Compare commits
12 Commits
test-rever
...
showcase
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8c99f0c166 | ||
|
|
520e07e756 | ||
|
|
674f552ff1 | ||
|
|
75d9236520 | ||
|
|
89926bce0b | ||
|
|
6c1fc4b4ed | ||
|
|
311f06047b | ||
|
|
46df717f7c | ||
|
|
dcf9483b0b | ||
|
|
2b828a9fee | ||
|
|
6fe1f86984 | ||
|
|
af073f13cf |
@@ -560,6 +560,60 @@ struct Values {
|
||||
false,
|
||||
&sample_shading};
|
||||
|
||||
#ifdef ANDROID
|
||||
// Shader Float Controls (Android only) - Eden Veil / Extensions
|
||||
// Force enable VK_KHR_shader_float_controls even if driver has known issues
|
||||
// Allows fine-tuning float behavior to match Switch/Maxwell or optimize performance
|
||||
SwitchableSetting<bool> shader_float_controls_force_enable{linkage,
|
||||
false,
|
||||
"shader_float_controls_force_enable",
|
||||
Category::RendererExtensions,
|
||||
Specialization::Paired};
|
||||
|
||||
// Individual float behavior controls (visible only when force_enable is true)
|
||||
// Multiple can be active simultaneously EXCEPT FTZ and DenormPreserve (mutually exclusive)
|
||||
//
|
||||
// Recommended configurations:
|
||||
// Switch-native: FTZ=ON, RTE=ON, SignedZero=ON (matches Maxwell behavior)
|
||||
// Performance: FTZ=ON only (fastest)
|
||||
// Accuracy: DenormPreserve=ON, RTE=ON, SignedZero=ON (slowest, highest precision)
|
||||
SwitchableSetting<bool> shader_float_ftz{linkage,
|
||||
false,
|
||||
"shader_float_ftz",
|
||||
Category::RendererExtensions,
|
||||
Specialization::Default,
|
||||
true,
|
||||
false,
|
||||
&shader_float_controls_force_enable};
|
||||
|
||||
SwitchableSetting<bool> shader_float_denorm_preserve{linkage,
|
||||
false,
|
||||
"shader_float_denorm_preserve",
|
||||
Category::RendererExtensions,
|
||||
Specialization::Default,
|
||||
true,
|
||||
false,
|
||||
&shader_float_controls_force_enable};
|
||||
|
||||
SwitchableSetting<bool> shader_float_rte{linkage,
|
||||
false,
|
||||
"shader_float_rte",
|
||||
Category::RendererExtensions,
|
||||
Specialization::Default,
|
||||
true,
|
||||
false,
|
||||
&shader_float_controls_force_enable};
|
||||
|
||||
SwitchableSetting<bool> shader_float_signed_zero_inf_nan{linkage,
|
||||
false,
|
||||
"shader_float_signed_zero_inf_nan",
|
||||
Category::RendererExtensions,
|
||||
Specialization::Default,
|
||||
true,
|
||||
false,
|
||||
&shader_float_controls_force_enable};
|
||||
#endif
|
||||
|
||||
Setting<bool> renderer_debug{linkage, false, "debug", Category::RendererDebug};
|
||||
Setting<bool> renderer_shader_feedback{linkage, false, "shader_feedback",
|
||||
Category::RendererDebug};
|
||||
|
||||
@@ -152,6 +152,16 @@ ENUM(SpirvOptimizeMode, Never, OnLoad, Always);
|
||||
ENUM(GpuOverclock, Low, Medium, High)
|
||||
ENUM(TemperatureUnits, Celsius, Fahrenheit)
|
||||
|
||||
// Shader Float Controls behavior modes
|
||||
// These control how floating-point denormals and special values are handled in shaders
|
||||
ENUM(ShaderFloatBehavior,
|
||||
DriverDefault, // Let driver choose (safest, may not match Switch behavior)
|
||||
SwitchNative, // Emulate Switch/Maxwell behavior (FTZ + RTE + SignedZero)
|
||||
FlushToZero, // FTZ only - flush denorms to zero (fastest, some precision loss)
|
||||
PreserveDenorms, // Preserve denorms (slowest, highest precision)
|
||||
RoundToEven, // RTE rounding mode (IEEE 754 compliant)
|
||||
SignedZeroInfNan); // Preserve signed zero, inf, nan (accuracy for edge cases)
|
||||
|
||||
template <typename Type>
|
||||
inline std::string_view CanonicalizeEnum(Type id) {
|
||||
const auto group = EnumMetadata<Type>::Canonicalizations();
|
||||
|
||||
@@ -341,19 +341,35 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) {
|
||||
void SetupDenormControl(const Profile& profile, const IR::Program& program, EmitContext& ctx,
|
||||
Id main_func) {
|
||||
const Info& info{program.info};
|
||||
|
||||
// User-forced behavior overrides (Android Eden Veil/Extensions)
|
||||
// When force flags are active, they take precedence over shader-declared behavior
|
||||
const bool force_flush = profile.force_fp32_denorm_flush;
|
||||
const bool force_preserve = profile.force_fp32_denorm_preserve;
|
||||
|
||||
if (force_flush && force_preserve) {
|
||||
LOG_WARNING(Shader_SPIRV, "Both FTZ and Preserve forced simultaneously - FTZ takes precedence");
|
||||
}
|
||||
|
||||
if (info.uses_fp32_denorms_flush && info.uses_fp32_denorms_preserve) {
|
||||
LOG_DEBUG(Shader_SPIRV, "Fp32 denorm flush and preserve on the same shader");
|
||||
} else if (info.uses_fp32_denorms_flush) {
|
||||
} else if (force_flush || info.uses_fp32_denorms_flush) {
|
||||
if (profile.support_fp32_denorm_flush) {
|
||||
ctx.AddCapability(spv::Capability::DenormFlushToZero);
|
||||
ctx.AddExecutionMode(main_func, spv::ExecutionMode::DenormFlushToZero, 32U);
|
||||
if (force_flush) {
|
||||
LOG_DEBUG(Shader_SPIRV, "Fp32 DenormFlushToZero FORCED by user setting");
|
||||
}
|
||||
} else {
|
||||
// Drivers will most likely flush denorms by default, no need to warn
|
||||
}
|
||||
} else if (info.uses_fp32_denorms_preserve) {
|
||||
} else if (force_preserve || info.uses_fp32_denorms_preserve) {
|
||||
if (profile.support_fp32_denorm_preserve) {
|
||||
ctx.AddCapability(spv::Capability::DenormPreserve);
|
||||
ctx.AddExecutionMode(main_func, spv::ExecutionMode::DenormPreserve, 32U);
|
||||
if (force_preserve) {
|
||||
LOG_DEBUG(Shader_SPIRV, "Fp32 DenormPreserve FORCED by user setting");
|
||||
}
|
||||
} else {
|
||||
LOG_DEBUG(Shader_SPIRV, "Fp32 denorm preserve used in shader without host support");
|
||||
}
|
||||
@@ -386,13 +402,24 @@ void SetupSignedNanCapabilities(const Profile& profile, const IR::Program& progr
|
||||
if (profile.has_broken_fp16_float_controls && program.info.uses_fp16) {
|
||||
return;
|
||||
}
|
||||
|
||||
// User-forced behavior (Android Eden Veil/Extensions)
|
||||
const bool force_signed_zero_inf_nan = profile.force_fp32_signed_zero_inf_nan;
|
||||
|
||||
if (program.info.uses_fp16 && profile.support_fp16_signed_zero_nan_preserve) {
|
||||
ctx.AddCapability(spv::Capability::SignedZeroInfNanPreserve);
|
||||
ctx.AddExecutionMode(main_func, spv::ExecutionMode::SignedZeroInfNanPreserve, 16U);
|
||||
}
|
||||
if (profile.support_fp32_signed_zero_nan_preserve) {
|
||||
ctx.AddCapability(spv::Capability::SignedZeroInfNanPreserve);
|
||||
ctx.AddExecutionMode(main_func, spv::ExecutionMode::SignedZeroInfNanPreserve, 32U);
|
||||
if (force_signed_zero_inf_nan || profile.support_fp32_signed_zero_nan_preserve) {
|
||||
if (profile.support_fp32_signed_zero_nan_preserve) {
|
||||
ctx.AddCapability(spv::Capability::SignedZeroInfNanPreserve);
|
||||
ctx.AddExecutionMode(main_func, spv::ExecutionMode::SignedZeroInfNanPreserve, 32U);
|
||||
if (force_signed_zero_inf_nan) {
|
||||
LOG_DEBUG(Shader_SPIRV, "Fp32 SignedZeroInfNanPreserve FORCED by user setting");
|
||||
}
|
||||
} else if (force_signed_zero_inf_nan) {
|
||||
LOG_WARNING(Shader_SPIRV, "SignedZeroInfNanPreserve forced but driver doesn't support it");
|
||||
}
|
||||
}
|
||||
if (program.info.uses_fp64 && profile.support_fp64_signed_zero_nan_preserve) {
|
||||
ctx.AddCapability(spv::Capability::SignedZeroInfNanPreserve);
|
||||
|
||||
@@ -95,7 +95,7 @@ void EmitLoadGlobalS16(EmitContext&) {
|
||||
}
|
||||
|
||||
Id EmitLoadGlobal32(EmitContext& ctx, Id address) {
|
||||
if (ctx.SupportsNativeInt64() || ctx.UsesInt64Emulation()) {
|
||||
if (ctx.profile.support_int64) {
|
||||
return ctx.OpFunctionCall(ctx.U32[1], ctx.load_global_func_u32, address);
|
||||
}
|
||||
LOG_WARNING(Shader_SPIRV, "Int64 not supported, ignoring memory operation");
|
||||
@@ -103,7 +103,7 @@ Id EmitLoadGlobal32(EmitContext& ctx, Id address) {
|
||||
}
|
||||
|
||||
Id EmitLoadGlobal64(EmitContext& ctx, Id address) {
|
||||
if (ctx.SupportsNativeInt64() || ctx.UsesInt64Emulation()) {
|
||||
if (ctx.profile.support_int64) {
|
||||
return ctx.OpFunctionCall(ctx.U32[2], ctx.load_global_func_u32x2, address);
|
||||
}
|
||||
LOG_WARNING(Shader_SPIRV, "Int64 not supported, ignoring memory operation");
|
||||
@@ -111,7 +111,7 @@ Id EmitLoadGlobal64(EmitContext& ctx, Id address) {
|
||||
}
|
||||
|
||||
Id EmitLoadGlobal128(EmitContext& ctx, Id address) {
|
||||
if (ctx.SupportsNativeInt64() || ctx.UsesInt64Emulation()) {
|
||||
if (ctx.profile.support_int64) {
|
||||
return ctx.OpFunctionCall(ctx.U32[4], ctx.load_global_func_u32x4, address);
|
||||
}
|
||||
LOG_WARNING(Shader_SPIRV, "Int64 not supported, ignoring memory operation");
|
||||
@@ -135,7 +135,7 @@ void EmitWriteGlobalS16(EmitContext&) {
|
||||
}
|
||||
|
||||
void EmitWriteGlobal32(EmitContext& ctx, Id address, Id value) {
|
||||
if (ctx.SupportsNativeInt64() || ctx.UsesInt64Emulation()) {
|
||||
if (ctx.profile.support_int64) {
|
||||
ctx.OpFunctionCall(ctx.void_id, ctx.write_global_func_u32, address, value);
|
||||
return;
|
||||
}
|
||||
@@ -143,7 +143,7 @@ void EmitWriteGlobal32(EmitContext& ctx, Id address, Id value) {
|
||||
}
|
||||
|
||||
void EmitWriteGlobal64(EmitContext& ctx, Id address, Id value) {
|
||||
if (ctx.SupportsNativeInt64() || ctx.UsesInt64Emulation()) {
|
||||
if (ctx.profile.support_int64) {
|
||||
ctx.OpFunctionCall(ctx.void_id, ctx.write_global_func_u32x2, address, value);
|
||||
return;
|
||||
}
|
||||
@@ -151,7 +151,7 @@ void EmitWriteGlobal64(EmitContext& ctx, Id address, Id value) {
|
||||
}
|
||||
|
||||
void EmitWriteGlobal128(EmitContext& ctx, Id address, Id value) {
|
||||
if (ctx.SupportsNativeInt64() || ctx.UsesInt64Emulation()) {
|
||||
if (ctx.profile.support_int64) {
|
||||
ctx.OpFunctionCall(ctx.void_id, ctx.write_global_func_u32x4, address, value);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -460,14 +460,9 @@ void VectorTypes::Define(Sirit::Module& sirit_ctx, Id base_type, std::string_vie
|
||||
|
||||
EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_info_,
|
||||
IR::Program& program, Bindings& bindings)
|
||||
: Sirit::Module(profile_.supported_spirv), profile{profile_}, runtime_info{runtime_info_},
|
||||
stage{program.stage},
|
||||
// Enable int64 emulation if host lacks int64 but we either use int64 ops
|
||||
// or we need 64-bit addressing for global memory operations.
|
||||
emulate_int64{!profile.support_int64 &&
|
||||
(program.info.uses_int64 || program.info.uses_global_memory)},
|
||||
texture_rescaling_index{bindings.texture_scaling_index},
|
||||
image_rescaling_index{bindings.image_scaling_index} {
|
||||
: Sirit::Module(profile_.supported_spirv), profile{profile_}, runtime_info{runtime_info_},
|
||||
stage{program.stage}, texture_rescaling_index{bindings.texture_scaling_index},
|
||||
image_rescaling_index{bindings.image_scaling_index} {
|
||||
const bool is_unified{profile.unified_descriptor_binding};
|
||||
u32& uniform_binding{is_unified ? bindings.unified : bindings.uniform_buffer};
|
||||
u32& storage_binding{is_unified ? bindings.unified : bindings.storage_buffer};
|
||||
@@ -937,163 +932,11 @@ void EmitContext::DefineWriteStorageCasLoopFunction(const Info& info) {
|
||||
}
|
||||
|
||||
void EmitContext::DefineGlobalMemoryFunctions(const Info& info) {
|
||||
if (!info.uses_global_memory) {
|
||||
if (!info.uses_global_memory || !profile.support_int64) {
|
||||
return;
|
||||
}
|
||||
using DefPtr = Id StorageDefinitions::*;
|
||||
const Id zero{u32_zero_value};
|
||||
|
||||
if (SupportsNativeInt64()) {
|
||||
const auto define_body{[&](DefPtr ssbo_member, Id addr, Id element_pointer, u32 shift,
|
||||
auto&& callback) {
|
||||
AddLabel();
|
||||
const size_t num_buffers{info.storage_buffers_descriptors.size()};
|
||||
for (size_t index = 0; index < num_buffers; ++index) {
|
||||
if (!info.nvn_buffer_used[index]) {
|
||||
continue;
|
||||
}
|
||||
const auto& ssbo{info.storage_buffers_descriptors[index]};
|
||||
const Id ssbo_addr_cbuf_offset{Const(ssbo.cbuf_offset / 8)};
|
||||
const Id ssbo_size_cbuf_offset{Const(ssbo.cbuf_offset / 4 + 2)};
|
||||
const Id ssbo_addr_pointer{OpAccessChain(
|
||||
uniform_types.U32x2, cbufs[ssbo.cbuf_index].U32x2, zero,
|
||||
ssbo_addr_cbuf_offset)};
|
||||
const Id ssbo_size_pointer{OpAccessChain(
|
||||
uniform_types.U32, cbufs[ssbo.cbuf_index].U32, zero, ssbo_size_cbuf_offset)};
|
||||
|
||||
const u64 ssbo_align_mask{~(profile.min_ssbo_alignment - 1U)};
|
||||
const Id unaligned_addr{OpBitcast(U64, OpLoad(U32[2], ssbo_addr_pointer))};
|
||||
const Id ssbo_addr{OpBitwiseAnd(U64, unaligned_addr, Constant(U64, ssbo_align_mask))};
|
||||
const Id ssbo_size{OpUConvert(U64, OpLoad(U32[1], ssbo_size_pointer))};
|
||||
const Id ssbo_end{OpIAdd(U64, ssbo_addr, ssbo_size)};
|
||||
const Id cond{OpLogicalAnd(U1, OpUGreaterThanEqual(U1, addr, ssbo_addr),
|
||||
OpULessThan(U1, addr, ssbo_end))};
|
||||
const Id then_label{OpLabel()};
|
||||
const Id else_label{OpLabel()};
|
||||
OpSelectionMerge(else_label, spv::SelectionControlMask::MaskNone);
|
||||
OpBranchConditional(cond, then_label, else_label);
|
||||
AddLabel(then_label);
|
||||
const Id ssbo_id{ssbos[index].*ssbo_member};
|
||||
const Id ssbo_offset{OpUConvert(U32[1], OpISub(U64, addr, ssbo_addr))};
|
||||
const Id ssbo_index{OpShiftRightLogical(U32[1], ssbo_offset, Const(shift))};
|
||||
const Id ssbo_pointer{OpAccessChain(element_pointer, ssbo_id, zero, ssbo_index)};
|
||||
callback(ssbo_pointer);
|
||||
AddLabel(else_label);
|
||||
}
|
||||
}};
|
||||
const auto define_load{[&](DefPtr ssbo_member, Id element_pointer, Id type, u32 shift) {
|
||||
const Id function_type{TypeFunction(type, U64)};
|
||||
const Id func_id{OpFunction(type, spv::FunctionControlMask::MaskNone, function_type)};
|
||||
const Id addr{OpFunctionParameter(U64)};
|
||||
define_body(ssbo_member, addr, element_pointer, shift,
|
||||
[&](Id ssbo_pointer) { OpReturnValue(OpLoad(type, ssbo_pointer)); });
|
||||
OpReturnValue(ConstantNull(type));
|
||||
OpFunctionEnd();
|
||||
return func_id;
|
||||
}};
|
||||
const auto define_write{[&](DefPtr ssbo_member, Id element_pointer, Id type, u32 shift) {
|
||||
const Id function_type{TypeFunction(void_id, U64, type)};
|
||||
const Id func_id{
|
||||
OpFunction(void_id, spv::FunctionControlMask::MaskNone, function_type)};
|
||||
const Id addr{OpFunctionParameter(U64)};
|
||||
const Id data{OpFunctionParameter(type)};
|
||||
define_body(ssbo_member, addr, element_pointer, shift, [&](Id ssbo_pointer) {
|
||||
OpStore(ssbo_pointer, data);
|
||||
OpReturn();
|
||||
});
|
||||
OpReturn();
|
||||
OpFunctionEnd();
|
||||
return func_id;
|
||||
}};
|
||||
const auto define{
|
||||
[&](DefPtr ssbo_member, const StorageTypeDefinition& type_def, Id type, size_t size) {
|
||||
const Id element_type{type_def.element};
|
||||
const u32 shift{static_cast<u32>(std::countr_zero(size))};
|
||||
const Id load_func{define_load(ssbo_member, element_type, type, shift)};
|
||||
const Id write_func{define_write(ssbo_member, element_type, type, shift)};
|
||||
return std::make_pair(load_func, write_func);
|
||||
}};
|
||||
std::tie(load_global_func_u32, write_global_func_u32) =
|
||||
define(&StorageDefinitions::U32, storage_types.U32, U32[1], sizeof(u32));
|
||||
std::tie(load_global_func_u32x2, write_global_func_u32x2) =
|
||||
define(&StorageDefinitions::U32x2, storage_types.U32x2, U32[2], sizeof(u32[2]));
|
||||
std::tie(load_global_func_u32x4, write_global_func_u32x4) =
|
||||
define(&StorageDefinitions::U32x4, storage_types.U32x4, U32[4], sizeof(u32[4]));
|
||||
return;
|
||||
}
|
||||
|
||||
if (!UsesInt64Emulation()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto make_pair = [&](Id lo, Id hi) {
|
||||
return OpCompositeConstruct(U32[2], lo, hi);
|
||||
};
|
||||
const auto split_pair = [&](Id value) {
|
||||
return std::array<Id, 2>{OpCompositeExtract(U32[1], value, 0U),
|
||||
OpCompositeExtract(U32[1], value, 1U)};
|
||||
};
|
||||
const auto bool_to_u32 = [&](Id predicate) {
|
||||
return OpSelect(U32[1], predicate, Const(1u), zero);
|
||||
};
|
||||
const auto and_pair = [&](Id value, Id mask) {
|
||||
const auto value_parts{split_pair(value)};
|
||||
const auto mask_parts{split_pair(mask)};
|
||||
return make_pair(OpBitwiseAnd(U32[1], value_parts[0], mask_parts[0]),
|
||||
OpBitwiseAnd(U32[1], value_parts[1], mask_parts[1]));
|
||||
};
|
||||
const auto add_pair = [&](Id lhs, Id rhs) {
|
||||
const auto lhs_parts{split_pair(lhs)};
|
||||
const auto rhs_parts{split_pair(rhs)};
|
||||
const Id sum_lo{OpIAdd(U32[1], lhs_parts[0], rhs_parts[0])};
|
||||
const Id carry{OpULessThan(U1, sum_lo, lhs_parts[0])};
|
||||
Id sum_hi{OpIAdd(U32[1], lhs_parts[1], rhs_parts[1])};
|
||||
sum_hi = OpIAdd(U32[1], sum_hi, bool_to_u32(carry));
|
||||
return make_pair(sum_lo, sum_hi);
|
||||
};
|
||||
const auto sub_pair = [&](Id lhs, Id rhs) {
|
||||
const auto lhs_parts{split_pair(lhs)};
|
||||
const auto rhs_parts{split_pair(rhs)};
|
||||
const Id borrow{OpULessThan(U1, lhs_parts[0], rhs_parts[0])};
|
||||
const Id diff_lo{OpISub(U32[1], lhs_parts[0], rhs_parts[0])};
|
||||
Id diff_hi{OpISub(U32[1], lhs_parts[1], rhs_parts[1])};
|
||||
diff_hi = OpISub(U32[1], diff_hi, bool_to_u32(borrow));
|
||||
return make_pair(diff_lo, diff_hi);
|
||||
};
|
||||
const auto shift_right_pair = [&](Id value, u32 shift) {
|
||||
if (shift == 0) {
|
||||
return value;
|
||||
}
|
||||
const auto parts{split_pair(value)};
|
||||
const Id shift_id{Const(shift)};
|
||||
const Id high_shifted{OpShiftRightLogical(U32[1], parts[1], shift_id)};
|
||||
Id low_shifted{OpShiftRightLogical(U32[1], parts[0], shift_id)};
|
||||
const Id carry_bits{OpShiftLeftLogical(U32[1], parts[1], Const(32u - shift))};
|
||||
low_shifted = OpBitwiseOr(U32[1], low_shifted, carry_bits);
|
||||
return make_pair(low_shifted, high_shifted);
|
||||
};
|
||||
const auto greater_equal_pair = [&](Id lhs, Id rhs) {
|
||||
const auto lhs_parts{split_pair(lhs)};
|
||||
const auto rhs_parts{split_pair(rhs)};
|
||||
const Id hi_gt{OpUGreaterThan(U1, lhs_parts[1], rhs_parts[1])};
|
||||
const Id hi_eq{OpIEqual(U1, lhs_parts[1], rhs_parts[1])};
|
||||
const Id lo_ge{OpUGreaterThanEqual(U1, lhs_parts[0], rhs_parts[0])};
|
||||
return OpLogicalOr(U1, hi_gt, OpLogicalAnd(U1, hi_eq, lo_ge));
|
||||
};
|
||||
const auto less_than_pair = [&](Id lhs, Id rhs) {
|
||||
const auto lhs_parts{split_pair(lhs)};
|
||||
const auto rhs_parts{split_pair(rhs)};
|
||||
const Id hi_lt{OpULessThan(U1, lhs_parts[1], rhs_parts[1])};
|
||||
const Id hi_eq{OpIEqual(U1, lhs_parts[1], rhs_parts[1])};
|
||||
const Id lo_lt{OpULessThan(U1, lhs_parts[0], rhs_parts[0])};
|
||||
return OpLogicalOr(U1, hi_lt, OpLogicalAnd(U1, hi_eq, lo_lt));
|
||||
};
|
||||
|
||||
const u64 ssbo_align_mask_value{~(profile.min_ssbo_alignment - 1U)};
|
||||
const Id ssbo_align_mask{
|
||||
Const(static_cast<u32>(ssbo_align_mask_value & 0xFFFFFFFFu),
|
||||
static_cast<u32>(ssbo_align_mask_value >> 32))};
|
||||
|
||||
const auto define_body{[&](DefPtr ssbo_member, Id addr, Id element_pointer, u32 shift,
|
||||
auto&& callback) {
|
||||
AddLabel();
|
||||
@@ -1110,44 +953,40 @@ void EmitContext::DefineGlobalMemoryFunctions(const Info& info) {
|
||||
const Id ssbo_size_pointer{OpAccessChain(uniform_types.U32, cbufs[ssbo.cbuf_index].U32,
|
||||
zero, ssbo_size_cbuf_offset)};
|
||||
|
||||
const Id unaligned_addr_pair{OpLoad(U32[2], ssbo_addr_pointer)};
|
||||
const Id ssbo_addr_pair{and_pair(unaligned_addr_pair, ssbo_align_mask)};
|
||||
const Id ssbo_size_value{OpLoad(U32[1], ssbo_size_pointer)};
|
||||
const Id ssbo_size_pair{make_pair(ssbo_size_value, zero)};
|
||||
const Id ssbo_end_pair{add_pair(ssbo_addr_pair, ssbo_size_pair)};
|
||||
const Id cond{OpLogicalAnd(U1, greater_equal_pair(addr, ssbo_addr_pair),
|
||||
less_than_pair(addr, ssbo_end_pair))};
|
||||
const u64 ssbo_align_mask{~(profile.min_ssbo_alignment - 1U)};
|
||||
const Id unaligned_addr{OpBitcast(U64, OpLoad(U32[2], ssbo_addr_pointer))};
|
||||
const Id ssbo_addr{OpBitwiseAnd(U64, unaligned_addr, Constant(U64, ssbo_align_mask))};
|
||||
const Id ssbo_size{OpUConvert(U64, OpLoad(U32[1], ssbo_size_pointer))};
|
||||
const Id ssbo_end{OpIAdd(U64, ssbo_addr, ssbo_size)};
|
||||
const Id cond{OpLogicalAnd(U1, OpUGreaterThanEqual(U1, addr, ssbo_addr),
|
||||
OpULessThan(U1, addr, ssbo_end))};
|
||||
const Id then_label{OpLabel()};
|
||||
const Id else_label{OpLabel()};
|
||||
OpSelectionMerge(else_label, spv::SelectionControlMask::MaskNone);
|
||||
OpBranchConditional(cond, then_label, else_label);
|
||||
AddLabel(then_label);
|
||||
const Id ssbo_id{ssbos[index].*ssbo_member};
|
||||
const Id ssbo_offset_pair{sub_pair(addr, ssbo_addr_pair)};
|
||||
const Id ssbo_index_pair{shift_right_pair(ssbo_offset_pair, shift)};
|
||||
const Id ssbo_index{OpCompositeExtract(U32[1], ssbo_index_pair, 0U)};
|
||||
const Id ssbo_offset{OpUConvert(U32[1], OpISub(U64, addr, ssbo_addr))};
|
||||
const Id ssbo_index{OpShiftRightLogical(U32[1], ssbo_offset, Const(shift))};
|
||||
const Id ssbo_pointer{OpAccessChain(element_pointer, ssbo_id, zero, ssbo_index)};
|
||||
callback(ssbo_pointer);
|
||||
AddLabel(else_label);
|
||||
}
|
||||
}};
|
||||
|
||||
const auto define_load{[&](DefPtr ssbo_member, Id element_pointer, Id type, u32 shift) {
|
||||
const Id function_type{TypeFunction(type, U32[2])};
|
||||
const Id function_type{TypeFunction(type, U64)};
|
||||
const Id func_id{OpFunction(type, spv::FunctionControlMask::MaskNone, function_type)};
|
||||
const Id addr{OpFunctionParameter(U32[2])};
|
||||
const Id addr{OpFunctionParameter(U64)};
|
||||
define_body(ssbo_member, addr, element_pointer, shift,
|
||||
[&](Id ssbo_pointer) { OpReturnValue(OpLoad(type, ssbo_pointer)); });
|
||||
OpReturnValue(ConstantNull(type));
|
||||
OpFunctionEnd();
|
||||
return func_id;
|
||||
}};
|
||||
|
||||
const auto define_write{[&](DefPtr ssbo_member, Id element_pointer, Id type, u32 shift) {
|
||||
const Id function_type{TypeFunction(void_id, U32[2], type)};
|
||||
const Id func_id{
|
||||
OpFunction(void_id, spv::FunctionControlMask::MaskNone, function_type)};
|
||||
const Id addr{OpFunctionParameter(U32[2])};
|
||||
const Id function_type{TypeFunction(void_id, U64, type)};
|
||||
const Id func_id{OpFunction(void_id, spv::FunctionControlMask::MaskNone, function_type)};
|
||||
const Id addr{OpFunctionParameter(U64)};
|
||||
const Id data{OpFunctionParameter(type)};
|
||||
define_body(ssbo_member, addr, element_pointer, shift, [&](Id ssbo_pointer) {
|
||||
OpStore(ssbo_pointer, data);
|
||||
@@ -1157,7 +996,6 @@ void EmitContext::DefineGlobalMemoryFunctions(const Info& info) {
|
||||
OpFunctionEnd();
|
||||
return func_id;
|
||||
}};
|
||||
|
||||
const auto define{
|
||||
[&](DefPtr ssbo_member, const StorageTypeDefinition& type_def, Id type, size_t size) {
|
||||
const Id element_type{type_def.element};
|
||||
@@ -1166,7 +1004,6 @@ void EmitContext::DefineGlobalMemoryFunctions(const Info& info) {
|
||||
const Id write_func{define_write(ssbo_member, element_type, type, shift)};
|
||||
return std::make_pair(load_func, write_func);
|
||||
}};
|
||||
|
||||
std::tie(load_global_func_u32, write_global_func_u32) =
|
||||
define(&StorageDefinitions::U32, storage_types.U32, U32[1], sizeof(u32));
|
||||
std::tie(load_global_func_u32x2, write_global_func_u32x2) =
|
||||
|
||||
@@ -210,15 +210,6 @@ public:
|
||||
const Profile& profile;
|
||||
const RuntimeInfo& runtime_info;
|
||||
Stage stage{};
|
||||
const bool emulate_int64{};
|
||||
|
||||
bool SupportsNativeInt64() const {
|
||||
return profile.support_int64;
|
||||
}
|
||||
|
||||
bool UsesInt64Emulation() const {
|
||||
return emulate_int64;
|
||||
}
|
||||
|
||||
Id void_id{};
|
||||
Id U1{};
|
||||
|
||||
@@ -28,6 +28,14 @@ struct Profile {
|
||||
bool support_fp16_signed_zero_nan_preserve{};
|
||||
bool support_fp32_signed_zero_nan_preserve{};
|
||||
bool support_fp64_signed_zero_nan_preserve{};
|
||||
|
||||
// User-forced float behavior overrides (Android Eden Veil/Extensions)
|
||||
// When shader_float_controls_force_enable is true, these override shader-declared behavior
|
||||
bool force_fp32_denorm_flush{}; // Force FTZ for all FP32 ops
|
||||
bool force_fp32_denorm_preserve{}; // Force denorm preservation for all FP32 ops
|
||||
bool force_fp32_rte_rounding{}; // Force Round-To-Even for all FP32 ops
|
||||
bool force_fp32_signed_zero_inf_nan{}; // Force signed zero/inf/nan preservation
|
||||
|
||||
bool support_explicit_workgroup_layout{};
|
||||
bool support_vote{};
|
||||
bool support_viewport_index_layer_non_geometry{};
|
||||
|
||||
@@ -42,7 +42,7 @@ constexpr std::array VIEW_CLASS_32_BITS{
|
||||
PixelFormat::A2B10G10R10_UNORM, PixelFormat::R16G16_UINT, PixelFormat::R32_UINT,
|
||||
PixelFormat::R16G16_SINT, PixelFormat::R32_SINT, PixelFormat::A8B8G8R8_UNORM,
|
||||
PixelFormat::R16G16_UNORM, PixelFormat::A8B8G8R8_SNORM, PixelFormat::R16G16_SNORM,
|
||||
PixelFormat::A8B8G8R8_SRGB, PixelFormat::E5B9G9R9_FLOAT, PixelFormat::B8G8R8A8_UNORM,
|
||||
PixelFormat::A8B8G8R8_SRGB, PixelFormat::B8G8R8A8_UNORM,
|
||||
PixelFormat::B8G8R8A8_SRGB, PixelFormat::A8B8G8R8_UINT, PixelFormat::A8B8G8R8_SINT,
|
||||
PixelFormat::A2B10G10R10_UINT,
|
||||
};
|
||||
@@ -52,7 +52,7 @@ constexpr std::array VIEW_CLASS_32_BITS_NO_BGR{
|
||||
PixelFormat::A2B10G10R10_UNORM, PixelFormat::R16G16_UINT, PixelFormat::R32_UINT,
|
||||
PixelFormat::R16G16_SINT, PixelFormat::R32_SINT, PixelFormat::A8B8G8R8_UNORM,
|
||||
PixelFormat::R16G16_UNORM, PixelFormat::A8B8G8R8_SNORM, PixelFormat::R16G16_SNORM,
|
||||
PixelFormat::A8B8G8R8_SRGB, PixelFormat::E5B9G9R9_FLOAT, PixelFormat::A8B8G8R8_UINT,
|
||||
PixelFormat::A8B8G8R8_SRGB, PixelFormat::A8B8G8R8_UINT,
|
||||
PixelFormat::A8B8G8R8_SINT, PixelFormat::A2B10G10R10_UINT,
|
||||
};
|
||||
|
||||
|
||||
@@ -214,7 +214,7 @@ struct FormatTuple {
|
||||
{VK_FORMAT_ASTC_8x6_SRGB_BLOCK}, // ASTC_2D_8X6_SRGB
|
||||
{VK_FORMAT_ASTC_6x5_UNORM_BLOCK}, // ASTC_2D_6X5_UNORM
|
||||
{VK_FORMAT_ASTC_6x5_SRGB_BLOCK}, // ASTC_2D_6X5_SRGB
|
||||
{VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, Attachable | Storage}, // E5B9G9R9_FLOAT
|
||||
{VK_FORMAT_E5B9G9R9_UFLOAT_PACK32}, // E5B9G9R9_FLOAT (SAMPLED_IMAGE only, no COLOR_ATTACHMENT)
|
||||
|
||||
// Depth formats
|
||||
{VK_FORMAT_D32_SFLOAT, Attachable}, // D32_FLOAT
|
||||
|
||||
@@ -27,8 +27,13 @@ public:
|
||||
DescriptorLayoutBuilder(const Device& device_) : device{&device_} {}
|
||||
|
||||
bool CanUsePushDescriptor() const noexcept {
|
||||
return device->IsKhrPushDescriptorSupported() &&
|
||||
num_descriptors <= device->MaxPushDescriptors();
|
||||
if (!device->IsKhrPushDescriptorSupported()) {
|
||||
return false;
|
||||
}
|
||||
if (num_descriptors > device->MaxPushDescriptors()) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// TODO(crueter): utilize layout binding flags
|
||||
|
||||
@@ -177,6 +177,8 @@ try
|
||||
|
||||
RendererVulkan::~RendererVulkan() {
|
||||
scheduler.RegisterOnSubmit([] {});
|
||||
// Acquire submit_mutex before WaitIdle to prevent simultaneous queue access
|
||||
std::scoped_lock lock{scheduler.submit_mutex};
|
||||
void(device.GetLogical().WaitIdle());
|
||||
}
|
||||
|
||||
|
||||
@@ -30,7 +30,8 @@ BlitScreen::~BlitScreen() = default;
|
||||
void BlitScreen::WaitIdle() {
|
||||
present_manager.WaitPresent();
|
||||
scheduler.Finish();
|
||||
device.GetLogical().WaitIdle();
|
||||
// Note: scheduler.Finish() already waits for GPU and synchronizes submit_mutex
|
||||
// Calling device.WaitIdle() here causes threading errors (simultaneous queue access)
|
||||
}
|
||||
|
||||
void BlitScreen::SetWindowAdaptPass() {
|
||||
|
||||
@@ -341,6 +341,20 @@ PipelineCache::PipelineCache(Tegra::MaxwellDeviceMemoryManager& device_memory_,
|
||||
float_control.shaderSignedZeroInfNanPreserveFloat32 != VK_FALSE,
|
||||
.support_fp64_signed_zero_nan_preserve =
|
||||
float_control.shaderSignedZeroInfNanPreserveFloat64 != VK_FALSE,
|
||||
|
||||
#ifdef ANDROID
|
||||
// User-forced float behavior overrides (Eden Veil/Extensions)
|
||||
.force_fp32_denorm_flush = Settings::values.shader_float_ftz.GetValue(),
|
||||
.force_fp32_denorm_preserve = Settings::values.shader_float_denorm_preserve.GetValue(),
|
||||
.force_fp32_rte_rounding = Settings::values.shader_float_rte.GetValue(),
|
||||
.force_fp32_signed_zero_inf_nan = Settings::values.shader_float_signed_zero_inf_nan.GetValue(),
|
||||
#else
|
||||
.force_fp32_denorm_flush = false,
|
||||
.force_fp32_denorm_preserve = false,
|
||||
.force_fp32_rte_rounding = false,
|
||||
.force_fp32_signed_zero_inf_nan = false,
|
||||
#endif
|
||||
|
||||
.support_explicit_workgroup_layout = device.IsKhrWorkgroupMemoryExplicitLayoutSupported(),
|
||||
.support_vote = device.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_VOTE_BIT),
|
||||
.support_viewport_index_layer_non_geometry =
|
||||
@@ -677,7 +691,17 @@ std::unique_ptr<GraphicsPipeline> PipelineCache::CreateGraphicsPipeline(
|
||||
|
||||
const auto runtime_info{MakeRuntimeInfo(programs, key, program, previous_stage)};
|
||||
ConvertLegacyToGeneric(program, runtime_info);
|
||||
const std::vector<u32> code{EmitSPIRV(profile, runtime_info, program, binding, this->optimize_spirv_output)};
|
||||
|
||||
// Adreno don't support subgroup operations in vertex stages
|
||||
// Disable subgroup features for vertex shaders if not supported by the device
|
||||
Shader::Profile stage_profile = profile;
|
||||
if (program.stage == Shader::Stage::VertexA || program.stage == Shader::Stage::VertexB) {
|
||||
if (!device.IsSubgroupSupportedForStage(VK_SHADER_STAGE_VERTEX_BIT)) {
|
||||
stage_profile.support_vote = false;
|
||||
}
|
||||
}
|
||||
|
||||
const std::vector<u32> code{EmitSPIRV(stage_profile, runtime_info, program, binding, this->optimize_spirv_output)};
|
||||
device.SaveShader(code);
|
||||
modules[stage_index] = BuildShader(device, code);
|
||||
if (device.HasDebuggingToolAttached()) {
|
||||
@@ -771,6 +795,17 @@ std::unique_ptr<ComputePipeline> PipelineCache::CreateComputePipeline(
|
||||
}
|
||||
|
||||
auto program{TranslateProgram(pools.inst, pools.block, env, cfg, host_info)};
|
||||
|
||||
// Adreno have lower shared memory limits (32KB)
|
||||
// Clamp shared memory usage to device maximum to avoid validation errors
|
||||
const u32 max_shared_memory = device.GetMaxComputeSharedMemorySize();
|
||||
if (program.shared_memory_size > max_shared_memory) {
|
||||
LOG_WARNING(Render_Vulkan,
|
||||
"Compute shader 0x{:016x} requests {}KB shared memory but device max is {}KB - clamping",
|
||||
key.unique_hash, program.shared_memory_size / 1024, max_shared_memory / 1024);
|
||||
program.shared_memory_size = max_shared_memory;
|
||||
}
|
||||
|
||||
const std::vector<u32> code{EmitSPIRV(profile, program, this->optimize_spirv_output)};
|
||||
device.SaveShader(code);
|
||||
vk::ShaderModule spv_module{BuildShader(device, code)};
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
|
||||
#include <boost/container/static_vector.hpp>
|
||||
|
||||
#include "common/logging/log.h"
|
||||
#include "video_core/renderer_vulkan/maxwell_to_vk.h"
|
||||
#include "video_core/renderer_vulkan/vk_render_pass_cache.h"
|
||||
#include "video_core/surface.h"
|
||||
@@ -19,6 +20,23 @@ namespace {
|
||||
using VideoCore::Surface::PixelFormat;
|
||||
using VideoCore::Surface::SurfaceType;
|
||||
|
||||
// Check if the driver uses tile-based deferred rendering (TBDR) architecture
|
||||
// These GPUs benefit from optimized load/store operations to keep data on-chip
|
||||
//
|
||||
// TBDR GPUs supported in Eden:
|
||||
// - Qualcomm Adreno (Snapdragon): Most Android flagship/midrange devices
|
||||
// - ARM Mali: Android devices (Samsung Exynos, MediaTek, etc.)
|
||||
// - Imagination PowerVR: Older iOS devices, some Android tablets
|
||||
// - Samsung Xclipse: Galaxy S22+ (AMD RDNA2-based, but uses TBDR mode)
|
||||
// - Broadcom VideoCore: Raspberry Pi
|
||||
[[nodiscard]] constexpr bool IsTBDRGPU(VkDriverId driver_id) {
|
||||
return driver_id == VK_DRIVER_ID_QUALCOMM_PROPRIETARY ||
|
||||
driver_id == VK_DRIVER_ID_ARM_PROPRIETARY ||
|
||||
driver_id == VK_DRIVER_ID_IMAGINATION_PROPRIETARY ||
|
||||
driver_id == VK_DRIVER_ID_SAMSUNG_PROPRIETARY ||
|
||||
driver_id == VK_DRIVER_ID_BROADCOM_PROPRIETARY;
|
||||
}
|
||||
|
||||
constexpr SurfaceType GetSurfaceType(PixelFormat format) {
|
||||
switch (format) {
|
||||
// Depth formats
|
||||
@@ -44,23 +62,57 @@ using VideoCore::Surface::SurfaceType;
|
||||
}
|
||||
|
||||
VkAttachmentDescription AttachmentDescription(const Device& device, PixelFormat format,
|
||||
VkSampleCountFlagBits samples) {
|
||||
VkSampleCountFlagBits samples,
|
||||
bool tbdr_will_clear,
|
||||
bool tbdr_discard_after,
|
||||
bool tbdr_read_only = false) {
|
||||
using MaxwellToVK::SurfaceFormat;
|
||||
|
||||
const SurfaceType surface_type = GetSurfaceType(format);
|
||||
const bool has_stencil = surface_type == SurfaceType::DepthStencil ||
|
||||
surface_type == SurfaceType::Stencil;
|
||||
|
||||
// TBDR optimization: Apply hints only on tile-based GPUs
|
||||
// Desktop GPUs (NVIDIA/AMD/Intel) ignore these hints and use standard behavior
|
||||
const bool is_tbdr = IsTBDRGPU(device.GetDriverID());
|
||||
|
||||
// On TBDR: Use DONT_CARE if clear is guaranteed (avoids loading from main memory)
|
||||
// On Desktop: Always LOAD to preserve existing content (safer default)
|
||||
VkAttachmentLoadOp load_op = VK_ATTACHMENT_LOAD_OP_LOAD;
|
||||
if (is_tbdr && tbdr_will_clear) {
|
||||
load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
|
||||
}
|
||||
|
||||
// On TBDR: Use DONT_CARE if content won't be read (avoids storing to main memory)
|
||||
// On Desktop: Always STORE (safer default)
|
||||
// VK_QCOM_render_pass_store_ops: Use NONE_QCOM for read-only attachments (preserves outside render area)
|
||||
VkAttachmentStoreOp store_op = VK_ATTACHMENT_STORE_OP_STORE;
|
||||
if (is_tbdr && tbdr_discard_after) {
|
||||
store_op = VK_ATTACHMENT_STORE_OP_DONT_CARE;
|
||||
} else if (is_tbdr && tbdr_read_only && device.IsQcomRenderPassStoreOpsSupported()) {
|
||||
store_op = static_cast<VkAttachmentStoreOp>(1000301000); // VK_ATTACHMENT_STORE_OP_NONE_QCOM
|
||||
}
|
||||
|
||||
// Stencil operations follow same logic
|
||||
VkAttachmentLoadOp stencil_load_op = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
|
||||
VkAttachmentStoreOp stencil_store_op = VK_ATTACHMENT_STORE_OP_DONT_CARE;
|
||||
if (has_stencil && tbdr_read_only && device.IsQcomRenderPassStoreOpsSupported()) {
|
||||
stencil_store_op = static_cast<VkAttachmentStoreOp>(1000301000); // VK_ATTACHMENT_STORE_OP_NONE_QCOM
|
||||
} else if (has_stencil) {
|
||||
stencil_load_op = (is_tbdr && tbdr_will_clear) ? VK_ATTACHMENT_LOAD_OP_DONT_CARE
|
||||
: VK_ATTACHMENT_LOAD_OP_LOAD;
|
||||
stencil_store_op = (is_tbdr && tbdr_discard_after) ? VK_ATTACHMENT_STORE_OP_DONT_CARE
|
||||
: VK_ATTACHMENT_STORE_OP_STORE;
|
||||
}
|
||||
|
||||
return {
|
||||
.flags = {},
|
||||
.format = SurfaceFormat(device, FormatType::Optimal, true, format).format,
|
||||
.samples = samples,
|
||||
.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
|
||||
.storeOp = VK_ATTACHMENT_STORE_OP_STORE,
|
||||
.stencilLoadOp = has_stencil ? VK_ATTACHMENT_LOAD_OP_LOAD
|
||||
: VK_ATTACHMENT_LOAD_OP_DONT_CARE,
|
||||
.stencilStoreOp = has_stencil ? VK_ATTACHMENT_STORE_OP_STORE
|
||||
: VK_ATTACHMENT_STORE_OP_DONT_CARE,
|
||||
.loadOp = load_op,
|
||||
.storeOp = store_op,
|
||||
.stencilLoadOp = stencil_load_op,
|
||||
.stencilStoreOp = stencil_store_op,
|
||||
.initialLayout = VK_IMAGE_LAYOUT_GENERAL,
|
||||
.finalLayout = VK_IMAGE_LAYOUT_GENERAL,
|
||||
};
|
||||
@@ -75,6 +127,13 @@ VkRenderPass RenderPassCache::Get(const RenderPassKey& key) {
|
||||
if (!is_new) {
|
||||
return *pair->second;
|
||||
}
|
||||
|
||||
const bool is_tbdr = IsTBDRGPU(device->GetDriverID());
|
||||
if (is_tbdr && (key.tbdr_will_clear || key.tbdr_discard_after)) {
|
||||
LOG_DEBUG(Render_Vulkan, "Creating TBDR-optimized render pass (driver={}, clear={}, discard={})",
|
||||
static_cast<u32>(device->GetDriverID()), key.tbdr_will_clear, key.tbdr_discard_after);
|
||||
}
|
||||
|
||||
boost::container::static_vector<VkAttachmentDescription, 9> descriptions;
|
||||
std::array<VkAttachmentReference, 8> references{};
|
||||
u32 num_attachments{};
|
||||
@@ -87,7 +146,8 @@ VkRenderPass RenderPassCache::Get(const RenderPassKey& key) {
|
||||
.layout = VK_IMAGE_LAYOUT_GENERAL,
|
||||
};
|
||||
if (is_valid) {
|
||||
descriptions.push_back(AttachmentDescription(*device, format, key.samples));
|
||||
descriptions.push_back(AttachmentDescription(*device, format, key.samples,
|
||||
key.tbdr_will_clear, key.tbdr_discard_after));
|
||||
num_attachments = static_cast<u32>(index + 1);
|
||||
++num_colors;
|
||||
}
|
||||
@@ -99,12 +159,14 @@ VkRenderPass RenderPassCache::Get(const RenderPassKey& key) {
|
||||
.attachment = num_colors,
|
||||
.layout = VK_IMAGE_LAYOUT_GENERAL,
|
||||
};
|
||||
descriptions.push_back(AttachmentDescription(*device, key.depth_format, key.samples));
|
||||
descriptions.push_back(AttachmentDescription(*device, key.depth_format, key.samples,
|
||||
key.tbdr_will_clear, key.tbdr_discard_after, key.tbdr_read_only));
|
||||
}
|
||||
VkSubpassDescriptionFlags subpass_flags = 0;
|
||||
if (key.qcom_shader_resolve) {
|
||||
// VK_QCOM_render_pass_shader_resolve: enables custom shader resolve in fragment shader
|
||||
// This must be the last subpass in the dependency chain
|
||||
// This flag allows using a programmable fragment shader for MSAA resolve instead of
|
||||
// fixed-function hardware resolve, enabling better quality and HDR format support
|
||||
subpass_flags |= 0x00000004; // VK_SUBPASS_DESCRIPTION_SHADER_RESOLVE_BIT_QCOM
|
||||
}
|
||||
|
||||
|
||||
@@ -25,6 +25,7 @@ struct RenderPassKey {
|
||||
// These flags indicate the expected usage pattern to optimize load/store operations
|
||||
bool tbdr_will_clear{false}; // Attachment will be cleared with vkCmdClearAttachments
|
||||
bool tbdr_discard_after{false}; // Attachment won't be read after render pass
|
||||
bool tbdr_read_only{false}; // Attachment is read-only (input attachment, depth test without writes)
|
||||
|
||||
// VK_QCOM_render_pass_shader_resolve support
|
||||
bool qcom_shader_resolve{false}; // Use shader resolve instead of fixed-function (last subpass)
|
||||
@@ -38,6 +39,8 @@ struct hash<Vulkan::RenderPassKey> {
|
||||
[[nodiscard]] size_t operator()(const Vulkan::RenderPassKey& key) const noexcept {
|
||||
size_t value = static_cast<size_t>(key.depth_format) << 48;
|
||||
value ^= static_cast<size_t>(key.samples) << 52;
|
||||
value ^= (static_cast<size_t>(key.tbdr_will_clear) << 56);
|
||||
value ^= (static_cast<size_t>(key.tbdr_discard_after) << 57);
|
||||
for (size_t i = 0; i < key.color_formats.size(); ++i) {
|
||||
value ^= static_cast<size_t>(key.color_formats[i]) << (i * 6);
|
||||
}
|
||||
|
||||
@@ -66,10 +66,20 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
|
||||
}
|
||||
}
|
||||
|
||||
[[nodiscard]] VkImageType ConvertImageType(const ImageType type) {
|
||||
[[nodiscard]] VkImageType ConvertImageType(const ImageType type, const Device& device) {
|
||||
switch (type) {
|
||||
case ImageType::e1D:
|
||||
return VK_IMAGE_TYPE_1D;
|
||||
// Mobile Vulkan (Adreno, Mali, PowerVR, IMG) lacks Sampled1D SPIR-V capability
|
||||
// Emulate as 2D texture with height=1 on mobile, use native 1D on desktop
|
||||
{
|
||||
const auto driver_id = device.GetDriverID();
|
||||
const bool is_mobile = driver_id == VK_DRIVER_ID_QUALCOMM_PROPRIETARY ||
|
||||
driver_id == VK_DRIVER_ID_MESA_TURNIP ||
|
||||
driver_id == VK_DRIVER_ID_ARM_PROPRIETARY ||
|
||||
driver_id == VK_DRIVER_ID_BROADCOM_PROPRIETARY ||
|
||||
driver_id == VK_DRIVER_ID_IMAGINATION_PROPRIETARY;
|
||||
return is_mobile ? VK_IMAGE_TYPE_2D : VK_IMAGE_TYPE_1D;
|
||||
}
|
||||
case ImageType::e2D:
|
||||
case ImageType::Linear:
|
||||
return VK_IMAGE_TYPE_2D;
|
||||
@@ -141,7 +151,7 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
|
||||
.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
|
||||
.pNext = nullptr,
|
||||
.flags = flags,
|
||||
.imageType = ConvertImageType(info.type),
|
||||
.imageType = ConvertImageType(info.type, device),
|
||||
.format = format_info.format,
|
||||
.extent{
|
||||
.width = info.size.width >> samples_x,
|
||||
@@ -160,6 +170,40 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
|
||||
};
|
||||
}
|
||||
|
||||
/// Emergency fallback: degrade MSAA to non-MSAA for HDR formats when no resolve support exists
|
||||
[[nodiscard]] ImageInfo AdjustMSAAForHDRFormats(const Device& device, ImageInfo info) {
|
||||
if (info.num_samples <= 1) {
|
||||
return info;
|
||||
}
|
||||
|
||||
const auto vk_format = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal,
|
||||
false, info.format).format;
|
||||
const bool is_hdr_format = vk_format == VK_FORMAT_B10G11R11_UFLOAT_PACK32;
|
||||
|
||||
if (!is_hdr_format) {
|
||||
return info;
|
||||
}
|
||||
|
||||
// Qualcomm: VK_QCOM_render_pass_shader_resolve handles HDR+MSAA
|
||||
if (device.GetDriverID() == VK_DRIVER_ID_QUALCOMM_PROPRIETARY) {
|
||||
if (device.IsQcomRenderPassShaderResolveSupported()) {
|
||||
return info;
|
||||
}
|
||||
}
|
||||
|
||||
// Other vendors: shaderStorageImageMultisample handles HDR+MSAA
|
||||
if (device.IsStorageImageMultisampleSupported()) {
|
||||
return info;
|
||||
}
|
||||
|
||||
// No suitable resolve method - degrade to non-MSAA
|
||||
LOG_WARNING(Render_Vulkan, "HDR format {} with MSAA not supported, degrading to 1x samples",
|
||||
vk_format);
|
||||
info.num_samples = 1;
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
[[nodiscard]] vk::Image MakeImage(const Device& device, const MemoryAllocator& allocator,
|
||||
const ImageInfo& info, std::span<const VkFormat> view_formats) {
|
||||
if (info.type == ImageType::Buffer) {
|
||||
@@ -272,10 +316,18 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
|
||||
return VK_COMPONENT_SWIZZLE_ZERO;
|
||||
}
|
||||
|
||||
[[nodiscard]] VkImageViewType ImageViewType(Shader::TextureType type) {
|
||||
[[nodiscard]] VkImageViewType ImageViewType(Shader::TextureType type, const Device& device) {
|
||||
const auto driver_id = device.GetDriverID();
|
||||
const bool is_mobile = driver_id == VK_DRIVER_ID_QUALCOMM_PROPRIETARY ||
|
||||
driver_id == VK_DRIVER_ID_MESA_TURNIP ||
|
||||
driver_id == VK_DRIVER_ID_ARM_PROPRIETARY ||
|
||||
driver_id == VK_DRIVER_ID_BROADCOM_PROPRIETARY ||
|
||||
driver_id == VK_DRIVER_ID_IMAGINATION_PROPRIETARY;
|
||||
|
||||
switch (type) {
|
||||
case Shader::TextureType::Color1D:
|
||||
return VK_IMAGE_VIEW_TYPE_1D;
|
||||
// Emulate 1D as 2D with height=1 on mobile (no Sampled1D capability)
|
||||
return is_mobile ? VK_IMAGE_VIEW_TYPE_2D : VK_IMAGE_VIEW_TYPE_1D;
|
||||
case Shader::TextureType::Color2D:
|
||||
case Shader::TextureType::Color2DRect:
|
||||
return VK_IMAGE_VIEW_TYPE_2D;
|
||||
@@ -284,7 +336,8 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
|
||||
case Shader::TextureType::Color3D:
|
||||
return VK_IMAGE_VIEW_TYPE_3D;
|
||||
case Shader::TextureType::ColorArray1D:
|
||||
return VK_IMAGE_VIEW_TYPE_1D_ARRAY;
|
||||
// Emulate 1D array as 2D array with height=1 on mobile
|
||||
return is_mobile ? VK_IMAGE_VIEW_TYPE_2D_ARRAY : VK_IMAGE_VIEW_TYPE_1D_ARRAY;
|
||||
case Shader::TextureType::ColorArray2D:
|
||||
return VK_IMAGE_VIEW_TYPE_2D_ARRAY;
|
||||
case Shader::TextureType::ColorArrayCube:
|
||||
@@ -297,10 +350,18 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
|
||||
return VK_IMAGE_VIEW_TYPE_2D;
|
||||
}
|
||||
|
||||
[[nodiscard]] VkImageViewType ImageViewType(VideoCommon::ImageViewType type) {
|
||||
[[nodiscard]] VkImageViewType ImageViewType(VideoCommon::ImageViewType type, const Device& device) {
|
||||
const auto driver_id = device.GetDriverID();
|
||||
const bool is_mobile = driver_id == VK_DRIVER_ID_QUALCOMM_PROPRIETARY ||
|
||||
driver_id == VK_DRIVER_ID_MESA_TURNIP ||
|
||||
driver_id == VK_DRIVER_ID_ARM_PROPRIETARY ||
|
||||
driver_id == VK_DRIVER_ID_BROADCOM_PROPRIETARY ||
|
||||
driver_id == VK_DRIVER_ID_IMAGINATION_PROPRIETARY;
|
||||
|
||||
switch (type) {
|
||||
case VideoCommon::ImageViewType::e1D:
|
||||
return VK_IMAGE_VIEW_TYPE_1D;
|
||||
// Emulate 1D as 2D with height=1 on mobile (no Sampled1D capability)
|
||||
return is_mobile ? VK_IMAGE_VIEW_TYPE_2D : VK_IMAGE_VIEW_TYPE_1D;
|
||||
case VideoCommon::ImageViewType::e2D:
|
||||
case VideoCommon::ImageViewType::Rect:
|
||||
return VK_IMAGE_VIEW_TYPE_2D;
|
||||
@@ -309,7 +370,8 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
|
||||
case VideoCommon::ImageViewType::e3D:
|
||||
return VK_IMAGE_VIEW_TYPE_3D;
|
||||
case VideoCommon::ImageViewType::e1DArray:
|
||||
return VK_IMAGE_VIEW_TYPE_1D_ARRAY;
|
||||
// Emulate 1D array as 2D array with height=1 on mobile
|
||||
return is_mobile ? VK_IMAGE_VIEW_TYPE_2D_ARRAY : VK_IMAGE_VIEW_TYPE_1D_ARRAY;
|
||||
case VideoCommon::ImageViewType::e2DArray:
|
||||
return VK_IMAGE_VIEW_TYPE_2D_ARRAY;
|
||||
case VideoCommon::ImageViewType::CubeArray:
|
||||
@@ -1326,7 +1388,6 @@ void TextureCacheRuntime::ConvertImage(Framebuffer* dst, ImageView& dst_view, Im
|
||||
case PixelFormat::ASTC_2D_8X6_SRGB:
|
||||
case PixelFormat::ASTC_2D_6X5_UNORM:
|
||||
case PixelFormat::ASTC_2D_6X5_SRGB:
|
||||
case PixelFormat::E5B9G9R9_FLOAT:
|
||||
case PixelFormat::D32_FLOAT:
|
||||
case PixelFormat::D16_UNORM:
|
||||
case PixelFormat::X8_D24_UNORM:
|
||||
@@ -1490,6 +1551,23 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src,
|
||||
void TextureCacheRuntime::CopyImageMSAA(Image& dst, Image& src,
|
||||
std::span<const VideoCommon::ImageCopy> copies) {
|
||||
const bool msaa_to_non_msaa = src.info.num_samples > 1 && dst.info.num_samples == 1;
|
||||
|
||||
// Use VK_QCOM_render_pass_shader_resolve for HDR formats on Qualcomm
|
||||
// This is more efficient than compute shader (stays on-chip in TBDR)
|
||||
const bool is_hdr_format = src.info.format == PixelFormat::B10G11R11_FLOAT ||
|
||||
dst.info.format == PixelFormat::B10G11R11_FLOAT;
|
||||
const bool use_qcom_resolve = msaa_to_non_msaa &&
|
||||
device.IsQcomRenderPassShaderResolveSupported() &&
|
||||
is_hdr_format &&
|
||||
copies.size() == 1; // QCOM resolve works best with single full copy
|
||||
|
||||
if (use_qcom_resolve) {
|
||||
// Create temporary framebuffer with resolve target
|
||||
// TODO Camille: Implement QCOM shader resolve path with proper framebuffer setup
|
||||
// For now, fall through to standard path
|
||||
LOG_DEBUG(Render_Vulkan, "QCOM shader resolve opportunity detected but not yet implemented");
|
||||
}
|
||||
|
||||
if (msaa_copy_pass) {
|
||||
return msaa_copy_pass->CopyImage(dst, src, copies, msaa_to_non_msaa);
|
||||
}
|
||||
@@ -1513,10 +1591,20 @@ void TextureCacheRuntime::TickFrame() {}
|
||||
Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu_addr_,
|
||||
VAddr cpu_addr_)
|
||||
: VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), scheduler{&runtime_.scheduler},
|
||||
runtime{&runtime_}, original_image(MakeImage(runtime_.device, runtime_.memory_allocator, info,
|
||||
runtime->ViewFormats(info.format))),
|
||||
aspect_mask(ImageAspectMask(info.format)) {
|
||||
if (IsPixelFormatASTC(info.format) && !runtime->device.IsOptimalAstcSupported()) {
|
||||
runtime{&runtime_} {
|
||||
// CRITICAL: Adjust MSAA for HDR formats if driver doesn't support shaderStorageImageMultisample
|
||||
// This prevents texture corruption by degrading to non-MSAA when msaa_copy_pass would fail
|
||||
const ImageInfo adjusted_info = AdjustMSAAForHDRFormats(runtime_.device, info_);
|
||||
|
||||
// Update our stored info with adjusted values (may have num_samples=1 now)
|
||||
info = adjusted_info;
|
||||
|
||||
// Create image with adjusted info
|
||||
original_image = MakeImage(runtime_.device, runtime_.memory_allocator, adjusted_info,
|
||||
runtime->ViewFormats(adjusted_info.format));
|
||||
aspect_mask = ImageAspectMask(adjusted_info.format);
|
||||
|
||||
if (IsPixelFormatASTC(adjusted_info.format) && !runtime->device.IsOptimalAstcSupported()) {
|
||||
switch (Settings::values.accelerate_astc.GetValue()) {
|
||||
case Settings::AstcDecodeMode::Gpu:
|
||||
if (Settings::values.astc_recompression.GetValue() ==
|
||||
@@ -1552,24 +1640,6 @@ Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu
|
||||
MakeStorageView(device, level, *original_image, VK_FORMAT_A8B8G8R8_UNORM_PACK32);
|
||||
}
|
||||
}
|
||||
|
||||
// Proactive warning for problematic HDR format + MSAA combinations on Android
|
||||
// These combinations commonly cause texture flickering/black screens across multiple game engines
|
||||
// Note: MSAA is native Switch rendering technique, cannot be disabled by emulator
|
||||
if (info.num_samples > 1) {
|
||||
const auto vk_format = MaxwellToVK::SurfaceFormat(runtime->device, FormatType::Optimal,
|
||||
false, info.format).format;
|
||||
const bool is_hdr_format = vk_format == VK_FORMAT_B10G11R11_UFLOAT_PACK32 ||
|
||||
vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32;
|
||||
|
||||
if (is_hdr_format) {
|
||||
LOG_WARNING(Render_Vulkan,
|
||||
"Creating MSAA image ({}x samples) with HDR format {} (Maxwell: {}). "
|
||||
"Driver support may be limited on Android (Qualcomm < 800, Mali pre-maintenance5). "
|
||||
"Format fallback to RGBA16F should prevent issues.",
|
||||
info.num_samples, vk_format, info.format);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBase{params} {}
|
||||
@@ -2050,6 +2120,21 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI
|
||||
}
|
||||
}
|
||||
const auto format_info = MaxwellToVK::SurfaceFormat(*device, FormatType::Optimal, true, format);
|
||||
|
||||
// This causes validation errors and undefined behavior (flickering, missing geometry) on certain games
|
||||
// Reinterpret R32_UINT as R32_SFLOAT for sampled images to match shader expectations
|
||||
VkFormat view_format = format_info.format;
|
||||
if (view_format == VK_FORMAT_R32_UINT &&
|
||||
!info.IsRenderTarget() &&
|
||||
(ImageUsageFlags(format_info, format) & VK_IMAGE_USAGE_SAMPLED_BIT)) {
|
||||
// Only reinterpret if NOT used as storage image (storage requires matching types)
|
||||
const bool is_storage = (ImageUsageFlags(format_info, format) & VK_IMAGE_USAGE_STORAGE_BIT) != 0;
|
||||
if (!is_storage) {
|
||||
view_format = VK_FORMAT_R32_SFLOAT;
|
||||
LOG_DEBUG(Render_Vulkan, "Reinterpreting R32_UINT as R32_SFLOAT for sampled image compatibility");
|
||||
}
|
||||
}
|
||||
|
||||
if (ImageUsageFlags(format_info, format) != image.UsageFlags()) {
|
||||
LOG_WARNING(Render_Vulkan,
|
||||
"Image view format {} has different usage flags than image format {}", format,
|
||||
@@ -2066,7 +2151,7 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI
|
||||
.flags = 0,
|
||||
.image = image.Handle(),
|
||||
.viewType = VkImageViewType{},
|
||||
.format = format_info.format,
|
||||
.format = view_format,
|
||||
.components{
|
||||
.r = ComponentSwizzle(swizzle[0]),
|
||||
.g = ComponentSwizzle(swizzle[1]),
|
||||
@@ -2077,7 +2162,7 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI
|
||||
};
|
||||
const auto create = [&](TextureType tex_type, std::optional<u32> num_layers) {
|
||||
VkImageViewCreateInfo ci{create_info};
|
||||
ci.viewType = ImageViewType(tex_type);
|
||||
ci.viewType = ImageViewType(tex_type, *device);
|
||||
if (num_layers) {
|
||||
ci.subresourceRange.layerCount = *num_layers;
|
||||
}
|
||||
@@ -2218,7 +2303,7 @@ vk::ImageView ImageView::MakeView(VkFormat vk_format, VkImageAspectFlags aspect_
|
||||
.pNext = nullptr,
|
||||
.flags = 0,
|
||||
.image = image_handle,
|
||||
.viewType = ImageViewType(type),
|
||||
.viewType = ImageViewType(type, *device),
|
||||
.format = vk_format,
|
||||
.components{
|
||||
.r = VK_COMPONENT_SWIZZLE_IDENTITY,
|
||||
@@ -2364,6 +2449,26 @@ void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime,
|
||||
}
|
||||
renderpass_key.samples = samples;
|
||||
|
||||
// Enable VK_QCOM_render_pass_shader_resolve for HDR+MSAA on Qualcomm
|
||||
// This performs MSAA resolve using fragment shader IN the render pass (on-chip)
|
||||
// Benefits: ~70% bandwidth reduction, better performance on TBDR architectures
|
||||
// Requirements: pResolveAttachments configured + explicit shader execution
|
||||
if (samples > VK_SAMPLE_COUNT_1_BIT && runtime.device.IsQcomRenderPassShaderResolveSupported()) {
|
||||
// Check if any color attachment is HDR format that benefits from shader resolve
|
||||
bool has_hdr_attachment = false;
|
||||
for (size_t index = 0; index < NUM_RT && !has_hdr_attachment; ++index) {
|
||||
const auto format = renderpass_key.color_formats[index];
|
||||
// B10G11R11_FLOAT benefits most: compute shader limited, fixed-function slower
|
||||
if (format == PixelFormat::B10G11R11_FLOAT) {
|
||||
has_hdr_attachment = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (has_hdr_attachment) {
|
||||
renderpass_key.qcom_shader_resolve = true;
|
||||
}
|
||||
}
|
||||
|
||||
renderpass = runtime.render_pass_cache.Get(renderpass_key);
|
||||
render_area.width = (std::min)(render_area.width, width);
|
||||
render_area.height = (std::min)(render_area.height, height);
|
||||
|
||||
@@ -89,20 +89,33 @@ constexpr std::array VK_FORMAT_A4B4G4R4_UNORM_PACK16{
|
||||
VK_FORMAT_UNDEFINED,
|
||||
};
|
||||
|
||||
// B10G11R11_UFLOAT (R11G11B10 float) is used by Unreal Engine 5 for HDR textures
|
||||
// Some Android drivers (Qualcomm pre-800, Mali pre-maintenance5) have issues with this format
|
||||
// when used with MSAA or certain tiling modes, causing texture flickering/black screens
|
||||
// B10G11R11_UFLOAT (R11G11B10F) - PRIMARY HDR format for Nintendo Switch
|
||||
// Nintendo Switch hardware validation: FULL support (COLOR_ATTACHMENT + STORAGE_IMAGE + BLEND)
|
||||
// Reference: vp_gpuinfo_nintendo_switch_v2_495_0_0_0 - All required feature bits present
|
||||
//
|
||||
// Fallback strategy: Degrade to LDR instead of expensive HDR emulation
|
||||
// - RGBA8 UNORM/SRGB: Universal support, 32-bit (same size as B10G11R11), acceptable quality
|
||||
// - RGB10A2: Better precision if available, still 32-bit
|
||||
// - RGBA16F: Last resort only if RGB8 variants fail (should never happen)
|
||||
constexpr std::array B10G11R11_UFLOAT_PACK32{
|
||||
VK_FORMAT_R16G16B16A16_SFLOAT, // Fallback: RGBA16F (more memory, but widely supported)
|
||||
VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, // Alternative: E5B9G9R9 shared exponent format
|
||||
#ifdef ANDROID
|
||||
VK_FORMAT_A8B8G8R8_SRGB_PACK32, // sRGB variant (for gamma-correct fallback)
|
||||
#else
|
||||
VK_FORMAT_A8B8G8R8_UNORM_PACK32, // Primary fallback: RGBA8 LDR (32-bit, universal)
|
||||
VK_FORMAT_A2B10G10R10_UNORM_PACK32, // Better precision: RGB10A2 (32-bit, common)
|
||||
#endif
|
||||
VK_FORMAT_R16G16B16A16_SFLOAT, // Emergency fallback: RGBA16F (64-bit, should never reach)
|
||||
VK_FORMAT_UNDEFINED,
|
||||
};
|
||||
|
||||
// E5B9G9R9_UFLOAT (shared exponent RGB9E5) used by various engines (Unity, custom engines)
|
||||
// Also problematic on some Android drivers, especially with MSAA and as render target
|
||||
// E5B9G9R9_UFLOAT (RGB9E5) - INVALID for COLOR_ATTACHMENT on Nintendo Switch
|
||||
// Nintendo Switch hardware validation: NO COLOR_ATTACHMENT_BIT (only SAMPLED_IMAGE)
|
||||
// Reference: vp_gpuinfo_nintendo_switch_v2_495_0_0_0 - Missing required attachment bits
|
||||
// This format should NEVER be used as render target, only for texture sampling
|
||||
constexpr std::array E5B9G9R9_UFLOAT_PACK32{
|
||||
VK_FORMAT_R16G16B16A16_SFLOAT, // Fallback: RGBA16F (safest option)
|
||||
VK_FORMAT_B10G11R11_UFLOAT_PACK32, // Alternative: might work if E5B9G9R9 fails
|
||||
VK_FORMAT_B10G11R11_UFLOAT_PACK32, // Upgrade to proper HDR format with attachment support
|
||||
VK_FORMAT_A8B8G8R8_UNORM_PACK32, // Fallback: RGBA8 LDR
|
||||
VK_FORMAT_R16G16B16A16_SFLOAT, // Last resort: RGBA16F
|
||||
VK_FORMAT_UNDEFINED,
|
||||
};
|
||||
|
||||
@@ -229,7 +242,6 @@ std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties(vk::Physica
|
||||
VK_FORMAT_D24_UNORM_S8_UINT,
|
||||
VK_FORMAT_D32_SFLOAT,
|
||||
VK_FORMAT_D32_SFLOAT_S8_UINT,
|
||||
VK_FORMAT_E5B9G9R9_UFLOAT_PACK32,
|
||||
VK_FORMAT_R16G16B16A16_SFLOAT,
|
||||
VK_FORMAT_R16G16B16A16_SINT,
|
||||
VK_FORMAT_R16G16B16A16_SNORM,
|
||||
@@ -539,9 +551,85 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
|
||||
"Qualcomm drivers have a slow VK_KHR_push_descriptor implementation");
|
||||
//RemoveExtension(extensions.push_descriptor, VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME);
|
||||
|
||||
// Log Qualcomm-specific optimizations
|
||||
if (extensions.render_pass_store_ops) {
|
||||
LOG_INFO(Render_Vulkan, "VK_QCOM_render_pass_store_ops: ENABLED (TBDR store optimization)");
|
||||
}
|
||||
if (extensions.tile_properties) {
|
||||
LOG_INFO(Render_Vulkan, "VK_QCOM_tile_properties: ENABLED (tile size queries available)");
|
||||
}
|
||||
if (extensions.render_pass_shader_resolve) {
|
||||
LOG_INFO(Render_Vulkan, "VK_QCOM_render_pass_shader_resolve: ENABLED (HDR+MSAA shader resolve)");
|
||||
}
|
||||
|
||||
#ifdef ANDROID
|
||||
// Shader Float Controls handling for Qualcomm Adreno
|
||||
// Default: DISABLED due to historical issues with binning precision causing visual glitches
|
||||
const bool force_enable = Settings::values.shader_float_controls_force_enable.GetValue();
|
||||
|
||||
if (force_enable) {
|
||||
// User explicitly enabled float controls - log detected capabilities and user config
|
||||
LOG_INFO(Render_Vulkan, "Shader Float Controls FORCE ENABLED by user (Eden Veil/Extensions)");
|
||||
|
||||
// Log driver capabilities
|
||||
const auto& fc = properties.float_controls;
|
||||
LOG_INFO(Render_Vulkan, "Driver Float Controls Capabilities:");
|
||||
LOG_INFO(Render_Vulkan, " - Denorm Flush FP32: {}", fc.shaderDenormFlushToZeroFloat32 ? "YES" : "NO");
|
||||
LOG_INFO(Render_Vulkan, " - Denorm Preserve FP32: {}", fc.shaderDenormPreserveFloat32 ? "YES" : "NO");
|
||||
LOG_INFO(Render_Vulkan, " - RTE Rounding FP32: {}", fc.shaderRoundingModeRTEFloat32 ? "YES" : "NO");
|
||||
LOG_INFO(Render_Vulkan, " - Signed Zero/Inf/Nan FP32: {}", fc.shaderSignedZeroInfNanPreserveFloat32 ? "YES" : "NO");
|
||||
LOG_INFO(Render_Vulkan, " - Independence: {}",
|
||||
fc.denormBehaviorIndependence == VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL ? "ALL" : "LIMITED");
|
||||
|
||||
// Log user selections
|
||||
bool ftz = Settings::values.shader_float_ftz.GetValue();
|
||||
bool preserve = Settings::values.shader_float_denorm_preserve.GetValue();
|
||||
const bool rte = Settings::values.shader_float_rte.GetValue();
|
||||
const bool signed_zero = Settings::values.shader_float_signed_zero_inf_nan.GetValue();
|
||||
|
||||
// Validate mutually exclusive options
|
||||
if (ftz && preserve) {
|
||||
LOG_WARNING(Render_Vulkan,
|
||||
"CONFLICT: FTZ and DenormPreserve are mutually exclusive!");
|
||||
LOG_WARNING(Render_Vulkan,
|
||||
" -> DenormPreserve will take precedence (accuracy over speed)");
|
||||
ftz = false; // Preserve takes priority for correctness
|
||||
}
|
||||
|
||||
LOG_INFO(Render_Vulkan, "User Float Behavior Selection:");
|
||||
LOG_INFO(Render_Vulkan, " - Flush To Zero (FTZ): {}", ftz ? "ENABLED" : "disabled");
|
||||
LOG_INFO(Render_Vulkan, " - Denorm Preserve: {}", preserve ? "ENABLED" : "disabled");
|
||||
LOG_INFO(Render_Vulkan, " - Round To Even (RTE): {}", rte ? "ENABLED" : "disabled");
|
||||
LOG_INFO(Render_Vulkan, " - Signed Zero/Inf/Nan: {}", signed_zero ? "ENABLED" : "disabled");
|
||||
|
||||
// Analyze configuration vs Switch native behavior
|
||||
const bool matches_switch = ftz && !preserve && rte && signed_zero;
|
||||
if (matches_switch) {
|
||||
LOG_INFO(Render_Vulkan, "Configuration MATCHES Switch/Maxwell native behavior (FTZ+RTE+SignedZero)");
|
||||
} else if (!ftz && !preserve && !rte && !signed_zero) {
|
||||
LOG_WARNING(Render_Vulkan, "No float behaviors selected - using driver default (may cause glitches)");
|
||||
} else {
|
||||
LOG_INFO(Render_Vulkan, "Configuration is CUSTOM - testing mode active");
|
||||
}
|
||||
|
||||
// Extension stays enabled
|
||||
LOG_INFO(Render_Vulkan, "VK_KHR_shader_float_controls: ENABLED");
|
||||
} else {
|
||||
// Default behavior - disable float controls
|
||||
LOG_WARNING(Render_Vulkan,
|
||||
"Disabling shader float controls on Qualcomm (historical binning precision issues)");
|
||||
LOG_INFO(Render_Vulkan,
|
||||
"To enable: Eden Veil -> Extensions -> Shader Float Controls (Force Enable)");
|
||||
RemoveExtension(extensions.shader_float_controls, VK_KHR_SHADER_FLOAT_CONTROLS_EXTENSION_NAME);
|
||||
}
|
||||
#else
|
||||
// Non-Android: keep original behavior
|
||||
LOG_WARNING(Render_Vulkan,
|
||||
"Disabling shader float controls and 64-bit integer features on Qualcomm proprietary drivers");
|
||||
RemoveExtension(extensions.shader_float_controls, VK_KHR_SHADER_FLOAT_CONTROLS_EXTENSION_NAME);
|
||||
#endif
|
||||
|
||||
// Int64 atomics - genuinely broken, always disable
|
||||
RemoveExtensionFeature(extensions.shader_atomic_int64, features.shader_atomic_int64,
|
||||
VK_KHR_SHADER_ATOMIC_INT64_EXTENSION_NAME);
|
||||
features.shader_atomic_int64.shaderBufferInt64Atomics = false;
|
||||
@@ -868,8 +956,7 @@ VkFormat Device::GetSupportedFormat(VkFormat wanted_format, VkFormatFeatureFlags
|
||||
// Driver may report STORAGE_IMAGE_BIT but shaderStorageImageMultisample=false means
|
||||
// it will fail at runtime when used with MSAA (CopyImageMSAA silently fails)
|
||||
const bool requests_storage = (wanted_usage & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT) != 0;
|
||||
const bool is_hdr_format = wanted_format == VK_FORMAT_B10G11R11_UFLOAT_PACK32 ||
|
||||
wanted_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32;
|
||||
const bool is_hdr_format = wanted_format == VK_FORMAT_B10G11R11_UFLOAT_PACK32;
|
||||
|
||||
// If driver doesn't support shader storage image with MSAA, and we're requesting storage
|
||||
// for an HDR format (which will likely be used with MSAA), force fallback
|
||||
@@ -902,13 +989,8 @@ VkFormat Device::GetSupportedFormat(VkFormat wanted_format, VkFormatFeatureFlags
|
||||
// Special logging for HDR formats (common across multiple engines) on problematic drivers
|
||||
if (wanted_format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) {
|
||||
LOG_WARNING(Render_Vulkan,
|
||||
"Emulating B10G11R11_UFLOAT (HDR format: UE5, custom engines) with {} on {}. "
|
||||
"Native format not supported by driver, using fallback.",
|
||||
alternative, properties.properties.deviceName);
|
||||
} else if (wanted_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) {
|
||||
LOG_WARNING(Render_Vulkan,
|
||||
"Emulating E5B9G9R9_UFLOAT (HDR format: Unity, RE Engine) with {} on {}. "
|
||||
"Native format not supported by driver, using fallback.",
|
||||
"B10G11R11_UFLOAT_PACK32 (R11G11B10F HDR format) not fully supported. "
|
||||
"Falling back to {} on {}",
|
||||
alternative, properties.properties.deviceName);
|
||||
} else {
|
||||
LOG_DEBUG(Render_Vulkan,
|
||||
|
||||
@@ -63,7 +63,8 @@ VK_DEFINE_HANDLE(VmaAllocator)
|
||||
FEATURE(KHR, PipelineExecutableProperties, PIPELINE_EXECUTABLE_PROPERTIES, \
|
||||
pipeline_executable_properties) \
|
||||
FEATURE(KHR, WorkgroupMemoryExplicitLayout, WORKGROUP_MEMORY_EXPLICIT_LAYOUT, \
|
||||
workgroup_memory_explicit_layout)
|
||||
workgroup_memory_explicit_layout) \
|
||||
FEATURE(QCOM, TileProperties, TILE_PROPERTIES, tile_properties_qcom)
|
||||
|
||||
// Define miscellaneous extensions which may be used by the implementation here.
|
||||
#define FOR_EACH_VK_EXTENSION(EXTENSION) \
|
||||
@@ -96,6 +97,8 @@ VK_DEFINE_HANDLE(VmaAllocator)
|
||||
EXTENSION(EXT, FILTER_CUBIC, filter_cubic) \
|
||||
EXTENSION(QCOM, FILTER_CUBIC_WEIGHTS, filter_cubic_weights) \
|
||||
EXTENSION(QCOM, RENDER_PASS_SHADER_RESOLVE, render_pass_shader_resolve) \
|
||||
EXTENSION(QCOM, RENDER_PASS_STORE_OPS, render_pass_store_ops) \
|
||||
EXTENSION(QCOM, TILE_PROPERTIES, tile_properties) \
|
||||
EXTENSION(KHR, MAINTENANCE_1, maintenance1) \
|
||||
EXTENSION(KHR, MAINTENANCE_2, maintenance2) \
|
||||
EXTENSION(KHR, MAINTENANCE_3, maintenance3) \
|
||||
@@ -379,6 +382,12 @@ public:
|
||||
return properties.subgroup_properties.supportedOperations & feature;
|
||||
}
|
||||
|
||||
/// Returns true if subgroup operations are supported in the specified shader stage.
|
||||
/// Mobile GPUs (Qualcomm Adreno) often only support subgroups in fragment/compute stages.
|
||||
bool IsSubgroupSupportedForStage(VkShaderStageFlagBits stage) const {
|
||||
return properties.subgroup_properties.supportedStages & stage;
|
||||
}
|
||||
|
||||
/// Returns the maximum number of push descriptors.
|
||||
u32 MaxPushDescriptors() const {
|
||||
return properties.push_descriptor.maxPushDescriptors;
|
||||
@@ -588,6 +597,26 @@ public:
|
||||
return extensions.render_pass_shader_resolve;
|
||||
}
|
||||
|
||||
/// Returns true if the device supports VK_QCOM_render_pass_store_ops
|
||||
bool IsQcomRenderPassStoreOpsSupported() const {
|
||||
return extensions.render_pass_store_ops;
|
||||
}
|
||||
|
||||
/// Returns true if the device supports VK_QCOM_tile_properties
|
||||
bool IsQcomTilePropertiesSupported() const {
|
||||
return extensions.tile_properties;
|
||||
}
|
||||
|
||||
/// Returns Qualcomm tile size (width, height, depth). Returns {0,0,0} if not queried or unsupported
|
||||
VkExtent3D GetQcomTileSize() const {
|
||||
return properties.qcom_tile_size;
|
||||
}
|
||||
|
||||
/// Returns Qualcomm tile apron size. Returns {0,0} if not queried or unsupported
|
||||
VkExtent2D GetQcomApronSize() const {
|
||||
return properties.qcom_apron_size;
|
||||
}
|
||||
|
||||
/// Returns true if MSAA copy operations are supported via compute shader (upload/download)
|
||||
/// Qualcomm uses render pass shader resolve instead, so this returns false for Qualcomm
|
||||
bool CanUploadMSAA() const {
|
||||
@@ -857,6 +886,8 @@ private:
|
||||
VkPhysicalDeviceSubgroupSizeControlProperties subgroup_size_control{};
|
||||
VkPhysicalDeviceTransformFeedbackPropertiesEXT transform_feedback{};
|
||||
VkPhysicalDeviceProperties properties{};
|
||||
VkExtent3D qcom_tile_size{}; // Qualcomm tile dimensions (0 if not queried)
|
||||
VkExtent2D qcom_apron_size{}; // Qualcomm tile apron size
|
||||
};
|
||||
|
||||
Extensions extensions{};
|
||||
|
||||
@@ -226,11 +226,24 @@ namespace Vulkan {
|
||||
vk::Buffer
|
||||
MemoryAllocator::CreateBuffer(const VkBufferCreateInfo &ci, MemoryUsage usage) const
|
||||
{
|
||||
// Qualcomm uses unified memory architecture - prefer DEVICE_LOCAL + HOST_VISIBLE
|
||||
// for zero-copy access without staging buffers
|
||||
const bool is_qualcomm = device.GetDriverID() == VK_DRIVER_ID_QUALCOMM_PROPRIETARY;
|
||||
const bool prefer_unified = is_qualcomm && (usage == MemoryUsage::Upload ||
|
||||
usage == MemoryUsage::Download ||
|
||||
usage == MemoryUsage::Stream);
|
||||
|
||||
VkMemoryPropertyFlags preferred_flags = MemoryUsagePreferredVmaFlags(usage);
|
||||
if (prefer_unified) {
|
||||
// Request DEVICE_LOCAL + HOST_VISIBLE for zero-copy on unified memory architectures
|
||||
preferred_flags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
|
||||
}
|
||||
|
||||
const VmaAllocationCreateInfo alloc_ci = {
|
||||
.flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | MemoryUsageVmaFlags(usage),
|
||||
.usage = MemoryUsageVma(usage),
|
||||
.requiredFlags = 0,
|
||||
.preferredFlags = MemoryUsagePreferredVmaFlags(usage),
|
||||
.preferredFlags = preferred_flags,
|
||||
.memoryTypeBits = usage == MemoryUsage::Stream ? 0u : valid_memory_types,
|
||||
.pool = VK_NULL_HANDLE,
|
||||
.pUserData = nullptr,
|
||||
@@ -245,6 +258,13 @@ namespace Vulkan {
|
||||
vk::Check(vmaCreateBuffer(allocator, &ci, &alloc_ci, &handle, &allocation, &alloc_info));
|
||||
vmaGetAllocationMemoryProperties(allocator, allocation, &property_flags);
|
||||
|
||||
if (is_qualcomm && prefer_unified) {
|
||||
const bool got_unified = (property_flags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) &&
|
||||
(property_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
|
||||
LOG_DEBUG(Render_Vulkan, "Qualcomm buffer allocation: usage={}, unified={}, flags=0x{:X}",
|
||||
static_cast<u32>(usage), got_unified, property_flags);
|
||||
}
|
||||
|
||||
u8 *data = reinterpret_cast<u8 *>(alloc_info.pMappedData);
|
||||
const std::span<u8> mapped_data = data ? std::span<u8>{data, ci.size} : std::span<u8>{};
|
||||
const bool is_coherent = (property_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) != 0;
|
||||
|
||||
Reference in New Issue
Block a user