This commit is contained in:
2026-01-08 16:20:36 -03:00
parent 7dc30a5e2d
commit c486334c78

View File

@@ -10,6 +10,7 @@
#include <thread> #include <thread>
#include <vector> #include <vector>
extern "C" { extern "C" {
#if defined(__GNUC__) || defined(__clang__) #if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic push #pragma GCC diagnostic push
@@ -107,17 +108,8 @@ void Vic::Execute() {
auto output_width{config.output_surface_config.out_surface_width + 1}; auto output_width{config.output_surface_config.out_surface_width + 1};
auto output_height{config.output_surface_config.out_surface_height + 1}; auto output_height{config.output_surface_config.out_surface_height + 1};
output_surface.resize_destructive(output_width * output_height);
// Only resize when dimensions change (huge performance boost for 1080p)
const auto required_size = output_width * output_height;
const bool size_changed = output_surface.size() != required_size;
if (size_changed) {
// Optimization: Only clear on first allocation, not on every resize
// This avoids expensive std::fill on large buffers (1080p = ~8MB)
const bool first_allocation = output_surface.size() == 0;
output_surface.resize_destructive(required_size);
if (first_allocation) {
// Initialize the surface with the appropriate black pixel // Initialize the surface with the appropriate black pixel
Pixel black_pixel{}; Pixel black_pixel{};
if (config.output_surface_config.out_pixel_format == VideoPixelFormat::Y8__V8U8_N420) { if (config.output_surface_config.out_pixel_format == VideoPixelFormat::Y8__V8U8_N420) {
@@ -128,8 +120,6 @@ void Vic::Execute() {
black_pixel = {0, 0, 0, 0}; black_pixel = {0, 0, 0, 0};
} }
std::fill(output_surface.begin(), output_surface.end(), black_pixel); std::fill(output_surface.begin(), output_surface.end(), black_pixel);
}
}
if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] { if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] {
@@ -203,16 +193,7 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const P
out_luma_height *= 2; out_luma_height *= 2;
} }
// Only resize when dimensions change (avoids expensive reallocation) slot_surface.resize_destructive(out_luma_width * out_luma_height);
const auto required_size = out_luma_width * out_luma_height;
if (slot_surface.size() != required_size) {
// Optimization: Only clear on first allocation to avoid expensive std::fill
const bool first_allocation = slot_surface.size() == 0;
slot_surface.resize_destructive(required_size);
if (first_allocation) {
std::fill(slot_surface.begin(), slot_surface.end(), Pixel{0, 512, 512, 0});
}
}
const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))}; const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))};
const auto in_luma_height{(std::min)(frame->GetHeight(), s32(out_luma_height))}; const auto in_luma_height{(std::min)(frame->GetHeight(), s32(out_luma_height))};
@@ -233,32 +214,45 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const P
out_luma_height, out_luma_stride); out_luma_height, out_luma_stride);
const auto alpha{u16(slot.config.planar_alpha.Value())}; const auto alpha{u16(slot.config.planar_alpha.Value())};
// Optimization: Separate luma and chroma processing for better cache locality
// Process entire scanlines at once for vectorization
for (s32 y = 0; y < in_luma_height; y++) { for (s32 y = 0; y < in_luma_height; y++) {
const auto src_luma{y * in_luma_stride}; const u8* luma_ptr = luma_buffer + y * in_luma_stride;
const auto src_chroma{(y / 2) * in_chroma_stride}; const u8* chroma_u_ptr = chroma_u_buffer + (y / 2) * in_chroma_stride;
const auto dst{y * out_luma_stride}; // For planar, V buffer is separate. For NV12, it is not used directly in the same way.
const u8* chroma_v_ptr = planar ? (chroma_v_buffer + (y / 2) * in_chroma_stride) : nullptr;
// Vectorized luma processing (compiler can auto-vectorize this) Pixel* dst_ptr = &slot_surface[y * out_luma_stride];
for (s32 x = 0; x < in_luma_width; x++) {
auto& pixel = slot_surface[dst + x];
pixel.r = u16(luma_buffer[src_luma + x] << 2);
pixel.a = alpha;
}
// Vectorized chroma processing (separate loop for better cache) for (s32 x = 0; x < in_luma_width; x += 2) {
if(planar) { u16 u_val, v_val;
for (s32 x = 0; x < in_luma_width; x++) {
slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2); if (planar) {
slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2); // YUV420P: U and V are in separate planes.
} // 1 UV pair for 2 horizontal pixels.
u_val = u16(chroma_u_ptr[x / 2] << 2);
v_val = u16(chroma_v_ptr[x / 2] << 2);
} else { } else {
for (s32 x = 0; x < in_luma_width; x++) { // NV12: UV are interleaved in the second plane.
slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2); // U is at even byte, V is at odd byte.
slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2); // x is even (0, 2, 4...), so x corresponds to the byte offset in the interleaved buffer.
u_val = u16(chroma_u_ptr[x] << 2);
v_val = u16(chroma_u_ptr[x + 1] << 2);
} }
// Pixel 1 (Even x)
dst_ptr[0].r = u16(luma_ptr[x] << 2);
dst_ptr[0].g = u_val;
dst_ptr[0].b = v_val;
dst_ptr[0].a = alpha;
// Pixel 2 (Odd x), check boundary
if (x + 1 < in_luma_width) {
dst_ptr[1].r = u16(luma_ptr[x + 1] << 2);
dst_ptr[1].g = u_val;
dst_ptr[1].b = v_val;
dst_ptr[1].a = alpha;
}
dst_ptr += 2;
} }
} }
} }
@@ -272,16 +266,7 @@ void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const Pl
const auto out_luma_height{(slot.surface_config.slot_surface_height + 1) * 2}; const auto out_luma_height{(slot.surface_config.slot_surface_height + 1) * 2};
const auto out_luma_stride{out_luma_width}; const auto out_luma_stride{out_luma_width};
// Only resize when dimensions change (avoids expensive reallocation) slot_surface.resize_destructive(out_luma_width * out_luma_height);
const auto required_size = out_luma_width * out_luma_height;
if (slot_surface.size() != required_size) {
// Optimization: Only clear on first allocation to avoid expensive std::fill
const bool first_allocation = slot_surface.size() == 0;
slot_surface.resize_destructive(required_size);
if (first_allocation) {
std::fill(slot_surface.begin(), slot_surface.end(), Pixel{0, 512, 512, 0});
}
}
const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))}; const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))};
[[maybe_unused]] const auto in_luma_height{ [[maybe_unused]] const auto in_luma_height{
@@ -306,33 +291,21 @@ void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const Pl
auto DecodeBobField = [&]() { auto DecodeBobField = [&]() {
const auto alpha{u16(slot.config.planar_alpha.Value())}; const auto alpha{u16(slot.config.planar_alpha.Value())};
// Optimization: Vectorized interlaced processing like progressive
for (s32 y = s32(top_field == false); y < in_chroma_height * 2; y += 2) { for (s32 y = s32(top_field == false); y < in_chroma_height * 2; y += 2) {
const auto src_luma{y * in_luma_stride}; const auto src_luma{y * in_luma_stride};
const auto src_chroma{(y / 2) * in_chroma_stride}; const auto src_chroma{(y / 2) * in_chroma_stride};
const auto dst{y * out_luma_stride}; const auto dst{y * out_luma_stride};
// Vectorized luma + alpha
for (s32 x = 0; x < in_luma_width; x++) { for (s32 x = 0; x < in_luma_width; x++) {
auto& pixel = slot_surface[dst + x]; slot_surface[dst + x].r = u16(luma_buffer[src_luma + x] << 2);
pixel.r = u16(luma_buffer[src_luma + x] << 2);
pixel.a = alpha;
}
// Vectorized chroma
if(planar) { if(planar) {
for (s32 x = 0; x < in_luma_width; x++) {
slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2); slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2);
slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2); slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2);
}
} else { } else {
for (s32 x = 0; x < in_luma_width; x++) {
slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2); slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2); slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
} }
slot_surface[dst + x].a = alpha;
} }
s32 other_line = (top_field ? y + 1 : y - 1) * out_luma_stride; s32 other_line = (top_field ? y + 1 : y - 1) * out_luma_stride;
std::memcpy(&slot_surface[other_line], &slot_surface[dst], out_luma_width * sizeof(Pixel)); std::memcpy(&slot_surface[other_line], &slot_surface[dst], out_luma_width * sizeof(Pixel));
} }
@@ -512,26 +485,20 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) {
surface_height = (std::min)(surface_height, out_luma_height); surface_height = (std::min)(surface_height, out_luma_height);
auto Decode = [&](std::span<u8> out_luma, std::span<u8> out_chroma) { auto Decode = [&](std::span<u8> out_luma, std::span<u8> out_chroma) {
// Optimization: Process entire scanlines at once for better vectorization
// Separate luma and chroma processing for better cache locality
for (u32 y = 0; y < surface_height; ++y) { for (u32 y = 0; y < surface_height; ++y) {
const auto src_luma = y * surface_stride; const auto src_luma = y * surface_stride;
const auto dst_luma = y * out_luma_stride; const auto dst_luma = y * out_luma_stride;
// Vectorized luma extraction (compiler can auto-vectorize with SSE/AVX)
for (u32 x = 0; x < surface_width; x++) {
out_luma[dst_luma + x] = u8(output_surface[src_luma + x].r >> 2);
}
}
// Vectorized chroma extraction (process every other line for 4:2:0 subsampling)
for (u32 y = 0; y < surface_height; y += 2) {
const auto src_chroma = y * surface_stride; const auto src_chroma = y * surface_stride;
const auto dst_chroma = (y / 2) * out_chroma_stride; const auto dst_chroma = (y / 2) * out_chroma_stride;
for (u32 x = 0; x < surface_width; x += 2) { for (u32 x = 0; x < surface_width; x += 2) {
out_chroma[dst_chroma + x + 0] = u8(output_surface[src_chroma + x].g >> 2); out_luma[dst_luma + x + 0] =
out_chroma[dst_chroma + x + 1] = u8(output_surface[src_chroma + x].b >> 2); u8(output_surface[src_luma + x + 0].r >> 2);
out_luma[dst_luma + x + 1] =
u8(output_surface[src_luma + x + 1].r >> 2);
out_chroma[dst_chroma + x + 0] =
u8(output_surface[src_chroma + x].g >> 2);
out_chroma[dst_chroma + x + 1] =
u8(output_surface[src_chroma + x].b >> 2);
} }
} }
}; };
@@ -626,31 +593,20 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config, VideoPixel
surface_height = (std::min)(surface_height, out_luma_height); surface_height = (std::min)(surface_height, out_luma_height);
auto Decode = [&](std::span<u8> out_buffer) { auto Decode = [&](std::span<u8> out_buffer) {
// Optimization: Better memory access pattern for vectorization
// Process entire scanlines with reduced array indirection
for (u32 y = 0; y < surface_height; y++) { for (u32 y = 0; y < surface_height; y++) {
const auto src = y * surface_stride; const auto src = y * surface_stride;
const auto dst = y * out_luma_stride; const auto dst = y * out_luma_stride;
for (u32 x = 0; x < surface_width; x++) {
if(format == VideoPixelFormat::A8R8G8B8) { if(format == VideoPixelFormat::A8R8G8B8) {
// Vectorized ARGB processing out_buffer[dst + x * 4 + 0] = u8(output_surface[src + x].b >> 2);
for (u32 x = 0; x < surface_width; x++) { out_buffer[dst + x * 4 + 1] = u8(output_surface[src + x].g >> 2);
const auto& pixel = output_surface[src + x]; out_buffer[dst + x * 4 + 2] = u8(output_surface[src + x].r >> 2);
auto* out_pixel = &out_buffer[dst + x * 4]; out_buffer[dst + x * 4 + 3] = u8(output_surface[src + x].a >> 2);
out_pixel[0] = u8(pixel.b >> 2);
out_pixel[1] = u8(pixel.g >> 2);
out_pixel[2] = u8(pixel.r >> 2);
out_pixel[3] = u8(pixel.a >> 2);
}
} else { } else {
// Vectorized ABGR processing out_buffer[dst + x * 4 + 0] = u8(output_surface[src + x].r >> 2);
for (u32 x = 0; x < surface_width; x++) { out_buffer[dst + x * 4 + 1] = u8(output_surface[src + x].g >> 2);
const auto& pixel = output_surface[src + x]; out_buffer[dst + x * 4 + 2] = u8(output_surface[src + x].b >> 2);
auto* out_pixel = &out_buffer[dst + x * 4]; out_buffer[dst + x * 4 + 3] = u8(output_surface[src + x].a >> 2);
out_pixel[0] = u8(pixel.r >> 2);
out_pixel[1] = u8(pixel.g >> 2);
out_pixel[2] = u8(pixel.b >> 2);
out_pixel[3] = u8(pixel.a >> 2);
} }
} }
} }