From c486334c78e82d0973fa985b365ab9193a6479aa Mon Sep 17 00:00:00 2001 From: DieguinDG Date: Thu, 8 Jan 2026 16:20:36 -0300 Subject: [PATCH] lol --- src/video_core/host1x/vic.cpp | 184 +++++++++++++--------------------- 1 file changed, 70 insertions(+), 114 deletions(-) diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp index b42dee43a2..20eb3ddc5f 100644 --- a/src/video_core/host1x/vic.cpp +++ b/src/video_core/host1x/vic.cpp @@ -10,6 +10,7 @@ #include #include + extern "C" { #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic push @@ -107,29 +108,18 @@ void Vic::Execute() { auto output_width{config.output_surface_config.out_surface_width + 1}; auto output_height{config.output_surface_config.out_surface_height + 1}; + output_surface.resize_destructive(output_width * output_height); - // Only resize when dimensions change (huge performance boost for 1080p) - const auto required_size = output_width * output_height; - const bool size_changed = output_surface.size() != required_size; - if (size_changed) { - // Optimization: Only clear on first allocation, not on every resize - // This avoids expensive std::fill on large buffers (1080p = ~8MB) - const bool first_allocation = output_surface.size() == 0; - output_surface.resize_destructive(required_size); - - if (first_allocation) { - // Initialize the surface with the appropriate black pixel - Pixel black_pixel{}; - if (config.output_surface_config.out_pixel_format == VideoPixelFormat::Y8__V8U8_N420) { - // Y=0, U=512, V=512 (10-bit), A=0 - black_pixel = {0, 512, 512, 0}; - } else { - // R=0, G=0, B=0, A=0 - black_pixel = {0, 0, 0, 0}; - } - std::fill(output_surface.begin(), output_surface.end(), black_pixel); - } + // Initialize the surface with the appropriate black pixel + Pixel black_pixel{}; + if (config.output_surface_config.out_pixel_format == VideoPixelFormat::Y8__V8U8_N420) { + // Y=0, U=512, V=512 (10-bit), A=0 + black_pixel = {0, 512, 512, 0}; + } else { + // R=0, G=0, B=0, A=0 + black_pixel = {0, 0, 0, 0}; } + std::fill(output_surface.begin(), output_surface.end(), black_pixel); if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] { @@ -203,16 +193,7 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::spanGetWidth(), s32(out_luma_width))}; const auto in_luma_height{(std::min)(frame->GetHeight(), s32(out_luma_height))}; @@ -233,32 +214,45 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::spanGetWidth(), s32(out_luma_width))}; [[maybe_unused]] const auto in_luma_height{ @@ -306,33 +291,21 @@ void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span out_luma, std::span out_chroma) { - // Optimization: Process entire scanlines at once for better vectorization - // Separate luma and chroma processing for better cache locality for (u32 y = 0; y < surface_height; ++y) { const auto src_luma = y * surface_stride; const auto dst_luma = y * out_luma_stride; - - // Vectorized luma extraction (compiler can auto-vectorize with SSE/AVX) - for (u32 x = 0; x < surface_width; x++) { - out_luma[dst_luma + x] = u8(output_surface[src_luma + x].r >> 2); - } - } - - // Vectorized chroma extraction (process every other line for 4:2:0 subsampling) - for (u32 y = 0; y < surface_height; y += 2) { const auto src_chroma = y * surface_stride; const auto dst_chroma = (y / 2) * out_chroma_stride; - for (u32 x = 0; x < surface_width; x += 2) { - out_chroma[dst_chroma + x + 0] = u8(output_surface[src_chroma + x].g >> 2); - out_chroma[dst_chroma + x + 1] = u8(output_surface[src_chroma + x].b >> 2); + out_luma[dst_luma + x + 0] = + u8(output_surface[src_luma + x + 0].r >> 2); + out_luma[dst_luma + x + 1] = + u8(output_surface[src_luma + x + 1].r >> 2); + out_chroma[dst_chroma + x + 0] = + u8(output_surface[src_chroma + x].g >> 2); + out_chroma[dst_chroma + x + 1] = + u8(output_surface[src_chroma + x].b >> 2); } } }; @@ -626,31 +593,20 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config, VideoPixel surface_height = (std::min)(surface_height, out_luma_height); auto Decode = [&](std::span out_buffer) { - // Optimization: Better memory access pattern for vectorization - // Process entire scanlines with reduced array indirection for (u32 y = 0; y < surface_height; y++) { const auto src = y * surface_stride; const auto dst = y * out_luma_stride; - - if(format == VideoPixelFormat::A8R8G8B8) { - // Vectorized ARGB processing - for (u32 x = 0; x < surface_width; x++) { - const auto& pixel = output_surface[src + x]; - auto* out_pixel = &out_buffer[dst + x * 4]; - out_pixel[0] = u8(pixel.b >> 2); - out_pixel[1] = u8(pixel.g >> 2); - out_pixel[2] = u8(pixel.r >> 2); - out_pixel[3] = u8(pixel.a >> 2); - } - } else { - // Vectorized ABGR processing - for (u32 x = 0; x < surface_width; x++) { - const auto& pixel = output_surface[src + x]; - auto* out_pixel = &out_buffer[dst + x * 4]; - out_pixel[0] = u8(pixel.r >> 2); - out_pixel[1] = u8(pixel.g >> 2); - out_pixel[2] = u8(pixel.b >> 2); - out_pixel[3] = u8(pixel.a >> 2); + for (u32 x = 0; x < surface_width; x++) { + if(format == VideoPixelFormat::A8R8G8B8) { + out_buffer[dst + x * 4 + 0] = u8(output_surface[src + x].b >> 2); + out_buffer[dst + x * 4 + 1] = u8(output_surface[src + x].g >> 2); + out_buffer[dst + x * 4 + 2] = u8(output_surface[src + x].r >> 2); + out_buffer[dst + x * 4 + 3] = u8(output_surface[src + x].a >> 2); + } else { + out_buffer[dst + x * 4 + 0] = u8(output_surface[src + x].r >> 2); + out_buffer[dst + x * 4 + 1] = u8(output_surface[src + x].g >> 2); + out_buffer[dst + x * 4 + 2] = u8(output_surface[src + x].b >> 2); + out_buffer[dst + x * 4 + 3] = u8(output_surface[src + x].a >> 2); } } }