From c486334c78e82d0973fa985b365ab9193a6479aa Mon Sep 17 00:00:00 2001
From: DieguinDG <araugolo997@gmail.com>
Date: Thu, 8 Jan 2026 16:20:36 -0300
Subject: [PATCH] lol

---
 src/video_core/host1x/vic.cpp | 184 +++++++++++++---------------------
 1 file changed, 70 insertions(+), 114 deletions(-)
diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp
index b42dee43a2..20eb3ddc5f 100644
--- a/src/video_core/host1x/vic.cpp
+++ b/src/video_core/host1x/vic.cpp
@@ -10,6 +10,7 @@
 #include <thread>
 #include <vector>
 
+
 extern "C" {
 #if defined(__GNUC__) || defined(__clang__)
 #pragma GCC diagnostic push
@@ -107,29 +108,18 @@ void Vic::Execute() {
 
     auto output_width{config.output_surface_config.out_surface_width + 1};
     auto output_height{config.output_surface_config.out_surface_height + 1};
+    output_surface.resize_destructive(output_width * output_height);
 
-    // Only resize when dimensions change (huge performance boost for 1080p)
-    const auto required_size = output_width * output_height;
-    const bool size_changed = output_surface.size() != required_size;
-    if (size_changed) {
-        // Optimization: Only clear on first allocation, not on every resize
-        // This avoids expensive std::fill on large buffers (1080p = ~8MB)
-        const bool first_allocation = output_surface.size() == 0;
-        output_surface.resize_destructive(required_size);
-
-        if (first_allocation) {
-            // Initialize the surface with the appropriate black pixel
-            Pixel black_pixel{};
-            if (config.output_surface_config.out_pixel_format == VideoPixelFormat::Y8__V8U8_N420) {
-                // Y=0, U=512, V=512 (10-bit), A=0
-                black_pixel = {0, 512, 512, 0};
-            } else {
-                // R=0, G=0, B=0, A=0
-                black_pixel = {0, 0, 0, 0};
-            }
-            std::fill(output_surface.begin(), output_surface.end(), black_pixel);
-        }
+    // Initialize the surface with the appropriate black pixel
+    Pixel black_pixel{};
+    if (config.output_surface_config.out_pixel_format == VideoPixelFormat::Y8__V8U8_N420) {
+        // Y=0, U=512, V=512 (10-bit), A=0
+        black_pixel = {0, 512, 512, 0};
+    } else {
+        // R=0, G=0, B=0, A=0
+        black_pixel = {0, 0, 0, 0};
     }
+    std::fill(output_surface.begin(), output_surface.end(), black_pixel);
 
     if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] {
 
@@ -203,16 +193,7 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const P
         out_luma_height *= 2;
     }
 
-    // Only resize when dimensions change (avoids expensive reallocation)
-    const auto required_size = out_luma_width * out_luma_height;
-    if (slot_surface.size() != required_size) {
-        // Optimization: Only clear on first allocation to avoid expensive std::fill
-        const bool first_allocation = slot_surface.size() == 0;
-        slot_surface.resize_destructive(required_size);
-        if (first_allocation) {
-            std::fill(slot_surface.begin(), slot_surface.end(), Pixel{0, 512, 512, 0});
-        }
-    }
+    slot_surface.resize_destructive(out_luma_width * out_luma_height);
 
     const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))};
     const auto in_luma_height{(std::min)(frame->GetHeight(), s32(out_luma_height))};
@@ -233,32 +214,45 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const P
               out_luma_height, out_luma_stride);
 
     const auto alpha{u16(slot.config.planar_alpha.Value())};
-
-    // Optimization: Separate luma and chroma processing for better cache locality
-    // Process entire scanlines at once for vectorization
     for (s32 y = 0; y < in_luma_height; y++) {
-        const auto src_luma{y * in_luma_stride};
-        const auto src_chroma{(y / 2) * in_chroma_stride};
-        const auto dst{y * out_luma_stride};
+        const u8* luma_ptr = luma_buffer + y * in_luma_stride;
+        const u8* chroma_u_ptr = chroma_u_buffer + (y / 2) * in_chroma_stride;
+        // For planar, V buffer is separate. For NV12, it is not used directly in the same way.
+        const u8* chroma_v_ptr = planar ? (chroma_v_buffer + (y / 2) * in_chroma_stride) : nullptr;
 
-        // Vectorized luma processing (compiler can auto-vectorize this)
-        for (s32 x = 0; x < in_luma_width; x++) {
-            auto& pixel = slot_surface[dst + x];
-            pixel.r = u16(luma_buffer[src_luma + x] << 2);
-            pixel.a = alpha;
-        }
+        Pixel* dst_ptr = &slot_surface[y * out_luma_stride];
 
-        // Vectorized chroma processing (separate loop for better cache)
-        if(planar) {
-            for (s32 x = 0; x < in_luma_width; x++) {
-                slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2);
-                slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2);
+        for (s32 x = 0; x < in_luma_width; x += 2) {
+            u16 u_val, v_val;
+
+            if (planar) {
+                // YUV420P: U and V are in separate planes.
+                // 1 UV pair for 2 horizontal pixels.
+                u_val = u16(chroma_u_ptr[x / 2] << 2);
+                v_val = u16(chroma_v_ptr[x / 2] << 2);
+            } else {
+                // NV12: UV are interleaved in the second plane.
+                // U is at even byte, V is at odd byte.
+                // x is even (0, 2, 4...), so x corresponds to the byte offset in the interleaved buffer.
+                u_val = u16(chroma_u_ptr[x] << 2);
+                v_val = u16(chroma_u_ptr[x + 1] << 2);
             }
-        } else {
-            for (s32 x = 0; x < in_luma_width; x++) {
-                slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
-                slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
+
+            // Pixel 1 (Even x)
+            dst_ptr[0].r = u16(luma_ptr[x] << 2);
+            dst_ptr[0].g = u_val;
+            dst_ptr[0].b = v_val;
+            dst_ptr[0].a = alpha;
+
+            // Pixel 2 (Odd x), check boundary
+            if (x + 1 < in_luma_width) {
+                dst_ptr[1].r = u16(luma_ptr[x + 1] << 2);
+                dst_ptr[1].g = u_val;
+                dst_ptr[1].b = v_val;
+                dst_ptr[1].a = alpha;
             }
+
+            dst_ptr += 2;
         }
     }
 }
@@ -272,16 +266,7 @@ void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const Pl
     const auto out_luma_height{(slot.surface_config.slot_surface_height + 1) * 2};
     const auto out_luma_stride{out_luma_width};
 
-    // Only resize when dimensions change (avoids expensive reallocation)
-    const auto required_size = out_luma_width * out_luma_height;
-    if (slot_surface.size() != required_size) {
-        // Optimization: Only clear on first allocation to avoid expensive std::fill
-        const bool first_allocation = slot_surface.size() == 0;
-        slot_surface.resize_destructive(required_size);
-        if (first_allocation) {
-            std::fill(slot_surface.begin(), slot_surface.end(), Pixel{0, 512, 512, 0});
-        }
-    }
+    slot_surface.resize_destructive(out_luma_width * out_luma_height);
 
     const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))};
     [[maybe_unused]] const auto in_luma_height{
@@ -306,33 +291,21 @@ void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const Pl
 
     auto DecodeBobField = [&]() {
         const auto alpha{u16(slot.config.planar_alpha.Value())};
-
-        // Optimization: Vectorized interlaced processing like progressive
         for (s32 y = s32(top_field == false); y < in_chroma_height * 2; y += 2) {
             const auto src_luma{y * in_luma_stride};
             const auto src_chroma{(y / 2) * in_chroma_stride};
             const auto dst{y * out_luma_stride};
-
-            // Vectorized luma + alpha
             for (s32 x = 0; x < in_luma_width; x++) {
-                auto& pixel = slot_surface[dst + x];
-                pixel.r = u16(luma_buffer[src_luma + x] << 2);
-                pixel.a = alpha;
-            }
-
-            // Vectorized chroma
-            if(planar) {
-                for (s32 x = 0; x < in_luma_width; x++) {
+                slot_surface[dst + x].r = u16(luma_buffer[src_luma + x] << 2);
+                if(planar) {
                     slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2);
                     slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2);
-                }
-            } else {
-                for (s32 x = 0; x < in_luma_width; x++) {
+                } else {
                     slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
                     slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
                 }
+                slot_surface[dst + x].a = alpha;
             }
-
             s32 other_line = (top_field ? y + 1 : y - 1) * out_luma_stride;
             std::memcpy(&slot_surface[other_line], &slot_surface[dst], out_luma_width * sizeof(Pixel));
         }
@@ -512,26 +485,20 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) {
     surface_height = (std::min)(surface_height, out_luma_height);
 
     auto Decode = [&](std::span<u8> out_luma, std::span<u8> out_chroma) {
-        // Optimization: Process entire scanlines at once for better vectorization
-        // Separate luma and chroma processing for better cache locality
         for (u32 y = 0; y < surface_height; ++y) {
             const auto src_luma = y * surface_stride;
             const auto dst_luma = y * out_luma_stride;
-
-            // Vectorized luma extraction (compiler can auto-vectorize with SSE/AVX)
-            for (u32 x = 0; x < surface_width; x++) {
-                out_luma[dst_luma + x] = u8(output_surface[src_luma + x].r >> 2);
-            }
-        }
-
-        // Vectorized chroma extraction (process every other line for 4:2:0 subsampling)
-        for (u32 y = 0; y < surface_height; y += 2) {
             const auto src_chroma = y * surface_stride;
             const auto dst_chroma = (y / 2) * out_chroma_stride;
-
             for (u32 x = 0; x < surface_width; x += 2) {
-                out_chroma[dst_chroma + x + 0] = u8(output_surface[src_chroma + x].g >> 2);
-                out_chroma[dst_chroma + x + 1] = u8(output_surface[src_chroma + x].b >> 2);
+                out_luma[dst_luma + x + 0] =
+                    u8(output_surface[src_luma + x + 0].r >> 2);
+                out_luma[dst_luma + x + 1] =
+                    u8(output_surface[src_luma + x + 1].r >> 2);
+                out_chroma[dst_chroma + x + 0] =
+                    u8(output_surface[src_chroma + x].g >> 2);
+                out_chroma[dst_chroma + x + 1] =
+                    u8(output_surface[src_chroma + x].b >> 2);
             }
         }
     };
@@ -626,31 +593,20 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config, VideoPixel
     surface_height = (std::min)(surface_height, out_luma_height);
 
     auto Decode = [&](std::span<u8> out_buffer) {
-        // Optimization: Better memory access pattern for vectorization
-        // Process entire scanlines with reduced array indirection
         for (u32 y = 0; y < surface_height; y++) {
             const auto src = y * surface_stride;
             const auto dst = y * out_luma_stride;
-
-            if(format == VideoPixelFormat::A8R8G8B8) {
-                // Vectorized ARGB processing
-                for (u32 x = 0; x < surface_width; x++) {
-                    const auto& pixel = output_surface[src + x];
-                    auto* out_pixel = &out_buffer[dst + x * 4];
-                    out_pixel[0] = u8(pixel.b >> 2);
-                    out_pixel[1] = u8(pixel.g >> 2);
-                    out_pixel[2] = u8(pixel.r >> 2);
-                    out_pixel[3] = u8(pixel.a >> 2);
-                }
-            } else {
-                // Vectorized ABGR processing
-                for (u32 x = 0; x < surface_width; x++) {
-                    const auto& pixel = output_surface[src + x];
-                    auto* out_pixel = &out_buffer[dst + x * 4];
-                    out_pixel[0] = u8(pixel.r >> 2);
-                    out_pixel[1] = u8(pixel.g >> 2);
-                    out_pixel[2] = u8(pixel.b >> 2);
-                    out_pixel[3] = u8(pixel.a >> 2);
+            for (u32 x = 0; x < surface_width; x++) {
+                if(format == VideoPixelFormat::A8R8G8B8) {
+                    out_buffer[dst + x * 4 + 0] = u8(output_surface[src + x].b >> 2);
+                    out_buffer[dst + x * 4 + 1] = u8(output_surface[src + x].g >> 2);
+                    out_buffer[dst + x * 4 + 2] = u8(output_surface[src + x].r >> 2);
+                    out_buffer[dst + x * 4 + 3] = u8(output_surface[src + x].a >> 2);
+                } else {
+                    out_buffer[dst + x * 4 + 0] = u8(output_surface[src + x].r >> 2);
+                    out_buffer[dst + x * 4 + 1] = u8(output_surface[src + x].g >> 2);
+                    out_buffer[dst + x * 4 + 2] = u8(output_surface[src + x].b >> 2);
+                    out_buffer[dst + x * 4 + 3] = u8(output_surface[src + x].a >> 2);
                 }
             }
         }