lol

2026-01-08 16:20:36 -03:00
parent 7dc30a5e2d
commit c486334c78
1 changed files with 70 additions and 114 deletions
--- a/src/video_core/host1x/vic.cpp
+++ b/src/video_core/host1x/vic.cpp
@@ -10,6 +10,7 @@
 #include <thread>
 #include <vector>
 extern "C" {
 #if defined(__GNUC__) || defined(__clang__)
 #pragma GCC diagnostic push
@@ -107,17 +108,8 @@ void Vic::Execute() {
    auto output_width{config.output_surface_config.out_surface_width + 1};
    auto output_height{config.output_surface_config.out_surface_height + 1};
    output_surface.resize_destructive(output_width * output_height);
    // Only resize when dimensions change (huge performance boost for 1080p)
    const auto required_size = output_width * output_height;
    const bool size_changed = output_surface.size() != required_size;
    if (size_changed) {
        // Optimization: Only clear on first allocation, not on every resize
        // This avoids expensive std::fill on large buffers (1080p = ~8MB)
        const bool first_allocation = output_surface.size() == 0;
        output_surface.resize_destructive(required_size);
        if (first_allocation) {
    // Initialize the surface with the appropriate black pixel
    Pixel black_pixel{};
    if (config.output_surface_config.out_pixel_format == VideoPixelFormat::Y8__V8U8_N420) {
@@ -128,8 +120,6 @@ void Vic::Execute() {
        black_pixel = {0, 0, 0, 0};
    }
    std::fill(output_surface.begin(), output_surface.end(), black_pixel);
        }
    }
    if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] {
@@ -203,16 +193,7 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const P
        out_luma_height *= 2;
    }
-    // Only resize when dimensions change (avoids expensive reallocation)
+    slot_surface.resize_destructive(out_luma_width * out_luma_height);
    const auto required_size = out_luma_width * out_luma_height;
    if (slot_surface.size() != required_size) {
        // Optimization: Only clear on first allocation to avoid expensive std::fill
        const bool first_allocation = slot_surface.size() == 0;
        slot_surface.resize_destructive(required_size);
        if (first_allocation) {
            std::fill(slot_surface.begin(), slot_surface.end(), Pixel{0, 512, 512, 0});
        }
    }
    const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))};
    const auto in_luma_height{(std::min)(frame->GetHeight(), s32(out_luma_height))};
@@ -233,32 +214,45 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const P
              out_luma_height, out_luma_stride);
    const auto alpha{u16(slot.config.planar_alpha.Value())};
    // Optimization: Separate luma and chroma processing for better cache locality
    // Process entire scanlines at once for vectorization
    for (s32 y = 0; y < in_luma_height; y++) {
-        const auto src_luma{y * in_luma_stride};
+        const u8* luma_ptr = luma_buffer + y * in_luma_stride;
-        const auto src_chroma{(y / 2) * in_chroma_stride};
+        const u8* chroma_u_ptr = chroma_u_buffer + (y / 2) * in_chroma_stride;
-        const auto dst{y * out_luma_stride};
+        // For planar, V buffer is separate. For NV12, it is not used directly in the same way.
        const u8* chroma_v_ptr = planar ? (chroma_v_buffer + (y / 2) * in_chroma_stride) : nullptr;
-        // Vectorized luma processing (compiler can auto-vectorize this)
+        Pixel* dst_ptr = &slot_surface[y * out_luma_stride];
        for (s32 x = 0; x < in_luma_width; x++) {
            auto& pixel = slot_surface[dst + x];
            pixel.r = u16(luma_buffer[src_luma + x] << 2);
            pixel.a = alpha;
        }
-        // Vectorized chroma processing (separate loop for better cache)
+        for (s32 x = 0; x < in_luma_width; x += 2) {
-        if(planar) {
+            u16 u_val, v_val;
-            for (s32 x = 0; x < in_luma_width; x++) {
+
-                slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2);
+            if (planar) {
-                slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2);
+                // YUV420P: U and V are in separate planes.
-            }
+                // 1 UV pair for 2 horizontal pixels.
                u_val = u16(chroma_u_ptr[x / 2] << 2);
                v_val = u16(chroma_v_ptr[x / 2] << 2);
            } else {
-            for (s32 x = 0; x < in_luma_width; x++) {
+                // NV12: UV are interleaved in the second plane.
-                slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
+                // U is at even byte, V is at odd byte.
-                slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
+                // x is even (0, 2, 4...), so x corresponds to the byte offset in the interleaved buffer.
                u_val = u16(chroma_u_ptr[x] << 2);
                v_val = u16(chroma_u_ptr[x + 1] << 2);
            }
            // Pixel 1 (Even x)
            dst_ptr[0].r = u16(luma_ptr[x] << 2);
            dst_ptr[0].g = u_val;
            dst_ptr[0].b = v_val;
            dst_ptr[0].a = alpha;
            // Pixel 2 (Odd x), check boundary
            if (x + 1 < in_luma_width) {
                dst_ptr[1].r = u16(luma_ptr[x + 1] << 2);
                dst_ptr[1].g = u_val;
                dst_ptr[1].b = v_val;
                dst_ptr[1].a = alpha;
            }
            dst_ptr += 2;
        }
    }
 }
@@ -272,16 +266,7 @@ void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const Pl
    const auto out_luma_height{(slot.surface_config.slot_surface_height + 1) * 2};
    const auto out_luma_stride{out_luma_width};
-    // Only resize when dimensions change (avoids expensive reallocation)
+    slot_surface.resize_destructive(out_luma_width * out_luma_height);
    const auto required_size = out_luma_width * out_luma_height;
    if (slot_surface.size() != required_size) {
        // Optimization: Only clear on first allocation to avoid expensive std::fill
        const bool first_allocation = slot_surface.size() == 0;
        slot_surface.resize_destructive(required_size);
        if (first_allocation) {
            std::fill(slot_surface.begin(), slot_surface.end(), Pixel{0, 512, 512, 0});
        }
    }
    const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))};
    [[maybe_unused]] const auto in_luma_height{
@@ -306,33 +291,21 @@ void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const Pl
    auto DecodeBobField = [&]() {
        const auto alpha{u16(slot.config.planar_alpha.Value())};
        // Optimization: Vectorized interlaced processing like progressive
        for (s32 y = s32(top_field == false); y < in_chroma_height * 2; y += 2) {
            const auto src_luma{y * in_luma_stride};
            const auto src_chroma{(y / 2) * in_chroma_stride};
            const auto dst{y * out_luma_stride};
            // Vectorized luma + alpha
            for (s32 x = 0; x < in_luma_width; x++) {
-                auto& pixel = slot_surface[dst + x];
+                slot_surface[dst + x].r = u16(luma_buffer[src_luma + x] << 2);
                pixel.r = u16(luma_buffer[src_luma + x] << 2);
                pixel.a = alpha;
            }
            // Vectorized chroma
                if(planar) {
                for (s32 x = 0; x < in_luma_width; x++) {
                    slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2);
                    slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2);
                }
                } else {
                for (s32 x = 0; x < in_luma_width; x++) {
                    slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
                    slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
                }
                slot_surface[dst + x].a = alpha;
            }
            s32 other_line = (top_field ? y + 1 : y - 1) * out_luma_stride;
            std::memcpy(&slot_surface[other_line], &slot_surface[dst], out_luma_width * sizeof(Pixel));
        }
@@ -512,26 +485,20 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) {
    surface_height = (std::min)(surface_height, out_luma_height);
    auto Decode = [&](std::span<u8> out_luma, std::span<u8> out_chroma) {
        // Optimization: Process entire scanlines at once for better vectorization
        // Separate luma and chroma processing for better cache locality
        for (u32 y = 0; y < surface_height; ++y) {
            const auto src_luma = y * surface_stride;
            const auto dst_luma = y * out_luma_stride;
            // Vectorized luma extraction (compiler can auto-vectorize with SSE/AVX)
            for (u32 x = 0; x < surface_width; x++) {
                out_luma[dst_luma + x] = u8(output_surface[src_luma + x].r >> 2);
            }
        }
        // Vectorized chroma extraction (process every other line for 4:2:0 subsampling)
        for (u32 y = 0; y < surface_height; y += 2) {
            const auto src_chroma = y * surface_stride;
            const auto dst_chroma = (y / 2) * out_chroma_stride;
            for (u32 x = 0; x < surface_width; x += 2) {
-                out_chroma[dst_chroma + x + 0] = u8(output_surface[src_chroma + x].g >> 2);
+                out_luma[dst_luma + x + 0] =
-                out_chroma[dst_chroma + x + 1] = u8(output_surface[src_chroma + x].b >> 2);
+                    u8(output_surface[src_luma + x + 0].r >> 2);
                out_luma[dst_luma + x + 1] =
                    u8(output_surface[src_luma + x + 1].r >> 2);
                out_chroma[dst_chroma + x + 0] =
                    u8(output_surface[src_chroma + x].g >> 2);
                out_chroma[dst_chroma + x + 1] =
                    u8(output_surface[src_chroma + x].b >> 2);
            }
        }
    };
@@ -626,31 +593,20 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config, VideoPixel
    surface_height = (std::min)(surface_height, out_luma_height);
    auto Decode = [&](std::span<u8> out_buffer) {
        // Optimization: Better memory access pattern for vectorization
        // Process entire scanlines with reduced array indirection
        for (u32 y = 0; y < surface_height; y++) {
            const auto src = y * surface_stride;
            const auto dst = y * out_luma_stride;
-
+            for (u32 x = 0; x < surface_width; x++) {
                if(format == VideoPixelFormat::A8R8G8B8) {
-                // Vectorized ARGB processing
+                    out_buffer[dst + x * 4 + 0] = u8(output_surface[src + x].b >> 2);
-                for (u32 x = 0; x < surface_width; x++) {
+                    out_buffer[dst + x * 4 + 1] = u8(output_surface[src + x].g >> 2);
-                    const auto& pixel = output_surface[src + x];
+                    out_buffer[dst + x * 4 + 2] = u8(output_surface[src + x].r >> 2);
-                    auto* out_pixel = &out_buffer[dst + x * 4];
+                    out_buffer[dst + x * 4 + 3] = u8(output_surface[src + x].a >> 2);
                    out_pixel[0] = u8(pixel.b >> 2);
                    out_pixel[1] = u8(pixel.g >> 2);
                    out_pixel[2] = u8(pixel.r >> 2);
                    out_pixel[3] = u8(pixel.a >> 2);
                }
                } else {
-                // Vectorized ABGR processing
+                    out_buffer[dst + x * 4 + 0] = u8(output_surface[src + x].r >> 2);
-                for (u32 x = 0; x < surface_width; x++) {
+                    out_buffer[dst + x * 4 + 1] = u8(output_surface[src + x].g >> 2);
-                    const auto& pixel = output_surface[src + x];
+                    out_buffer[dst + x * 4 + 2] = u8(output_surface[src + x].b >> 2);
-                    auto* out_pixel = &out_buffer[dst + x * 4];
+                    out_buffer[dst + x * 4 + 3] = u8(output_surface[src + x].a >> 2);
                    out_pixel[0] = u8(pixel.r >> 2);
                    out_pixel[1] = u8(pixel.g >> 2);
                    out_pixel[2] = u8(pixel.b >> 2);
                    out_pixel[3] = u8(pixel.a >> 2);
                }
            }
        }