lol
This commit is contained in:
@@ -10,6 +10,7 @@
|
|||||||
#include <thread>
|
#include <thread>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#if defined(__GNUC__) || defined(__clang__)
|
#if defined(__GNUC__) || defined(__clang__)
|
||||||
#pragma GCC diagnostic push
|
#pragma GCC diagnostic push
|
||||||
@@ -107,17 +108,8 @@ void Vic::Execute() {
|
|||||||
|
|
||||||
auto output_width{config.output_surface_config.out_surface_width + 1};
|
auto output_width{config.output_surface_config.out_surface_width + 1};
|
||||||
auto output_height{config.output_surface_config.out_surface_height + 1};
|
auto output_height{config.output_surface_config.out_surface_height + 1};
|
||||||
|
output_surface.resize_destructive(output_width * output_height);
|
||||||
|
|
||||||
// Only resize when dimensions change (huge performance boost for 1080p)
|
|
||||||
const auto required_size = output_width * output_height;
|
|
||||||
const bool size_changed = output_surface.size() != required_size;
|
|
||||||
if (size_changed) {
|
|
||||||
// Optimization: Only clear on first allocation, not on every resize
|
|
||||||
// This avoids expensive std::fill on large buffers (1080p = ~8MB)
|
|
||||||
const bool first_allocation = output_surface.size() == 0;
|
|
||||||
output_surface.resize_destructive(required_size);
|
|
||||||
|
|
||||||
if (first_allocation) {
|
|
||||||
// Initialize the surface with the appropriate black pixel
|
// Initialize the surface with the appropriate black pixel
|
||||||
Pixel black_pixel{};
|
Pixel black_pixel{};
|
||||||
if (config.output_surface_config.out_pixel_format == VideoPixelFormat::Y8__V8U8_N420) {
|
if (config.output_surface_config.out_pixel_format == VideoPixelFormat::Y8__V8U8_N420) {
|
||||||
@@ -128,8 +120,6 @@ void Vic::Execute() {
|
|||||||
black_pixel = {0, 0, 0, 0};
|
black_pixel = {0, 0, 0, 0};
|
||||||
}
|
}
|
||||||
std::fill(output_surface.begin(), output_surface.end(), black_pixel);
|
std::fill(output_surface.begin(), output_surface.end(), black_pixel);
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] {
|
if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] {
|
||||||
|
|
||||||
@@ -203,16 +193,7 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const P
|
|||||||
out_luma_height *= 2;
|
out_luma_height *= 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Only resize when dimensions change (avoids expensive reallocation)
|
slot_surface.resize_destructive(out_luma_width * out_luma_height);
|
||||||
const auto required_size = out_luma_width * out_luma_height;
|
|
||||||
if (slot_surface.size() != required_size) {
|
|
||||||
// Optimization: Only clear on first allocation to avoid expensive std::fill
|
|
||||||
const bool first_allocation = slot_surface.size() == 0;
|
|
||||||
slot_surface.resize_destructive(required_size);
|
|
||||||
if (first_allocation) {
|
|
||||||
std::fill(slot_surface.begin(), slot_surface.end(), Pixel{0, 512, 512, 0});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))};
|
const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))};
|
||||||
const auto in_luma_height{(std::min)(frame->GetHeight(), s32(out_luma_height))};
|
const auto in_luma_height{(std::min)(frame->GetHeight(), s32(out_luma_height))};
|
||||||
@@ -233,32 +214,45 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const P
|
|||||||
out_luma_height, out_luma_stride);
|
out_luma_height, out_luma_stride);
|
||||||
|
|
||||||
const auto alpha{u16(slot.config.planar_alpha.Value())};
|
const auto alpha{u16(slot.config.planar_alpha.Value())};
|
||||||
|
|
||||||
// Optimization: Separate luma and chroma processing for better cache locality
|
|
||||||
// Process entire scanlines at once for vectorization
|
|
||||||
for (s32 y = 0; y < in_luma_height; y++) {
|
for (s32 y = 0; y < in_luma_height; y++) {
|
||||||
const auto src_luma{y * in_luma_stride};
|
const u8* luma_ptr = luma_buffer + y * in_luma_stride;
|
||||||
const auto src_chroma{(y / 2) * in_chroma_stride};
|
const u8* chroma_u_ptr = chroma_u_buffer + (y / 2) * in_chroma_stride;
|
||||||
const auto dst{y * out_luma_stride};
|
// For planar, V buffer is separate. For NV12, it is not used directly in the same way.
|
||||||
|
const u8* chroma_v_ptr = planar ? (chroma_v_buffer + (y / 2) * in_chroma_stride) : nullptr;
|
||||||
|
|
||||||
// Vectorized luma processing (compiler can auto-vectorize this)
|
Pixel* dst_ptr = &slot_surface[y * out_luma_stride];
|
||||||
for (s32 x = 0; x < in_luma_width; x++) {
|
|
||||||
auto& pixel = slot_surface[dst + x];
|
|
||||||
pixel.r = u16(luma_buffer[src_luma + x] << 2);
|
|
||||||
pixel.a = alpha;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Vectorized chroma processing (separate loop for better cache)
|
for (s32 x = 0; x < in_luma_width; x += 2) {
|
||||||
if(planar) {
|
u16 u_val, v_val;
|
||||||
for (s32 x = 0; x < in_luma_width; x++) {
|
|
||||||
slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2);
|
if (planar) {
|
||||||
slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2);
|
// YUV420P: U and V are in separate planes.
|
||||||
}
|
// 1 UV pair for 2 horizontal pixels.
|
||||||
|
u_val = u16(chroma_u_ptr[x / 2] << 2);
|
||||||
|
v_val = u16(chroma_v_ptr[x / 2] << 2);
|
||||||
} else {
|
} else {
|
||||||
for (s32 x = 0; x < in_luma_width; x++) {
|
// NV12: UV are interleaved in the second plane.
|
||||||
slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
|
// U is at even byte, V is at odd byte.
|
||||||
slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
|
// x is even (0, 2, 4...), so x corresponds to the byte offset in the interleaved buffer.
|
||||||
|
u_val = u16(chroma_u_ptr[x] << 2);
|
||||||
|
v_val = u16(chroma_u_ptr[x + 1] << 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Pixel 1 (Even x)
|
||||||
|
dst_ptr[0].r = u16(luma_ptr[x] << 2);
|
||||||
|
dst_ptr[0].g = u_val;
|
||||||
|
dst_ptr[0].b = v_val;
|
||||||
|
dst_ptr[0].a = alpha;
|
||||||
|
|
||||||
|
// Pixel 2 (Odd x), check boundary
|
||||||
|
if (x + 1 < in_luma_width) {
|
||||||
|
dst_ptr[1].r = u16(luma_ptr[x + 1] << 2);
|
||||||
|
dst_ptr[1].g = u_val;
|
||||||
|
dst_ptr[1].b = v_val;
|
||||||
|
dst_ptr[1].a = alpha;
|
||||||
|
}
|
||||||
|
|
||||||
|
dst_ptr += 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -272,16 +266,7 @@ void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const Pl
|
|||||||
const auto out_luma_height{(slot.surface_config.slot_surface_height + 1) * 2};
|
const auto out_luma_height{(slot.surface_config.slot_surface_height + 1) * 2};
|
||||||
const auto out_luma_stride{out_luma_width};
|
const auto out_luma_stride{out_luma_width};
|
||||||
|
|
||||||
// Only resize when dimensions change (avoids expensive reallocation)
|
slot_surface.resize_destructive(out_luma_width * out_luma_height);
|
||||||
const auto required_size = out_luma_width * out_luma_height;
|
|
||||||
if (slot_surface.size() != required_size) {
|
|
||||||
// Optimization: Only clear on first allocation to avoid expensive std::fill
|
|
||||||
const bool first_allocation = slot_surface.size() == 0;
|
|
||||||
slot_surface.resize_destructive(required_size);
|
|
||||||
if (first_allocation) {
|
|
||||||
std::fill(slot_surface.begin(), slot_surface.end(), Pixel{0, 512, 512, 0});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))};
|
const auto in_luma_width{(std::min)(frame->GetWidth(), s32(out_luma_width))};
|
||||||
[[maybe_unused]] const auto in_luma_height{
|
[[maybe_unused]] const auto in_luma_height{
|
||||||
@@ -306,33 +291,21 @@ void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const Pl
|
|||||||
|
|
||||||
auto DecodeBobField = [&]() {
|
auto DecodeBobField = [&]() {
|
||||||
const auto alpha{u16(slot.config.planar_alpha.Value())};
|
const auto alpha{u16(slot.config.planar_alpha.Value())};
|
||||||
|
|
||||||
// Optimization: Vectorized interlaced processing like progressive
|
|
||||||
for (s32 y = s32(top_field == false); y < in_chroma_height * 2; y += 2) {
|
for (s32 y = s32(top_field == false); y < in_chroma_height * 2; y += 2) {
|
||||||
const auto src_luma{y * in_luma_stride};
|
const auto src_luma{y * in_luma_stride};
|
||||||
const auto src_chroma{(y / 2) * in_chroma_stride};
|
const auto src_chroma{(y / 2) * in_chroma_stride};
|
||||||
const auto dst{y * out_luma_stride};
|
const auto dst{y * out_luma_stride};
|
||||||
|
|
||||||
// Vectorized luma + alpha
|
|
||||||
for (s32 x = 0; x < in_luma_width; x++) {
|
for (s32 x = 0; x < in_luma_width; x++) {
|
||||||
auto& pixel = slot_surface[dst + x];
|
slot_surface[dst + x].r = u16(luma_buffer[src_luma + x] << 2);
|
||||||
pixel.r = u16(luma_buffer[src_luma + x] << 2);
|
|
||||||
pixel.a = alpha;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Vectorized chroma
|
|
||||||
if(planar) {
|
if(planar) {
|
||||||
for (s32 x = 0; x < in_luma_width; x++) {
|
|
||||||
slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2);
|
slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2);
|
||||||
slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2);
|
slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2);
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
for (s32 x = 0; x < in_luma_width; x++) {
|
|
||||||
slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
|
slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
|
||||||
slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
|
slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
|
||||||
}
|
}
|
||||||
|
slot_surface[dst + x].a = alpha;
|
||||||
}
|
}
|
||||||
|
|
||||||
s32 other_line = (top_field ? y + 1 : y - 1) * out_luma_stride;
|
s32 other_line = (top_field ? y + 1 : y - 1) * out_luma_stride;
|
||||||
std::memcpy(&slot_surface[other_line], &slot_surface[dst], out_luma_width * sizeof(Pixel));
|
std::memcpy(&slot_surface[other_line], &slot_surface[dst], out_luma_width * sizeof(Pixel));
|
||||||
}
|
}
|
||||||
@@ -512,26 +485,20 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) {
|
|||||||
surface_height = (std::min)(surface_height, out_luma_height);
|
surface_height = (std::min)(surface_height, out_luma_height);
|
||||||
|
|
||||||
auto Decode = [&](std::span<u8> out_luma, std::span<u8> out_chroma) {
|
auto Decode = [&](std::span<u8> out_luma, std::span<u8> out_chroma) {
|
||||||
// Optimization: Process entire scanlines at once for better vectorization
|
|
||||||
// Separate luma and chroma processing for better cache locality
|
|
||||||
for (u32 y = 0; y < surface_height; ++y) {
|
for (u32 y = 0; y < surface_height; ++y) {
|
||||||
const auto src_luma = y * surface_stride;
|
const auto src_luma = y * surface_stride;
|
||||||
const auto dst_luma = y * out_luma_stride;
|
const auto dst_luma = y * out_luma_stride;
|
||||||
|
|
||||||
// Vectorized luma extraction (compiler can auto-vectorize with SSE/AVX)
|
|
||||||
for (u32 x = 0; x < surface_width; x++) {
|
|
||||||
out_luma[dst_luma + x] = u8(output_surface[src_luma + x].r >> 2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Vectorized chroma extraction (process every other line for 4:2:0 subsampling)
|
|
||||||
for (u32 y = 0; y < surface_height; y += 2) {
|
|
||||||
const auto src_chroma = y * surface_stride;
|
const auto src_chroma = y * surface_stride;
|
||||||
const auto dst_chroma = (y / 2) * out_chroma_stride;
|
const auto dst_chroma = (y / 2) * out_chroma_stride;
|
||||||
|
|
||||||
for (u32 x = 0; x < surface_width; x += 2) {
|
for (u32 x = 0; x < surface_width; x += 2) {
|
||||||
out_chroma[dst_chroma + x + 0] = u8(output_surface[src_chroma + x].g >> 2);
|
out_luma[dst_luma + x + 0] =
|
||||||
out_chroma[dst_chroma + x + 1] = u8(output_surface[src_chroma + x].b >> 2);
|
u8(output_surface[src_luma + x + 0].r >> 2);
|
||||||
|
out_luma[dst_luma + x + 1] =
|
||||||
|
u8(output_surface[src_luma + x + 1].r >> 2);
|
||||||
|
out_chroma[dst_chroma + x + 0] =
|
||||||
|
u8(output_surface[src_chroma + x].g >> 2);
|
||||||
|
out_chroma[dst_chroma + x + 1] =
|
||||||
|
u8(output_surface[src_chroma + x].b >> 2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -626,31 +593,20 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config, VideoPixel
|
|||||||
surface_height = (std::min)(surface_height, out_luma_height);
|
surface_height = (std::min)(surface_height, out_luma_height);
|
||||||
|
|
||||||
auto Decode = [&](std::span<u8> out_buffer) {
|
auto Decode = [&](std::span<u8> out_buffer) {
|
||||||
// Optimization: Better memory access pattern for vectorization
|
|
||||||
// Process entire scanlines with reduced array indirection
|
|
||||||
for (u32 y = 0; y < surface_height; y++) {
|
for (u32 y = 0; y < surface_height; y++) {
|
||||||
const auto src = y * surface_stride;
|
const auto src = y * surface_stride;
|
||||||
const auto dst = y * out_luma_stride;
|
const auto dst = y * out_luma_stride;
|
||||||
|
for (u32 x = 0; x < surface_width; x++) {
|
||||||
if(format == VideoPixelFormat::A8R8G8B8) {
|
if(format == VideoPixelFormat::A8R8G8B8) {
|
||||||
// Vectorized ARGB processing
|
out_buffer[dst + x * 4 + 0] = u8(output_surface[src + x].b >> 2);
|
||||||
for (u32 x = 0; x < surface_width; x++) {
|
out_buffer[dst + x * 4 + 1] = u8(output_surface[src + x].g >> 2);
|
||||||
const auto& pixel = output_surface[src + x];
|
out_buffer[dst + x * 4 + 2] = u8(output_surface[src + x].r >> 2);
|
||||||
auto* out_pixel = &out_buffer[dst + x * 4];
|
out_buffer[dst + x * 4 + 3] = u8(output_surface[src + x].a >> 2);
|
||||||
out_pixel[0] = u8(pixel.b >> 2);
|
|
||||||
out_pixel[1] = u8(pixel.g >> 2);
|
|
||||||
out_pixel[2] = u8(pixel.r >> 2);
|
|
||||||
out_pixel[3] = u8(pixel.a >> 2);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
// Vectorized ABGR processing
|
out_buffer[dst + x * 4 + 0] = u8(output_surface[src + x].r >> 2);
|
||||||
for (u32 x = 0; x < surface_width; x++) {
|
out_buffer[dst + x * 4 + 1] = u8(output_surface[src + x].g >> 2);
|
||||||
const auto& pixel = output_surface[src + x];
|
out_buffer[dst + x * 4 + 2] = u8(output_surface[src + x].b >> 2);
|
||||||
auto* out_pixel = &out_buffer[dst + x * 4];
|
out_buffer[dst + x * 4 + 3] = u8(output_surface[src + x].a >> 2);
|
||||||
out_pixel[0] = u8(pixel.r >> 2);
|
|
||||||
out_pixel[1] = u8(pixel.g >> 2);
|
|
||||||
out_pixel[2] = u8(pixel.b >> 2);
|
|
||||||
out_pixel[3] = u8(pixel.a >> 2);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user