Reduce synchronization overhead and improve performance in DMA operations

[FIXUP] Partially revert "[NCE] Fix cache invalidation and signal interrupt race condition (#3063 )" (#3190 )
* this fixes Jamboree and SSB This reverts commit e3c942b209. Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3190 Reviewed-by: Maufeat <sahyno1996@gmail.com> Reviewed-by: MaranBr <maranbr@eden-emu.dev> Co-authored-by: Caio Oliveira <caiooliveirafarias0@gmail.com> Co-committed-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
2025-12-22 12:56:28 +01:00 · 2025-12-22 02:58:40 +01:00
2 changed files with 42 additions and 79 deletions
--- a/src/core/arm/nce/arm_nce.cpp
+++ b/src/core/arm/nce/arm_nce.cpp
@@ -391,28 +391,15 @@ const std::size_t CACHE_PAGE_SIZE = 4096;

 void ArmNce::ClearInstructionCache() {
 #ifdef __aarch64__
-    // Use IC IALLU to actually invalidate L1 instruction cache
+    // Ensure all previous memory operations complete
    asm volatile("dsb ish\n"
-                 "ic iallu\n"
                 "dsb ish\n"
                 "isb" ::: "memory");
 #endif
 }

 void ArmNce::InvalidateCacheRange(u64 addr, std::size_t size) {
-#ifdef ARCHITECTURE_arm64
-    // Invalidate instruction cache for specific range instead of full flush
-    constexpr u64 cache_line_size = 64;
-    const u64 aligned_addr = addr & ~(cache_line_size - 1);
-    const u64 end_addr = (addr + size + cache_line_size - 1) & ~(cache_line_size - 1);
-
-    asm volatile("dsb ish" ::: "memory");
-    for (u64 i = aligned_addr; i < end_addr; i += cache_line_size) {
-        asm volatile("ic ivau, %0" :: "r"(i) : "memory");
-    }
-    asm volatile("dsb ish\n"
-                 "isb" ::: "memory");
-#endif
+    this->ClearInstructionCache();
 }

 } // namespace Core
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -43,90 +43,66 @@ void DmaPusher::DispatchCalls() {

 bool DmaPusher::Step() {
    if (!ib_enable || dma_pushbuffer.empty()) {
-        // pushbuffer empty and IB empty or nonexistent - nothing to do
        return false;
    }

-    CommandList& command_list{dma_pushbuffer.front()};
+    CommandList& command_list = dma_pushbuffer.front();

-    ASSERT_OR_EXECUTE(
-        command_list.command_lists.size() || command_list.prefetch_command_list.size(), {
-            // Somehow the command_list is empty, in order to avoid a crash
-            // We ignore it and assume its size is 0.
-            dma_pushbuffer.pop();
-            dma_pushbuffer_subindex = 0;
-            return true;
-        });
+    const size_t prefetch_size = command_list.prefetch_command_list.size();
+    const size_t command_list_size = command_list.command_lists.size();

-    if (command_list.prefetch_command_list.size()) {
-        // Prefetched command list from nvdrv, used for things like synchronization
-        ProcessCommands(VideoCommon::FixSmallVectorADL(command_list.prefetch_command_list));
+    if (prefetch_size == 0 && command_list_size == 0) {
        dma_pushbuffer.pop();
-    } else {
-        const CommandListHeader command_list_header{
-            command_list.command_lists[dma_pushbuffer_subindex++]};
+        dma_pushbuffer_subindex = 0;
+        return true;
+    }

-        if (signal_sync) {
-            std::unique_lock lk(sync_mutex);
-            sync_cv.wait(lk, [this]() { return synced; });
-            signal_sync = false;
-            synced = false;
-        }
+    if (prefetch_size > 0) {
+        ProcessCommands(command_list.prefetch_command_list);
+        dma_pushbuffer.pop();
+        return true;
+    }

-        dma_state.dma_get = command_list_header.addr;
+    auto& current_command = command_list.command_lists[dma_pushbuffer_subindex];
+    const CommandListHeader& header = current_command;
+    dma_state.dma_get = header.addr;

-        if (command_list_header.size == 0) {
-            return true;
-        }
+    if (signal_sync && !synced) {
+        std::unique_lock lk(sync_mutex);
+        sync_cv.wait(lk, [this]() { return synced; });
+        signal_sync = false;
+        synced = false;
+    }

-        // Push buffer non-empty, read a word
-        if (dma_state.method >= MacroRegistersStart) {
-            if (subchannels[dma_state.subchannel]) {
-                subchannels[dma_state.subchannel]->current_dirty = memory_manager.IsMemoryDirty(
-                    dma_state.dma_get, command_list_header.size * sizeof(u32));
-            }
-        }
+    if (header.size > 0 && dma_state.method >= MacroRegistersStart && subchannels[dma_state.subchannel]) {
+        subchannels[dma_state.subchannel]->current_dirty = memory_manager.IsMemoryDirty(dma_state.dma_get, header.size * sizeof(u32));
+    }

-        const auto safe_process = [&] {
-            Tegra::Memory::GpuGuestMemory<Tegra::CommandHeader,
-                                          Tegra::Memory::GuestMemoryFlags::SafeRead>
-                headers(memory_manager, dma_state.dma_get, command_list_header.size,
-                        &command_headers);
+    if (header.size > 0) {
+        if (Settings::IsDMALevelDefault() ? (Settings::IsGPULevelMedium() || Settings::IsGPULevelHigh()) : Settings::IsDMALevelSafe()) {
+            Tegra::Memory::GpuGuestMemory<Tegra::CommandHeader, Tegra::Memory::GuestMemoryFlags::SafeRead>headers(memory_manager, dma_state.dma_get, header.size, &command_headers);
            ProcessCommands(headers);
-        };
-
-        const auto unsafe_process = [&] {
-            Tegra::Memory::GpuGuestMemory<Tegra::CommandHeader,
-                                          Tegra::Memory::GuestMemoryFlags::UnsafeRead>
-                headers(memory_manager, dma_state.dma_get, command_list_header.size,
-                        &command_headers);
-            ProcessCommands(headers);
-        };
-
-        const bool use_safe = Settings::IsDMALevelDefault() ? (Settings::IsGPULevelMedium() || Settings::IsGPULevelHigh()) : Settings::IsDMALevelSafe();
-
-        if (use_safe) {
-            safe_process();
        } else {
-            unsafe_process();
+            Tegra::Memory::GpuGuestMemory<Tegra::CommandHeader, Tegra::Memory::GuestMemoryFlags::UnsafeRead>headers(memory_manager, dma_state.dma_get, header.size, &command_headers);
+            ProcessCommands(headers);
        }
+    }

-        if (dma_pushbuffer_subindex >= command_list.command_lists.size()) {
-            // We've gone through the current list, remove it from the queue
-            dma_pushbuffer.pop();
-            dma_pushbuffer_subindex = 0;
-        } else if (command_list.command_lists[dma_pushbuffer_subindex].sync && Settings::values.sync_memory_operations.GetValue()) {
-            signal_sync = true;
-        }
+    if (++dma_pushbuffer_subindex >= command_list_size) {
+        dma_pushbuffer.pop();
+        dma_pushbuffer_subindex = 0;
+    } else {
+        signal_sync = command_list.command_lists[dma_pushbuffer_subindex].sync && Settings::values.sync_memory_operations.GetValue();
+    }

-        if (signal_sync) {
-            rasterizer->SignalFence([this]() {
+    if (signal_sync) {
+        rasterizer->SignalFence([this]() {
            std::scoped_lock lk(sync_mutex);
            synced = true;
            sync_cv.notify_all();
-            });
-        }
+        });
    }
+
    return true;
 }