Compare commits

..

7 Commits

Author SHA1 Message Date
PavelBARABANOV
60b84b169f a small fix to the start menu 2025-12-11 18:34:26 +03:00
MaranBr
e2f175f82d Fix GPU mode descriptions 2025-12-10 08:15:25 -04:00
PavelBARABANOV
4edb70c8f0 [vk] Use point filter for D32->R32 blits to fix burnout blur (#3088)
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3088
Reviewed-by: crueter <crueter@eden-emu.dev>
Reviewed-by: CamilleLaVey <camillelavey99@gmail.com>
Co-authored-by: PavelBARABANOV <pavelbarabanov94@gmail.com>
Co-committed-by: PavelBARABANOV <pavelbarabanov94@gmail.com>
2025-12-09 21:11:08 +01:00
MaranBr
9da38715fe [video_core] Rework GPU Accuracy levels and remove Early Release Fences toggle (#3129)
The GPU Accuracy level is now divided into Performance, Balanced and Accurate.

1. Performance prioritizes speed at all costs. It's faster, but it can be unstable and may have some bugs (which is expected).

2. Balanced maintains excellent performance and is safer against bugs and shader corruption.

3. Accurate is the most precise and the most expensive in terms of hardware. Only a few games still need this level to work properly.

The Release Early Fences toggle has also been removed by @PavelBARABANOV, as it's not needed anymore.

Co-authored-by: PavelBARABANOV <pavelbarabanov94@gmail.com>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3129
Reviewed-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
Reviewed-by: Maufeat <sahyno1996@gmail.com>
Co-authored-by: MaranBr <maranbr@outlook.com>
Co-committed-by: MaranBr <maranbr@outlook.com>
2025-12-09 18:11:05 +01:00
MaranBr
7157d5167e [video_core] Fix inconsistency between EDS and VIDS settings (#3148)
Fixes an issue where selecting EDS 0 for a specific game would incorrectly disable VIDS globally.

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3148
Reviewed-by: Lizzie <lizzie@eden-emu.dev>
Co-authored-by: MaranBr <maranbr@outlook.com>
Co-committed-by: MaranBr <maranbr@outlook.com>
2025-12-09 17:57:03 +01:00
lizzie
77d83b008a [dynarmic] avoid IsInmediate() comical call recursion (#3145)
Signed-off-by: lizzie <lizzie@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3145
Reviewed-by: crueter <crueter@eden-emu.dev>
Reviewed-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
2025-12-09 05:28:01 +01:00
lizzie
69a84ee0a6 [dynarmic] remove reg_alloc from all arguments on x86 emitter (#3150)
From my tests this decreases JIT latency twofold, may be placebo.
saving reg_alloc while having it readily available is certainly a very interesting choice... afterall saving it onto %rdi is way more cheap isn't it? :)
Please test any performance rgressions, I got +20 FPS on Rain World (unlocked) off this change alone

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3150
Reviewed-by: crueter <crueter@eden-emu.dev>
Reviewed-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
2025-12-09 03:53:58 +01:00
73 changed files with 2379 additions and 2675 deletions

View File

@@ -143,21 +143,8 @@ class ReleaseAdapter(
binding.containerDownloads.removeAllViews()
release.artifacts.forEach { artifact ->
val alreadyInstalled = try {
// Prefer fast check via ViewModel list; fallback to helper if needed
driverViewModel.driverData.any {
File(it.first).name.equals(artifact.name, ignoreCase = true)
} || GpuDriverHelper.isDriverZipInstalledByName(artifact.name)
} catch (_: Exception) {
false
}
val button = MaterialButton(binding.root.context).apply {
text = if (alreadyInstalled) {
context.getString(R.string.installed_label, artifact.name)
} else {
artifact.name
}
text = artifact.name
setTextAppearance(
com.google.android.material.R.style.TextAppearance_Material3_LabelLarge
)
@@ -167,7 +154,7 @@ class ReleaseAdapter(
com.google.android.material.R.color.m3_button_background_color_selector
)
)
setIconResource(if (alreadyInstalled) R.drawable.ic_check else R.drawable.ic_import)
setIconResource(R.drawable.ic_import)
iconTint = ColorStateList.valueOf(
MaterialColors.getColor(
this,
@@ -180,22 +167,7 @@ class ReleaseAdapter(
ViewGroup.LayoutParams.MATCH_PARENT,
ViewGroup.LayoutParams.WRAP_CONTENT
)
isEnabled = !alreadyInstalled
setOnClickListener {
// Double-check just before starting (race-proof)
if (GpuDriverHelper.isDriverZipInstalledByName(artifact.name)) {
Toast.makeText(
context,
context.getString(R.string.driver_already_installed),
Toast.LENGTH_SHORT
).show()
// Update UI to reflect installed state
this.isEnabled = false
this.text = context.getString(R.string.installed_label, artifact.name)
this.setIconResource(R.drawable.ic_check)
return@setOnClickListener
}
val dialogBinding =
DialogProgressBinding.inflate(LayoutInflater.from(context))
dialogBinding.progressBar.isIndeterminate = true
@@ -261,10 +233,6 @@ class ReleaseAdapter(
driverViewModel.onDriverAdded(Pair(driverPath, driverData))
progressDialog.dismiss()
// Update button to installed state
this@apply.isEnabled = false
this@apply.text = context.getString(R.string.installed_label, artifact.name)
this@apply.setIconResource(R.drawable.ic_check)
Toast.makeText(
context,
context.getString(

View File

@@ -25,7 +25,6 @@ enum class BooleanSetting(override val key: String) : AbstractBooleanSetting {
RENDERER_ASYNCHRONOUS_SHADERS("use_asynchronous_shaders"),
RENDERER_FAST_GPU("use_fast_gpu_time"),
RENDERER_REACTIVE_FLUSHING("use_reactive_flushing"),
RENDERER_EARLY_RELEASE_FENCES("early_release_fences"),
SYNC_MEMORY_OPERATIONS("sync_memory_operations"),
BUFFER_REORDER_DISABLE("disable_buffer_reorder"),
RENDERER_DEBUG("debug"),

View File

@@ -689,13 +689,6 @@ abstract class SettingsItem(
descriptionId = R.string.renderer_reactive_flushing_description
)
)
put(
SwitchSetting(
BooleanSetting.RENDERER_EARLY_RELEASE_FENCES,
titleId = R.string.renderer_early_release_fences,
descriptionId = R.string.renderer_early_release_fences_description
)
)
put(
SwitchSetting(
BooleanSetting.SYNC_MEMORY_OPERATIONS,

View File

@@ -460,7 +460,6 @@ class SettingsFragmentPresenter(
add(IntSetting.RENDERER_SAMPLE_SHADING_FRACTION.key)
add(HeaderSetting(R.string.veil_renderer))
add(BooleanSetting.RENDERER_EARLY_RELEASE_FENCES.key)
add(IntSetting.DMA_ACCURACY.key)
add(BooleanSetting.BUFFER_REORDER_DISABLE.key)
add(BooleanSetting.RENDERER_FAST_GPU.key)

View File

@@ -238,18 +238,4 @@ object GpuDriverHelper {
driverStorageDirectory.mkdirs()
}
}
/**
* Checks if a driver zip with the given filename is already present and valid in the
* internal driver storage directory. Validation requires a readable meta.json with a name.
*/
fun isDriverZipInstalledByName(fileName: String): Boolean {
// Normalize separators in case upstream sent a path
val baseName = fileName.substringAfterLast('/')
.substringAfterLast('\\')
val candidate = File("$driverStoragePath$baseName")
if (!candidate.exists() || candidate.length() == 0L) return false
val metadata = getMetadataFromZip(candidate)
return metadata.name != null
}
}

View File

@@ -1,6 +1,6 @@
<com.google.android.material.button.MaterialButton xmlns:android="http://schemas.android.com/apk/res/android"
xmlns:app="http://schemas.android.com/apk/res-auto"
android:layout_width="170dp"
android:layout_width="175dp"
android:layout_height="55dp"
android:layout_marginBottom="16dp"
app:iconTint="?attr/colorOnPrimary"

View File

@@ -55,7 +55,7 @@
<com.google.android.material.textview.MaterialTextView
android:id="@+id/text_confirmation"
style="@style/SynthwaveText.Accent"
android:layout_width="213dp"
android:layout_width="wrap_content"
android:layout_height="226dp"
android:paddingHorizontal="16dp"
android:paddingTop="24dp"

View File

@@ -98,8 +98,6 @@
<string name="sample_shading_fraction_description">كثافة تمرير تظليل العينة. تؤدي القيم الأعلى إلى تحسين الجودة بشكل أكبر، ولكنها تقلل أيضًا من الأداء إلى حد كبير.</string>
<string name="veil_renderer">العارض</string>
<string name="renderer_early_release_fences">إطلاق الأسوار مبكرًا</string>
<string name="renderer_early_release_fences_description">يساعد في إصلاح مشكلة 0 إطار في الثانية في ألعاب مثل DKCR:HD وSubnautica Below Zero وOri 2، ولكن قد يتسبب في تعطيل التحميل أو الأداء في ألعاب Unreal Engine.</string>
<string name="sync_memory_operations">مزامنة عمليات الذاكرة</string>
<string name="sync_memory_operations_description">يضمن اتساق البيانات بين عمليات الحوسبة والذاكرة. هذا الخيار قد يحل المشكلات في بعض الألعاب، ولكن قد يقلل الأداء في بعض الحالات. يبدو أن الألعاب التي تستخدم Unreal Engine 4 هي الأكثر تأثرًا.</string>
<string name="buffer_reorder_disable">تعطيل إعادة ترتيب المخزن المؤقت</string>
@@ -880,11 +878,6 @@
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none">لاشيء</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">عادي</string>
<string name="renderer_accuracy_high">عالي</string>
<string name="renderer_accuracy_extreme">أقصى</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">افتراضي</string>
<string name="dma_accuracy_unsafe">غير آمن</string>

View File

@@ -76,8 +76,6 @@
<string name="sample_shading_fraction_description">چڕی تێپەڕاندنی سێبەرکردنی نموونە. بەهای زیاتر کوالێتی باشتر دەکات بەڵام کارایی زیاتر کەم دەکاتەوە.</string>
<string name="veil_renderer">رێندرەر</string>
<string name="renderer_early_release_fences">زێدەکردنی پەرستارەکان زووتر</string>
<string name="renderer_early_release_fences_description">یارمەتی دەدات لە چارەسەری 0 FPS لە یارییەکانی وەک DKCR:HD، Subnautica Below Zero و Ori 2، بەڵام ڕەنگە بارکردن یان کارایی لە یارییەکانی Unreal Engine تێکبدات.</string>
<string name="sync_memory_operations">هاوبەشیی کردارەکانی بیرگە</string>
<string name="sync_memory_operations_description">دڵنیایی داتا لە نێوان کردارەکانی کۆمپیوتەر و بیرگە. ئەم هەڵبژاردە کێشەکان لە هەندێک یاری چارەسەر دەکات، بەڵام لە هەندێک حاڵەت کارایی کەم دەکاتەوە. وا دیارە یارییەکانی Unreal Engine 4 زۆرترین کاریگەریان هەیە.</string>
<string name="buffer_reorder_disable">ڕێکخستنەوەی بافر ناچالاک بکە</string>
@@ -625,9 +623,6 @@
<string name="renderer_vulkan">ڤوڵکان</string>
<string name="renderer_none">هیچ</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">ئاسایی</string>
<string name="renderer_accuracy_high">بەرز</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">بنەڕەتی</string>
<!-- ASTC Decoding Method -->

View File

@@ -76,8 +76,6 @@
<string name="sample_shading_fraction_description">Intenzita průchodu stínování vzorku. Vyšší hodnoty zlepšují kvalitu, ale také výrazněji snižují výkon.</string>
<string name="veil_renderer">Renderer</string>
<string name="renderer_early_release_fences">Uvolnit ploty brzy</string>
<string name="renderer_early_release_fences_description">Pomáhá opravit 0 FPS v hrách jako DKCR:HD, Subnautica Below Zero a Ori 2, ale může narušit načítání nebo výkon v hrách na Unreal Engine.</string>
<string name="sync_memory_operations">Synchronizace paměťových operací</string>
<string name="sync_memory_operations_description">Zajišťuje konzistenci dat mezi výpočetními a paměťovými operacemi. Tato volba by měla opravit problémy v některých hrách, ale může v některých případech snížit výkon. Nejvíce postižené se zdají být hry s Unreal Engine 4.</string>
<string name="buffer_reorder_disable">Zakázat přeřazování vyrovnávací paměti</string>
@@ -612,9 +610,6 @@
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none">Žádné</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">Normální</string>
<string name="renderer_accuracy_high">Vysoká</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">Výchozí</string>
<!-- ASTC Decoding Method -->

View File

@@ -84,8 +84,6 @@
<string name="sample_shading_fraction_description">Die Intensität des Sample-Shading-Durchgangs. Höhere Werte verbessern die Qualität stärker, beeinträchtigen aber auch die Leistung stärker.</string>
<string name="veil_renderer">Renderer</string>
<string name="renderer_early_release_fences">Zäune früher freigeben</string>
<string name="renderer_early_release_fences_description">Behebt 0 FPS in Spielen wie DKCR:HD, Subnautica Below Zero und Ori 2, kann aber Ladezeiten oder Performance in Unreal Engine-Spielen beeinträchtigen.</string>
<string name="sync_memory_operations">Speicheroperationen synchronisieren</string>
<string name="sync_memory_operations_description">Stellt die Datenkonsistenz zwischen Compute- und Speicheroperationen sicher. Diese Option sollte Probleme in einigen Spielen beheben, kann aber in einigen Fällen die Leistung verringern. Spiele mit Unreal Engine 4 scheinen am stärksten betroffen zu sein.</string>
<string name="buffer_reorder_disable">Puffer-Neuanordnung deaktivieren</string>
@@ -790,9 +788,6 @@ Wirklich fortfahren?</string>
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none">Keiner</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">Normal</string>
<string name="renderer_accuracy_high">Hoch</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">Standard</string>
<!-- ASTC Decoding Method -->

View File

@@ -98,8 +98,6 @@
<string name="sample_shading_fraction_description">La intensidad del paso de sombreado de la muestra. Los valores más altos mejoran más la calidad, pero también reducen el rendimiento en mayor medida.</string>
<string name="veil_renderer">Renderizador</string>
<string name="renderer_early_release_fences">Liberar las vallas antes</string>
<string name="renderer_early_release_fences_description">Ayuda a arreglar 0 FPS en juegos como DKCR:HD, Subnautica Below Zero y Ori 2, pero puede romper la carga o el rendimiento en juegos de Unreal Engine.</string>
<string name="sync_memory_operations">Sincronizar operaciones de memoria</string>
<string name="sync_memory_operations_description">Garantiza la consistencia de los datos entre las operaciones de computación y memoria. Esta opción debería solucionar problemas en algunos juegos, pero también puede reducir el rendimiento en algunos casos. Los juegos de Unreal Engine 4 a menudo ven los cambios más significativos de los mismos.</string>
<string name="buffer_reorder_disable">Desactivar reordenamiento de búfer</string>
@@ -839,11 +837,6 @@
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none">Ninguno</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">Normal</string>
<string name="renderer_accuracy_high">Alto</string>
<string name="renderer_accuracy_extreme">Extremo</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">Predeterminado</string>
<string name="dma_accuracy_unsafe">Inseguro</string>

View File

@@ -65,8 +65,6 @@
<string name="veil_misc">پردازنده و حافظه</string>
<string name="eden_veil">پرده عدن</string>
<string name="eden_veil_description">تنظیمات آزمایشی برای بهبود عملکرد و قابلیت. این تنظیمات ممکن است باعث نمایش صفحه سیاه یا سایر مشکلات بازی شود.</string>
<string name="renderer_early_release_fences">رهاسازی حصارها زودتر</string>
<string name="renderer_early_release_fences_description">به رفع مشکل 0 فریم بر ثانیه در بازی‌هایی مانند DKCR:HD، Subnautica Below Zero و Ori 2 کمک می‌کند، اما ممکن است بارگذاری یا عملکرد بازی‌های Unreal Engine را مختل کند.</string>
<string name="sync_memory_operations">همگام‌سازی عملیات حافظه</string>
<string name="sync_memory_operations_description">اطمینان از سازگاری داده‌ها بین عملیات محاسباتی و حافظه. این گزینه ممکن است مشکلات برخی بازی‌ها را رفع کند، اما در برخی موارد ممکن است عملکرد را کاهش دهد. به نظر می‌رسد بازی‌های با Unreal Engine 4 بیشترین تأثیر را داشته باشند.</string>
<string name="buffer_reorder_disable">غیرفعال کردن مرتب‌سازی مجدد بافر</string>
@@ -760,11 +758,6 @@
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none">هیچ</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">معمولی</string>
<string name="renderer_accuracy_high">زیاد</string>
<string name="renderer_accuracy_extreme">افراطی (کند)</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">پیش فرض</string>
<string name="dma_accuracy_unsafe">ناایمن (سریع)</string>

View File

@@ -98,8 +98,6 @@
<string name="sample_shading_fraction_description">L\'intensité de la passe d\'ombrage d\'échantillon. Des valeurs plus élevées améliorent davantage la qualité mais réduisent aussi plus fortement les performances.</string>
<string name="veil_renderer">Rendu</string>
<string name="renderer_early_release_fences">Libérer les barrières plus tôt</string>
<string name="renderer_early_release_fences_description">Résout les problèmes de 0 FPS dans des jeux comme DKCR:HD, Subnautica Below Zero et Ori 2, mais peut perturber le chargement ou les performances des jeux Unreal Engine.</string>
<string name="sync_memory_operations">Synchroniser les opérations mémoire</string>
<string name="sync_memory_operations_description">Garantit la cohérence des données entre les opérations de calcul et de mémoire. Cette option devrait résoudre les problèmes dans certains jeux, mais peut réduire les performances dans certains cas. Les jeux utilisant Unreal Engine 4 semblent être les plus affectés.</string>
<string name="buffer_reorder_disable">Désactiver le réordonnancement du tampon</string>
@@ -852,11 +850,6 @@
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none">Aucune</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">Normal</string>
<string name="renderer_accuracy_high">Haut</string>
<string name="renderer_accuracy_extreme">Extrême</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">Défaut</string>
<string name="dma_accuracy_unsafe">Dangereux</string>

View File

@@ -76,8 +76,6 @@
<string name="sample_shading_fraction_description">עוצמת מעבר ההצללה לדוגמה. ערכים גבוהים יותר משפרים את האיכות יותר אך גם מפחיתים את הביצועים במידה רבה יותר.</string>
<string name="veil_renderer">רנדרר</string>
<string name="renderer_early_release_fences">שחרר גדרות מוקדם</string>
<string name="renderer_early_release_fences_description">עוזר לתקן 0 FPS במשחקים כמו DKCR:HD, Subnautica Below Zero ו-Ori 2, אך עלול לפגוע בטעינה או בביצועים במשחקי Unreal Engine.</string>
<string name="sync_memory_operations">סנכרון פעולות זיכרון</string>
<string name="sync_memory_operations_description">מבטיח עקביות נתונים בין פעולות חישוב וזיכרון. אפשרות זו אמורה לתקן בעיות במשחקים מסוימים, אך עלולה להפחית ביצועים במקרים מסוימים. נראה שהמשחקים עם Unreal Engine 4 הם המושפעים ביותר.</string>
<string name="buffer_reorder_disable">השבת סידור מחדש של חוצץ</string>
@@ -666,9 +664,6 @@
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none">אין שום דבר</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">רגיל</string>
<string name="renderer_accuracy_high">גבוה</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">ברירת מחדל</string>
<!-- ASTC Decoding Method -->

View File

@@ -76,8 +76,6 @@
<string name="sample_shading_fraction_description">A mintavételezés árnyékolási lépés intenzitása. A magasabb értékek jobb minőséget eredményeznek, de nagyobb mértékben csökkentik a teljesítményt.</string>
<string name="veil_renderer">Megjelenítő</string>
<string name="renderer_early_release_fences">Korai kerítés-felszabadítás</string>
<string name="renderer_early_release_fences_description">Segít javítani a 0 FPS-t olyan játékokban, mint a DKCR:HD, Subnautica Below Zero és az Ori 2, de ronthatja az Unreal Engine játékok betöltését vagy teljesítményét.</string>
<string name="sync_memory_operations">Memória-műveletek szinkronizálása</string>
<string name="sync_memory_operations_description">Biztosítja az adatok konzisztenciáját a számítási és memória-műveletek között. Ez az opciónak javítania kell néhány játékban előforduló problémát, de bizonyos esetekben csökkentheti a teljesítményt. Az Unreal Engine 4-et használó játékok látszanak a legérintettebbek.</string>
<string name="buffer_reorder_disable">Puffer újrarendezés letiltása</string>
@@ -761,9 +759,6 @@
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none">Nincs</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">Normál</string>
<string name="renderer_accuracy_high">Magas</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">Alapértelmezett</string>
<!-- ASTC Decoding Method -->

View File

@@ -98,8 +98,6 @@
<string name="sample_shading_fraction_description">Intensitas proses pencahayaan sampel. Nilai lebih tinggi meningkatkan kualitas lebih baik tetapi juga mengurangi performa lebih besar.</string>
<string name="veil_renderer">Renderer</string>
<string name="renderer_early_release_fences">Lepas Pagar Lebih Awal</string>
<string name="renderer_early_release_fences_description">Membantu memperbaiki 0 FPS di game seperti DKCR:HD, Subnautica Below Zero dan Ori 2, tapi mungkin mengganggu loading atau performa di game Unreal Engine.</string>
<string name="sync_memory_operations">Sinkronisasi Operasi Memori</string>
<string name="sync_memory_operations_description">Memastikan konsistensi data antara operasi komputasi dan memori. Opsi ini seharusnya memperbaiki masalah di beberapa game, tetapi mungkin mengurangi performa dalam beberapa kasus. Game dengan Unreal Engine 4 tampaknya yang paling terpengaruh.</string>
<string name="buffer_reorder_disable">Nonaktifkan Penyusunan Ulang Buffer</string>
@@ -811,9 +809,6 @@
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none">Tak ada</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">Normal</string>
<string name="renderer_accuracy_high">Tinggi</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">Bawaan</string>
<!-- ASTC Decoding Method -->

View File

@@ -98,8 +98,6 @@
<string name="sample_shading_fraction_description">L\'intensità della passata di ombreggiatura campione. Valori più alti migliorano la qualità ma riducono maggiormente le prestazioni.</string>
<string name="veil_renderer">Renderer</string>
<string name="renderer_early_release_fences">Rilascia le barriere prima</string>
<string name="renderer_early_release_fences_description">Risolve problemi di 0 FPS in giochi come DKCR:HD, Subnautica Below Zero e Ori 2, ma potrebbe compromettere caricamento o prestazioni in giochi Unreal Engine.</string>
<string name="sync_memory_operations">Sincronizza operazioni di memoria</string>
<string name="sync_memory_operations_description">Garantisce la coerenza dei dati tra le operazioni di calcolo e memoria. Questa opzione dovrebbe risolvere problemi in alcuni giochi, ma potrebbe ridurre le prestazioni in alcuni casi. I giochi con Unreal Engine 4 sembrano essere i più colpiti.</string>
<string name="buffer_reorder_disable">Disabilita riordino buffer</string>
@@ -852,11 +850,6 @@
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none">Nessuna</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">Normale</string>
<string name="renderer_accuracy_high">Alta</string>
<string name="renderer_accuracy_extreme">Estrema</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">Predefinito</string>
<string name="dma_accuracy_unsafe">Non sicura</string>

View File

@@ -76,8 +76,6 @@
<string name="sample_shading_fraction_description">サンプルシェーディング処理の強度。高い値ほど品質は向上しますが、パフォーマンスも大きく低下します。</string>
<string name="veil_renderer">レンダラー</string>
<string name="renderer_early_release_fences">フェンスを早期に解放</string>
<string name="renderer_early_release_fences_description">DKCR:HD、Subnautica Below Zero、Ori 2などのゲームで0 FPSを修正しますが、Unreal Engineゲームの読み込みやパフォーマンスに影響する可能性があります。</string>
<string name="sync_memory_operations">メモリ操作の同期</string>
<string name="sync_memory_operations_description">計算処理とメモリ操作間のデータ一貫性を保証します。 このオプションは一部のゲームの問題を修正しますが、場合によってはパフォーマンスが低下する可能性があります。 Unreal Engine 4のゲームが最も影響を受けるようです。</string>
<string name="buffer_reorder_disable">バッファの再並べ替えを無効化</string>
@@ -666,9 +664,6 @@
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none">なし</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">標準</string>
<string name="renderer_accuracy_high"></string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">デフォルト</string>
<!-- ASTC Decoding Method -->

View File

@@ -76,8 +76,6 @@
<string name="sample_shading_fraction_description">샘플 쉐이딩 패스의 강도. 값이 높을수록 품질이 더 향상되지만 성능도 더 크게 저하됩니다.</string>
<string name="veil_renderer">렌더러</string>
<string name="renderer_early_release_fences">펜스 조기 해제</string>
<string name="renderer_early_release_fences_description">DKCR:HD, Subnautica Below Zero, Ori 2 등의 게임에서 0 FPS 현상을 해결하지만, Unreal Engine 게임의 로딩이나 성능에 문제를 일으킬 수 있습니다.</string>
<string name="sync_memory_operations">메모리 작업 동기화</string>
<string name="sync_memory_operations_description">컴퓨팅 및 메모리 작업 간 데이터 일관성을 보장합니다. 이 옵션은 일부 게임의 문제를 해결할 수 있지만 경우에 따라 성능이 저하될 수 있습니다. Unreal Engine 4 게임이 가장 큰 영향을 받는 것으로 보입니다.</string>
<string name="buffer_reorder_disable">버퍼 재정렬 비활성화</string>
@@ -720,9 +718,6 @@
<string name="renderer_vulkan">Vulcan</string>
<string name="renderer_none">없음</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">보통</string>
<string name="renderer_accuracy_high">높음</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">기본값</string>
<!-- ASTC Decoding Method -->

View File

@@ -76,8 +76,6 @@
<string name="sample_shading_fraction_description">Intensiteten til prøveskyggepasseringen. Høyere verdier forbedrer kvaliteten mer, men reduserer også ytelsen i større grad.</string>
<string name="veil_renderer">Renderer</string>
<string name="renderer_early_release_fences">Frigjør gjerder tidlig</string>
<string name="renderer_early_release_fences_description">Løser 0 FPS i spill som DKCR:HD, Subnautica Below Zero og Ori 2, men kan forårsake problemer med lasting eller ytelse i Unreal Engine-spill.</string>
<string name="sync_memory_operations">Synkroniser minneoperasjoner</string>
<string name="sync_memory_operations_description">Sikrer datakonsistens mellom beregnings- og minneoperasjoner. Dette alternativet bør fikse problemer i noen spill, men kan redusere ytelsen i noen tilfeller. Spill med Unreal Engine 4 ser ut til å være de mest berørte.</string>
<string name="buffer_reorder_disable">Deaktiver bufferomorganisering</string>
@@ -638,9 +636,6 @@
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none">Ingen</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">Normal</string>
<string name="renderer_accuracy_high">Høy</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">Standard</string>
<!-- ASTC Decoding Method -->

View File

@@ -98,8 +98,6 @@
<string name="sample_shading_fraction_description">Intensywność przebiegu cieniowania próbki. Wyższe wartości poprawiają jakość, ale także w większym stopniu zmniejszają wydajność.</string>
<string name="veil_renderer">Renderer</string>
<string name="renderer_early_release_fences">Wcześniejsze zwalnianie zabezpieczeń</string>
<string name="renderer_early_release_fences_description">Pomaga naprawić 0 FPS w grach takich jak DKCR:HD, Subnautica Below Zero i Ori 2, ale może zaburzyć ładowanie lub wydajność w grach Unreal Engine.</string>
<string name="sync_memory_operations">Synchronizuj operacje pamięci</string>
<string name="sync_memory_operations_description">Zapewnia spójność danych między operacjami obliczeniowymi i pamięciowymi. Ta opcja powinna naprawiać problemy w niektórych grach, ale może zmniejszyć wydajność w niektórych przypadkach. Gry z Unreal Engine 4 wydają się być najbardziej dotknięte.</string>
<string name="buffer_reorder_disable">Wyłącz przestawianie bufora</string>
@@ -876,11 +874,6 @@
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none">Żadny</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">Normalny</string>
<string name="renderer_accuracy_high">Wysoki</string>
<string name="renderer_accuracy_extreme">Ekstremalne</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">預設</string>
<string name="dma_accuracy_unsafe">Niezalecane</string>

View File

@@ -98,8 +98,6 @@
<string name="sample_shading_fraction_description">Fração de Sombreamento de Amostra: Define a intensidade do sample shading. Quanto maior, melhor a qualidade, mas maior o impacto no desempenho.</string>
<string name="veil_renderer">Renderizador</string>
<string name="renderer_early_release_fences">Release Fences Early</string>
<string name="renderer_early_release_fences_description">Liberar Cercas Antecipadamente: Ajuda a corrigir 0 FPS em jogos como DKCR:HD, Subnautica Below Zero e Ori 2, mas pode prejudicar o carregamento ou o desempenho em jogos feitos com Unreal Engine.</string>
<string name="sync_memory_operations">Sincronizar Operações de Memória</string>
<string name="sync_memory_operations_description">Garante a consistência de dados entre operações de computação e memória. Esta opção pode corrigir problemas em alguns jogos, mas também pode reduzir o desempenho, sendo os jogos da Unreal Engine 4 os mais afetados.</string>
<string name="buffer_reorder_disable">Desativar Reordenação de Buffers</string>
@@ -840,11 +838,6 @@ uma tentativa de mapeamento automático</string>
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none">Nenhum</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">Normal</string>
<string name="renderer_accuracy_high">Alta</string>
<string name="renderer_accuracy_extreme">Extrema</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">Padrão</string>
<string name="dma_accuracy_unsafe">Insegura</string>

View File

@@ -76,8 +76,6 @@
<string name="sample_shading_fraction_description">A intensidade da passagem de sombreamento de amostra. Valores mais elevados melhoram a qualidade, mas também reduzem o desempenho numa maior medida.</string>
<string name="veil_renderer">Renderizador</string>
<string name="renderer_early_release_fences">Libertar barreiras antecipadamente</string>
<string name="renderer_early_release_fences_description">Ajuda a corrigir 0 FPS em jogos como DKCR:HD, Subnautica Below Zero e Ori 2, mas pode afetar carregamento ou desempenho em jogos Unreal Engine.</string>
<string name="sync_memory_operations">Sincronizar Operações de Memória</string>
<string name="sync_memory_operations_description">Garante a consistência dos dados entre operações de computação e memória. Esta opção deve corrigir problemas em alguns jogos, mas pode reduzir o desempenho nalguns casos. Os jogos com Unreal Engine 4 parecem ser os mais afectados.</string>
<string name="buffer_reorder_disable">Desativar reordenação de buffer</string>
@@ -773,9 +771,6 @@ uma tentativa de mapeamento automático</string>
<string name="renderer_vulkan">Vulcano</string>
<string name="renderer_none">Nenhum</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">Normal</string>
<string name="renderer_accuracy_high">Alto</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">Predefinido</string>
<!-- ASTC Decoding Method -->

View File

@@ -98,8 +98,6 @@
<string name="sample_shading_fraction_description">Интенсивность прохода сэмплового затенения. Более высокие значения улучшают качество, но и сильнее снижают производительность.</string>
<string name="veil_renderer">Рендеринг</string>
<string name="renderer_early_release_fences">Ранний релиз ограждений</string>
<string name="renderer_early_release_fences_description">Помогает исправить 0 FPS в играх типа DKCR:HD, Subnautica Below Zero и Ori 2, но может нарушить загрузку или производительность в играх на Unreal Engine.</string>
<string name="sync_memory_operations">Синхронизация операций с памятью</string>
<string name="sync_memory_operations_description">Обеспечивает согласованность данных между вычислительными операциями и операциями с памятью. Эта опция должна исправлять проблемы в некоторых играх, но может снижать производительность в некоторых случаях. Наиболее сильно это затрагивает игры на Unreal Engine 4.</string>
<string name="buffer_reorder_disable">Отключить переупорядочивание буфера</string>
@@ -874,11 +872,6 @@
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none">Никакой</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">Нормальная</string>
<string name="renderer_accuracy_high">Высокая</string>
<string name="renderer_accuracy_extreme">Экстрим</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">По умолчанию</string>
<string name="dma_accuracy_unsafe">Небезопасно</string>

View File

@@ -75,8 +75,6 @@
<string name="sample_shading_fraction_description">Интензитет проласка сенчења узорка. Веће вредности побољшавају квалитет више, али такође више смањују перформансе.</string>
<string name="veil_renderer">Рендерер</string>
<string name="renderer_early_release_fences">Ranije oslobađanje ograda</string>
<string name="renderer_early_release_fences_description">Pomaže u popravci 0 FPS u igrama kao što su DKCR:HD, Subnautica Below Zero i Ori 2, ali može oštetiti učitavanje ili performanse u Unreal Engine igrama.</string>
<string name="sync_memory_operations">Синхронизација меморијских операција</string>
<string name="sync_memory_operations_description">Осигурава конзистентност података између рачунских и меморијских операција. Ова опција би требало да поправи проблеме у неким играма, али може смањити перформансе у неким случајевима. Чини се да су игре са Unreal Engine 4 највише погођене.</string>
<string name="buffer_reorder_disable">Онемогући преуређивање бафера</string>
@@ -771,9 +769,6 @@
<string name="renderer_vulkan">Вулкан</string>
<string name="renderer_none">Ниједан</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">Нормалан</string>
<string name="renderer_accuracy_high">Високо</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">Подразумевано</string>
<!-- ASTC Decoding Method -->

View File

@@ -98,8 +98,6 @@
<string name="sample_shading_fraction_description">Інтенсивність проходу затінення зразка. Вищі значення покращують якість, але й сильніше знижують продуктивність.</string>
<string name="veil_renderer">Візуалізатор</string>
<string name="renderer_early_release_fences">Release fences early</string>
<string name="renderer_early_release_fences_description">Це налаштування може бути необхідним для виправлення помилок 0FPS у деяких іграх (зокрема DKCR:HD, Subnautica та Ori 2). Водночас інші ігри, особливо створені на рушії Unreal Engine, можуть працювати некоректно або взагалі не запускатися.</string>
<string name="sync_memory_operations">Синхронізація операцій з пам\'яттю</string>
<string name="sync_memory_operations_description">Забезпечує узгодженість даних між обчислювальними операціями та операціями з пам\'яттю. Ця опція має виправляти проблеми в деяких іграх, але може знижувати продуктивність у деяких випадках. Ігри на Unreal Engine 4, здається, найбільш постраждалі.</string>
<string name="buffer_reorder_disable">Вимкнути переупорядкування буфера</string>
@@ -876,11 +874,6 @@
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none">Вимкнено</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">Нормальна</string>
<string name="renderer_accuracy_high">Висока</string>
<string name="renderer_accuracy_extreme">Екстремально</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">Типово</string>
<string name="dma_accuracy_unsafe">Небезпечно</string>

View File

@@ -76,8 +76,6 @@
<string name="sample_shading_fraction_description">Cường độ của bước tô bóng mẫu. Giá trị cao hơn cải thiện chất lượng tốt hơn nhưng cũng giảm hiệu suất nhiều hơn.</string>
<string name="veil_renderer">Trình kết xuất</string>
<string name="renderer_early_release_fences">Giải phóng rào chắn sớm</string>
<string name="renderer_early_release_fences_description">Giúp sửa lỗi 0 FPS trong các trò chơi như DKCR:HD, Subnautica Below Zero và Ori 2, nhưng có thể ảnh hưởng đến tải hoặc hiệu suất trong trò chơi Unreal Engine.</string>
<string name="sync_memory_operations">Đồng bộ hoá thao tác bộ nhớ</string>
<string name="sync_memory_operations_description">Đảm bảo tính nhất quán dữ liệu giữa các thao tác tính toán và bộ nhớ. Tùy chọn này nên khắc phục sự cố trong một số trò chơi, nhưng có thể làm giảm hiệu suất trong một số trường hợp. Các trò chơi với Unreal Engine 4 có vẻ bị ảnh hưởng nhiều nhất.</string>
<string name="buffer_reorder_disable">Tắt sắp xếp lại bộ đệm</string>
@@ -638,9 +636,6 @@
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none">Trống</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">Trung bình</string>
<string name="renderer_accuracy_high">Khỏe</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">Mặc định</string>
<!-- ASTC Decoding Method -->

View File

@@ -98,8 +98,6 @@
<string name="sample_shading_fraction_description">采样着色处理的强度。值越高,质量改善越多,但性能降低也越明显。</string>
<string name="veil_renderer">渲染器</string>
<string name="renderer_early_release_fences">提前释放围栏</string>
<string name="renderer_early_release_fences_description">可修复《大金刚国度:热带寒流》《深海迷航:零度之下》和《奥日2》等游戏中的0 FPS问题但可能影响Unreal Engine游戏的加载或性能。</string>
<string name="sync_memory_operations">同步内存操作</string>
<string name="sync_memory_operations_description">确保计算和内存操作之间的数据一致性。 此选项应能修复某些游戏中的问题,但在某些情况下可能会降低性能。 使用Unreal Engine 4的游戏似乎受影响最大。</string>
<string name="buffer_reorder_disable">禁用缓冲重排序</string>
@@ -848,11 +846,6 @@
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none"></string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">正常</string>
<string name="renderer_accuracy_high"></string>
<string name="renderer_accuracy_extreme">极致</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">默认</string>
<string name="dma_accuracy_unsafe">不安全</string>

View File

@@ -98,8 +98,6 @@
<string name="sample_shading_fraction_description">採樣著色處理的強度。數值越高,品質改善越多,但效能降低也越明顯。</string>
<string name="veil_renderer">渲染器</string>
<string name="renderer_early_release_fences">提前釋放圍欄</string>
<string name="renderer_early_release_fences_description">可修復《咚奇剛歸來HD》、《深海迷航冰點之下》和《聖靈之光2》等遊戲中的0 FPS問題但可能影響Unreal Engine遊戲的載入或效能。</string>
<string name="sync_memory_operations">同步記憶體操作</string>
<string name="sync_memory_operations_description">確保計算和記憶體操作之間的資料一致性。 此選項應能修復某些遊戲中的問題,但在某些情況下可能會降低效能。 使用Unreal Engine 4的遊戲似乎受影響最大。</string>
<string name="buffer_reorder_disable">停用緩衝區重新排序</string>
@@ -848,11 +846,6 @@
<string name="renderer_vulkan">Vulkan</string>
<string name="renderer_none"></string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">標準</string>
<string name="renderer_accuracy_high"></string>
<string name="renderer_accuracy_extreme">極高</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">預設</string>
<string name="dma_accuracy_unsafe">不安全</string>

View File

@@ -98,9 +98,9 @@
</integer-array>
<string-array name="rendererAccuracyNames">
<item>@string/renderer_accuracy_normal</item>
<item>@string/renderer_accuracy_low</item>
<item>@string/renderer_accuracy_medium</item>
<item>@string/renderer_accuracy_high</item>
<item>@string/renderer_accuracy_extreme</item>
</string-array>
<integer-array name="rendererAccuracyValues">

View File

@@ -105,8 +105,6 @@
<string name="sample_shading_fraction_description">The intensity of the sample shading pass. Higher values improve quality more but also reduce performance to a greater extent.</string>
<string name="veil_renderer">Renderer</string>
<string name="renderer_early_release_fences">Release Fences Early</string>
<string name="renderer_early_release_fences_description">Helps fix 0 FPS in games like DKCR:HD, Subnautica Below Zero and Ori 2, but may break loading or performance in Unreal Engine games.</string>
<string name="sync_memory_operations">Sync Memory Operations</string>
<string name="sync_memory_operations_description">Ensures data consistency between compute and memory operations. This option should fix issues in some games, but may also reduce performance in some cases. Unreal Engine 4 games often see the most significant changes thereof.</string>
<string name="buffer_reorder_disable">Disable Buffer Reorder</string>
@@ -661,7 +659,6 @@
<string name="select_gpu_driver_default">Default</string>
<string name="select_gpu_driver_error">Invalid driver selected</string>
<string name="driver_already_installed">Driver already installed</string>
<string name="installed_label">%1$s (Installed)</string>
<string name="system_gpu_driver">System GPU driver</string>
<string name="installing_driver">Installing driver…</string>
@@ -925,9 +922,9 @@
<string name="renderer_none">None</string>
<!-- Renderer Accuracy -->
<string name="renderer_accuracy_normal">Normal</string>
<string name="renderer_accuracy_high">High</string>
<string name="renderer_accuracy_extreme">Extreme</string>
<string name="renderer_accuracy_low">Fast</string>
<string name="renderer_accuracy_medium">Balanced</string>
<string name="renderer_accuracy_high">Accurate</string>
<!-- DMA Accuracy -->
<string name="dma_accuracy_default">Default</string>

View File

@@ -149,13 +149,16 @@ void UpdateGPUAccuracy() {
values.current_gpu_accuracy = values.gpu_accuracy.GetValue();
}
bool IsGPULevelExtreme() {
return values.current_gpu_accuracy == GpuAccuracy::Extreme;
bool IsGPULevelLow() {
return values.current_gpu_accuracy == GpuAccuracy::Low;
}
bool IsGPULevelMedium() {
return values.current_gpu_accuracy == GpuAccuracy::Medium;
}
bool IsGPULevelHigh() {
return values.current_gpu_accuracy == GpuAccuracy::Extreme ||
values.current_gpu_accuracy == GpuAccuracy::High;
return values.current_gpu_accuracy == GpuAccuracy::High;
}
bool IsDMALevelDefault() {

View File

@@ -414,9 +414,9 @@ struct Values {
SwitchableSetting<GpuAccuracy, true> gpu_accuracy{linkage,
#ifdef ANDROID
GpuAccuracy::Normal,
GpuAccuracy::Low,
#else
GpuAccuracy::High,
GpuAccuracy::Medium,
#endif
"gpu_accuracy",
Category::RendererAdvanced,
@@ -424,7 +424,7 @@ struct Values {
true,
true};
GpuAccuracy current_gpu_accuracy{GpuAccuracy::High};
GpuAccuracy current_gpu_accuracy{GpuAccuracy::Medium};
SwitchableSetting<DmaAccuracy, true> dma_accuracy{linkage,
DmaAccuracy::Default,
@@ -457,15 +457,6 @@ struct Values {
Specialization::Default,
true,
true};
#ifdef ANDROID
SwitchableSetting<bool> early_release_fences{linkage,
false,
"early_release_fences",
Category::RendererAdvanced,
Specialization::Default,
true,
true};
#endif
SwitchableSetting<bool> sync_memory_operations{linkage,
false,
"sync_memory_operations",
@@ -771,7 +762,8 @@ extern Values values;
bool getDebugKnobAt(u8 i);
void UpdateGPUAccuracy();
bool IsGPULevelExtreme();
bool IsGPULevelLow();
bool IsGPULevelMedium();
bool IsGPULevelHigh();
bool IsDMALevelDefault();

View File

@@ -133,7 +133,7 @@ ENUM(VSyncMode, Immediate, Mailbox, Fifo, FifoRelaxed);
ENUM(VramUsageMode, Conservative, Aggressive);
ENUM(RendererBackend, OpenGL, Vulkan, Null);
ENUM(ShaderBackend, Glsl, Glasm, SpirV);
ENUM(GpuAccuracy, Normal, High, Extreme);
ENUM(GpuAccuracy, Low, Medium, High);
ENUM(DmaAccuracy, Default, Unsafe, Safe);
ENUM(CpuBackend, Dynarmic, Nce);
ENUM(CpuAccuracy, Auto, Accurate, Unsafe, Paranoid, Debugging);

View File

@@ -77,9 +77,9 @@ void EmitX64::EmitPushRSB(IR::Block&, IR::Inst* inst) {
ASSERT(inst->GetArg(0).IsImmediate());
u64 imm64 = inst->GetArg(0).GetU64();
Xbyak::Reg64 code_ptr_reg = reg_alloc.ScratchGpr({HostLoc::RCX});
Xbyak::Reg64 loc_desc_reg = reg_alloc.ScratchGpr();
Xbyak::Reg32 index_reg = reg_alloc.ScratchGpr().cvt32();
Xbyak::Reg64 code_ptr_reg = reg_alloc.ScratchGpr(code, {HostLoc::RCX});
Xbyak::Reg64 loc_desc_reg = reg_alloc.ScratchGpr(code);
Xbyak::Reg32 index_reg = reg_alloc.ScratchGpr(code).cvt32();
u64 code_ptr = unique_hash_to_code_ptr.find(imm64) != unique_hash_to_code_ptr.end()
? u64(unique_hash_to_code_ptr[imm64])
: u64(code->GetReturnFromRunCodeAddress());

View File

@@ -175,7 +175,6 @@ if ("x86_64" IN_LIST ARCHITECTURE)
backend/x64/exclusive_monitor.cpp
backend/x64/exclusive_monitor_friend.h
backend/x64/host_feature.h
backend/x64/hostloc.cpp
backend/x64/hostloc.h
backend/x64/jitstate_info.h
backend/x64/oparg.h

View File

@@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
/* This file is part of the dynarmic project.
* Copyright (c) 2022 MerryMage
* SPDX-License-Identifier: 0BSD
@@ -60,7 +63,7 @@ void EmitIR<IR::Opcode::Pack2x32To1x64>(oaknut::CodeGenerator& code, EmitContext
template<>
void EmitIR<IR::Opcode::Pack2x64To1x128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
bool const args_in_gpr[] = { args[0].IsInGpr(), args[1].IsInGpr() };
bool const args_in_gpr[] = { args[0].IsInGpr(ctx.reg_alloc), args[1].IsInGpr(ctx.reg_alloc) };
if (args_in_gpr[0] && args_in_gpr[1]) {
auto Xlo = ctx.reg_alloc.ReadX(args[0]);
auto Xhi = ctx.reg_alloc.ReadX(args[1]);

View File

@@ -84,7 +84,7 @@ IR::AccType Argument::GetImmediateAccType() const {
return value.GetAccType();
}
HostLoc::Kind Argument::CurrentLocationKind() const {
HostLoc::Kind Argument::CurrentLocationKind(RegAlloc& reg_alloc) const {
return reg_alloc.ValueLocation(value.GetInst())->kind;
}
@@ -131,7 +131,7 @@ void HostLocInfo::UpdateUses() {
}
RegAlloc::ArgumentInfo RegAlloc::GetArgumentInfo(IR::Inst* inst) {
ArgumentInfo ret = {Argument{*this}, Argument{*this}, Argument{*this}, Argument{*this}};
ArgumentInfo ret = {Argument{}, Argument{}, Argument{}, Argument{}};
for (size_t i = 0; i < inst->NumArgs(); i++) {
const IR::Value arg = inst->GetArg(i);
ret[i].value = arg;

View File

@@ -64,18 +64,18 @@ public:
IR::AccType GetImmediateAccType() const;
// Only valid if not immediate
HostLoc::Kind CurrentLocationKind() const;
bool IsInGpr() const { return !IsImmediate() && CurrentLocationKind() == HostLoc::Kind::Gpr; }
bool IsInFpr() const { return !IsImmediate() && CurrentLocationKind() == HostLoc::Kind::Fpr; }
HostLoc::Kind CurrentLocationKind(RegAlloc& reg_alloc) const;
bool IsInGpr(RegAlloc& reg_alloc) const {
return !IsImmediate() && CurrentLocationKind(reg_alloc) == HostLoc::Kind::Gpr;
}
bool IsInFpr(RegAlloc& reg_alloc) const {
return !IsImmediate() && CurrentLocationKind(reg_alloc) == HostLoc::Kind::Fpr;
}
private:
friend class RegAlloc;
explicit Argument(RegAlloc& reg_alloc)
: reg_alloc{reg_alloc} {}
bool allocated = false;
RegAlloc& reg_alloc;
IR::Value value;
bool allocated = false;
};
struct FlagsTag final {

View File

@@ -117,7 +117,7 @@ A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) {
return gprs;
}();
new (&this->reg_alloc) RegAlloc(&code, gpr_order, any_xmm);
new (&this->reg_alloc) RegAlloc(gpr_order, any_xmm);
A32EmitContext ctx{conf, reg_alloc, block};
// Start emitting.
@@ -283,47 +283,47 @@ void A32EmitX64::GenTerminalHandlers() {
void A32EmitX64::EmitA32SetCheckBit(A32EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8();
const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(code, args[0]).cvt8();
code.mov(code.byte[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, check_bit)], to_store);
}
void A32EmitX64::EmitA32GetRegister(A32EmitContext& ctx, IR::Inst* inst) {
const A32::Reg reg = inst->GetArg(0).GetA32RegRef();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.mov(result, MJitStateReg(reg));
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A32EmitX64::EmitA32GetExtendedRegister32(A32EmitContext& ctx, IR::Inst* inst) {
const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
ASSERT(A32::IsSingleExtReg(reg));
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
code.movss(result, MJitStateExtReg(reg));
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A32EmitX64::EmitA32GetExtendedRegister64(A32EmitContext& ctx, IR::Inst* inst) {
const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
ASSERT(A32::IsDoubleExtReg(reg));
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
code.movsd(result, MJitStateExtReg(reg));
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A32EmitX64::EmitA32GetVector(A32EmitContext& ctx, IR::Inst* inst) {
const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
ASSERT(A32::IsDoubleExtReg(reg) || A32::IsQuadExtReg(reg));
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
if (A32::IsDoubleExtReg(reg)) {
code.movsd(result, MJitStateExtReg(reg));
} else {
code.movaps(result, MJitStateExtReg(reg));
}
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A32EmitX64::EmitA32SetRegister(A32EmitContext& ctx, IR::Inst* inst) {
@@ -332,11 +332,11 @@ void A32EmitX64::EmitA32SetRegister(A32EmitContext& ctx, IR::Inst* inst) {
if (args[1].IsImmediate()) {
code.mov(MJitStateReg(reg), args[1].GetImmediateU32());
} else if (args[1].IsInXmm()) {
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
} else if (args[1].IsInXmm(ctx.reg_alloc)) {
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(code, args[1]);
code.movd(MJitStateReg(reg), to_store);
} else {
const Xbyak::Reg32 to_store = ctx.reg_alloc.UseGpr(args[1]).cvt32();
const Xbyak::Reg32 to_store = ctx.reg_alloc.UseGpr(code, args[1]).cvt32();
code.mov(MJitStateReg(reg), to_store);
}
}
@@ -346,11 +346,11 @@ void A32EmitX64::EmitA32SetExtendedRegister32(A32EmitContext& ctx, IR::Inst* ins
const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
ASSERT(A32::IsSingleExtReg(reg));
if (args[1].IsInXmm()) {
Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
if (args[1].IsInXmm(ctx.reg_alloc)) {
Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(code, args[1]);
code.movss(MJitStateExtReg(reg), to_store);
} else {
Xbyak::Reg32 to_store = ctx.reg_alloc.UseGpr(args[1]).cvt32();
Xbyak::Reg32 to_store = ctx.reg_alloc.UseGpr(code, args[1]).cvt32();
code.mov(MJitStateExtReg(reg), to_store);
}
}
@@ -360,11 +360,11 @@ void A32EmitX64::EmitA32SetExtendedRegister64(A32EmitContext& ctx, IR::Inst* ins
const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
ASSERT(A32::IsDoubleExtReg(reg));
if (args[1].IsInXmm()) {
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
if (args[1].IsInXmm(ctx.reg_alloc)) {
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(code, args[1]);
code.movsd(MJitStateExtReg(reg), to_store);
} else {
const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(args[1]);
const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(code, args[1]);
code.mov(MJitStateExtReg(reg), to_store);
}
}
@@ -374,7 +374,7 @@ void A32EmitX64::EmitA32SetVector(A32EmitContext& ctx, IR::Inst* inst) {
const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
ASSERT(A32::IsDoubleExtReg(reg) || A32::IsQuadExtReg(reg));
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(code, args[1]);
if (A32::IsDoubleExtReg(reg)) {
code.movsd(MJitStateExtReg(reg), to_store);
} else {
@@ -383,9 +383,9 @@ void A32EmitX64::EmitA32SetVector(A32EmitContext& ctx, IR::Inst* inst) {
}
void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr(code).cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr(code).cvt32();
const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr(code).cvt32();
if (code.HasHostFeature(HostFeature::FastBMI2)) {
// Here we observe that cpsr_et and cpsr_ge are right next to each other in memory,
@@ -428,15 +428,15 @@ void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
code.or_(result, dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_jaifm)]);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg32 cpsr = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 cpsr = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr(code).cvt32();
const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr(code).cvt32();
if (conf.always_little_endian) {
code.and_(cpsr, 0xFFFFFDFF);
@@ -501,7 +501,7 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
void A32EmitX64::EmitA32SetCpsrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv)], to_store);
}
@@ -512,15 +512,15 @@ void A32EmitX64::EmitA32SetCpsrNZCVRaw(A32EmitContext& ctx, IR::Inst* inst) {
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm));
} else if (code.HasHostFeature(HostFeature::FastBMI2)) {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.shr(a, 28);
code.mov(b, NZCV::x64_mask);
code.pdep(a, a, b);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv)], a);
} else {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
code.shr(a, 28);
code.imul(a, a, NZCV::to_x64_multiplier);
@@ -537,8 +537,8 @@ void A32EmitX64::EmitA32SetCpsrNZCVQ(A32EmitContext& ctx, IR::Inst* inst) {
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm));
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_q)], u8((imm & 0x08000000) != 0 ? 1 : 0));
} else if (code.HasHostFeature(HostFeature::FastBMI2)) {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.shr(a, 28);
code.setc(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_q)]);
@@ -546,7 +546,7 @@ void A32EmitX64::EmitA32SetCpsrNZCVQ(A32EmitContext& ctx, IR::Inst* inst) {
code.pdep(a, a, b);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv)], a);
} else {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
code.shr(a, 28);
code.setc(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_q)]);
@@ -559,8 +559,8 @@ void A32EmitX64::EmitA32SetCpsrNZCVQ(A32EmitContext& ctx, IR::Inst* inst) {
void A32EmitX64::EmitA32SetCpsrNZ(A32EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg32 nz = ctx.reg_alloc.UseGpr(args[0]).cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 nz = ctx.reg_alloc.UseGpr(code, args[0]).cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.movzx(tmp, code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv) + 1]);
code.and_(tmp, 1);
@@ -577,12 +577,12 @@ void A32EmitX64::EmitA32SetCpsrNZC(A32EmitContext& ctx, IR::Inst* inst) {
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv) + 1], c);
} else {
const Xbyak::Reg8 c = ctx.reg_alloc.UseGpr(args[1]).cvt8();
const Xbyak::Reg8 c = ctx.reg_alloc.UseGpr(code, args[1]).cvt8();
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv) + 1], c);
}
} else {
const Xbyak::Reg32 nz = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 nz = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
if (args[1].IsImmediate()) {
const bool c = args[1].GetImmediateU1();
@@ -590,7 +590,7 @@ void A32EmitX64::EmitA32SetCpsrNZC(A32EmitContext& ctx, IR::Inst* inst) {
code.or_(nz, c);
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv) + 1], nz.cvt8());
} else {
const Xbyak::Reg32 c = ctx.reg_alloc.UseGpr(args[1]).cvt32();
const Xbyak::Reg32 c = ctx.reg_alloc.UseGpr(code, args[1]).cvt32();
code.or_(nz, c);
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv) + 1], nz.cvt8());
@@ -599,13 +599,13 @@ void A32EmitX64::EmitA32SetCpsrNZC(A32EmitContext& ctx, IR::Inst* inst) {
}
static void EmitGetFlag(BlockOfCode& code, A32EmitContext& ctx, IR::Inst* inst, size_t flag_bit) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.mov(result, dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_nzcv)]);
if (flag_bit != 0) {
code.shr(result, static_cast<int>(flag_bit));
}
code.and_(result, 1);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A32EmitX64::EmitA32GetCFlag(A32EmitContext& ctx, IR::Inst* inst) {
@@ -619,27 +619,27 @@ void A32EmitX64::EmitA32OrQFlag(A32EmitContext& ctx, IR::Inst* inst) {
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_q)], 1);
}
} else {
const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8();
const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(code, args[0]).cvt8();
code.or_(code.byte[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_q)], to_store);
}
}
void A32EmitX64::EmitA32GetGEFlags(A32EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
code.movd(result, dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)]);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A32EmitX64::EmitA32SetGEFlags(A32EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ASSERT(!args[0].IsImmediate());
if (args[0].IsInXmm()) {
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[0]);
if (args[0].IsInXmm(ctx.reg_alloc)) {
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(code, args[0]);
code.movd(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)], to_store);
} else {
const Xbyak::Reg32 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt32();
const Xbyak::Reg32 to_store = ctx.reg_alloc.UseGpr(code, args[0]).cvt32();
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)], to_store);
}
}
@@ -656,8 +656,8 @@ void A32EmitX64::EmitA32SetGEFlagsCompressed(A32EmitContext& ctx, IR::Inst* inst
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)], ge);
} else if (code.HasHostFeature(HostFeature::FastBMI2)) {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.mov(b, 0x01010101);
code.shr(a, 16);
@@ -665,7 +665,7 @@ void A32EmitX64::EmitA32SetGEFlagsCompressed(A32EmitContext& ctx, IR::Inst* inst
code.imul(a, a, 0xFF);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)], a);
} else {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
code.shr(a, 16);
code.and_(a, 0xF);
@@ -690,7 +690,7 @@ void A32EmitX64::EmitA32InstructionSynchronizationBarrier(A32EmitContext& ctx, I
return;
}
ctx.reg_alloc.HostCall(nullptr);
ctx.reg_alloc.HostCall(code, nullptr);
Devirtualize<&A32::UserCallbacks::InstructionSynchronizationBarrierRaised>(conf.callbacks).EmitCall(code);
}
@@ -718,9 +718,9 @@ void A32EmitX64::EmitA32BXWritePC(A32EmitContext& ctx, IR::Inst* inst) {
code.mov(MJitStateReg(A32::Reg::PC), new_pc & mask);
code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, upper_location_descriptor)], new_upper);
} else {
const Xbyak::Reg32 new_pc = ctx.reg_alloc.UseScratchGpr(arg).cvt32();
const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 new_upper = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 new_pc = ctx.reg_alloc.UseScratchGpr(code, arg).cvt32();
const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr(code).cvt32();
const Xbyak::Reg32 new_upper = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.mov(mask, new_pc);
code.and_(mask, 1);
@@ -745,7 +745,7 @@ void A32EmitX64::EmitA32CallSupervisor(A32EmitContext& ctx, IR::Inst* inst) {
code.SwitchMxcsrOnExit();
if (conf.enable_cycle_counting) {
ctx.reg_alloc.HostCall(nullptr);
ctx.reg_alloc.HostCall(code, nullptr);
code.mov(code.ABI_PARAM2, qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)]);
code.sub(code.ABI_PARAM2, qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)]);
Devirtualize<&A32::UserCallbacks::AddTicks>(conf.callbacks).EmitCall(code);
@@ -753,7 +753,7 @@ void A32EmitX64::EmitA32CallSupervisor(A32EmitContext& ctx, IR::Inst* inst) {
}
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.HostCall(nullptr, {}, args[0]);
ctx.reg_alloc.HostCall(code, nullptr, {}, args[0]);
Devirtualize<&A32::UserCallbacks::CallSVC>(conf.callbacks).EmitCall(code);
if (conf.enable_cycle_counting) {
@@ -767,7 +767,7 @@ void A32EmitX64::EmitA32CallSupervisor(A32EmitContext& ctx, IR::Inst* inst) {
void A32EmitX64::EmitA32ExceptionRaised(A32EmitContext& ctx, IR::Inst* inst) {
code.SwitchMxcsrOnExit();
ctx.reg_alloc.HostCall(nullptr);
ctx.reg_alloc.HostCall(code, nullptr);
if (conf.enable_cycle_counting) {
code.mov(code.ABI_PARAM2, qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)]);
code.sub(code.ABI_PARAM2, qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)]);
@@ -797,7 +797,7 @@ static u32 GetFpscrImpl(A32JitState* jit_state) {
}
void A32EmitX64::EmitA32GetFpscr(A32EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.HostCall(inst);
ctx.reg_alloc.HostCall(code, inst);
code.mov(code.ABI_PARAM1, code.ABI_JIT_PTR);
code.stmxcsr(code.dword[code.ABI_JIT_PTR + offsetof(A32JitState, guest_MXCSR)]);
@@ -810,7 +810,7 @@ static void SetFpscrImpl(u32 value, A32JitState* jit_state) {
void A32EmitX64::EmitA32SetFpscr(A32EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.HostCall(nullptr, args[0]);
ctx.reg_alloc.HostCall(code, nullptr, args[0]);
code.mov(code.ABI_PARAM2, code.ABI_JIT_PTR);
code.CallFunction(&SetFpscrImpl);
@@ -818,17 +818,17 @@ void A32EmitX64::EmitA32SetFpscr(A32EmitContext& ctx, IR::Inst* inst) {
}
void A32EmitX64::EmitA32GetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.mov(result, dword[code.ABI_JIT_PTR + offsetof(A32JitState, fpsr_nzcv)]);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A32EmitX64::EmitA32SetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::FastBMI2)) {
const Xbyak::Reg32 value = ctx.reg_alloc.UseGpr(args[0]).cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 value = ctx.reg_alloc.UseGpr(code, args[0]).cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.mov(tmp, NZCV::x64_mask);
code.pext(tmp, value, tmp);
@@ -838,7 +838,7 @@ void A32EmitX64::EmitA32SetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
return;
}
const Xbyak::Reg32 value = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 value = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
code.and_(value, NZCV::x64_mask);
code.imul(value, value, NZCV::from_x64_multiplier);
@@ -851,7 +851,7 @@ static void EmitCoprocessorException() {
}
static void CallCoprocCallback(BlockOfCode& code, RegAlloc& reg_alloc, A32::Coprocessor::Callback callback, IR::Inst* inst = nullptr, std::optional<Argument::copyable_reference> arg0 = {}, std::optional<Argument::copyable_reference> arg1 = {}) {
reg_alloc.HostCall(inst, {}, arg0, arg1);
reg_alloc.HostCall(code, inst, {}, arg0, arg1);
if (callback.user_arg) {
code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(*callback.user_arg));
@@ -914,8 +914,8 @@ void A32EmitX64::EmitA32CoprocSendOneWord(A32EmitContext& ctx, IR::Inst* inst) {
}
if (const auto destination_ptr = std::get_if<u32*>(&action)) {
const Xbyak::Reg32 reg_word = ctx.reg_alloc.UseGpr(args[1]).cvt32();
const Xbyak::Reg64 reg_destination_addr = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg32 reg_word = ctx.reg_alloc.UseGpr(code, args[1]).cvt32();
const Xbyak::Reg64 reg_destination_addr = ctx.reg_alloc.ScratchGpr(code);
code.mov(reg_destination_addr, reinterpret_cast<u64>(*destination_ptr));
code.mov(code.dword[reg_destination_addr], reg_word);
@@ -954,9 +954,9 @@ void A32EmitX64::EmitA32CoprocSendTwoWords(A32EmitContext& ctx, IR::Inst* inst)
}
if (const auto destination_ptrs = std::get_if<std::array<u32*, 2>>(&action)) {
const Xbyak::Reg32 reg_word1 = ctx.reg_alloc.UseGpr(args[1]).cvt32();
const Xbyak::Reg32 reg_word2 = ctx.reg_alloc.UseGpr(args[2]).cvt32();
const Xbyak::Reg64 reg_destination_addr = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg32 reg_word1 = ctx.reg_alloc.UseGpr(code, args[1]).cvt32();
const Xbyak::Reg32 reg_word2 = ctx.reg_alloc.UseGpr(code, args[2]).cvt32();
const Xbyak::Reg64 reg_destination_addr = ctx.reg_alloc.ScratchGpr(code);
code.mov(reg_destination_addr, reinterpret_cast<u64>((*destination_ptrs)[0]));
code.mov(code.dword[reg_destination_addr], reg_word1);
@@ -998,13 +998,13 @@ void A32EmitX64::EmitA32CoprocGetOneWord(A32EmitContext& ctx, IR::Inst* inst) {
}
if (const auto source_ptr = std::get_if<u32*>(&action)) {
const Xbyak::Reg32 reg_word = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg64 reg_source_addr = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg32 reg_word = ctx.reg_alloc.ScratchGpr(code).cvt32();
const Xbyak::Reg64 reg_source_addr = ctx.reg_alloc.ScratchGpr(code);
code.mov(reg_source_addr, reinterpret_cast<u64>(*source_ptr));
code.mov(reg_word, code.dword[reg_source_addr]);
ctx.reg_alloc.DefineValue(inst, reg_word);
ctx.reg_alloc.DefineValue(code, inst, reg_word);
return;
}
@@ -1038,9 +1038,9 @@ void A32EmitX64::EmitA32CoprocGetTwoWords(A32EmitContext& ctx, IR::Inst* inst) {
}
if (const auto source_ptrs = std::get_if<std::array<u32*, 2>>(&action)) {
const Xbyak::Reg64 reg_result = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 reg_destination_addr = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 reg_tmp = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 reg_result = ctx.reg_alloc.ScratchGpr(code);
const Xbyak::Reg64 reg_destination_addr = ctx.reg_alloc.ScratchGpr(code);
const Xbyak::Reg64 reg_tmp = ctx.reg_alloc.ScratchGpr(code);
code.mov(reg_destination_addr, reinterpret_cast<u64>((*source_ptrs)[1]));
code.mov(reg_result.cvt32(), code.dword[reg_destination_addr]);
@@ -1049,7 +1049,7 @@ void A32EmitX64::EmitA32CoprocGetTwoWords(A32EmitContext& ctx, IR::Inst* inst) {
code.mov(reg_tmp.cvt32(), code.dword[reg_destination_addr]);
code.or_(reg_result, reg_tmp);
ctx.reg_alloc.DefineValue(inst, reg_result);
ctx.reg_alloc.DefineValue(code, inst, reg_result);
return;
}

View File

@@ -91,7 +91,7 @@ A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) noexcept {
return gprs;
}();
new (&this->reg_alloc) RegAlloc{&code, gpr_order, any_xmm};
new (&this->reg_alloc) RegAlloc{gpr_order, any_xmm};
A64EmitContext ctx{conf, reg_alloc, block};
// Start emitting.
@@ -159,7 +159,7 @@ finish_this_inst:
}
code.int3();
const size_t size = static_cast<size_t>(code.getCurr() - entrypoint);
const size_t size = size_t(code.getCurr() - entrypoint);
const A64::LocationDescriptor descriptor{block.Location()};
const A64::LocationDescriptor end_location{block.EndLocation()};
@@ -266,25 +266,25 @@ void A64EmitX64::EmitPushRSB(EmitContext& ctx, IR::Inst* inst) {
void A64EmitX64::EmitA64SetCheckBit(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8();
const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(code, args[0]).cvt8();
code.mov(code.byte[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, check_bit)], to_store);
}
void A64EmitX64::EmitA64GetCFlag(A64EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.mov(result, dword[code.ABI_JIT_PTR + offsetof(A64JitState, cpsr_nzcv)]);
code.shr(result, NZCV::x64_c_flag_bit);
code.and_(result, 1);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A64EmitX64::EmitA64GetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.mov(nzcv_raw, dword[code.ABI_JIT_PTR + offsetof(A64JitState, cpsr_nzcv)]);
if (code.HasHostFeature(HostFeature::FastBMI2)) {
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.mov(tmp, NZCV::x64_mask);
code.pext(nzcv_raw, nzcv_raw, tmp);
code.shl(nzcv_raw, 28);
@@ -294,16 +294,16 @@ void A64EmitX64::EmitA64GetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) {
code.and_(nzcv_raw, NZCV::arm_mask);
}
ctx.reg_alloc.DefineValue(inst, nzcv_raw);
ctx.reg_alloc.DefineValue(code, inst, nzcv_raw);
}
void A64EmitX64::EmitA64SetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
code.shr(nzcv_raw, 28);
if (code.HasHostFeature(HostFeature::FastBMI2)) {
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.mov(tmp, NZCV::x64_mask);
code.pdep(nzcv_raw, nzcv_raw, tmp);
} else {
@@ -315,63 +315,63 @@ void A64EmitX64::EmitA64SetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) {
void A64EmitX64::EmitA64SetNZCV(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
code.mov(dword[code.ABI_JIT_PTR + offsetof(A64JitState, cpsr_nzcv)], to_store);
}
void A64EmitX64::EmitA64GetW(A64EmitContext& ctx, IR::Inst* inst) {
const A64::Reg reg = inst->GetArg(0).GetA64RegRef();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.mov(result, dword[code.ABI_JIT_PTR + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)]);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A64EmitX64::EmitA64GetX(A64EmitContext& ctx, IR::Inst* inst) {
const A64::Reg reg = inst->GetArg(0).GetA64RegRef();
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(code);
code.mov(result, qword[code.ABI_JIT_PTR + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)]);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A64EmitX64::EmitA64GetS(A64EmitContext& ctx, IR::Inst* inst) {
const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
const auto addr = qword[code.ABI_JIT_PTR + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
code.movd(result, addr);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A64EmitX64::EmitA64GetD(A64EmitContext& ctx, IR::Inst* inst) {
const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
const auto addr = qword[code.ABI_JIT_PTR + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
code.movq(result, addr);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A64EmitX64::EmitA64GetQ(A64EmitContext& ctx, IR::Inst* inst) {
const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
const auto addr = xword[code.ABI_JIT_PTR + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
code.movaps(result, addr);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A64EmitX64::EmitA64GetSP(A64EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(code);
code.mov(result, qword[code.ABI_JIT_PTR + offsetof(A64JitState, sp)]);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A64EmitX64::EmitA64GetFPCR(A64EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.mov(result, dword[code.ABI_JIT_PTR + offsetof(A64JitState, fpcr)]);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
static u32 GetFPSRImpl(A64JitState* jit_state) {
@@ -379,7 +379,7 @@ static u32 GetFPSRImpl(A64JitState* jit_state) {
}
void A64EmitX64::EmitA64GetFPSR(A64EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.HostCall(inst);
ctx.reg_alloc.HostCall(code, inst);
code.mov(code.ABI_PARAM1, code.ABI_JIT_PTR);
code.stmxcsr(code.dword[code.ABI_JIT_PTR + offsetof(A64JitState, guest_MXCSR)]);
code.CallFunction(GetFPSRImpl);
@@ -393,7 +393,7 @@ void A64EmitX64::EmitA64SetW(A64EmitContext& ctx, IR::Inst* inst) {
code.mov(addr, args[1].GetImmediateS32());
} else {
// TODO: zext tracking, xmm variant
const Xbyak::Reg64 to_store = ctx.reg_alloc.UseScratchGpr(args[1]);
const Xbyak::Reg64 to_store = ctx.reg_alloc.UseScratchGpr(code, args[1]);
code.mov(to_store.cvt32(), to_store.cvt32());
code.mov(addr, to_store);
}
@@ -405,11 +405,11 @@ void A64EmitX64::EmitA64SetX(A64EmitContext& ctx, IR::Inst* inst) {
const auto addr = qword[code.ABI_JIT_PTR + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)];
if (args[1].FitsInImmediateS32()) {
code.mov(addr, args[1].GetImmediateS32());
} else if (args[1].IsInXmm()) {
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
} else if (args[1].IsInXmm(ctx.reg_alloc)) {
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(code, args[1]);
code.movq(addr, to_store);
} else {
const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(args[1]);
const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(code, args[1]);
code.mov(addr, to_store);
}
}
@@ -419,8 +419,8 @@ void A64EmitX64::EmitA64SetS(A64EmitContext& ctx, IR::Inst* inst) {
const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
const auto addr = xword[code.ABI_JIT_PTR + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
// TODO: Optimize
code.pxor(tmp, tmp);
code.movss(tmp, to_store);
@@ -432,7 +432,7 @@ void A64EmitX64::EmitA64SetD(A64EmitContext& ctx, IR::Inst* inst) {
const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
const auto addr = xword[code.ABI_JIT_PTR + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
const Xbyak::Xmm to_store = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm to_store = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.movq(to_store, to_store); // TODO: Remove when able
code.movaps(addr, to_store);
}
@@ -442,7 +442,7 @@ void A64EmitX64::EmitA64SetQ(A64EmitContext& ctx, IR::Inst* inst) {
const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
const auto addr = xword[code.ABI_JIT_PTR + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(code, args[1]);
code.movaps(addr, to_store);
}
@@ -451,11 +451,11 @@ void A64EmitX64::EmitA64SetSP(A64EmitContext& ctx, IR::Inst* inst) {
const auto addr = qword[code.ABI_JIT_PTR + offsetof(A64JitState, sp)];
if (args[0].FitsInImmediateS32()) {
code.mov(addr, args[0].GetImmediateS32());
} else if (args[0].IsInXmm()) {
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[0]);
} else if (args[0].IsInXmm(ctx.reg_alloc)) {
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(code, args[0]);
code.movq(addr, to_store);
} else {
const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(args[0]);
const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(code, args[0]);
code.mov(addr, to_store);
}
}
@@ -466,7 +466,7 @@ static void SetFPCRImpl(A64JitState* jit_state, u32 value) {
void A64EmitX64::EmitA64SetFPCR(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.HostCall(nullptr, {}, args[0]);
ctx.reg_alloc.HostCall(code, nullptr, {}, args[0]);
code.mov(code.ABI_PARAM1, code.ABI_JIT_PTR);
code.CallFunction(SetFPCRImpl);
code.ldmxcsr(code.dword[code.ABI_JIT_PTR + offsetof(A64JitState, guest_MXCSR)]);
@@ -478,7 +478,7 @@ static void SetFPSRImpl(A64JitState* jit_state, u32 value) {
void A64EmitX64::EmitA64SetFPSR(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.HostCall(nullptr, {}, args[0]);
ctx.reg_alloc.HostCall(code, nullptr, {}, args[0]);
code.mov(code.ABI_PARAM1, code.ABI_JIT_PTR);
code.CallFunction(SetFPSRImpl);
code.ldmxcsr(code.dword[code.ABI_JIT_PTR + offsetof(A64JitState, guest_MXCSR)]);
@@ -489,17 +489,17 @@ void A64EmitX64::EmitA64SetPC(A64EmitContext& ctx, IR::Inst* inst) {
const auto addr = qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)];
if (args[0].FitsInImmediateS32()) {
code.mov(addr, args[0].GetImmediateS32());
} else if (args[0].IsInXmm()) {
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[0]);
} else if (args[0].IsInXmm(ctx.reg_alloc)) {
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(code, args[0]);
code.movq(addr, to_store);
} else {
const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(args[0]);
const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(code, args[0]);
code.mov(addr, to_store);
}
}
void A64EmitX64::EmitA64CallSupervisor(A64EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.HostCall(nullptr);
ctx.reg_alloc.HostCall(code, nullptr);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ASSERT(args[0].IsImmediate());
const u32 imm = args[0].GetImmediateU32();
@@ -511,7 +511,7 @@ void A64EmitX64::EmitA64CallSupervisor(A64EmitContext& ctx, IR::Inst* inst) {
}
void A64EmitX64::EmitA64ExceptionRaised(A64EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.HostCall(nullptr);
ctx.reg_alloc.HostCall(code, nullptr);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ASSERT(args[0].IsImmediate() && args[1].IsImmediate());
const u64 pc = args[0].GetImmediateU64();
@@ -524,13 +524,13 @@ void A64EmitX64::EmitA64ExceptionRaised(A64EmitContext& ctx, IR::Inst* inst) {
void A64EmitX64::EmitA64DataCacheOperationRaised(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.HostCall(nullptr, {}, args[1], args[2]);
ctx.reg_alloc.HostCall(code, nullptr, {}, args[1], args[2]);
Devirtualize<&A64::UserCallbacks::DataCacheOperationRaised>(conf.callbacks).EmitCall(code);
}
void A64EmitX64::EmitA64InstructionCacheOperationRaised(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]);
ctx.reg_alloc.HostCall(code, nullptr, {}, args[0], args[1]);
Devirtualize<&A64::UserCallbacks::InstructionCacheOperationRaised>(conf.callbacks).EmitCall(code);
}
@@ -548,18 +548,18 @@ void A64EmitX64::EmitA64InstructionSynchronizationBarrier(A64EmitContext& ctx, I
return;
}
ctx.reg_alloc.HostCall(nullptr);
ctx.reg_alloc.HostCall(code, nullptr);
Devirtualize<&A64::UserCallbacks::InstructionSynchronizationBarrierRaised>(conf.callbacks).EmitCall(code);
}
void A64EmitX64::EmitA64GetCNTFRQ(A64EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.mov(result, conf.cntfrq_el0);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A64EmitX64::EmitA64GetCNTPCT(A64EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.HostCall(inst);
ctx.reg_alloc.HostCall(code, inst);
if (!conf.wall_clock_cntpct) {
code.UpdateTicks();
}
@@ -567,43 +567,43 @@ void A64EmitX64::EmitA64GetCNTPCT(A64EmitContext& ctx, IR::Inst* inst) {
}
void A64EmitX64::EmitA64GetCTR(A64EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.mov(result, conf.ctr_el0);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A64EmitX64::EmitA64GetDCZID(A64EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.mov(result, conf.dczid_el0);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A64EmitX64::EmitA64GetTPIDR(A64EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(code);
if (conf.tpidr_el0) {
code.mov(result, u64(conf.tpidr_el0));
code.mov(result, qword[result]);
} else {
code.xor_(result.cvt32(), result.cvt32());
}
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A64EmitX64::EmitA64GetTPIDRRO(A64EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(code);
if (conf.tpidrro_el0) {
code.mov(result, u64(conf.tpidrro_el0));
code.mov(result, qword[result]);
} else {
code.xor_(result.cvt32(), result.cvt32());
}
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void A64EmitX64::EmitA64SetTPIDR(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[0]);
const Xbyak::Reg64 addr = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(code, args[0]);
const Xbyak::Reg64 addr = ctx.reg_alloc.ScratchGpr(code);
if (conf.tpidr_el0) {
code.mov(addr, u64(conf.tpidr_el0));
code.mov(qword[addr], value);

View File

@@ -68,7 +68,7 @@ void EmitX64::EmitVoid(EmitContext&, IR::Inst*) {
void EmitX64::EmitIdentity(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (!args[0].IsImmediate()) {
ctx.reg_alloc.DefineValue(inst, args[0]);
ctx.reg_alloc.DefineValue(code, inst, args[0]);
}
}
@@ -78,7 +78,7 @@ void EmitX64::EmitBreakpoint(EmitContext&, IR::Inst*) {
void EmitX64::EmitCallHostFunction(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.HostCall(nullptr, args[1], args[2], args[3]);
ctx.reg_alloc.HostCall(code, nullptr, args[1], args[2], args[3]);
code.mov(rax, args[0].GetImmediateU64());
code.call(rax);
}
@@ -120,7 +120,7 @@ void EmitX64::EmitVerboseDebuggingOutput(RegAlloc& reg_alloc) {
code.lea(rax, ptr[rsp + sizeof(RegisterData) + offsetof(StackLayout, spill)]);
code.mov(qword[rsp + offsetof(RegisterData, spill)], rax);
reg_alloc.EmitVerboseDebuggingOutput();
reg_alloc.EmitVerboseDebuggingOutput(code);
for (int i = 0; i < 16; i++) {
if (rsp.getIdx() == i) {
@@ -140,9 +140,9 @@ void EmitX64::EmitPushRSB(EmitContext& ctx, IR::Inst* inst) {
ASSERT(args[0].IsImmediate());
const u64 unique_hash_of_target = args[0].GetImmediateU64();
ctx.reg_alloc.ScratchGpr(HostLoc::RCX);
const Xbyak::Reg64 loc_desc_reg = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 index_reg = ctx.reg_alloc.ScratchGpr();
ctx.reg_alloc.ScratchGpr(code, HostLoc::RCX);
const Xbyak::Reg64 loc_desc_reg = ctx.reg_alloc.ScratchGpr(code);
const Xbyak::Reg64 index_reg = ctx.reg_alloc.ScratchGpr(code);
PushRSBHelper(loc_desc_reg, index_reg, IR::LocationDescriptor{unique_hash_of_target});
}
@@ -190,12 +190,12 @@ void EmitX64::EmitGetNZFromOp(EmitContext& ctx, IR::Inst* inst) {
}
}();
const Xbyak::Reg64 nz = ctx.reg_alloc.ScratchGpr(HostLoc::RAX);
const Xbyak::Reg value = ctx.reg_alloc.UseGpr(args[0]).changeBit(bitsize);
const Xbyak::Reg64 nz = ctx.reg_alloc.ScratchGpr(code, HostLoc::RAX);
const Xbyak::Reg value = ctx.reg_alloc.UseGpr(code, args[0]).changeBit(bitsize);
code.test(value, value);
code.lahf();
code.movzx(eax, ah);
ctx.reg_alloc.DefineValue(inst, nz);
ctx.reg_alloc.DefineValue(code, inst, nz);
}
void EmitX64::EmitGetNZCVFromOp(EmitContext& ctx, IR::Inst* inst) {
@@ -221,27 +221,27 @@ void EmitX64::EmitGetNZCVFromOp(EmitContext& ctx, IR::Inst* inst) {
}
}();
const Xbyak::Reg64 nzcv = ctx.reg_alloc.ScratchGpr(HostLoc::RAX);
const Xbyak::Reg value = ctx.reg_alloc.UseGpr(args[0]).changeBit(bitsize);
const Xbyak::Reg64 nzcv = ctx.reg_alloc.ScratchGpr(code, HostLoc::RAX);
const Xbyak::Reg value = ctx.reg_alloc.UseGpr(code, args[0]).changeBit(bitsize);
code.test(value, value);
code.lahf();
code.xor_(al, al);
ctx.reg_alloc.DefineValue(inst, nzcv);
ctx.reg_alloc.DefineValue(code, inst, nzcv);
}
void EmitX64::EmitGetCFlagFromNZCV(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (args[0].IsImmediate()) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr(code).cvt32();
const u32 value = (args[0].GetImmediateU32() >> 8) & 1;
code.mov(result, value);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
code.shr(result, 8);
code.and_(result, 1);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
}
@@ -249,30 +249,30 @@ void EmitX64::EmitNZCVFromPackedFlags(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (args[0].IsImmediate()) {
const Xbyak::Reg32 nzcv = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 nzcv = ctx.reg_alloc.ScratchGpr(code).cvt32();
u32 value = 0;
value |= mcl::bit::get_bit<31>(args[0].GetImmediateU32()) ? (1 << 15) : 0;
value |= mcl::bit::get_bit<30>(args[0].GetImmediateU32()) ? (1 << 14) : 0;
value |= mcl::bit::get_bit<29>(args[0].GetImmediateU32()) ? (1 << 8) : 0;
value |= mcl::bit::get_bit<28>(args[0].GetImmediateU32()) ? (1 << 0) : 0;
code.mov(nzcv, value);
ctx.reg_alloc.DefineValue(inst, nzcv);
ctx.reg_alloc.DefineValue(code, inst, nzcv);
} else if (code.HasHostFeature(HostFeature::FastBMI2)) {
const Xbyak::Reg32 nzcv = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 nzcv = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.shr(nzcv, 28);
code.mov(tmp, NZCV::x64_mask);
code.pdep(nzcv, nzcv, tmp);
ctx.reg_alloc.DefineValue(inst, nzcv);
ctx.reg_alloc.DefineValue(code, inst, nzcv);
} else {
const Xbyak::Reg32 nzcv = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 nzcv = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
code.shr(nzcv, 28);
code.imul(nzcv, nzcv, NZCV::to_x64_multiplier);
code.and_(nzcv, NZCV::x64_mask);
ctx.reg_alloc.DefineValue(inst, nzcv);
ctx.reg_alloc.DefineValue(code, inst, nzcv);
}
}

View File

@@ -23,13 +23,13 @@ using AESFn = void(AES::State&, const AES::State&);
static void EmitAESFunction(RegAlloc::ArgumentInfo args, EmitContext& ctx, BlockOfCode& code, IR::Inst* inst, AESFn fn) {
constexpr u32 stack_space = static_cast<u32>(sizeof(AES::State)) * 2;
const Xbyak::Xmm input = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm input = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(nullptr);
ctx.reg_alloc.HostCall(code, nullptr);
ctx.reg_alloc.AllocStackSpace(stack_space + ABI_SHADOW_SPACE);
ctx.reg_alloc.AllocStackSpace(code, stack_space + ABI_SHADOW_SPACE);
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]);
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + sizeof(AES::State)]);
@@ -37,22 +37,22 @@ static void EmitAESFunction(RegAlloc::ArgumentInfo args, EmitContext& ctx, Block
code.CallFunction(fn);
code.movaps(result, xword[rsp + ABI_SHADOW_SPACE]);
ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE);
ctx.reg_alloc.ReleaseStackSpace(code, stack_space + ABI_SHADOW_SPACE);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void EmitX64::EmitAESDecryptSingleRound(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AES)) {
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zero, zero);
code.aesdeclast(data, zero);
ctx.reg_alloc.DefineValue(inst, data);
ctx.reg_alloc.DefineValue(code, inst, data);
return;
}
@@ -63,13 +63,13 @@ void EmitX64::EmitAESEncryptSingleRound(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AES)) {
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zero, zero);
code.aesenclast(data, zero);
ctx.reg_alloc.DefineValue(inst, data);
ctx.reg_alloc.DefineValue(code, inst, data);
return;
}
@@ -80,11 +80,11 @@ void EmitX64::EmitAESInverseMixColumns(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AES)) {
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.aesimc(data, data);
ctx.reg_alloc.DefineValue(inst, data);
ctx.reg_alloc.DefineValue(code, inst, data);
return;
}
@@ -95,14 +95,14 @@ void EmitX64::EmitAESMixColumns(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AES)) {
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zero, zero);
code.aesdeclast(data, zero);
code.aesenc(data, zero);
ctx.reg_alloc.DefineValue(inst, data);
ctx.reg_alloc.DefineValue(code, inst, data);
return;
}

View File

@@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
/* This file is part of the dynarmic project.
* Copyright (c) 2018 MerryMage
* SPDX-License-Identifier: 0BSD
@@ -19,16 +22,16 @@ namespace CRC32 = Common::Crypto::CRC32;
static void EmitCRC32Castagnoli(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, const int data_size) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE42)) {
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg value = ctx.reg_alloc.UseGpr(args[1]).changeBit(data_size);
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
const Xbyak::Reg value = ctx.reg_alloc.UseGpr(code, args[1]).changeBit(data_size);
if (data_size != 64) {
code.crc32(crc, value);
} else {
code.crc32(crc.cvt64(), value);
}
ctx.reg_alloc.DefineValue(inst, crc);
ctx.reg_alloc.DefineValue(code, inst, crc);
} else {
ctx.reg_alloc.HostCall(inst, args[0], args[1], {});
ctx.reg_alloc.HostCall(code, inst, args[0], args[1], {});
code.mov(code.ABI_PARAM3.cvt32(), data_size / CHAR_BIT); //zext
code.CallFunction(&CRC32::ComputeCRC32Castagnoli);
}
@@ -38,11 +41,11 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size < 32) {
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg64 value = ctx.reg_alloc.UseScratchGpr(args[1]);
const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_const = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
const Xbyak::Reg64 value = ctx.reg_alloc.UseScratchGpr(code, args[1]);
const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm xmm_const = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(xmm_const, code.Const(xword, 0xb4e5b025'f7011641, 0x00000001'DB710641));
@@ -64,12 +67,12 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
code.pextrd(crc, xmm_value, 2);
ctx.reg_alloc.DefineValue(inst, crc);
ctx.reg_alloc.DefineValue(code, inst, crc);
} else if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size == 32) {
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 value = ctx.reg_alloc.UseGpr(args[1]).cvt32();
const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_const = ctx.reg_alloc.ScratchXmm();
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
const Xbyak::Reg32 value = ctx.reg_alloc.UseGpr(code, args[1]).cvt32();
const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm xmm_const = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(xmm_const, code.Const(xword, 0xb4e5b025'f7011641, 0x00000001'DB710641));
@@ -82,12 +85,12 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
code.pextrd(crc, xmm_value, 2);
ctx.reg_alloc.DefineValue(inst, crc);
ctx.reg_alloc.DefineValue(code, inst, crc);
} else if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size == 64) {
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]);
const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_const = ctx.reg_alloc.ScratchXmm();
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(code, args[1]);
const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm xmm_const = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(xmm_const, code.Const(xword, 0xb4e5b025'f7011641, 0x00000001'DB710641));
@@ -100,9 +103,9 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
code.pextrd(crc, xmm_value, 2);
ctx.reg_alloc.DefineValue(inst, crc);
ctx.reg_alloc.DefineValue(code, inst, crc);
} else {
ctx.reg_alloc.HostCall(inst, args[0], args[1], {});
ctx.reg_alloc.HostCall(code, inst, args[0], args[1], {});
code.mov(code.ABI_PARAM3, data_size / CHAR_BIT);
code.CallFunction(&CRC32::ComputeCRC32ISO);
}

View File

@@ -54,14 +54,14 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) {
if (!conf.page_table && !fastmem_marker) {
// Neither fastmem nor page table: Use callbacks
if constexpr (bitsize == 128) {
ctx.reg_alloc.HostCall(nullptr, {}, args[1]);
ctx.reg_alloc.HostCall(code, nullptr, {}, args[1]);
if (ordered) {
code.mfence();
}
code.CallFunction(memory_read_128);
ctx.reg_alloc.DefineValue(inst, xmm1);
ctx.reg_alloc.DefineValue(code, inst, xmm1);
} else {
ctx.reg_alloc.HostCall(inst, {}, args[1]);
ctx.reg_alloc.HostCall(code, inst, {}, args[1]);
if (ordered) {
code.mfence();
}
@@ -74,14 +74,14 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) {
if (ordered && bitsize == 128) {
// Required for atomic 128-bit loads/stores
ctx.reg_alloc.ScratchGpr(HostLoc::RAX);
ctx.reg_alloc.ScratchGpr(HostLoc::RBX);
ctx.reg_alloc.ScratchGpr(HostLoc::RCX);
ctx.reg_alloc.ScratchGpr(HostLoc::RDX);
ctx.reg_alloc.ScratchGpr(code, HostLoc::RAX);
ctx.reg_alloc.ScratchGpr(code, HostLoc::RBX);
ctx.reg_alloc.ScratchGpr(code, HostLoc::RCX);
ctx.reg_alloc.ScratchGpr(code, HostLoc::RDX);
}
const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[1]);
const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx();
const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(code, args[1]);
const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm(code).getIdx() : ctx.reg_alloc.ScratchGpr(code).getIdx();
const auto wrapped_fn = read_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)];
@@ -126,9 +126,9 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) {
code.L(*end);
if constexpr (bitsize == 128) {
ctx.reg_alloc.DefineValue(inst, Xbyak::Xmm{value_idx});
ctx.reg_alloc.DefineValue(code, inst, Xbyak::Xmm{value_idx});
} else {
ctx.reg_alloc.DefineValue(inst, Xbyak::Reg64{value_idx});
ctx.reg_alloc.DefineValue(code, inst, Xbyak::Reg64{value_idx});
}
}
@@ -141,13 +141,13 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) {
if (!conf.page_table && !fastmem_marker) {
// Neither fastmem nor page table: Use callbacks
if constexpr (bitsize == 128) {
ctx.reg_alloc.Use(args[1], ABI_PARAM2);
ctx.reg_alloc.Use(args[2], HostLoc::XMM1);
ctx.reg_alloc.Use(code, args[1], ABI_PARAM2);
ctx.reg_alloc.Use(code, args[2], HostLoc::XMM1);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(nullptr);
ctx.reg_alloc.HostCall(code, nullptr);
code.CallFunction(memory_write_128);
} else {
ctx.reg_alloc.HostCall(nullptr, {}, args[1], args[2]);
ctx.reg_alloc.HostCall(code, nullptr, {}, args[1], args[2]);
Devirtualize<callback>(conf.callbacks).EmitCall(code);
}
if (ordered) {
@@ -159,16 +159,16 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) {
if (ordered && bitsize == 128) {
// Required for atomic 128-bit loads/stores
ctx.reg_alloc.ScratchGpr(HostLoc::RAX);
ctx.reg_alloc.ScratchGpr(HostLoc::RBX);
ctx.reg_alloc.ScratchGpr(HostLoc::RCX);
ctx.reg_alloc.ScratchGpr(HostLoc::RDX);
ctx.reg_alloc.ScratchGpr(code, HostLoc::RAX);
ctx.reg_alloc.ScratchGpr(code, HostLoc::RBX);
ctx.reg_alloc.ScratchGpr(code, HostLoc::RCX);
ctx.reg_alloc.ScratchGpr(code, HostLoc::RDX);
}
const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[1]);
const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(code, args[1]);
const int value_idx = bitsize == 128
? ctx.reg_alloc.UseXmm(args[2]).getIdx()
: (ordered ? ctx.reg_alloc.UseScratchGpr(args[2]).getIdx() : ctx.reg_alloc.UseGpr(args[2]).getIdx());
? ctx.reg_alloc.UseXmm(code, args[2]).getIdx()
: (ordered ? ctx.reg_alloc.UseScratchGpr(code, args[2]).getIdx() : ctx.reg_alloc.UseGpr(code, args[2]).getIdx());
const auto wrapped_fn = write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)];
@@ -222,7 +222,7 @@ void AxxEmitX64::EmitExclusiveReadMemory(AxxEmitContext& ctx, IR::Inst* inst) {
if constexpr (bitsize != 128) {
using T = mcl::unsigned_integer_of_size<bitsize>;
ctx.reg_alloc.HostCall(inst, {}, args[1]);
ctx.reg_alloc.HostCall(code, inst, {}, args[1]);
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(AxxJitState, exclusive_state)], u8(1));
code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf));
@@ -237,14 +237,14 @@ void AxxEmitX64::EmitExclusiveReadMemory(AxxEmitContext& ctx, IR::Inst* inst) {
});
code.ZeroExtendFrom(bitsize, code.ABI_RETURN);
} else {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
ctx.reg_alloc.Use(args[1], ABI_PARAM2);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.Use(code, args[1], ABI_PARAM2);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(nullptr);
ctx.reg_alloc.HostCall(code, nullptr);
code.mov(code.byte[code.ABI_JIT_PTR + offsetof(AxxJitState, exclusive_state)], u8(1));
code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf));
ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE);
ctx.reg_alloc.AllocStackSpace(code, 16 + ABI_SHADOW_SPACE);
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]);
if (ordered) {
code.mfence();
@@ -256,9 +256,9 @@ void AxxEmitX64::EmitExclusiveReadMemory(AxxEmitContext& ctx, IR::Inst* inst) {
});
});
code.movups(result, xword[rsp + ABI_SHADOW_SPACE]);
ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE);
ctx.reg_alloc.ReleaseStackSpace(code, 16 + ABI_SHADOW_SPACE);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
EmitCheckMemoryAbort(ctx, inst);
@@ -271,15 +271,15 @@ void AxxEmitX64::EmitExclusiveWriteMemory(AxxEmitContext& ctx, IR::Inst* inst) {
const bool ordered = IsOrdered(args[3].GetImmediateAccType());
if constexpr (bitsize == 128) {
ctx.reg_alloc.Use(args[1], ABI_PARAM2);
ctx.reg_alloc.Use(args[2], HostLoc::XMM1);
ctx.reg_alloc.Use(code, args[1], ABI_PARAM2);
ctx.reg_alloc.Use(code, args[2], HostLoc::XMM1);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(inst);
ctx.reg_alloc.HostCall(code, inst);
} else {
ctx.reg_alloc.HostCall(inst, {}, args[1], args[2]);
ctx.reg_alloc.HostCall(code, inst, {}, args[1], args[2]);
}
const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(code);
Xbyak::Label end;
code.mov(code.ABI_RETURN, u32(1));
code.movzx(tmp.cvt32(), code.byte[code.ABI_JIT_PTR + offsetof(AxxJitState, exclusive_state)]);
@@ -299,7 +299,7 @@ void AxxEmitX64::EmitExclusiveWriteMemory(AxxEmitContext& ctx, IR::Inst* inst) {
code.mfence();
}
} else {
ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE);
ctx.reg_alloc.AllocStackSpace(code, 16 + ABI_SHADOW_SPACE);
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]);
code.movaps(xword[code.ABI_PARAM3], xmm1);
code.CallLambda([](AxxUserConfig& conf, Axx::VAddr vaddr, Vector& value) -> u32 {
@@ -310,7 +310,7 @@ void AxxEmitX64::EmitExclusiveWriteMemory(AxxEmitContext& ctx, IR::Inst* inst) {
if (ordered) {
code.mfence();
}
ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE);
ctx.reg_alloc.ReleaseStackSpace(code, 16 + ABI_SHADOW_SPACE);
}
code.L(end);
@@ -330,16 +330,16 @@ void AxxEmitX64::EmitExclusiveReadMemoryInline(AxxEmitContext& ctx, IR::Inst* in
if constexpr (ordered && bitsize == 128) {
// Required for atomic 128-bit loads/stores
ctx.reg_alloc.ScratchGpr(HostLoc::RAX);
ctx.reg_alloc.ScratchGpr(HostLoc::RBX);
ctx.reg_alloc.ScratchGpr(HostLoc::RCX);
ctx.reg_alloc.ScratchGpr(HostLoc::RDX);
ctx.reg_alloc.ScratchGpr(code, HostLoc::RAX);
ctx.reg_alloc.ScratchGpr(code, HostLoc::RBX);
ctx.reg_alloc.ScratchGpr(code, HostLoc::RCX);
ctx.reg_alloc.ScratchGpr(code, HostLoc::RDX);
}
const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[1]);
const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx();
const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(code, args[1]);
const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm(code).getIdx() : ctx.reg_alloc.ScratchGpr(code).getIdx();
const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(code);
const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(code);
const auto wrapped_fn = read_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)];
@@ -386,9 +386,9 @@ void AxxEmitX64::EmitExclusiveReadMemoryInline(AxxEmitContext& ctx, IR::Inst* in
EmitExclusiveUnlock(code, conf, tmp, tmp2.cvt32());
if constexpr (bitsize == 128) {
ctx.reg_alloc.DefineValue(inst, Xbyak::Xmm{value_idx});
ctx.reg_alloc.DefineValue(code, inst, Xbyak::Xmm{value_idx});
} else {
ctx.reg_alloc.DefineValue(inst, Xbyak::Reg64{value_idx});
ctx.reg_alloc.DefineValue(code, inst, Xbyak::Reg64{value_idx});
}
EmitCheckMemoryAbort(ctx, inst);
@@ -407,19 +407,19 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i
const auto value = [&] {
if constexpr (bitsize == 128) {
ctx.reg_alloc.ScratchGpr(HostLoc::RAX);
ctx.reg_alloc.ScratchGpr(HostLoc::RBX);
ctx.reg_alloc.ScratchGpr(HostLoc::RCX);
ctx.reg_alloc.ScratchGpr(HostLoc::RDX);
return ctx.reg_alloc.UseXmm(args[2]);
ctx.reg_alloc.ScratchGpr(code, HostLoc::RAX);
ctx.reg_alloc.ScratchGpr(code, HostLoc::RBX);
ctx.reg_alloc.ScratchGpr(code, HostLoc::RCX);
ctx.reg_alloc.ScratchGpr(code, HostLoc::RDX);
return ctx.reg_alloc.UseXmm(code, args[2]);
} else {
ctx.reg_alloc.ScratchGpr(HostLoc::RAX);
return ctx.reg_alloc.UseGpr(args[2]);
ctx.reg_alloc.ScratchGpr(code, HostLoc::RAX);
return ctx.reg_alloc.UseGpr(code, args[2]);
}
}();
const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[1]);
const Xbyak::Reg32 status = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(code, args[1]);
const Xbyak::Reg32 status = ctx.reg_alloc.ScratchGpr(code).cvt32();
const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(code);
const auto wrapped_fn = exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value.getIdx())];
@@ -518,7 +518,7 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i
code.L(*end);
EmitExclusiveUnlock(code, conf, tmp, eax);
ctx.reg_alloc.DefineValue(inst, status);
ctx.reg_alloc.DefineValue(code, inst, status);
EmitCheckMemoryAbort(ctx, inst);
}

View File

@@ -75,8 +75,8 @@ Xbyak::RegExp EmitVAddrLookup(BlockOfCode& code, EmitContext& ctx, size_t bitsiz
template<>
[[maybe_unused]] Xbyak::RegExp EmitVAddrLookup<A32EmitContext>(BlockOfCode& code, A32EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr) {
const Xbyak::Reg64 page = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg32 tmp = ctx.conf.absolute_offset_page_table ? page.cvt32() : ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg64 page = ctx.reg_alloc.ScratchGpr(code);
const Xbyak::Reg32 tmp = ctx.conf.absolute_offset_page_table ? page.cvt32() : ctx.reg_alloc.ScratchGpr(code).cvt32();
EmitDetectMisalignedVAddr(code, ctx, bitsize, abort, vaddr, tmp.cvt64());
@@ -105,8 +105,8 @@ template<>
const size_t valid_page_index_bits = ctx.conf.page_table_address_space_bits - page_bits;
const size_t unused_top_bits = 64 - ctx.conf.page_table_address_space_bits;
const Xbyak::Reg64 page = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 tmp = ctx.conf.absolute_offset_page_table ? page : ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 page = ctx.reg_alloc.ScratchGpr(code);
const Xbyak::Reg64 tmp = ctx.conf.absolute_offset_page_table ? page : ctx.reg_alloc.ScratchGpr(code);
EmitDetectMisalignedVAddr(code, ctx, bitsize, abort, vaddr, tmp);
@@ -116,7 +116,7 @@ template<>
} else if (ctx.conf.silently_mirror_page_table) {
if (valid_page_index_bits >= 32) {
if (code.HasHostFeature(HostFeature::BMI2)) {
const Xbyak::Reg64 bit_count = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 bit_count = ctx.reg_alloc.ScratchGpr(code);
code.mov(bit_count, unused_top_bits);
code.bzhi(tmp, vaddr, bit_count);
code.shr(tmp, int(page_bits));
@@ -168,7 +168,7 @@ template<>
return r13 + vaddr;
} else if (ctx.conf.silently_mirror_fastmem) {
if (!tmp) {
tmp = ctx.reg_alloc.ScratchGpr();
tmp = ctx.reg_alloc.ScratchGpr(code);
}
if (unused_top_bits < 32) {
code.mov(*tmp, vaddr);
@@ -189,7 +189,7 @@ template<>
} else {
// TODO: Consider having TEST as above but coalesce 64-bit constant in register allocator
if (!tmp) {
tmp = ctx.reg_alloc.ScratchGpr();
tmp = ctx.reg_alloc.ScratchGpr(code);
}
code.mov(*tmp, vaddr);
code.shr(*tmp, int(ctx.conf.fastmem_address_space_bits));

View File

@@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
/* This file is part of the dynarmic project.
* Copyright (c) 2016 MerryMage
* SPDX-License-Identifier: 0BSD
@@ -16,14 +19,14 @@ void EmitX64::EmitPackedAddU8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
code.paddb(xmm_a, xmm_b);
if (ge_inst) {
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm(code);
code.pcmpeqb(ones, ones);
@@ -32,21 +35,21 @@ void EmitX64::EmitPackedAddU8(EmitContext& ctx, IR::Inst* inst) {
code.pcmpeqb(xmm_ge, xmm_b);
code.pxor(xmm_ge, ones);
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
ctx.reg_alloc.DefineValue(code, ge_inst, xmm_ge);
}
ctx.reg_alloc.DefineValue(inst, xmm_a);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
}
void EmitX64::EmitPackedAddS8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
if (ge_inst) {
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(code);
code.pcmpeqb(xmm0, xmm0);
@@ -54,27 +57,27 @@ void EmitX64::EmitPackedAddS8(EmitContext& ctx, IR::Inst* inst) {
code.paddsb(xmm_ge, xmm_b);
code.pcmpgtb(xmm_ge, xmm0);
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
ctx.reg_alloc.DefineValue(code, ge_inst, xmm_ge);
}
code.paddb(xmm_a, xmm_b);
ctx.reg_alloc.DefineValue(inst, xmm_a);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
}
void EmitX64::EmitPackedAddU16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
code.paddw(xmm_a, xmm_b);
if (ge_inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm(code);
code.pcmpeqb(ones, ones);
@@ -83,10 +86,10 @@ void EmitX64::EmitPackedAddU16(EmitContext& ctx, IR::Inst* inst) {
code.pcmpeqw(xmm_ge, xmm_b);
code.pxor(xmm_ge, ones);
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
ctx.reg_alloc.DefineValue(code, ge_inst, xmm_ge);
} else {
const Xbyak::Xmm tmp_a = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm tmp_a = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm(code);
// !(b <= a+b) == b > a+b
code.movdqa(tmp_a, xmm_a);
@@ -95,22 +98,22 @@ void EmitX64::EmitPackedAddU16(EmitContext& ctx, IR::Inst* inst) {
code.paddw(tmp_b, code.Const(xword, 0x80008000));
code.pcmpgtw(tmp_b, tmp_a); // *Signed* comparison!
ctx.reg_alloc.DefineValue(ge_inst, tmp_b);
ctx.reg_alloc.DefineValue(code, ge_inst, tmp_b);
}
}
ctx.reg_alloc.DefineValue(inst, xmm_a);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
}
void EmitX64::EmitPackedAddS16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
if (ge_inst) {
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(code);
code.pcmpeqw(xmm0, xmm0);
@@ -118,45 +121,45 @@ void EmitX64::EmitPackedAddS16(EmitContext& ctx, IR::Inst* inst) {
code.paddsw(xmm_ge, xmm_b);
code.pcmpgtw(xmm_ge, xmm0);
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
ctx.reg_alloc.DefineValue(code, ge_inst, xmm_ge);
}
code.paddw(xmm_a, xmm_b);
ctx.reg_alloc.DefineValue(inst, xmm_a);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
}
void EmitX64::EmitPackedSubU8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
if (ge_inst) {
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(xmm_ge, xmm_a);
code.pmaxub(xmm_ge, xmm_b);
code.pcmpeqb(xmm_ge, xmm_a);
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
ctx.reg_alloc.DefineValue(code, ge_inst, xmm_ge);
}
code.psubb(xmm_a, xmm_b);
ctx.reg_alloc.DefineValue(inst, xmm_a);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
}
void EmitX64::EmitPackedSubS8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
if (ge_inst) {
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(code);
code.pcmpeqb(xmm0, xmm0);
@@ -164,12 +167,12 @@ void EmitX64::EmitPackedSubS8(EmitContext& ctx, IR::Inst* inst) {
code.psubsb(xmm_ge, xmm_b);
code.pcmpgtb(xmm_ge, xmm0);
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
ctx.reg_alloc.DefineValue(code, ge_inst, xmm_ge);
}
code.psubb(xmm_a, xmm_b);
ctx.reg_alloc.DefineValue(inst, xmm_a);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
}
void EmitX64::EmitPackedSubU16(EmitContext& ctx, IR::Inst* inst) {
@@ -177,19 +180,19 @@ void EmitX64::EmitPackedSubU16(EmitContext& ctx, IR::Inst* inst) {
const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
if (!ge_inst) {
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
code.psubw(xmm_a, xmm_b);
ctx.reg_alloc.DefineValue(inst, xmm_a);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
return;
}
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(xmm_ge, xmm_a);
code.pmaxuw(xmm_ge, xmm_b); // Requires SSE 4.1
@@ -197,15 +200,15 @@ void EmitX64::EmitPackedSubU16(EmitContext& ctx, IR::Inst* inst) {
code.psubw(xmm_a, xmm_b);
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
ctx.reg_alloc.DefineValue(inst, xmm_a);
ctx.reg_alloc.DefineValue(code, ge_inst, xmm_ge);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
return;
}
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm(code);
// (a >= b) == !(b > a)
code.pcmpeqb(ones, ones);
@@ -217,19 +220,19 @@ void EmitX64::EmitPackedSubU16(EmitContext& ctx, IR::Inst* inst) {
code.psubw(xmm_a, xmm_b);
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
ctx.reg_alloc.DefineValue(inst, xmm_a);
ctx.reg_alloc.DefineValue(code, ge_inst, xmm_ge);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
}
void EmitX64::EmitPackedSubS16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
if (ge_inst) {
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(code);
code.pcmpeqw(xmm0, xmm0);
@@ -237,21 +240,21 @@ void EmitX64::EmitPackedSubS16(EmitContext& ctx, IR::Inst* inst) {
code.psubsw(xmm_ge, xmm_b);
code.pcmpgtw(xmm_ge, xmm0);
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
ctx.reg_alloc.DefineValue(code, ge_inst, xmm_ge);
}
code.psubw(xmm_a, xmm_b);
ctx.reg_alloc.DefineValue(inst, xmm_a);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
}
void EmitX64::EmitPackedHalvingAddU8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (args[0].IsInXmm() || args[1].IsInXmm()) {
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();
if (args[0].IsInXmm(ctx.reg_alloc) || args[1].IsInXmm(ctx.reg_alloc)) {
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm(code);
// Since,
// pavg(a, b) == (a + b + 1) >> 1
@@ -264,11 +267,11 @@ void EmitX64::EmitPackedHalvingAddU8(EmitContext& ctx, IR::Inst* inst) {
code.pavgb(xmm_a, xmm_b);
code.pxor(xmm_a, ones);
ctx.reg_alloc.DefineValue(inst, xmm_a);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
} else {
const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32();
const Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
const Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(code, args[1]).cvt32();
const Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr(code).cvt32();
const Xbyak::Reg32 and_a_b = reg_a;
const Xbyak::Reg32 result = reg_a;
@@ -284,17 +287,17 @@ void EmitX64::EmitPackedHalvingAddU8(EmitContext& ctx, IR::Inst* inst) {
code.and_(xor_a_b, 0x7F7F7F7F);
code.add(result, xor_a_b);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
}
void EmitX64::EmitPackedHalvingAddU16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (args[0].IsInXmm() || args[1].IsInXmm()) {
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
if (args[0].IsInXmm(ctx.reg_alloc) || args[1].IsInXmm(ctx.reg_alloc)) {
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, xmm_a);
code.pand(xmm_a, xmm_b);
@@ -302,11 +305,11 @@ void EmitX64::EmitPackedHalvingAddU16(EmitContext& ctx, IR::Inst* inst) {
code.psrlw(tmp, 1);
code.paddw(xmm_a, tmp);
ctx.reg_alloc.DefineValue(inst, xmm_a);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
} else {
const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32();
const Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
const Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(code, args[1]).cvt32();
const Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr(code).cvt32();
const Xbyak::Reg32 and_a_b = reg_a;
const Xbyak::Reg32 result = reg_a;
@@ -322,19 +325,19 @@ void EmitX64::EmitPackedHalvingAddU16(EmitContext& ctx, IR::Inst* inst) {
code.and_(xor_a_b, 0x7FFF7FFF);
code.add(result, xor_a_b);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
}
void EmitX64::EmitPackedHalvingAddS8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32();
const Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
const Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(code, args[1]).cvt32();
const Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr(code).cvt32();
const Xbyak::Reg32 and_a_b = reg_a;
const Xbyak::Reg32 result = reg_a;
const Xbyak::Reg32 carry = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 carry = ctx.reg_alloc.ScratchGpr(code).cvt32();
// This relies on the equality x+y == ((x&y) << 1) + (x^y).
// Note that x^y always contains the LSB of the result.
@@ -352,15 +355,15 @@ void EmitX64::EmitPackedHalvingAddS8(EmitContext& ctx, IR::Inst* inst) {
code.add(result, xor_a_b);
code.xor_(result, carry);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void EmitX64::EmitPackedHalvingAddS16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
// This relies on the equality x+y == ((x&y) << 1) + (x^y).
// Note that x^y always contains the LSB of the result.
@@ -373,14 +376,14 @@ void EmitX64::EmitPackedHalvingAddS16(EmitContext& ctx, IR::Inst* inst) {
code.psraw(tmp, 1);
code.paddw(xmm_a, tmp);
ctx.reg_alloc.DefineValue(inst, xmm_a);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
}
void EmitX64::EmitPackedHalvingSubU8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg32 minuend = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 subtrahend = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
const Xbyak::Reg32 minuend = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
const Xbyak::Reg32 subtrahend = ctx.reg_alloc.UseScratchGpr(code, args[1]).cvt32();
// This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
// Note that x^y always contains the LSB of the result.
@@ -403,16 +406,16 @@ void EmitX64::EmitPackedHalvingSubU8(EmitContext& ctx, IR::Inst* inst) {
code.xor_(minuend, 0x80808080);
// minuend now contains the desired result.
ctx.reg_alloc.DefineValue(inst, minuend);
ctx.reg_alloc.DefineValue(code, inst, minuend);
}
void EmitX64::EmitPackedHalvingSubS8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg32 minuend = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 subtrahend = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
const Xbyak::Reg32 minuend = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
const Xbyak::Reg32 subtrahend = ctx.reg_alloc.UseScratchGpr(code, args[1]).cvt32();
const Xbyak::Reg32 carry = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 carry = ctx.reg_alloc.ScratchGpr(code).cvt32();
// This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
// Note that x^y always contains the LSB of the result.
@@ -439,14 +442,14 @@ void EmitX64::EmitPackedHalvingSubS8(EmitContext& ctx, IR::Inst* inst) {
code.xor_(minuend, 0x80808080);
code.xor_(minuend, carry);
ctx.reg_alloc.DefineValue(inst, minuend);
ctx.reg_alloc.DefineValue(code, inst, minuend);
}
void EmitX64::EmitPackedHalvingSubU16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm minuend = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm subtrahend = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm minuend = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm subtrahend = ctx.reg_alloc.UseScratchXmm(code, args[1]);
// This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
// Note that x^y always contains the LSB of the result.
@@ -462,14 +465,14 @@ void EmitX64::EmitPackedHalvingSubU16(EmitContext& ctx, IR::Inst* inst) {
code.psubw(minuend, subtrahend);
ctx.reg_alloc.DefineValue(inst, minuend);
ctx.reg_alloc.DefineValue(code, inst, minuend);
}
void EmitX64::EmitPackedHalvingSubS16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm minuend = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm subtrahend = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm minuend = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm subtrahend = ctx.reg_alloc.UseScratchXmm(code, args[1]);
// This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
// Note that x^y always contains the LSB of the result.
@@ -485,17 +488,17 @@ void EmitX64::EmitPackedHalvingSubS16(EmitContext& ctx, IR::Inst* inst) {
code.psubw(minuend, subtrahend);
ctx.reg_alloc.DefineValue(inst, minuend);
ctx.reg_alloc.DefineValue(code, inst, minuend);
}
static void EmitPackedSubAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bool hi_is_sum, bool is_signed, bool is_halving) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
const Xbyak::Reg32 reg_a_hi = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 reg_b_hi = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
const Xbyak::Reg32 reg_a_lo = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 reg_b_lo = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 reg_a_hi = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
const Xbyak::Reg32 reg_b_hi = ctx.reg_alloc.UseScratchGpr(code, args[1]).cvt32();
const Xbyak::Reg32 reg_a_lo = ctx.reg_alloc.ScratchGpr(code).cvt32();
const Xbyak::Reg32 reg_b_lo = ctx.reg_alloc.ScratchGpr(code).cvt32();
Xbyak::Reg32 reg_sum, reg_diff;
if (is_signed) {
@@ -543,7 +546,7 @@ static void EmitPackedSubAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst
code.and_(ge_diff, hi_is_sum ? 0x0000FFFF : 0xFFFF0000);
code.or_(ge_sum, ge_diff);
ctx.reg_alloc.DefineValue(ge_inst, ge_sum);
ctx.reg_alloc.DefineValue(code, ge_inst, ge_sum);
}
if (is_halving) {
@@ -557,7 +560,7 @@ static void EmitPackedSubAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst
// Merge them.
code.shld(reg_a_hi, reg_a_lo, 16);
ctx.reg_alloc.DefineValue(inst, reg_a_hi);
ctx.reg_alloc.DefineValue(code, inst, reg_a_hi);
}
void EmitX64::EmitPackedAddSubU16(EmitContext& ctx, IR::Inst* inst) {
@@ -595,12 +598,12 @@ void EmitX64::EmitPackedHalvingSubAddS16(EmitContext& ctx, IR::Inst* inst) {
static void EmitPackedOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
(code.*fn)(xmm_a, xmm_b);
ctx.reg_alloc.DefineValue(inst, xmm_a);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
}
void EmitX64::EmitPackedSaturatedAddU8(EmitContext& ctx, IR::Inst* inst) {
@@ -638,9 +641,9 @@ void EmitX64::EmitPackedSaturatedSubS16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitPackedAbsDiffSumU8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
// TODO: Optimize with zero-extension detection
code.movaps(tmp, code.Const(xword, 0x0000'0000'ffff'ffff));
@@ -648,45 +651,45 @@ void EmitX64::EmitPackedAbsDiffSumU8(EmitContext& ctx, IR::Inst* inst) {
code.pand(xmm_b, tmp);
code.psadbw(xmm_a, xmm_b);
ctx.reg_alloc.DefineValue(inst, xmm_a);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
}
void EmitX64::EmitPackedSelect(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const size_t num_args_in_xmm = args[0].IsInXmm() + args[1].IsInXmm() + args[2].IsInXmm();
const size_t num_args_in_xmm = args[0].IsInXmm(ctx.reg_alloc) + args[1].IsInXmm(ctx.reg_alloc) + args[2].IsInXmm(ctx.reg_alloc);
if (num_args_in_xmm >= 2) {
const Xbyak::Xmm ge = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm to = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm from = ctx.reg_alloc.UseScratchXmm(args[2]);
const Xbyak::Xmm ge = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm to = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm from = ctx.reg_alloc.UseScratchXmm(code, args[2]);
code.pand(from, ge);
code.pandn(ge, to);
code.por(from, ge);
ctx.reg_alloc.DefineValue(inst, from);
ctx.reg_alloc.DefineValue(code, inst, from);
} else if (code.HasHostFeature(HostFeature::BMI1)) {
const Xbyak::Reg32 ge = ctx.reg_alloc.UseGpr(args[0]).cvt32();
const Xbyak::Reg32 to = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
const Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32();
const Xbyak::Reg32 ge = ctx.reg_alloc.UseGpr(code, args[0]).cvt32();
const Xbyak::Reg32 to = ctx.reg_alloc.UseScratchGpr(code, args[1]).cvt32();
const Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(code, args[2]).cvt32();
code.and_(from, ge);
code.andn(to, ge, to);
code.or_(from, to);
ctx.reg_alloc.DefineValue(inst, from);
ctx.reg_alloc.DefineValue(code, inst, from);
} else {
const Xbyak::Reg32 ge = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 to = ctx.reg_alloc.UseGpr(args[1]).cvt32();
const Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32();
const Xbyak::Reg32 ge = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
const Xbyak::Reg32 to = ctx.reg_alloc.UseGpr(code, args[1]).cvt32();
const Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(code, args[2]).cvt32();
code.and_(from, ge);
code.not_(ge);
code.and_(ge, to);
code.or_(from, ge);
ctx.reg_alloc.DefineValue(inst, from);
ctx.reg_alloc.DefineValue(code, inst, from);
}
}

View File

@@ -34,9 +34,9 @@ template<Op op, size_t size, bool has_overflow_inst = false>
void EmitSignedSaturatedOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
Xbyak::Reg result = ctx.reg_alloc.UseScratchGpr(args[0]).changeBit(size);
Xbyak::Reg addend = ctx.reg_alloc.UseGpr(args[1]).changeBit(size);
Xbyak::Reg overflow = ctx.reg_alloc.ScratchGpr().changeBit(size);
Xbyak::Reg result = ctx.reg_alloc.UseScratchGpr(code, args[0]).changeBit(size);
Xbyak::Reg addend = ctx.reg_alloc.UseGpr(code, args[1]).changeBit(size);
Xbyak::Reg overflow = ctx.reg_alloc.ScratchGpr(code).changeBit(size);
constexpr u64 int_max = static_cast<u64>((std::numeric_limits<mcl::signed_integer_of_size<size>>::max)());
if constexpr (size < 64) {
@@ -66,21 +66,21 @@ void EmitSignedSaturatedOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
code.seto(overflow.cvt8());
if constexpr (has_overflow_inst) {
if (const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp)) {
ctx.reg_alloc.DefineValue(overflow_inst, overflow);
ctx.reg_alloc.DefineValue(code, overflow_inst, overflow);
}
} else {
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], overflow.cvt8());
}
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
template<Op op, size_t size>
void EmitUnsignedSaturatedOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
Xbyak::Reg op_result = ctx.reg_alloc.UseScratchGpr(args[0]).changeBit(size);
Xbyak::Reg addend = ctx.reg_alloc.UseScratchGpr(args[1]).changeBit(size);
Xbyak::Reg op_result = ctx.reg_alloc.UseScratchGpr(code, args[0]).changeBit(size);
Xbyak::Reg addend = ctx.reg_alloc.UseScratchGpr(code, args[1]).changeBit(size);
constexpr u64 boundary = op == Op::Add ? (std::numeric_limits<mcl::unsigned_integer_of_size<size>>::max)() : 0;
@@ -96,11 +96,11 @@ void EmitUnsignedSaturatedOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst
code.cmovae(addend, op_result);
}
const Xbyak::Reg overflow = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg overflow = ctx.reg_alloc.ScratchGpr(code);
code.setb(overflow.cvt8());
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], overflow.cvt8());
ctx.reg_alloc.DefineValue(inst, addend);
ctx.reg_alloc.DefineValue(code, inst, addend);
}
} // anonymous namespace
@@ -126,10 +126,10 @@ void EmitX64::EmitSignedSaturation(EmitContext& ctx, IR::Inst* inst) {
overflow_inst->ReplaceUsesWith(no_overflow);
}
// TODO: DefineValue directly on Argument
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(args[0]);
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(code);
const Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(code, args[0]);
code.mov(result.cvt32(), source.cvt32());
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
@@ -137,9 +137,9 @@ void EmitX64::EmitSignedSaturation(EmitContext& ctx, IR::Inst* inst) {
const u32 positive_saturated_value = (1u << (N - 1)) - 1;
const u32 negative_saturated_value = 1u << (N - 1);
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseGpr(args[0]).cvt32();
const Xbyak::Reg32 overflow = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr(code).cvt32();
const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseGpr(code, args[0]).cvt32();
const Xbyak::Reg32 overflow = ctx.reg_alloc.ScratchGpr(code).cvt32();
// overflow now contains a value between 0 and mask if it was originally between {negative,positive}_saturated_value.
code.lea(overflow, code.ptr[reg_a.cvt64() + negative_saturated_value]);
@@ -156,10 +156,10 @@ void EmitX64::EmitSignedSaturation(EmitContext& ctx, IR::Inst* inst) {
if (overflow_inst) {
code.seta(overflow.cvt8());
ctx.reg_alloc.DefineValue(overflow_inst, overflow);
ctx.reg_alloc.DefineValue(code, overflow_inst, overflow);
}
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void EmitX64::EmitUnsignedSaturation(EmitContext& ctx, IR::Inst* inst) {
@@ -171,9 +171,9 @@ void EmitX64::EmitUnsignedSaturation(EmitContext& ctx, IR::Inst* inst) {
const u32 saturated_value = (1u << N) - 1;
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseGpr(args[0]).cvt32();
const Xbyak::Reg32 overflow = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr(code).cvt32();
const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseGpr(code, args[0]).cvt32();
const Xbyak::Reg32 overflow = ctx.reg_alloc.ScratchGpr(code).cvt32();
// Pseudocode: result = clamp(reg_a, 0, saturated_value);
code.xor_(overflow, overflow);
@@ -185,10 +185,10 @@ void EmitX64::EmitUnsignedSaturation(EmitContext& ctx, IR::Inst* inst) {
if (overflow_inst) {
code.seta(overflow.cvt8());
ctx.reg_alloc.DefineValue(overflow_inst, overflow);
ctx.reg_alloc.DefineValue(code, overflow_inst, overflow);
}
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void EmitX64::EmitSignedSaturatedAdd8(EmitContext& ctx, IR::Inst* inst) {
@@ -210,9 +210,9 @@ void EmitX64::EmitSignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitSignedSaturatedDoublingMultiplyReturnHigh16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg32 x = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 y = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 x = ctx.reg_alloc.UseScratchGpr(code, args[0]).cvt32();
const Xbyak::Reg32 y = ctx.reg_alloc.UseScratchGpr(code, args[1]).cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.movsx(x, x.cvt16());
code.movsx(y, y.cvt16());
@@ -228,15 +228,15 @@ void EmitX64::EmitSignedSaturatedDoublingMultiplyReturnHigh16(EmitContext& ctx,
code.sets(tmp.cvt8());
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], tmp.cvt8());
ctx.reg_alloc.DefineValue(inst, y);
ctx.reg_alloc.DefineValue(code, inst, y);
}
void EmitX64::EmitSignedSaturatedDoublingMultiplyReturnHigh32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg64 x = ctx.reg_alloc.UseScratchGpr(args[0]);
const Xbyak::Reg64 y = ctx.reg_alloc.UseScratchGpr(args[1]);
const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 x = ctx.reg_alloc.UseScratchGpr(code, args[0]);
const Xbyak::Reg64 y = ctx.reg_alloc.UseScratchGpr(code, args[1]);
const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(code);
code.movsxd(x, x.cvt32());
code.movsxd(y, y.cvt32());
@@ -252,7 +252,7 @@ void EmitX64::EmitSignedSaturatedDoublingMultiplyReturnHigh32(EmitContext& ctx,
code.sets(tmp.cvt8());
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], tmp.cvt8());
ctx.reg_alloc.DefineValue(inst, y);
ctx.reg_alloc.DefineValue(code, inst, y);
}
void EmitX64::EmitSignedSaturatedSub8(EmitContext& ctx, IR::Inst* inst) {

View File

@@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
/* This file is part of the dynarmic project.
* Copyright (c) 2022 MerryMage
* SPDX-License-Identifier: 0BSD
@@ -22,9 +25,9 @@ void EmitX64::EmitSHA256Hash(EmitContext& ctx, IR::Inst* inst) {
// y = h g f e
// w = wk3 wk2 wk1 wk0
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm w = ctx.reg_alloc.UseXmm(args[2]);
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
const Xbyak::Xmm w = ctx.reg_alloc.UseXmm(code, args[2]);
// x64 expects:
// 3 2 1 0
@@ -45,7 +48,7 @@ void EmitX64::EmitSHA256Hash(EmitContext& ctx, IR::Inst* inst) {
code.shufps(y, x, part1 ? 0b10111011 : 0b00010001);
ctx.reg_alloc.DefineValue(inst, y);
ctx.reg_alloc.DefineValue(code, inst, y);
}
void EmitX64::EmitSHA256MessageSchedule0(EmitContext& ctx, IR::Inst* inst) {
@@ -53,12 +56,12 @@ void EmitX64::EmitSHA256MessageSchedule0(EmitContext& ctx, IR::Inst* inst) {
ASSERT(code.HasHostFeature(HostFeature::SHA));
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
code.sha256msg1(x, y);
ctx.reg_alloc.DefineValue(inst, x);
ctx.reg_alloc.DefineValue(code, inst, x);
}
void EmitX64::EmitSHA256MessageSchedule1(EmitContext& ctx, IR::Inst* inst) {
@@ -66,16 +69,16 @@ void EmitX64::EmitSHA256MessageSchedule1(EmitContext& ctx, IR::Inst* inst) {
ASSERT(code.HasHostFeature(HostFeature::SHA));
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm z = ctx.reg_alloc.UseXmm(args[2]);
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm z = ctx.reg_alloc.UseXmm(code, args[2]);
code.movaps(xmm0, z);
code.palignr(xmm0, y, 4);
code.paddd(x, xmm0);
code.sha256msg2(x, z);
ctx.reg_alloc.DefineValue(inst, x);
ctx.reg_alloc.DefineValue(code, inst, x);
}
} // namespace Dynarmic::Backend::X64

View File

@@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
/* This file is part of the dynarmic project.
* Copyright (c) 2018 MerryMage
* SPDX-License-Identifier: 0BSD
@@ -13,7 +16,7 @@ namespace Dynarmic::Backend::X64 {
void EmitX64::EmitSM4AccessSubstitutionBox(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.HostCall(inst, args[0]);
ctx.reg_alloc.HostCall(code, inst, args[0]);
code.CallFunction(&Common::Crypto::SM4::AccessSubstitutionBox);
code.movzx(code.ABI_RETURN.cvt32(), code.ABI_RETURN.cvt8());
}

File diff suppressed because it is too large Load Diff

View File

@@ -96,7 +96,7 @@ void HandleNaNs(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, std::
if (code.HasHostFeature(HostFeature::SSE41)) {
code.ptest(nan_mask, nan_mask);
} else {
const Xbyak::Reg32 bitmask = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 bitmask = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.movmskps(bitmask, nan_mask);
code.cmp(bitmask, 0);
}
@@ -312,13 +312,13 @@ void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
Xbyak::Xmm result;
if constexpr (std::is_member_function_pointer_v<Function>) {
result = ctx.reg_alloc.UseScratchXmm(args[0]);
result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
(code.*fn)(result);
});
} else {
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(code, args[0]);
result = ctx.reg_alloc.ScratchXmm(code);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
fn(result, xmm_a);
});
@@ -328,13 +328,13 @@ void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), result);
}
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(code);
if constexpr (std::is_member_function_pointer_v<Function>) {
code.movaps(result, xmm_a);
@@ -352,7 +352,7 @@ void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
HandleNaNs<fsize, 1>(code, ctx, fpcr_controlled, {result, xmm_a}, nan_mask, nan_handler);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
enum class CheckInputNaN {
@@ -368,8 +368,8 @@ void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
const bool fpcr_controlled = args[2].GetImmediateU1();
if (ctx.FPCR(fpcr_controlled).DN() || ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
if constexpr (std::is_member_function_pointer_v<Function>) {
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
@@ -385,14 +385,14 @@ void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), xmm_a);
}
ctx.reg_alloc.DefineValue(inst, xmm_a);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
return;
}
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(code);
code.movaps(result, xmm_a);
@@ -422,7 +422,7 @@ void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
HandleNaNs<fsize, 2>(code, ctx, fpcr_controlled, {result, xmm_a, xmm_b}, nan_mask, nan_handler);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
template<typename F>
@@ -448,16 +448,16 @@ void EmitTwoOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak
template<size_t fpcr_controlled_arg_index = 1, typename F>
void EmitTwoOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, F lambda) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(nullptr);
ctx.reg_alloc.HostCall(code, nullptr);
const bool fpcr_controlled = args[fpcr_controlled_arg_index].GetImmediateU1();
EmitTwoOpFallbackWithoutRegAlloc(code, ctx, result, arg1, lambda, fpcr_controlled);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
template<typename Lambda>
@@ -501,17 +501,17 @@ void EmitThreeOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xby
template<typename Lambda>
void EmitThreeOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(nullptr);
ctx.reg_alloc.HostCall(code, nullptr);
const bool fpcr_controlled = args[2].GetImmediateU1();
EmitThreeOpFallbackWithoutRegAlloc(code, ctx, result, arg1, arg2, lambda, fpcr_controlled);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
enum class LoadPreviousResult {
@@ -565,16 +565,16 @@ template<typename Lambda>
void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[3].GetImmediateU1();
const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm arg3 = ctx.reg_alloc.UseXmm(args[2]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm arg3 = ctx.reg_alloc.UseXmm(code, args[2]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(nullptr);
ctx.reg_alloc.HostCall(code, nullptr);
EmitFourOpFallbackWithoutRegAlloc(code, ctx, result, arg1, arg2, arg3, lambda, fpcr_controlled);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
} // anonymous namespace
@@ -582,9 +582,9 @@ void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lam
template<size_t fsize>
void FPVectorAbs(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.andps(a, GetNonSignMaskVector<fsize>(code));
ctx.reg_alloc.DefineValue(inst, a);
ctx.reg_alloc.DefineValue(code, inst, a);
}
void EmitX64::EmitFPVectorAbs16(EmitContext& ctx, IR::Inst* inst) {
@@ -626,29 +626,29 @@ void EmitX64::EmitFPVectorEqual16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitFPVectorEqual32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(code, args[1]) : ctx.reg_alloc.UseXmm(code, args[1]);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
DenormalsAreZero<32>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0);
code.cmpeqps(a, b);
});
ctx.reg_alloc.DefineValue(inst, a);
ctx.reg_alloc.DefineValue(code, inst, a);
}
void EmitX64::EmitFPVectorEqual64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(code, args[1]) : ctx.reg_alloc.UseXmm(code, args[1]);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
DenormalsAreZero<64>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0);
code.cmpeqpd(a, b);
});
ctx.reg_alloc.DefineValue(inst, a);
ctx.reg_alloc.DefineValue(code, inst, a);
}
template<FP::RoundingMode rounding_mode>
@@ -664,13 +664,13 @@ void EmitX64::EmitFPVectorFromHalf32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(code, args[0]);
code.vcvtph2ps(result, value);
ForceToDefaultNaN<32>(code, ctx.FPCR(fpcr_controlled), result);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
switch (rounding_mode) {
case FP::RoundingMode::ToNearest_TieEven:
@@ -696,7 +696,7 @@ void EmitX64::EmitFPVectorFromHalf32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitFPVectorFromSignedFixed32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const int fbits = args[1].GetImmediateU8();
const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
const bool fpcr_controlled = args[3].GetImmediateU1();
@@ -709,12 +709,12 @@ void EmitX64::EmitFPVectorFromSignedFixed32(EmitContext& ctx, IR::Inst* inst) {
}
});
ctx.reg_alloc.DefineValue(inst, xmm);
ctx.reg_alloc.DefineValue(code, inst, xmm);
}
void EmitX64::EmitFPVectorFromSignedFixed64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const int fbits = args[1].GetImmediateU8();
const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
const bool fpcr_controlled = args[3].GetImmediateU1();
@@ -724,8 +724,8 @@ void EmitX64::EmitFPVectorFromSignedFixed64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
code.vcvtqq2pd(xmm, xmm);
} else if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr();
const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(code);
// First quadword
code.movq(tmp, xmm);
@@ -738,9 +738,9 @@ void EmitX64::EmitFPVectorFromSignedFixed64(EmitContext& ctx, IR::Inst* inst) {
// Combine
code.unpcklpd(xmm, xmm_tmp);
} else {
const Xbyak::Xmm high_xmm = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr();
const Xbyak::Xmm high_xmm = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(code);
// First quadword
code.movhlps(high_xmm, xmm);
@@ -760,12 +760,12 @@ void EmitX64::EmitFPVectorFromSignedFixed64(EmitContext& ctx, IR::Inst* inst) {
}
});
ctx.reg_alloc.DefineValue(inst, xmm);
ctx.reg_alloc.DefineValue(code, inst, xmm);
}
void EmitX64::EmitFPVectorFromUnsignedFixed32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const int fbits = args[1].GetImmediateU8();
const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
const bool fpcr_controlled = args[3].GetImmediateU1();
@@ -779,7 +779,7 @@ void EmitX64::EmitFPVectorFromUnsignedFixed32(EmitContext& ctx, IR::Inst* inst)
const Xbyak::Address mem_53000000 = code.BConst<32>(xword, 0x53000000);
const Xbyak::Address mem_D3000080 = code.BConst<32>(xword, 0xD3000080);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpblendw(tmp, xmm, mem_4B000000, 0b10101010);
@@ -810,12 +810,12 @@ void EmitX64::EmitFPVectorFromUnsignedFixed32(EmitContext& ctx, IR::Inst* inst)
}
});
ctx.reg_alloc.DefineValue(inst, xmm);
ctx.reg_alloc.DefineValue(code, inst, xmm);
}
void EmitX64::EmitFPVectorFromUnsignedFixed64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const int fbits = args[1].GetImmediateU8();
const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
const bool fpcr_controlled = args[3].GetImmediateU1();
@@ -828,9 +828,9 @@ void EmitX64::EmitFPVectorFromUnsignedFixed64(EmitContext& ctx, IR::Inst* inst)
const Xbyak::Address unpack = code.Const(xword, 0x4530000043300000, 0);
const Xbyak::Address subtrahend = code.Const(xword, 0x4330000000000000, 0x4530000000000000);
const Xbyak::Xmm unpack_reg = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm subtrahend_reg = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm unpack_reg = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm subtrahend_reg = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vmovapd(unpack_reg, unpack);
@@ -846,7 +846,7 @@ void EmitX64::EmitFPVectorFromUnsignedFixed64(EmitContext& ctx, IR::Inst* inst)
code.vhaddpd(xmm, tmp1, xmm);
} else {
const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code);
code.movapd(unpack_reg, unpack);
code.movapd(subtrahend_reg, subtrahend);
@@ -877,63 +877,63 @@ void EmitX64::EmitFPVectorFromUnsignedFixed64(EmitContext& ctx, IR::Inst* inst)
}
});
ctx.reg_alloc.DefineValue(inst, xmm);
ctx.reg_alloc.DefineValue(code, inst, xmm);
}
void EmitX64::EmitFPVectorGreater32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(code, args[0]) : ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
DenormalsAreZero<32>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0);
code.cmpltps(b, a);
});
ctx.reg_alloc.DefineValue(inst, b);
ctx.reg_alloc.DefineValue(code, inst, b);
}
void EmitX64::EmitFPVectorGreater64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(code, args[0]) : ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
DenormalsAreZero<64>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0);
code.cmpltpd(b, a);
});
ctx.reg_alloc.DefineValue(inst, b);
ctx.reg_alloc.DefineValue(code, inst, b);
}
void EmitX64::EmitFPVectorGreaterEqual32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(code, args[0]) : ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
DenormalsAreZero<32>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0);
code.cmpleps(b, a);
});
ctx.reg_alloc.DefineValue(inst, b);
ctx.reg_alloc.DefineValue(code, inst, b);
}
void EmitX64::EmitFPVectorGreaterEqual64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(code, args[0]) : ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
DenormalsAreZero<64>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0);
code.cmplepd(b, a);
});
ctx.reg_alloc.DefineValue(inst, b);
ctx.reg_alloc.DefineValue(code, inst, b);
}
template<size_t fsize, bool is_max>
@@ -942,12 +942,12 @@ static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
if (ctx.FPCR(fpcr_controlled).DN()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(code, args[1]) : ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm mask = xmm0;
const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(code);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
DenormalsAreZero<fsize>(code, ctx.FPCR(fpcr_controlled), {result, xmm_b}, mask);
@@ -994,7 +994,7 @@ static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
}
});
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
@@ -1002,11 +1002,11 @@ static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
EmitThreeOpVectorOperation<fsize, DefaultIndexer>(
code, ctx, inst, [&](const Xbyak::Xmm& result, Xbyak::Xmm xmm_b) {
const Xbyak::Xmm mask = xmm0;
const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm(code);
if (ctx.FPCR(fpcr_controlled).FZ()) {
const Xbyak::Xmm prev_xmm_b = xmm_b;
xmm_b = ctx.reg_alloc.ScratchXmm();
xmm_b = ctx.reg_alloc.ScratchXmm(code);
code.movaps(xmm_b, prev_xmm_b);
DenormalsAreZero<fsize>(code, ctx.FPCR(fpcr_controlled), {result, xmm_b}, mask);
}
@@ -1053,13 +1053,13 @@ static void EmitFPVectorMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::I
const bool fpcr_controlled = inst->GetArg(2).GetU1();
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm intermediate_result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm intermediate_result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm tmp1 = xmm0;
const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code);
// NaN requirements:
// op1 op2 result
@@ -1139,7 +1139,7 @@ static void EmitFPVectorMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::I
}
});
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
@@ -1230,7 +1230,7 @@ static void EmitFPVectorMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::I
}
});
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void EmitX64::EmitFPVectorMax32(EmitContext& ctx, IR::Inst* inst) {
@@ -1316,27 +1316,27 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::FMA) && !needs_rounding_correction && !needs_nan_correction) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(code, args[2]);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
FCODE(vfmadd231p)(result, xmm_b, xmm_c);
ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), result);
});
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(code, args[2]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
@@ -1375,21 +1375,21 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
code.jmp(*end, code.T_NEAR);
});
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]);
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(code, args[2]);
FCODE(mulp)(operand2, operand3);
FCODE(addp)(operand1, operand2);
ctx.reg_alloc.DefineValue(inst, operand1);
ctx.reg_alloc.DefineValue(code, inst, operand1);
return;
}
}
@@ -1417,10 +1417,10 @@ static void EmitFPVectorMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst
const bool fpcr_controlled = args[2].GetImmediateU1();
if (ctx.FPCR(fpcr_controlled).DN() && code.HasHostFeature(HostFeature::AVX)) {
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm twos = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm twos = ctx.reg_alloc.ScratchXmm(code);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
FCODE(vcmpunordp)(xmm0, result, operand);
@@ -1434,14 +1434,14 @@ static void EmitFPVectorMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst
FCODE(blendvp)(result, twos);
});
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(code);
code.movaps(nan_mask, xmm_b);
code.movaps(result, xmm_a);
@@ -1464,7 +1464,7 @@ static void EmitFPVectorMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst
HandleNaNs<fsize, 2>(code, ctx, fpcr_controlled, {result, xmm_a, xmm_b}, nan_mask, nan_handler);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
void EmitX64::EmitFPVectorMulX32(EmitContext& ctx, IR::Inst* inst) {
@@ -1482,12 +1482,12 @@ void FPVectorNeg(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Address mask = code.BConst<fsize>(xword, sign_mask);
code.xorps(a, mask);
ctx.reg_alloc.DefineValue(inst, a);
ctx.reg_alloc.DefineValue(code, inst, a);
}
void EmitX64::EmitFPVectorNeg16(EmitContext& ctx, IR::Inst* inst) {
@@ -1512,7 +1512,7 @@ void EmitX64::EmitFPVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitFPVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) {
EmitThreeOpVectorOperation<32, PairedLowerIndexer>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm xmm_b) {
const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(code);
code.xorps(zero, zero);
code.punpcklqdq(result, xmm_b);
code.haddps(result, zero);
@@ -1521,7 +1521,7 @@ void EmitX64::EmitFPVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitFPVectorPairedAddLower64(EmitContext& ctx, IR::Inst* inst) {
EmitThreeOpVectorOperation<64, PairedLowerIndexer>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm xmm_b) {
const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(code);
code.xorps(zero, zero);
code.punpcklqdq(result, xmm_b);
code.haddpd(result, zero);
@@ -1535,8 +1535,8 @@ static void EmitRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
if constexpr (fsize != 16) {
if (ctx.HasOptimization(OptimizationFlag::Unsafe_ReducedErrorFP)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
FCODE(vrcp14p)(result, operand);
@@ -1550,7 +1550,7 @@ static void EmitRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
}
}
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
}
@@ -1589,16 +1589,16 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
code.movaps(result, GetVectorOf<fsize, false, 0, 2>(code));
FCODE(vfnmadd231p)(result, operand1, operand2);
});
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
@@ -1606,10 +1606,10 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
@@ -1633,22 +1633,22 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
code.jmp(*end, code.T_NEAR);
});
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
code.movaps(result, GetVectorOf<fsize, false, 0, 2>(code));
FCODE(mulp)(operand1, operand2);
FCODE(subp)(result, operand1);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
}
@@ -1757,8 +1757,8 @@ static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
if constexpr (fsize != 16) {
if (ctx.HasOptimization(OptimizationFlag::Unsafe_ReducedErrorFP)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
FCODE(vrsqrt14p)(result, operand);
@@ -1772,7 +1772,7 @@ static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
}
}
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
@@ -1780,9 +1780,9 @@ static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[1].GetImmediateU1();
const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm(code);
SharedLabel bad_values = GenSharedLabel(), end = GenSharedLabel();
@@ -1816,7 +1816,7 @@ static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
code.jmp(*end, code.T_NEAR);
});
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
}
@@ -1851,9 +1851,9 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
code.vmovaps(result, GetVectorOf<fsize, false, 0, 3>(code));
@@ -1861,7 +1861,7 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
FCODE(vmulp)(result, result, GetVectorOf<fsize, false, -1, 1>(code));
});
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
@@ -1869,11 +1869,11 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code);
SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
@@ -1902,23 +1902,23 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
code.jmp(*end, code.T_NEAR);
});
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
code.movaps(result, GetVectorOf<fsize, false, 0, 3>(code));
FCODE(mulp)(operand1, operand2);
FCODE(subp)(result, operand1);
FCODE(mulp)(result, GetVectorOf<fsize, false, -1, 1>(code));
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
}
@@ -1972,12 +1972,12 @@ void EmitX64::EmitFPVectorToHalf32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
ForceToDefaultNaN<32>(code, ctx.FPCR(fpcr_controlled), result);
code.vcvtps2ph(result, result, u8(*round_imm));
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
switch (rounding_mode) {
case FP::RoundingMode::ToNearest_TieEven:
@@ -2018,7 +2018,7 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(code, args[0]);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
const int round_imm = [&] {
@@ -2045,8 +2045,8 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
code.vcvttpd2qq(src, src);
} else {
const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr(code);
const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr(code);
code.cvttsd2si(lo, src);
code.punpckhqdq(src, src);
@@ -2093,12 +2093,12 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
FCODE(andp)(src, xmm0);
// Will we exceed unsigned range?
const Xbyak::Xmm exceed_unsigned = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm exceed_unsigned = ctx.reg_alloc.ScratchXmm(code);
code.movaps(exceed_unsigned, GetVectorOf<fsize, float_upper_limit_unsigned>(code));
FCODE(cmplep)(exceed_unsigned, src);
// Will be exceed signed range?
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
code.movaps(tmp, GetVectorOf<fsize, float_upper_limit_signed>(code));
code.movaps(xmm0, tmp);
FCODE(cmplep)(xmm0, src);
@@ -2122,7 +2122,7 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
}
});
ctx.reg_alloc.DefineValue(inst, src);
ctx.reg_alloc.DefineValue(code, inst, src);
return;
}
}

View File

@@ -26,9 +26,9 @@ namespace {
void EmitVectorSaturatedNative(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*saturated_fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&), void (Xbyak::CodeGenerator::*unsaturated_fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&), void (Xbyak::CodeGenerator::*sub_fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm addend = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8();
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm addend = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr(code).cvt8();
code.movaps(xmm0, result);
@@ -39,7 +39,7 @@ void EmitVectorSaturatedNative(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
if (code.HasHostFeature(HostFeature::SSE41)) {
code.ptest(xmm0, xmm0);
} else {
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
code.pxor(tmp, tmp);
code.pcmpeqw(xmm0, tmp);
code.pmovmskb(overflow.cvt32(), xmm0);
@@ -49,7 +49,7 @@ void EmitVectorSaturatedNative(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
code.setnz(overflow);
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
}
enum class Op {
@@ -65,10 +65,10 @@ void EmitVectorSignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) {
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8();
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr(code).cvt8();
code.movaps(xmm0, operand1);
@@ -91,15 +91,15 @@ void EmitVectorSignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
code.setnz(overflow);
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
const Xbyak::Xmm operand1 = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.UseXmm(args[0]) : ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.ScratchXmm() : operand1;
const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8();
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm operand1 = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.UseXmm(code, args[0]) : ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm result = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.ScratchXmm(code) : operand1;
const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr(code).cvt8();
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
if (code.HasHostFeature(HostFeature::AVX)) {
if constexpr (op == Op::Add) {
@@ -150,7 +150,7 @@ void EmitVectorSignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
if (code.HasHostFeature(HostFeature::SSE41)) {
FCODE(blendvp)(result, tmp);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
code.psrad(xmm0, 31);
if constexpr (esize == 64) {
@@ -161,7 +161,7 @@ void EmitVectorSignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
code.pandn(xmm0, result);
code.por(tmp, xmm0);
ctx.reg_alloc.DefineValue(inst, tmp);
ctx.reg_alloc.DefineValue(code, inst, tmp);
}
}
@@ -172,10 +172,10 @@ void EmitVectorUnsignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst*
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) {
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8();
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr(code).cvt8();
if constexpr (op == Op::Add) {
ICODE(vpadd)(result, operand1, operand2);
@@ -191,15 +191,15 @@ void EmitVectorUnsignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst*
code.setnz(overflow);
code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
return;
}
const Xbyak::Xmm operand1 = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.UseXmm(args[0]) : ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.ScratchXmm() : operand1;
const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8();
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm operand1 = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.UseXmm(code, args[0]) : ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm result = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.ScratchXmm(code) : operand1;
const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr(code).cvt8();
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
if constexpr (op == Op::Add) {
if (code.HasHostFeature(HostFeature::AVX)) {
@@ -252,10 +252,10 @@ void EmitVectorUnsignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst*
if constexpr (op == Op::Add) {
code.por(result, tmp);
ctx.reg_alloc.DefineValue(inst, result);
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
code.pandn(tmp, result);
ctx.reg_alloc.DefineValue(inst, tmp);
ctx.reg_alloc.DefineValue(code, inst, tmp);
}
}

View File

@@ -1,25 +0,0 @@
/* This file is part of the dynarmic project.
* Copyright (c) 2016 MerryMage
* SPDX-License-Identifier: 0BSD
*/
#include "dynarmic/backend/x64/hostloc.h"
#include <xbyak/xbyak.h>
#include "dynarmic/backend/x64/abi.h"
#include "dynarmic/backend/x64/stack_layout.h"
namespace Dynarmic::Backend::X64 {
Xbyak::Reg64 HostLocToReg64(HostLoc loc) {
ASSERT(HostLocIsGPR(loc));
return Xbyak::Reg64(static_cast<int>(loc));
}
Xbyak::Xmm HostLocToXmm(HostLoc loc) {
ASSERT(HostLocIsXMM(loc));
return Xbyak::Xmm(static_cast<int>(loc) - static_cast<int>(HostLoc::XMM0));
}
} // namespace Dynarmic::Backend::X64

View File

@@ -152,7 +152,14 @@ const HostLocList any_xmm = {
HostLoc::XMM15,
};
Xbyak::Reg64 HostLocToReg64(HostLoc loc);
Xbyak::Xmm HostLocToXmm(HostLoc loc);
inline Xbyak::Reg64 HostLocToReg64(HostLoc loc) noexcept {
ASSERT(HostLocIsGPR(loc));
return Xbyak::Reg64(int(loc));
}
inline Xbyak::Xmm HostLocToXmm(HostLoc loc) noexcept {
ASSERT(HostLocIsXMM(loc));
return Xbyak::Xmm(int(loc) - int(HostLoc::XMM0));
}
} // namespace Dynarmic::Backend::X64

View File

@@ -24,15 +24,6 @@
namespace Dynarmic::Backend::X64 {
#define MAYBE_AVX(OPCODE, ...) \
[&] { \
if (code->HasHostFeature(HostFeature::AVX)) { \
code->v##OPCODE(__VA_ARGS__); \
} else { \
code->OPCODE(__VA_ARGS__); \
} \
}()
static inline bool CanExchange(const HostLoc a, const HostLoc b) noexcept {
return HostLocIsGPR(a) && HostLocIsGPR(b);
}
@@ -107,14 +98,14 @@ void HostLocInfo::AddValue(IR::Inst* inst) noexcept {
max_bit_width = std::max<uint8_t>(max_bit_width, std::countr_zero(GetBitWidth(inst->GetType())));
}
void HostLocInfo::EmitVerboseDebuggingOutput(BlockOfCode* code, size_t host_loc_index) const noexcept {
void HostLocInfo::EmitVerboseDebuggingOutput(BlockOfCode& code, size_t host_loc_index) const noexcept {
using namespace Xbyak::util;
for (auto const value : values) {
code->mov(code->ABI_PARAM1, rsp);
code->mov(code->ABI_PARAM2, host_loc_index);
code->mov(code->ABI_PARAM3, value->GetName());
code->mov(code->ABI_PARAM4, GetBitWidth(value->GetType()));
code->CallFunction(PrintVerboseDebuggingOutputLine);
code.mov(code.ABI_PARAM1, rsp);
code.mov(code.ABI_PARAM2, host_loc_index);
code.mov(code.ABI_PARAM3, value->GetName());
code.mov(code.ABI_PARAM4, GetBitWidth(value->GetType()));
code.CallFunction(PrintVerboseDebuggingOutputLine);
}
}
@@ -128,7 +119,7 @@ bool Argument::FitsInImmediateU32() const noexcept {
bool Argument::FitsInImmediateS32() const noexcept {
if (!IsImmediate())
return false;
const s64 imm = static_cast<s64>(value.GetImmediateAsU64());
const s64 imm = s64(value.GetImmediateAsU64());
return -s64(0x80000000) <= imm && imm <= s64(0x7FFFFFFF);
}
@@ -174,36 +165,38 @@ IR::AccType Argument::GetImmediateAccType() const noexcept {
}
/// Is this value currently in a GPR?
bool Argument::IsInGpr() const noexcept {
bool Argument::IsInGpr(RegAlloc& reg_alloc) const noexcept {
if (IsImmediate())
return false;
return HostLocIsGPR(*reg_alloc.ValueLocation(value.GetInst()));
}
/// Is this value currently in a XMM?
bool Argument::IsInXmm() const noexcept {
bool Argument::IsInXmm(RegAlloc& reg_alloc) const noexcept {
if (IsImmediate())
return false;
return HostLocIsXMM(*reg_alloc.ValueLocation(value.GetInst()));
}
/// Is this value currently in memory?
bool Argument::IsInMemory() const noexcept {
bool Argument::IsInMemory(RegAlloc& reg_alloc) const noexcept {
if (IsImmediate())
return false;
return HostLocIsSpill(*reg_alloc.ValueLocation(value.GetInst()));
}
RegAlloc::RegAlloc(BlockOfCode* code, boost::container::static_vector<HostLoc, 28> gpr_order, boost::container::static_vector<HostLoc, 28> xmm_order) noexcept
RegAlloc::RegAlloc(boost::container::static_vector<HostLoc, 28> gpr_order, boost::container::static_vector<HostLoc, 28> xmm_order) noexcept
: gpr_order(gpr_order),
xmm_order(xmm_order),
code(code)
xmm_order(xmm_order)
{}
//static std::uint64_t Zfncwjkrt_blockOfCodeShim = 0;
RegAlloc::ArgumentInfo RegAlloc::GetArgumentInfo(const IR::Inst* inst) noexcept {
ArgumentInfo ret{Argument{*this}, Argument{*this}, Argument{*this}, Argument{*this}};
ArgumentInfo ret{
Argument{},
Argument{},
Argument{},
Argument{}
};
for (size_t i = 0; i < inst->NumArgs(); i++) {
const auto arg = inst->GetArg(i);
ret[i].value = arg;
@@ -228,34 +221,34 @@ void RegAlloc::RegisterPseudoOperation(const IR::Inst* inst) noexcept {
}
}
Xbyak::Reg64 RegAlloc::UseScratchGpr(Argument& arg) noexcept {
Xbyak::Reg64 RegAlloc::UseScratchGpr(BlockOfCode& code, Argument& arg) noexcept {
ASSERT(!arg.allocated);
arg.allocated = true;
return HostLocToReg64(UseScratchImpl(arg.value, gpr_order));
return HostLocToReg64(UseScratchImpl(code, arg.value, gpr_order));
}
Xbyak::Xmm RegAlloc::UseScratchXmm(Argument& arg) noexcept {
Xbyak::Xmm RegAlloc::UseScratchXmm(BlockOfCode& code, Argument& arg) noexcept {
ASSERT(!arg.allocated);
arg.allocated = true;
return HostLocToXmm(UseScratchImpl(arg.value, xmm_order));
return HostLocToXmm(UseScratchImpl(code, arg.value, xmm_order));
}
void RegAlloc::UseScratch(Argument& arg, HostLoc host_loc) noexcept {
void RegAlloc::UseScratch(BlockOfCode& code, Argument& arg, HostLoc host_loc) noexcept {
ASSERT(!arg.allocated);
arg.allocated = true;
UseScratchImpl(arg.value, {host_loc});
UseScratchImpl(code, arg.value, {host_loc});
}
void RegAlloc::DefineValue(IR::Inst* inst, const Xbyak::Reg& reg) noexcept {
void RegAlloc::DefineValue(BlockOfCode& code, IR::Inst* inst, const Xbyak::Reg& reg) noexcept {
ASSERT(reg.getKind() == Xbyak::Operand::XMM || reg.getKind() == Xbyak::Operand::REG);
const auto hostloc = static_cast<HostLoc>(reg.getIdx() + static_cast<size_t>(reg.getKind() == Xbyak::Operand::XMM ? HostLoc::XMM0 : HostLoc::RAX));
DefineValueImpl(inst, hostloc);
DefineValueImpl(code, inst, hostloc);
}
void RegAlloc::DefineValue(IR::Inst* inst, Argument& arg) noexcept {
void RegAlloc::DefineValue(BlockOfCode& code, IR::Inst* inst, Argument& arg) noexcept {
ASSERT(!arg.allocated);
arg.allocated = true;
DefineValueImpl(inst, arg.value);
DefineValueImpl(code, inst, arg.value);
}
void RegAlloc::Release(const Xbyak::Reg& reg) noexcept {
@@ -264,9 +257,9 @@ void RegAlloc::Release(const Xbyak::Reg& reg) noexcept {
LocInfo(hostloc).ReleaseOne();
}
HostLoc RegAlloc::UseImpl(IR::Value use_value, const boost::container::static_vector<HostLoc, 28>& desired_locations) noexcept {
HostLoc RegAlloc::UseImpl(BlockOfCode& code, IR::Value use_value, const boost::container::static_vector<HostLoc, 28>& desired_locations) noexcept {
if (use_value.IsImmediate()) {
return LoadImmediate(use_value, ScratchImpl(desired_locations));
return LoadImmediate(code, use_value, ScratchImpl(code, desired_locations));
}
const auto* use_inst = use_value.GetInst();
@@ -280,25 +273,25 @@ HostLoc RegAlloc::UseImpl(IR::Value use_value, const boost::container::static_ve
}
if (LocInfo(current_location).IsLocked()) {
return UseScratchImpl(use_value, desired_locations);
return UseScratchImpl(code, use_value, desired_locations);
}
const HostLoc destination_location = SelectARegister(desired_locations);
if (max_bit_width > HostLocBitWidth(destination_location)) {
return UseScratchImpl(use_value, desired_locations);
return UseScratchImpl(code, use_value, desired_locations);
} else if (CanExchange(destination_location, current_location)) {
Exchange(destination_location, current_location);
Exchange(code, destination_location, current_location);
} else {
MoveOutOfTheWay(destination_location);
Move(destination_location, current_location);
MoveOutOfTheWay(code, destination_location);
Move(code, destination_location, current_location);
}
LocInfo(destination_location).ReadLock();
return destination_location;
}
HostLoc RegAlloc::UseScratchImpl(IR::Value use_value, const boost::container::static_vector<HostLoc, 28>& desired_locations) noexcept {
HostLoc RegAlloc::UseScratchImpl(BlockOfCode& code, IR::Value use_value, const boost::container::static_vector<HostLoc, 28>& desired_locations) noexcept {
if (use_value.IsImmediate()) {
return LoadImmediate(use_value, ScratchImpl(desired_locations));
return LoadImmediate(code, use_value, ScratchImpl(code, desired_locations));
}
const auto* use_inst = use_value.GetInst();
@@ -308,7 +301,7 @@ HostLoc RegAlloc::UseScratchImpl(IR::Value use_value, const boost::container::st
const bool can_use_current_location = std::find(desired_locations.begin(), desired_locations.end(), current_location) != desired_locations.end();
if (can_use_current_location && !LocInfo(current_location).IsLocked()) {
if (!LocInfo(current_location).IsLastUse()) {
MoveOutOfTheWay(current_location);
MoveOutOfTheWay(code, current_location);
} else {
LocInfo(current_location).SetLastUse();
}
@@ -317,20 +310,22 @@ HostLoc RegAlloc::UseScratchImpl(IR::Value use_value, const boost::container::st
}
const HostLoc destination_location = SelectARegister(desired_locations);
MoveOutOfTheWay(destination_location);
CopyToScratch(bit_width, destination_location, current_location);
MoveOutOfTheWay(code, destination_location);
CopyToScratch(code, bit_width, destination_location, current_location);
LocInfo(destination_location).WriteLock();
return destination_location;
}
HostLoc RegAlloc::ScratchImpl(const boost::container::static_vector<HostLoc, 28>& desired_locations) noexcept {
HostLoc RegAlloc::ScratchImpl(BlockOfCode& code, const boost::container::static_vector<HostLoc, 28>& desired_locations) noexcept {
const HostLoc location = SelectARegister(desired_locations);
MoveOutOfTheWay(location);
MoveOutOfTheWay(code, location);
LocInfo(location).WriteLock();
return location;
}
void RegAlloc::HostCall(IR::Inst* result_def,
void RegAlloc::HostCall(
BlockOfCode& code,
IR::Inst* result_def,
const std::optional<Argument::copyable_reference> arg0,
const std::optional<Argument::copyable_reference> arg1,
const std::optional<Argument::copyable_reference> arg2,
@@ -348,20 +343,20 @@ void RegAlloc::HostCall(IR::Inst* result_def,
return ret;
}();
ScratchGpr(ABI_RETURN);
if (result_def) {
DefineValueImpl(result_def, ABI_RETURN);
}
ScratchGpr(code, ABI_RETURN);
if (result_def)
DefineValueImpl(code, result_def, ABI_RETURN);
for (size_t i = 0; i < args.size(); i++) {
if (args[i]) {
UseScratch(*args[i], args_hostloc[i]);
UseScratch(code, *args[i], args_hostloc[i]);
} else {
ScratchGpr(args_hostloc[i]); // TODO: Force spill
ScratchGpr(code, args_hostloc[i]); // TODO: Force spill
}
}
// Must match with with ScratchImpl
for (auto const gpr : other_caller_save) {
MoveOutOfTheWay(gpr);
MoveOutOfTheWay(code, gpr);
LocInfo(gpr).WriteLock();
}
for (size_t i = 0; i < args.size(); i++) {
@@ -370,13 +365,13 @@ void RegAlloc::HostCall(IR::Inst* result_def,
const Xbyak::Reg64 reg = HostLocToReg64(args_hostloc[i]);
switch (args[i]->get().GetType()) {
case IR::Type::U8:
code->movzx(reg.cvt32(), reg.cvt8());
code.movzx(reg.cvt32(), reg.cvt8());
break;
case IR::Type::U16:
code->movzx(reg.cvt32(), reg.cvt16());
code.movzx(reg.cvt32(), reg.cvt16());
break;
case IR::Type::U32:
code->mov(reg.cvt32(), reg.cvt32());
code.mov(reg.cvt32(), reg.cvt32());
break;
case IR::Type::U64:
break; //no op
@@ -387,18 +382,18 @@ void RegAlloc::HostCall(IR::Inst* result_def,
}
}
void RegAlloc::AllocStackSpace(const size_t stack_space) noexcept {
void RegAlloc::AllocStackSpace(BlockOfCode& code, const size_t stack_space) noexcept {
ASSERT(stack_space < size_t((std::numeric_limits<s32>::max)()));
ASSERT(reserved_stack_space == 0);
reserved_stack_space = stack_space;
code->sub(code->rsp, u32(stack_space));
code.sub(code.rsp, u32(stack_space));
}
void RegAlloc::ReleaseStackSpace(const size_t stack_space) noexcept {
void RegAlloc::ReleaseStackSpace(BlockOfCode& code, const size_t stack_space) noexcept {
ASSERT(stack_space < size_t((std::numeric_limits<s32>::max)()));
ASSERT(reserved_stack_space == stack_space);
reserved_stack_space = 0;
code->add(code->rsp, u32(stack_space));
code.add(code.rsp, u32(stack_space));
}
HostLoc RegAlloc::SelectARegister(const boost::container::static_vector<HostLoc, 28>& desired_locations) const noexcept {
@@ -458,92 +453,75 @@ HostLoc RegAlloc::SelectARegister(const boost::container::static_vector<HostLoc,
return *it_final;
}
void RegAlloc::DefineValueImpl(IR::Inst* def_inst, HostLoc host_loc) noexcept {
std::optional<HostLoc> RegAlloc::ValueLocation(const IR::Inst* value) const noexcept {
for (size_t i = 0; i < hostloc_info.size(); i++)
if (hostloc_info[i].ContainsValue(value))
return HostLoc(i);
return std::nullopt;
}
void RegAlloc::DefineValueImpl(BlockOfCode& code, IR::Inst* def_inst, HostLoc host_loc) noexcept {
ASSERT(!ValueLocation(def_inst) && "def_inst has already been defined");
LocInfo(host_loc).AddValue(def_inst);
}
void RegAlloc::DefineValueImpl(IR::Inst* def_inst, const IR::Value& use_inst) noexcept {
void RegAlloc::DefineValueImpl(BlockOfCode& code, IR::Inst* def_inst, const IR::Value& use_inst) noexcept {
ASSERT(!ValueLocation(def_inst) && "def_inst has already been defined");
if (use_inst.IsImmediate()) {
const HostLoc location = ScratchImpl(gpr_order);
DefineValueImpl(def_inst, location);
LoadImmediate(use_inst, location);
const HostLoc location = ScratchImpl(code, gpr_order);
DefineValueImpl(code, def_inst, location);
LoadImmediate(code, use_inst, location);
return;
}
ASSERT(ValueLocation(use_inst.GetInst()) && "use_inst must already be defined");
const HostLoc location = *ValueLocation(use_inst.GetInst());
DefineValueImpl(def_inst, location);
DefineValueImpl(code, def_inst, location);
}
HostLoc RegAlloc::LoadImmediate(IR::Value imm, HostLoc host_loc) noexcept {
ASSERT(imm.IsImmediate() && "imm is not an immediate");
if (HostLocIsGPR(host_loc)) {
const Xbyak::Reg64 reg = HostLocToReg64(host_loc);
const u64 imm_value = imm.GetImmediateAsU64();
if (imm_value == 0) {
code->xor_(reg.cvt32(), reg.cvt32());
} else {
code->mov(reg, imm_value);
}
} else if (HostLocIsXMM(host_loc)) {
const Xbyak::Xmm reg = HostLocToXmm(host_loc);
const u64 imm_value = imm.GetImmediateAsU64();
if (imm_value == 0) {
MAYBE_AVX(xorps, reg, reg);
} else {
MAYBE_AVX(movaps, reg, code->Const(code->xword, imm_value));
}
} else {
UNREACHABLE();
}
return host_loc;
}
void RegAlloc::Move(HostLoc to, HostLoc from) noexcept {
void RegAlloc::Move(BlockOfCode& code, HostLoc to, HostLoc from) noexcept {
const size_t bit_width = LocInfo(from).GetMaxBitWidth();
ASSERT(LocInfo(to).IsEmpty() && !LocInfo(from).IsLocked());
ASSERT(bit_width <= HostLocBitWidth(to));
ASSERT(!LocInfo(from).IsEmpty() && "Mov eliminated");
EmitMove(bit_width, to, from);
EmitMove(code, bit_width, to, from);
LocInfo(to) = std::exchange(LocInfo(from), {});
}
void RegAlloc::CopyToScratch(size_t bit_width, HostLoc to, HostLoc from) noexcept {
void RegAlloc::CopyToScratch(BlockOfCode& code, size_t bit_width, HostLoc to, HostLoc from) noexcept {
ASSERT(LocInfo(to).IsEmpty() && !LocInfo(from).IsEmpty());
EmitMove(bit_width, to, from);
EmitMove(code, bit_width, to, from);
}
void RegAlloc::Exchange(HostLoc a, HostLoc b) noexcept {
void RegAlloc::Exchange(BlockOfCode& code, HostLoc a, HostLoc b) noexcept {
ASSERT(!LocInfo(a).IsLocked() && !LocInfo(b).IsLocked());
ASSERT(LocInfo(a).GetMaxBitWidth() <= HostLocBitWidth(b));
ASSERT(LocInfo(b).GetMaxBitWidth() <= HostLocBitWidth(a));
if (LocInfo(a).IsEmpty()) {
Move(a, b);
Move(code, a, b);
} else if (LocInfo(b).IsEmpty()) {
Move(b, a);
Move(code, b, a);
} else {
EmitExchange(a, b);
EmitExchange(code, a, b);
std::swap(LocInfo(a), LocInfo(b));
}
}
void RegAlloc::MoveOutOfTheWay(HostLoc reg) noexcept {
void RegAlloc::MoveOutOfTheWay(BlockOfCode& code, HostLoc reg) noexcept {
ASSERT(!LocInfo(reg).IsLocked());
if (!LocInfo(reg).IsEmpty()) {
SpillRegister(reg);
SpillRegister(code, reg);
}
}
void RegAlloc::SpillRegister(HostLoc loc) noexcept {
void RegAlloc::SpillRegister(BlockOfCode& code, HostLoc loc) noexcept {
ASSERT(HostLocIsRegister(loc) && "Only registers can be spilled");
ASSERT(!LocInfo(loc).IsEmpty() && "There is no need to spill unoccupied registers");
ASSERT(!LocInfo(loc).IsLocked() && "Registers that have been allocated must not be spilt");
auto const new_loc = FindFreeSpill(HostLocIsXMM(loc));
Move(new_loc, loc);
Move(code, new_loc, loc);
}
HostLoc RegAlloc::FindFreeSpill(bool is_xmm) const noexcept {
@@ -568,9 +546,39 @@ HostLoc RegAlloc::FindFreeSpill(bool is_xmm) const noexcept {
if (const auto loc = HostLoc(i); LocInfo(loc).IsEmpty())
return loc;
UNREACHABLE();
};
}
void RegAlloc::EmitMove(const size_t bit_width, const HostLoc to, const HostLoc from) noexcept {
#define MAYBE_AVX(OPCODE, ...) \
[&] { \
if (code.HasHostFeature(HostFeature::AVX)) code.v##OPCODE(__VA_ARGS__); \
else code.OPCODE(__VA_ARGS__); \
}()
HostLoc RegAlloc::LoadImmediate(BlockOfCode& code, IR::Value imm, HostLoc host_loc) noexcept {
ASSERT(imm.IsImmediate() && "imm is not an immediate");
if (HostLocIsGPR(host_loc)) {
const Xbyak::Reg64 reg = HostLocToReg64(host_loc);
const u64 imm_value = imm.GetImmediateAsU64();
if (imm_value == 0) {
code.xor_(reg.cvt32(), reg.cvt32());
} else {
code.mov(reg, imm_value);
}
} else if (HostLocIsXMM(host_loc)) {
const Xbyak::Xmm reg = HostLocToXmm(host_loc);
const u64 imm_value = imm.GetImmediateAsU64();
if (imm_value == 0) {
MAYBE_AVX(xorps, reg, reg);
} else {
MAYBE_AVX(movaps, reg, code.Const(code.xword, imm_value));
}
} else {
UNREACHABLE();
}
return host_loc;
}
void RegAlloc::EmitMove(BlockOfCode& code, const size_t bit_width, const HostLoc to, const HostLoc from) noexcept {
auto const spill_to_op_arg_helper = [&](HostLoc loc, size_t reserved_stack_space) {
ASSERT(HostLocIsSpill(loc));
size_t i = size_t(loc) - size_t(HostLoc::FirstSpill);
@@ -585,9 +593,9 @@ void RegAlloc::EmitMove(const size_t bit_width, const HostLoc to, const HostLoc
} else if (HostLocIsGPR(to) && HostLocIsGPR(from)) {
ASSERT(bit_width != 128);
if (bit_width == 64) {
code->mov(HostLocToReg64(to), HostLocToReg64(from));
code.mov(HostLocToReg64(to), HostLocToReg64(from));
} else {
code->mov(HostLocToReg64(to).cvt32(), HostLocToReg64(from).cvt32());
code.mov(HostLocToReg64(to).cvt32(), HostLocToReg64(from).cvt32());
}
} else if (HostLocIsXMM(to) && HostLocIsGPR(from)) {
ASSERT(bit_width != 128);
@@ -642,25 +650,26 @@ void RegAlloc::EmitMove(const size_t bit_width, const HostLoc to, const HostLoc
} else if (HostLocIsGPR(to) && HostLocIsSpill(from)) {
ASSERT(bit_width != 128);
if (bit_width == 64) {
code->mov(HostLocToReg64(to), Xbyak::util::qword[spill_to_op_arg_helper(from, reserved_stack_space)]);
code.mov(HostLocToReg64(to), Xbyak::util::qword[spill_to_op_arg_helper(from, reserved_stack_space)]);
} else {
code->mov(HostLocToReg64(to).cvt32(), Xbyak::util::dword[spill_to_op_arg_helper(from, reserved_stack_space)]);
code.mov(HostLocToReg64(to).cvt32(), Xbyak::util::dword[spill_to_op_arg_helper(from, reserved_stack_space)]);
}
} else if (HostLocIsSpill(to) && HostLocIsGPR(from)) {
ASSERT(bit_width != 128);
if (bit_width == 64) {
code->mov(Xbyak::util::qword[spill_to_op_arg_helper(to, reserved_stack_space)], HostLocToReg64(from));
code.mov(Xbyak::util::qword[spill_to_op_arg_helper(to, reserved_stack_space)], HostLocToReg64(from));
} else {
code->mov(Xbyak::util::dword[spill_to_op_arg_helper(to, reserved_stack_space)], HostLocToReg64(from).cvt32());
code.mov(Xbyak::util::dword[spill_to_op_arg_helper(to, reserved_stack_space)], HostLocToReg64(from).cvt32());
}
} else {
UNREACHABLE();
}
}
#undef MAYBE_AVX
void RegAlloc::EmitExchange(const HostLoc a, const HostLoc b) noexcept {
void RegAlloc::EmitExchange(BlockOfCode& code, const HostLoc a, const HostLoc b) noexcept {
ASSERT(HostLocIsGPR(a) && HostLocIsGPR(b) && "Exchanging XMM registers is uneeded OR invalid emit");
code->xchg(HostLocToReg64(a), HostLocToReg64(b));
code.xchg(HostLocToReg64(a), HostLocToReg64(b));
}
} // namespace Dynarmic::Backend::X64

View File

@@ -81,7 +81,7 @@ public:
return 1 << max_bit_width;
}
void AddValue(IR::Inst* inst) noexcept;
void EmitVerboseDebuggingOutput(BlockOfCode* code, size_t host_loc_index) const noexcept;
void EmitVerboseDebuggingOutput(BlockOfCode& code, size_t host_loc_index) const noexcept;
private:
//non trivial
boost::container::small_vector<IR::Inst*, 3> values; //24
@@ -129,16 +129,15 @@ public:
IR::AccType GetImmediateAccType() const noexcept;
/// Is this value currently in a GPR?
bool IsInGpr() const noexcept;
bool IsInXmm() const noexcept;
bool IsInMemory() const noexcept;
bool IsInGpr(RegAlloc& reg_alloc) const noexcept;
bool IsInXmm(RegAlloc& reg_alloc) const noexcept;
bool IsInMemory(RegAlloc& reg_alloc) const noexcept;
private:
friend class RegAlloc;
explicit Argument(RegAlloc& reg_alloc) : reg_alloc(reg_alloc) {}
explicit Argument() {}
//data
IR::Value value; //8
RegAlloc& reg_alloc; //8
bool allocated = false; //1
};
@@ -146,55 +145,57 @@ class RegAlloc final {
public:
using ArgumentInfo = std::array<Argument, IR::max_arg_count>;
RegAlloc() noexcept = default;
RegAlloc(BlockOfCode* code, boost::container::static_vector<HostLoc, 28> gpr_order, boost::container::static_vector<HostLoc, 28> xmm_order) noexcept;
RegAlloc(boost::container::static_vector<HostLoc, 28> gpr_order, boost::container::static_vector<HostLoc, 28> xmm_order) noexcept;
ArgumentInfo GetArgumentInfo(const IR::Inst* inst) noexcept;
void RegisterPseudoOperation(const IR::Inst* inst) noexcept;
inline bool IsValueLive(const IR::Inst* inst) const noexcept {
return !!ValueLocation(inst);
}
inline Xbyak::Reg64 UseGpr(Argument& arg) noexcept {
inline Xbyak::Reg64 UseGpr(BlockOfCode& code, Argument& arg) noexcept {
ASSERT(!arg.allocated);
arg.allocated = true;
return HostLocToReg64(UseImpl(arg.value, gpr_order));
return HostLocToReg64(UseImpl(code, arg.value, gpr_order));
}
inline Xbyak::Xmm UseXmm(Argument& arg) noexcept {
inline Xbyak::Xmm UseXmm(BlockOfCode& code, Argument& arg) noexcept {
ASSERT(!arg.allocated);
arg.allocated = true;
return HostLocToXmm(UseImpl(arg.value, xmm_order));
return HostLocToXmm(UseImpl(code, arg.value, xmm_order));
}
inline OpArg UseOpArg(Argument& arg) noexcept {
return UseGpr(arg);
inline OpArg UseOpArg(BlockOfCode& code, Argument& arg) noexcept {
return UseGpr(code, arg);
}
inline void Use(Argument& arg, const HostLoc host_loc) noexcept {
inline void Use(BlockOfCode& code, Argument& arg, const HostLoc host_loc) noexcept {
ASSERT(!arg.allocated);
arg.allocated = true;
UseImpl(arg.value, {host_loc});
UseImpl(code, arg.value, {host_loc});
}
Xbyak::Reg64 UseScratchGpr(Argument& arg) noexcept;
Xbyak::Xmm UseScratchXmm(Argument& arg) noexcept;
void UseScratch(Argument& arg, HostLoc host_loc) noexcept;
Xbyak::Reg64 UseScratchGpr(BlockOfCode& code, Argument& arg) noexcept;
Xbyak::Xmm UseScratchXmm(BlockOfCode& code, Argument& arg) noexcept;
void UseScratch(BlockOfCode& code, Argument& arg, HostLoc host_loc) noexcept;
void DefineValue(IR::Inst* inst, const Xbyak::Reg& reg) noexcept;
void DefineValue(IR::Inst* inst, Argument& arg) noexcept;
void DefineValue(BlockOfCode& code, IR::Inst* inst, const Xbyak::Reg& reg) noexcept;
void DefineValue(BlockOfCode& code, IR::Inst* inst, Argument& arg) noexcept;
void Release(const Xbyak::Reg& reg) noexcept;
inline Xbyak::Reg64 ScratchGpr() noexcept {
return HostLocToReg64(ScratchImpl(gpr_order));
inline Xbyak::Reg64 ScratchGpr(BlockOfCode& code) noexcept {
return HostLocToReg64(ScratchImpl(code, gpr_order));
}
inline Xbyak::Reg64 ScratchGpr(const HostLoc desired_location) noexcept {
return HostLocToReg64(ScratchImpl({desired_location}));
inline Xbyak::Reg64 ScratchGpr(BlockOfCode& code, const HostLoc desired_location) noexcept {
return HostLocToReg64(ScratchImpl(code, {desired_location}));
}
inline Xbyak::Xmm ScratchXmm() noexcept {
return HostLocToXmm(ScratchImpl(xmm_order));
inline Xbyak::Xmm ScratchXmm(BlockOfCode& code) noexcept {
return HostLocToXmm(ScratchImpl(code, xmm_order));
}
inline Xbyak::Xmm ScratchXmm(HostLoc desired_location) noexcept {
return HostLocToXmm(ScratchImpl({desired_location}));
inline Xbyak::Xmm ScratchXmm(BlockOfCode& code, HostLoc desired_location) noexcept {
return HostLocToXmm(ScratchImpl(code, {desired_location}));
}
void HostCall(IR::Inst* result_def = nullptr,
void HostCall(
BlockOfCode& code,
IR::Inst* result_def = nullptr,
const std::optional<Argument::copyable_reference> arg0 = {},
const std::optional<Argument::copyable_reference> arg1 = {},
const std::optional<Argument::copyable_reference> arg2 = {},
@@ -202,67 +203,56 @@ public:
) noexcept;
// TODO: Values in host flags
void AllocStackSpace(const size_t stack_space) noexcept;
void ReleaseStackSpace(const size_t stack_space) noexcept;
void AllocStackSpace(BlockOfCode& code, const size_t stack_space) noexcept;
void ReleaseStackSpace(BlockOfCode& code, const size_t stack_space) noexcept;
inline void EndOfAllocScope() noexcept {
for (auto& iter : hostloc_info) {
for (auto& iter : hostloc_info)
iter.ReleaseAll();
}
}
inline void AssertNoMoreUses() noexcept {
ASSERT(std::all_of(hostloc_info.begin(), hostloc_info.end(), [](const auto& i) noexcept { return i.IsEmpty(); }));
}
inline void EmitVerboseDebuggingOutput() noexcept {
for (size_t i = 0; i < hostloc_info.size(); i++) {
inline void EmitVerboseDebuggingOutput(BlockOfCode& code) noexcept {
for (size_t i = 0; i < hostloc_info.size(); i++)
hostloc_info[i].EmitVerboseDebuggingOutput(code, i);
}
}
private:
friend struct Argument;
HostLoc SelectARegister(const boost::container::static_vector<HostLoc, 28>& desired_locations) const noexcept;
inline std::optional<HostLoc> ValueLocation(const IR::Inst* value) const noexcept {
for (size_t i = 0; i < hostloc_info.size(); i++) {
if (hostloc_info[i].ContainsValue(value)) {
return HostLoc(i);
}
}
return std::nullopt;
}
std::optional<HostLoc> ValueLocation(const IR::Inst* value) const noexcept;
HostLoc UseImpl(BlockOfCode& code, IR::Value use_value, const boost::container::static_vector<HostLoc, 28>& desired_locations) noexcept;
HostLoc UseScratchImpl(BlockOfCode& code, IR::Value use_value, const boost::container::static_vector<HostLoc, 28>& desired_locations) noexcept;
HostLoc ScratchImpl(BlockOfCode& code, const boost::container::static_vector<HostLoc, 28>& desired_locations) noexcept;
void DefineValueImpl(BlockOfCode& code, IR::Inst* def_inst, HostLoc host_loc) noexcept;
void DefineValueImpl(BlockOfCode& code, IR::Inst* def_inst, const IR::Value& use_inst) noexcept;
HostLoc UseImpl(IR::Value use_value, const boost::container::static_vector<HostLoc, 28>& desired_locations) noexcept;
HostLoc UseScratchImpl(IR::Value use_value, const boost::container::static_vector<HostLoc, 28>& desired_locations) noexcept;
HostLoc ScratchImpl(const boost::container::static_vector<HostLoc, 28>& desired_locations) noexcept;
void DefineValueImpl(IR::Inst* def_inst, HostLoc host_loc) noexcept;
void DefineValueImpl(IR::Inst* def_inst, const IR::Value& use_inst) noexcept;
HostLoc LoadImmediate(BlockOfCode& code, IR::Value imm, HostLoc host_loc) noexcept;
void Move(BlockOfCode& code, HostLoc to, HostLoc from) noexcept;
void CopyToScratch(BlockOfCode& code, size_t bit_width, HostLoc to, HostLoc from) noexcept;
void Exchange(BlockOfCode& code, HostLoc a, HostLoc b) noexcept;
void MoveOutOfTheWay(BlockOfCode& code, HostLoc reg) noexcept;
HostLoc LoadImmediate(IR::Value imm, HostLoc host_loc) noexcept;
void Move(HostLoc to, HostLoc from) noexcept;
void CopyToScratch(size_t bit_width, HostLoc to, HostLoc from) noexcept;
void Exchange(HostLoc a, HostLoc b) noexcept;
void MoveOutOfTheWay(HostLoc reg) noexcept;
void SpillRegister(HostLoc loc) noexcept;
void SpillRegister(BlockOfCode& code, HostLoc loc) noexcept;
HostLoc FindFreeSpill(bool is_xmm) const noexcept;
inline HostLocInfo& LocInfo(const HostLoc loc) noexcept {
ASSERT(loc != HostLoc::RSP && loc != ABI_JIT_PTR);
return hostloc_info[static_cast<size_t>(loc)];
return hostloc_info[size_t(loc)];
}
inline const HostLocInfo& LocInfo(const HostLoc loc) const noexcept {
ASSERT(loc != HostLoc::RSP && loc != ABI_JIT_PTR);
return hostloc_info[static_cast<size_t>(loc)];
return hostloc_info[size_t(loc)];
}
void EmitMove(const size_t bit_width, const HostLoc to, const HostLoc from) noexcept;
void EmitExchange(const HostLoc a, const HostLoc b) noexcept;
void EmitMove(BlockOfCode& code, const size_t bit_width, const HostLoc to, const HostLoc from) noexcept;
void EmitExchange(BlockOfCode& code, const HostLoc a, const HostLoc b) noexcept;
//data
alignas(64) boost::container::static_vector<HostLoc, 28> gpr_order;
alignas(64) boost::container::static_vector<HostLoc, 28> xmm_order;
alignas(64) std::array<HostLocInfo, NonSpillHostLocCount + SpillCount> hostloc_info;
BlockOfCode* code = nullptr;
size_t reserved_stack_space = 0;
};
// Ensure a cache line (or less) is used, this is primordial

View File

@@ -100,9 +100,14 @@ bool Value::IsEmpty() const noexcept {
}
bool Value::IsImmediate() const noexcept {
if (IsIdentity())
return inner.inst->GetArg(0).IsImmediate();
return type != Type::Opaque;
IR::Type current_type = type;
IR::Inst const* current_inst = inner.inst;
while (current_type == Type::Opaque && current_inst->GetOpcode() == Opcode::Identity) {
Value const& arg = current_inst->GetArg(0);
current_type = arg.type;
current_inst = arg.inner.inst;
}
return current_type != Type::Opaque;
}
Type Value::GetType() const noexcept {

View File

@@ -277,9 +277,8 @@ std::unique_ptr<TranslationMap> InitializeTranslations(QObject* parent)
INSERT(Settings,
gpu_accuracy,
tr("GPU Accuracy:"),
tr("Controls the GPU emulation accuracy.\nMost games render fine with Normal, but High is still "
"required for some.\nParticles tend to only render correctly with High "
"accuracy.\nExtreme should only be used as a last resort."));
tr("Controls the GPU emulation accuracy.\nMost games render fine with Fast or Balanced modes, but Accurate is still "
"required for some.\nParticles tend to only render correctly with Accurate mode."));
INSERT(Settings,
dma_accuracy,
tr("DMA Accuracy:"),
@@ -507,9 +506,9 @@ std::unique_ptr<ComboboxTranslationMap> ComboboxEnumeration(QObject* parent)
}});
translations->insert({Settings::EnumMetadata<Settings::GpuAccuracy>::Index(),
{
PAIR(GpuAccuracy, Normal, tr("Normal")),
PAIR(GpuAccuracy, High, tr("High")),
PAIR(GpuAccuracy, Extreme, tr("Extreme")),
PAIR(GpuAccuracy, Low, tr("Fast")),
PAIR(GpuAccuracy, Medium, tr("Balanced")),
PAIR(GpuAccuracy, High, tr("Accurate")),
}});
translations->insert({Settings::EnumMetadata<Settings::DmaAccuracy>::Index(),
{

View File

@@ -61,9 +61,9 @@ static const std::map<Settings::ConsoleMode, QString> use_docked_mode_texts_map
};
static const std::map<Settings::GpuAccuracy, QString> gpu_accuracy_texts_map = {
{Settings::GpuAccuracy::Normal, QStringLiteral(QT_TRANSLATE_NOOP("MainWindow", "Normal"))},
{Settings::GpuAccuracy::High, QStringLiteral(QT_TRANSLATE_NOOP("MainWindow", "High"))},
{Settings::GpuAccuracy::Extreme, QStringLiteral(QT_TRANSLATE_NOOP("MainWindow", "Extreme"))},
{Settings::GpuAccuracy::Low, QStringLiteral(QT_TRANSLATE_NOOP("MainWindow", "Fast"))},
{Settings::GpuAccuracy::Medium, QStringLiteral(QT_TRANSLATE_NOOP("MainWindow", "Balanced"))},
{Settings::GpuAccuracy::High, QStringLiteral(QT_TRANSLATE_NOOP("MainWindow", "Accurate"))},
};
static const std::map<Settings::RendererBackend, QString> renderer_backend_texts_map = {

View File

@@ -103,7 +103,7 @@ bool DmaPusher::Step() {
ProcessCommands(headers);
};
const bool use_safe = Settings::IsDMALevelDefault() ? Settings::IsGPULevelHigh() : Settings::IsDMALevelSafe();
const bool use_safe = Settings::IsDMALevelDefault() ? (Settings::IsGPULevelMedium() || Settings::IsGPULevelHigh()) : Settings::IsDMALevelSafe();
if (use_safe) {
safe_process();

View File

@@ -72,66 +72,33 @@ public:
}
void SignalFence(std::function<void()>&& func) {
const bool delay_fence = Settings::IsGPULevelHigh();
#ifdef __ANDROID__
const bool use_optimized = Settings::values.early_release_fences.GetValue();
#else
constexpr bool use_optimized = false;
#endif
if constexpr (!can_async_check) {
TryReleasePendingFences<false>();
}
const bool should_flush = ShouldFlush();
CommitAsyncFlushes();
TFence new_fence = CreateFence(!should_flush);
if (use_optimized) {
if (!delay_fence) {
TryReleasePendingFences<false>();
}
if (delay_fence) {
guard.lock();
uncommitted_operations.emplace_back(std::move(func));
}
} else {
if constexpr (!can_async_check) {
TryReleasePendingFences<false>();
}
if constexpr (can_async_check) {
guard.lock();
}
if (delay_fence) {
uncommitted_operations.emplace_back(std::move(func));
}
if constexpr (can_async_check) {
guard.lock();
}
pending_operations.emplace_back(std::move(uncommitted_operations));
QueueFence(new_fence);
if (!delay_fence) {
if (Settings::IsGPULevelLow() || (Settings::IsGPULevelMedium() && !should_flush)) {
func();
} else {
uncommitted_operations.emplace_back(std::move(func));
}
if (!uncommitted_operations.empty()) {
pending_operations.emplace_back(std::move(uncommitted_operations));
uncommitted_operations.clear();
}
QueueFence(new_fence);
fences.push(std::move(new_fence));
if (should_flush) {
rasterizer.FlushCommands();
}
if (use_optimized) {
if (delay_fence) {
guard.unlock();
cv.notify_all();
}
} else {
if constexpr (can_async_check) {
guard.unlock();
cv.notify_all();
}
if constexpr (can_async_check) {
guard.unlock();
cv.notify_all();
}
rasterizer.InvalidateGPUCache();
}

View File

@@ -79,15 +79,8 @@ void ThreadManager::FlushRegion(DAddr addr, u64 size) {
if (!is_async) {
// Always flush with synchronous GPU mode
PushCommand(FlushRegionCommand(addr, size));
return;
}
if (!Settings::IsGPULevelExtreme()) {
return;
}
auto& gpu = system.GPU();
u64 fence = gpu.RequestFlush(addr, size);
TickGPU();
gpu.WaitForSyncOperation(fence);
return;
}
void ThreadManager::TickGPU() {

View File

@@ -629,9 +629,6 @@ void RasterizerOpenGL::ReleaseFences(bool force) {
void RasterizerOpenGL::FlushAndInvalidateRegion(DAddr addr, u64 size,
VideoCommon::CacheType which) {
if (Settings::IsGPULevelExtreme()) {
FlushRegion(addr, size, which);
}
InvalidateRegion(addr, size, which);
}

View File

@@ -80,7 +80,9 @@ void MasterSemaphore::Wait(u64 tick) {
if (!semaphore) {
// If we don't support timeline semaphores, wait for the value normally
std::unique_lock lk{free_mutex};
free_cv.wait(lk, [&] { return gpu_tick.load(std::memory_order_relaxed) >= tick; });
free_cv.wait(lk, [&] {
return gpu_tick.load(std::memory_order_acquire) >= tick;
});
return;
}
@@ -216,15 +218,32 @@ void MasterSemaphore::WaitThread(std::stop_token token) {
wait_queue.pop();
}
#ifdef ANDROID
VkResult status;
do {
status = fence.GetStatus();
if (status == VK_NOT_READY) {
std::this_thread::sleep_for(std::chrono::microseconds(100));
}
} while (status == VK_NOT_READY);
if (status == VK_SUCCESS) {
fence.Reset();
} else {
vk::Check(status);
continue;
}
#else
fence.Wait();
fence.Reset();
#endif
{
std::scoped_lock lock{free_mutex};
free_queue.push_front(std::move(fence));
gpu_tick.store(host_tick);
gpu_tick.store(host_tick, std::memory_order_release);
}
free_cv.notify_one();
free_cv.notify_all();
}
}

View File

@@ -1415,14 +1415,10 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku
return false;
}
const bool is_gpu_high = Settings::IsGPULevelHigh();
if (!is_gpu_high && impl->device.GetDriverID() == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS) {
return true;
}
auto driver_id = impl->device.GetDriverID();
if (driver_id == VK_DRIVER_ID_QUALCOMM_PROPRIETARY ||
driver_id == VK_DRIVER_ID_ARM_PROPRIETARY || driver_id == VK_DRIVER_ID_MESA_TURNIP) {
const bool is_gpu_high = Settings::IsGPULevelHigh();
if ((!is_gpu_high && driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS) || driver_id == VK_DRIVER_ID_QUALCOMM_PROPRIETARY || driver_id == VK_DRIVER_ID_ARM_PROPRIETARY || driver_id == VK_DRIVER_ID_MESA_TURNIP) {
return true;
}
@@ -1443,7 +1439,6 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku
}
if (!is_in_bc[0] && !is_in_bc[1]) {
// Both queries are in query cache, it's best to just flush.
return true;
}
HostConditionalRenderingCompareBCImpl(object_1.address, equal_check);

View File

@@ -730,9 +730,6 @@ void RasterizerVulkan::ReleaseFences(bool force) {
void RasterizerVulkan::FlushAndInvalidateRegion(DAddr addr, u64 size,
VideoCommon::CacheType which) {
if (Settings::IsGPULevelExtreme()) {
FlushRegion(addr, size, which);
}
InvalidateRegion(addr, size, which);
}

View File

@@ -1269,6 +1269,17 @@ void TextureCacheRuntime::ConvertImage(Framebuffer* dst, ImageView& dst_view, Im
case PixelFormat::R32G32_FLOAT:
case PixelFormat::R32G32_SINT:
case PixelFormat::R32_FLOAT:
if (src_view.format == PixelFormat::D32_FLOAT) {
const Region2D region{
.start = {0, 0},
.end = {static_cast<s32>(dst->RenderArea().width),
static_cast<s32>(dst->RenderArea().height)},
};
return blit_image_helper.BlitColor(dst, src_view, region, region,
Tegra::Engines::Fermi2D::Filter::Point,
Tegra::Engines::Fermi2D::Operation::SrcCopy);
}
break;
case PixelFormat::R16_FLOAT:
case PixelFormat::R16_UNORM:
case PixelFormat::R16_SNORM:

View File

@@ -663,11 +663,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
break;
}
if (!extensions.extended_dynamic_state) {
Settings::values.vertex_input_dynamic_state.SetValue(false);
}
if (!Settings::values.vertex_input_dynamic_state.GetValue()) {
if (!Settings::values.vertex_input_dynamic_state.GetValue() || !extensions.extended_dynamic_state) {
RemoveExtensionFeature(extensions.vertex_input_dynamic_state, features.vertex_input_dynamic_state, VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME);
}

View File

@@ -1267,9 +1267,6 @@ void MainWindow::InitializeWidgets() {
QMenu context_menu;
for (auto const& gpu_accuracy_pair : ConfigurationShared::gpu_accuracy_texts_map) {
if (gpu_accuracy_pair.first == Settings::GpuAccuracy::Extreme) {
continue;
}
context_menu.addAction(gpu_accuracy_pair.second, [this, gpu_accuracy_pair] {
Settings::values.gpu_accuracy.SetValue(gpu_accuracy_pair.first);
UpdateGPUAccuracyButton();
@@ -3563,16 +3560,15 @@ void MainWindow::OnToggleDockedMode() {
void MainWindow::OnToggleGpuAccuracy() {
switch (Settings::values.gpu_accuracy.GetValue()) {
case Settings::GpuAccuracy::High: {
Settings::values.gpu_accuracy.SetValue(Settings::GpuAccuracy::Normal);
case Settings::GpuAccuracy::Low:
Settings::values.gpu_accuracy.SetValue(Settings::GpuAccuracy::Medium);
break;
}
case Settings::GpuAccuracy::Normal:
case Settings::GpuAccuracy::Extreme:
default: {
case Settings::GpuAccuracy::Medium:
Settings::values.gpu_accuracy.SetValue(Settings::GpuAccuracy::High);
break;
}
case Settings::GpuAccuracy::High:
Settings::values.gpu_accuracy.SetValue(Settings::GpuAccuracy::Low);
break;
}
QtCommon::system->ApplySettings();
@@ -4260,7 +4256,7 @@ void MainWindow::UpdateGPUAccuracyButton() {
const auto gpu_accuracy_text =
ConfigurationShared::gpu_accuracy_texts_map.find(gpu_accuracy)->second;
gpu_accuracy_button->setText(gpu_accuracy_text.toUpper());
gpu_accuracy_button->setChecked(gpu_accuracy != Settings::GpuAccuracy::Normal);
gpu_accuracy_button->setChecked(gpu_accuracy != Settings::GpuAccuracy::Low);
}
void MainWindow::UpdateDockedButton() {