4040#include "xe_gt_printk.h"
4141#include "xe_gt_sriov_vf.h"
4242#include "xe_guc.h"
43+ #include "xe_guc_pc.h"
4344#include "xe_hw_engine_group.h"
4445#include "xe_hwmon.h"
4546#include "xe_irq.h"
@@ -986,38 +987,15 @@ void xe_device_wmb(struct xe_device *xe)
986987 xe_mmio_write32 (xe_root_tile_mmio (xe ), VF_CAP_REG , 0 );
987988}
988989
989- /**
990- * xe_device_td_flush() - Flush transient L3 cache entries
991- * @xe: The device
992- *
993- * Display engine has direct access to memory and is never coherent with L3/L4
994- * caches (or CPU caches), however KMD is responsible for specifically flushing
995- * transient L3 GPU cache entries prior to the flip sequence to ensure scanout
996- * can happen from such a surface without seeing corruption.
997- *
998- * Display surfaces can be tagged as transient by mapping it using one of the
999- * various L3:XD PAT index modes on Xe2.
1000- *
1001- * Note: On non-discrete xe2 platforms, like LNL, the entire L3 cache is flushed
1002- * at the end of each submission via PIPE_CONTROL for compute/render, since SA
1003- * Media is not coherent with L3 and we want to support render-vs-media
1004- * usescases. For other engines like copy/blt the HW internally forces uncached
1005- * behaviour, hence why we can skip the TDF on such platforms.
990+ /*
991+ * Issue a TRANSIENT_FLUSH_REQUEST and wait for completion on each gt.
1006992 */
1007- void xe_device_td_flush (struct xe_device * xe )
993+ static void tdf_request_sync (struct xe_device * xe )
1008994{
1009- struct xe_gt * gt ;
1010995 unsigned int fw_ref ;
996+ struct xe_gt * gt ;
1011997 u8 id ;
1012998
1013- if (!IS_DGFX (xe ) || GRAPHICS_VER (xe ) < 20 )
1014- return ;
1015-
1016- if (XE_WA (xe_root_mmio_gt (xe ), 16023588340 )) {
1017- xe_device_l2_flush (xe );
1018- return ;
1019- }
1020-
1021999 for_each_gt (gt , xe , id ) {
10221000 if (xe_gt_is_media_type (gt ))
10231001 continue ;
@@ -1027,6 +1005,7 @@ void xe_device_td_flush(struct xe_device *xe)
10271005 return ;
10281006
10291007 xe_mmio_write32 (& gt -> mmio , XE2_TDF_CTRL , TRANSIENT_FLUSH_REQUEST );
1008+
10301009 /*
10311010 * FIXME: We can likely do better here with our choice of
10321011 * timeout. Currently we just assume the worst case, i.e. 150us,
@@ -1057,15 +1036,52 @@ void xe_device_l2_flush(struct xe_device *xe)
10571036 return ;
10581037
10591038 spin_lock (& gt -> global_invl_lock );
1060- xe_mmio_write32 (& gt -> mmio , XE2_GLOBAL_INVAL , 0x1 );
10611039
1040+ xe_mmio_write32 (& gt -> mmio , XE2_GLOBAL_INVAL , 0x1 );
10621041 if (xe_mmio_wait32 (& gt -> mmio , XE2_GLOBAL_INVAL , 0x1 , 0x0 , 500 , NULL , true))
10631042 xe_gt_err_once (gt , "Global invalidation timeout\n" );
1043+
10641044 spin_unlock (& gt -> global_invl_lock );
10651045
10661046 xe_force_wake_put (gt_to_fw (gt ), fw_ref );
10671047}
10681048
1049+ /**
1050+ * xe_device_td_flush() - Flush transient L3 cache entries
1051+ * @xe: The device
1052+ *
1053+ * Display engine has direct access to memory and is never coherent with L3/L4
1054+ * caches (or CPU caches), however KMD is responsible for specifically flushing
1055+ * transient L3 GPU cache entries prior to the flip sequence to ensure scanout
1056+ * can happen from such a surface without seeing corruption.
1057+ *
1058+ * Display surfaces can be tagged as transient by mapping it using one of the
1059+ * various L3:XD PAT index modes on Xe2.
1060+ *
1061+ * Note: On non-discrete xe2 platforms, like LNL, the entire L3 cache is flushed
1062+ * at the end of each submission via PIPE_CONTROL for compute/render, since SA
1063+ * Media is not coherent with L3 and we want to support render-vs-media
1064+ * usescases. For other engines like copy/blt the HW internally forces uncached
1065+ * behaviour, hence why we can skip the TDF on such platforms.
1066+ */
1067+ void xe_device_td_flush (struct xe_device * xe )
1068+ {
1069+ struct xe_gt * root_gt ;
1070+
1071+ if (!IS_DGFX (xe ) || GRAPHICS_VER (xe ) < 20 )
1072+ return ;
1073+
1074+ root_gt = xe_root_mmio_gt (xe );
1075+ if (XE_WA (root_gt , 16023588340 )) {
1076+ /* A transient flush is not sufficient: flush the L2 */
1077+ xe_device_l2_flush (xe );
1078+ } else {
1079+ xe_guc_pc_apply_flush_freq_limit (& root_gt -> uc .guc .pc );
1080+ tdf_request_sync (xe );
1081+ xe_guc_pc_remove_flush_freq_limit (& root_gt -> uc .guc .pc );
1082+ }
1083+ }
1084+
10691085u32 xe_device_ccs_bytes (struct xe_device * xe , u64 size )
10701086{
10711087 return xe_device_has_flat_ccs (xe ) ?
0 commit comments