4848
4949#define XE_GUC_EXEC_QUEUE_CGP_CONTEXT_ERROR_LEN 6
5050
51+ static int guc_submit_reset_prepare (struct xe_guc * guc );
52+
5153static struct xe_guc *
5254exec_queue_to_guc (struct xe_exec_queue * q )
5355{
@@ -239,7 +241,7 @@ static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
239241 EXEC_QUEUE_STATE_BANNED ));
240242}
241243
242- static void guc_submit_fini (struct drm_device * drm , void * arg )
244+ static void guc_submit_sw_fini (struct drm_device * drm , void * arg )
243245{
244246 struct xe_guc * guc = arg ;
245247 struct xe_device * xe = guc_to_xe (guc );
@@ -257,6 +259,19 @@ static void guc_submit_fini(struct drm_device *drm, void *arg)
257259 xa_destroy (& guc -> submission_state .exec_queue_lookup );
258260}
259261
262+ static void guc_submit_fini (void * arg )
263+ {
264+ struct xe_guc * guc = arg ;
265+
266+ /* Forcefully kill any remaining exec queues */
267+ xe_guc_ct_stop (& guc -> ct );
268+ guc_submit_reset_prepare (guc );
269+ xe_guc_softreset (guc );
270+ xe_guc_submit_stop (guc );
271+ xe_uc_fw_sanitize (& guc -> fw );
272+ xe_guc_submit_pause_abort (guc );
273+ }
274+
260275static void guc_submit_wedged_fini (void * arg )
261276{
262277 struct xe_guc * guc = arg ;
@@ -326,7 +341,11 @@ int xe_guc_submit_init(struct xe_guc *guc, unsigned int num_ids)
326341
327342 guc -> submission_state .initialized = true;
328343
329- return drmm_add_action_or_reset (& xe -> drm , guc_submit_fini , guc );
344+ err = drmm_add_action_or_reset (& xe -> drm , guc_submit_sw_fini , guc );
345+ if (err )
346+ return err ;
347+
348+ return devm_add_action_or_reset (xe -> drm .dev , guc_submit_fini , guc );
330349}
331350
332351/*
@@ -1252,6 +1271,7 @@ static void disable_scheduling_deregister(struct xe_guc *guc,
12521271 */
12531272void xe_guc_submit_wedge (struct xe_guc * guc )
12541273{
1274+ struct xe_device * xe = guc_to_xe (guc );
12551275 struct xe_gt * gt = guc_to_gt (guc );
12561276 struct xe_exec_queue * q ;
12571277 unsigned long index ;
@@ -1266,20 +1286,28 @@ void xe_guc_submit_wedge(struct xe_guc *guc)
12661286 if (!guc -> submission_state .initialized )
12671287 return ;
12681288
1269- err = devm_add_action_or_reset ( guc_to_xe ( guc ) -> drm . dev ,
1270- guc_submit_wedged_fini , guc );
1271- if ( err ) {
1272- xe_gt_err ( gt , "Failed to register clean-up in wedged.mode=%s; "
1273- "Although device is wedged.\n" ,
1274- xe_wedged_mode_to_string ( XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET ) );
1275- return ;
1276- }
1289+ if ( xe -> wedged . mode == 2 ) {
1290+ err = devm_add_action_or_reset ( guc_to_xe ( guc )-> drm . dev ,
1291+ guc_submit_wedged_fini , guc );
1292+ if ( err ) {
1293+ xe_gt_err ( gt , "Failed to register clean-up on wedged.mode=2; "
1294+ "Although device is wedged.\n" );
1295+ return ;
1296+ }
12771297
1278- mutex_lock (& guc -> submission_state .lock );
1279- xa_for_each (& guc -> submission_state .exec_queue_lookup , index , q )
1280- if (xe_exec_queue_get_unless_zero (q ))
1281- set_exec_queue_wedged (q );
1282- mutex_unlock (& guc -> submission_state .lock );
1298+ mutex_lock (& guc -> submission_state .lock );
1299+ xa_for_each (& guc -> submission_state .exec_queue_lookup , index , q )
1300+ if (xe_exec_queue_get_unless_zero (q ))
1301+ set_exec_queue_wedged (q );
1302+ mutex_unlock (& guc -> submission_state .lock );
1303+ } else {
1304+ /* Forcefully kill any remaining exec queues, signal fences */
1305+ guc_submit_reset_prepare (guc );
1306+ xe_guc_submit_stop (guc );
1307+ xe_guc_softreset (guc );
1308+ xe_uc_fw_sanitize (& guc -> fw );
1309+ xe_guc_submit_pause_abort (guc );
1310+ }
12831311}
12841312
12851313static bool guc_submit_hint_wedged (struct xe_guc * guc )
@@ -2230,14 +2258,15 @@ static const struct xe_exec_queue_ops guc_exec_queue_ops = {
22302258static void guc_exec_queue_stop (struct xe_guc * guc , struct xe_exec_queue * q )
22312259{
22322260 struct xe_gpu_scheduler * sched = & q -> guc -> sched ;
2261+ bool do_destroy = false;
22332262
22342263 /* Stop scheduling + flush any DRM scheduler operations */
22352264 xe_sched_submission_stop (sched );
22362265
22372266 /* Clean up lost G2H + reset engine state */
22382267 if (exec_queue_registered (q )) {
22392268 if (exec_queue_destroyed (q ))
2240- __guc_exec_queue_destroy ( guc , q ) ;
2269+ do_destroy = true ;
22412270 }
22422271 if (q -> guc -> suspend_pending ) {
22432272 set_exec_queue_suspended (q );
@@ -2273,18 +2302,15 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
22732302 xe_guc_exec_queue_trigger_cleanup (q );
22742303 }
22752304 }
2305+
2306+ if (do_destroy )
2307+ __guc_exec_queue_destroy (guc , q );
22762308}
22772309
2278- int xe_guc_submit_reset_prepare (struct xe_guc * guc )
2310+ static int guc_submit_reset_prepare (struct xe_guc * guc )
22792311{
22802312 int ret ;
22812313
2282- if (xe_gt_WARN_ON (guc_to_gt (guc ), vf_recovery (guc )))
2283- return 0 ;
2284-
2285- if (!guc -> submission_state .initialized )
2286- return 0 ;
2287-
22882314 /*
22892315 * Using an atomic here rather than submission_state.lock as this
22902316 * function can be called while holding the CT lock (engine reset
@@ -2299,6 +2325,17 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc)
22992325 return ret ;
23002326}
23012327
2328+ int xe_guc_submit_reset_prepare (struct xe_guc * guc )
2329+ {
2330+ if (xe_gt_WARN_ON (guc_to_gt (guc ), vf_recovery (guc )))
2331+ return 0 ;
2332+
2333+ if (!guc -> submission_state .initialized )
2334+ return 0 ;
2335+
2336+ return guc_submit_reset_prepare (guc );
2337+ }
2338+
23022339void xe_guc_submit_reset_wait (struct xe_guc * guc )
23032340{
23042341 wait_event (guc -> ct .wq , xe_device_wedged (guc_to_xe (guc )) ||
@@ -2695,8 +2732,7 @@ void xe_guc_submit_pause_abort(struct xe_guc *guc)
26952732 continue ;
26962733
26972734 xe_sched_submission_start (sched );
2698- if (exec_queue_killed_or_banned_or_wedged (q ))
2699- xe_guc_exec_queue_trigger_cleanup (q );
2735+ guc_exec_queue_kill (q );
27002736 }
27012737 mutex_unlock (& guc -> submission_state .lock );
27022738}
0 commit comments