diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 55bbc8b8df159ae3667df8c82c75eaca4bad031d..76a7b37a4a532cc904a910b6a6b94d459980bb54 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -137,6 +137,9 @@ static long xe_drm_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct xe_device *xe = to_xe_device(file_priv->minor->dev); long ret; + if (xe_device_wedged(xe)) + return -ECANCELED; + ret = xe_pm_runtime_get_ioctl(xe); if (ret >= 0) ret = drm_ioctl(file, cmd, arg); @@ -152,6 +155,9 @@ static long xe_drm_compat_ioctl(struct file *file, unsigned int cmd, unsigned lo struct xe_device *xe = to_xe_device(file_priv->minor->dev); long ret; + if (xe_device_wedged(xe)) + return -ECANCELED; + ret = xe_pm_runtime_get_ioctl(xe); if (ret >= 0) ret = drm_compat_ioctl(file, cmd, arg); diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h index 36d4434ebcccb5c800267e6c92f34d0f97f1ab1c..d2e4249d37ce7a9bdb40dff98d9d0f6db980d945 100644 --- a/drivers/gpu/drm/xe/xe_device.h +++ b/drivers/gpu/drm/xe/xe_device.h @@ -167,4 +167,24 @@ void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p); u64 xe_device_canonicalize_addr(struct xe_device *xe, u64 address); u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address); +static inline bool xe_device_wedged(struct xe_device *xe) +{ + return atomic_read(&xe->wedged); +} + +static inline void xe_device_declare_wedged(struct xe_device *xe) +{ + if (!atomic_xchg(&xe->wedged, 1)) { + xe->needs_flr_on_fini = true; + drm_err(&xe->drm, + "CRITICAL: Xe has declared device %s as wedged.\n" + "IOCTLs and executions are blocked until device is probed again with unbind and bind operations:\n" + "echo '%s' > /sys/bus/pci/drivers/xe/unbind\n" + "echo '%s' > /sys/bus/pci/drivers/xe/bind\n" + "Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n", + dev_name(xe->drm.dev), dev_name(xe->drm.dev), + dev_name(xe->drm.dev)); + } +} + #endif diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index 8a9f12a8d7c171c106f02d878cff9881b79a0be1..91c720d6ad29aabc70d23b3af2a7dbc62a3e2da9 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -459,6 +459,9 @@ struct xe_device { /** @needs_flr_on_fini: requests function-reset on fini */ bool needs_flr_on_fini; + /** @wedged: Xe device faced a critical error and is now blocked. */ + atomic_t wedged; + /* private: */ #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY) diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index 491d0413de15ffb56213993ecc81fcac5f713bfd..e922e77f5010ea7e89b49bd16364d270724ddae6 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -633,6 +633,9 @@ static int gt_reset(struct xe_gt *gt) { int err; + if (xe_device_wedged(gt_to_xe(gt))) + return -ECANCELED; + /* We only support GT resets with GuC submission */ if (!xe_device_uc_enabled(gt_to_xe(gt))) return -ENODEV; @@ -685,7 +688,7 @@ static int gt_reset(struct xe_gt *gt) err_fail: xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err)); - gt_to_xe(gt)->needs_flr_on_fini = true; + xe_device_declare_wedged(gt_to_xe(gt)); return err; } diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c b/drivers/gpu/drm/xe/xe_guc_pc.c index 509649d0e65e1c5aef5cc1518ca23335075fa4ab..8fc757900ed18f8e657df6205b4d3cc0c4413621 100644 --- a/drivers/gpu/drm/xe/xe_guc_pc.c +++ b/drivers/gpu/drm/xe/xe_guc_pc.c @@ -902,6 +902,9 @@ static void xe_guc_pc_fini(struct drm_device *drm, void *arg) return; } + if (xe_device_wedged(xe)) + return; + XE_WARN_ON(xe_force_wake_get(gt_to_fw(pc_to_gt(pc)), XE_FORCEWAKE_ALL)); XE_WARN_ON(xe_guc_pc_gucrc_disable(pc)); XE_WARN_ON(xe_guc_pc_stop(pc));