accel/ivpu: Return correct job error status

Currently the driver returns ABORTED for all errors that trigger engine reset. It is better to distinguish between different error types by returning the actual error code reported by firmware. This allows userspace to take different actions based on the error type and improves debuggability. Refactor ivpu_job_signal_and_destroy() by extracting engine error handling logic into a new function ivpu_job_handle_engine_error(). This simplifies engine error handling logic by removing necessity of calling ivpu_job_singal_and_destroy() multiple times by a single job changing it's behavior based on job status. Signed-off-by: Andrzej Kacprowski <andrzej.kacprowski@linux.intel.com> Reviewed-by: Jeff Hugo <jeff.hugo@oss.qualcomm.com> Signed-off-by: Karol Wachowski <karol.wachowski@linux.intel.com> Link: https://lore.kernel.org/r/20251008061255.2909794-1-karol.wachowski@linux.intel.com
2025-12-07 20:06:24 +00:00 · 2025-10-08 08:12:55 +02:00
parent 4139eb2490
commit 40527034d1
2 changed files with 52 additions and 30 deletions
--- a/drivers/accel/ivpu/ivpu_job.c
+++ b/drivers/accel/ivpu/ivpu_job.c
@@ -564,6 +564,44 @@ static struct ivpu_job *ivpu_job_remove_from_submitted_jobs(struct ivpu_device *
 	return job;
 }

+bool ivpu_job_handle_engine_error(struct ivpu_device *vdev, u32 job_id, u32 job_status)
+{
+	lockdep_assert_held(&vdev->submitted_jobs_lock);
+
+	switch (job_status) {
+	case VPU_JSM_STATUS_PROCESSING_ERR:
+	case VPU_JSM_STATUS_ENGINE_RESET_REQUIRED_MIN ... VPU_JSM_STATUS_ENGINE_RESET_REQUIRED_MAX:
+	{
+		struct ivpu_job *job = xa_load(&vdev->submitted_jobs_xa, job_id);
+
+		if (!job)
+			return false;
+
+		/* Trigger an engine reset */
+		guard(mutex)(&job->file_priv->lock);
+
+		job->job_status = job_status;
+
+		if (job->file_priv->has_mmu_faults)
+			return false;
+
+		/*
+		 * Mark context as faulty and defer destruction of the job to jobs abort thread
+		 * handler to synchronize between both faults and jobs returning context violation
+		 * status and ensure both are handled in the same way
+		 */
+		job->file_priv->has_mmu_faults = true;
+		queue_work(system_wq, &vdev->context_abort_work);
+		return true;
+	}
+	default:
+		/* Complete job with error status, engine reset not required */
+		break;
+	}
+
+	return false;
+}
+
 static int ivpu_job_signal_and_destroy(struct ivpu_device *vdev, u32 job_id, u32 job_status)
 {
 	struct ivpu_job *job;
@@ -574,43 +612,22 @@ static int ivpu_job_signal_and_destroy(struct ivpu_device *vdev, u32 job_id, u32
 	if (!job)
 		return -ENOENT;

-	switch (job_status) {
-	case VPU_JSM_STATUS_PROCESSING_ERR:
-	case VPU_JSM_STATUS_ENGINE_RESET_REQUIRED_MIN ... VPU_JSM_STATUS_ENGINE_RESET_REQUIRED_MAX:
-	{
-		/* Trigger an engine reset */
-		guard(mutex)(&job->file_priv->lock);
+	ivpu_job_remove_from_submitted_jobs(vdev, job_id);

+	if (job->job_status == VPU_JSM_STATUS_SUCCESS) {
 		if (job->file_priv->has_mmu_faults)
-			return 0;
-
-		/*
-		 * Mark context as faulty and defer destruction of the job to jobs abort thread
-		 * handler to synchronize between both faults and jobs returning context violation
-		 * status and ensure both are handled in the same way
-		 */
-		job->file_priv->has_mmu_faults = true;
-		queue_work(system_wq, &vdev->context_abort_work);
-		return 0;
-	}
-	default:
-		/* Complete job with error status, engine reset not required */
-		break;
+			job->job_status = DRM_IVPU_JOB_STATUS_ABORTED;
+		else
+			job->job_status = job_status;
 	}

-	job = ivpu_job_remove_from_submitted_jobs(vdev, job_id);
-	if (!job)
-		return -ENOENT;
-
-	if (job->file_priv->has_mmu_faults)
-		job_status = DRM_IVPU_JOB_STATUS_ABORTED;
-
-	job->bos[CMD_BUF_IDX]->job_status = job_status;
+	job->bos[CMD_BUF_IDX]->job_status = job->job_status;
 	dma_fence_signal(job->done_fence);

 	trace_job("done", job);
 	ivpu_dbg(vdev, JOB, "Job complete:  id %3u ctx %2d cmdq_id %u engine %d status 0x%x\n",
-		 job->job_id, job->file_priv->ctx.id, job->cmdq_id, job->engine_idx, job_status);
+		 job->job_id, job->file_priv->ctx.id, job->cmdq_id, job->engine_idx,
+		 job->job_status);

 	ivpu_job_destroy(job);
 	ivpu_stop_job_timeout_detection(vdev);
@@ -1030,7 +1047,9 @@ ivpu_job_done_callback(struct ivpu_device *vdev, struct ivpu_ipc_hdr *ipc_hdr,
 	payload = (struct vpu_ipc_msg_payload_job_done *)&jsm_msg->payload;

 	mutex_lock(&vdev->submitted_jobs_lock);
-	ivpu_job_signal_and_destroy(vdev, payload->job_id, payload->job_status);
+	if (!ivpu_job_handle_engine_error(vdev, payload->job_id, payload->job_status))
+		/* No engine error, complete the job normally */
+		ivpu_job_signal_and_destroy(vdev, payload->job_id, payload->job_status);
 	mutex_unlock(&vdev->submitted_jobs_lock);
 }

--- a/drivers/accel/ivpu/ivpu_job.h
+++ b/drivers/accel/ivpu/ivpu_job.h
@@ -51,6 +51,7 @@ struct ivpu_cmdq {
 * @cmdq_id:             Command queue ID used for submission
 * @job_id:              Unique job ID for tracking and status reporting
 * @engine_idx:          Engine index for job execution
+ * @job_status:          Status reported by firmware for this job
 * @primary_preempt_buf: Primary preemption buffer for job
 * @secondary_preempt_buf: Secondary preemption buffer for job (optional)
 * @bo_count:            Number of buffer objects associated with this job
@@ -64,6 +65,7 @@ struct ivpu_job {
 	u32 cmdq_id;
 	u32 job_id;
 	u32 engine_idx;
+	u32 job_status;
 	struct ivpu_bo *primary_preempt_buf;
 	struct ivpu_bo *secondary_preempt_buf;
 	size_t bo_count;
@@ -83,6 +85,7 @@ void ivpu_cmdq_abort_all_jobs(struct ivpu_device *vdev, u32 ctx_id, u32 cmdq_id)

 void ivpu_job_done_consumer_init(struct ivpu_device *vdev);
 void ivpu_job_done_consumer_fini(struct ivpu_device *vdev);
+bool ivpu_job_handle_engine_error(struct ivpu_device *vdev, u32 job_id, u32 job_status);
 void ivpu_context_abort_work_fn(struct work_struct *work);

 void ivpu_jobs_abort_all(struct ivpu_device *vdev);