mirror of
https://github.com/torvalds/linux.git
synced 2025-12-07 20:06:24 +00:00
drm/amdgpu: suspend ras module before gpu reset
During gpu reset, all GPU-related resources are inaccessible. To avoid affecting ras functionality, suspend ras module before gpu reset and resume it after gpu reset is complete. V2: Rename functions to avoid misunderstanding. V3: Move flush_delayed_work to amdgpu_ras_process_pause, Move schedule_delayed_work to amdgpu_ras_process_unpause. V4: Rename functions. V5: Move the function to amdgpu_ras.c. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Acked-by: Lijo Lazar <lijo.lazar@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
committed by
Alex Deucher
parent
d4432f16d3
commit
d95ca7f515
@@ -71,6 +71,7 @@
|
||||
|
||||
#include "amdgpu_xgmi.h"
|
||||
#include "amdgpu_ras.h"
|
||||
#include "amdgpu_ras_mgr.h"
|
||||
#include "amdgpu_pmu.h"
|
||||
#include "amdgpu_fru_eeprom.h"
|
||||
#include "amdgpu_reset.h"
|
||||
@@ -6660,6 +6661,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
goto end_reset;
|
||||
}
|
||||
|
||||
/* Cannot be called after locking reset domain */
|
||||
amdgpu_ras_pre_reset(adev, &device_list);
|
||||
|
||||
/* We need to lock reset domain only once both for XGMI and single device */
|
||||
amdgpu_device_recovery_get_reset_lock(adev, &device_list);
|
||||
|
||||
@@ -6691,6 +6695,7 @@ skip_sched_resume:
|
||||
reset_unlock:
|
||||
amdgpu_device_recovery_put_reset_lock(adev, &device_list);
|
||||
end_reset:
|
||||
amdgpu_ras_post_reset(adev, &device_list);
|
||||
if (hive) {
|
||||
mutex_unlock(&hive->hive_lock);
|
||||
amdgpu_put_xgmi_hive(hive);
|
||||
|
||||
@@ -2921,8 +2921,12 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
|
||||
type = amdgpu_ras_get_fatal_error_event(adev);
|
||||
list_for_each_entry(remote_adev,
|
||||
device_list_handle, gmc.xgmi.head) {
|
||||
amdgpu_ras_query_err_status(remote_adev);
|
||||
amdgpu_ras_log_on_err_counter(remote_adev, type);
|
||||
if (amdgpu_uniras_enabled(remote_adev)) {
|
||||
amdgpu_ras_mgr_update_ras_ecc(remote_adev);
|
||||
} else {
|
||||
amdgpu_ras_query_err_status(remote_adev);
|
||||
amdgpu_ras_log_on_err_counter(remote_adev, type);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -5673,3 +5677,25 @@ bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void amdgpu_ras_pre_reset(struct amdgpu_device *adev,
|
||||
struct list_head *device_list)
|
||||
{
|
||||
struct amdgpu_device *tmp_adev = NULL;
|
||||
|
||||
list_for_each_entry(tmp_adev, device_list, reset_list) {
|
||||
if (amdgpu_uniras_enabled(tmp_adev))
|
||||
amdgpu_ras_mgr_pre_reset(tmp_adev);
|
||||
}
|
||||
}
|
||||
|
||||
void amdgpu_ras_post_reset(struct amdgpu_device *adev,
|
||||
struct list_head *device_list)
|
||||
{
|
||||
struct amdgpu_device *tmp_adev = NULL;
|
||||
|
||||
list_for_each_entry(tmp_adev, device_list, reset_list) {
|
||||
if (amdgpu_uniras_enabled(tmp_adev))
|
||||
amdgpu_ras_mgr_post_reset(tmp_adev);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1039,4 +1039,9 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
|
||||
const char *fmt, ...);
|
||||
|
||||
bool amdgpu_ras_is_rma(struct amdgpu_device *adev);
|
||||
|
||||
void amdgpu_ras_pre_reset(struct amdgpu_device *adev,
|
||||
struct list_head *device_list);
|
||||
void amdgpu_ras_post_reset(struct amdgpu_device *adev,
|
||||
struct list_head *device_list);
|
||||
#endif
|
||||
|
||||
@@ -624,3 +624,25 @@ int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev,
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev)
|
||||
{
|
||||
if (!amdgpu_ras_mgr_is_ready(adev)) {
|
||||
RAS_DEV_ERR(adev, "Invalid ras suspend!\n");
|
||||
return -EPERM;
|
||||
}
|
||||
|
||||
amdgpu_ras_process_pre_reset(adev);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev)
|
||||
{
|
||||
if (!amdgpu_ras_mgr_is_ready(adev)) {
|
||||
RAS_DEV_ERR(adev, "Invalid ras resume!\n");
|
||||
return -EPERM;
|
||||
}
|
||||
|
||||
amdgpu_ras_process_post_reset(adev);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -52,6 +52,9 @@ struct amdgpu_ras_mgr {
|
||||
struct ras_event_manager ras_event_mgr;
|
||||
uint64_t last_poison_consumption_seqno;
|
||||
bool ras_is_ready;
|
||||
|
||||
bool is_paused;
|
||||
struct completion ras_event_done;
|
||||
};
|
||||
|
||||
extern const struct amdgpu_ip_block_version ras_v1_0_ip_block;
|
||||
@@ -75,4 +78,6 @@ bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev);
|
||||
int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev,
|
||||
uint32_t cmd_id, void *input, uint32_t input_size,
|
||||
void *output, uint32_t out_size);
|
||||
int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev);
|
||||
int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev);
|
||||
#endif
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include "amdgpu_ras_process.h"
|
||||
|
||||
#define RAS_MGR_RETIRE_PAGE_INTERVAL 100
|
||||
#define RAS_EVENT_PROCESS_TIMEOUT 1200
|
||||
|
||||
static void ras_process_retire_page_dwork(struct work_struct *work)
|
||||
{
|
||||
@@ -57,6 +58,9 @@ int amdgpu_ras_process_init(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
|
||||
|
||||
ras_mgr->is_paused = false;
|
||||
init_completion(&ras_mgr->ras_event_done);
|
||||
|
||||
INIT_DELAYED_WORK(&ras_mgr->retire_page_dwork, ras_process_retire_page_dwork);
|
||||
|
||||
return 0;
|
||||
@@ -66,6 +70,7 @@ int amdgpu_ras_process_fini(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
|
||||
|
||||
ras_mgr->is_paused = false;
|
||||
/* Save all cached bad pages to eeprom */
|
||||
flush_delayed_work(&ras_mgr->retire_page_dwork);
|
||||
cancel_delayed_work_sync(&ras_mgr->retire_page_dwork);
|
||||
@@ -124,3 +129,62 @@ int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev,
|
||||
|
||||
return ras_process_add_interrupt_req(ras_mgr->ras_core, &req, false);
|
||||
}
|
||||
|
||||
int amdgpu_ras_process_begin(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
|
||||
|
||||
if (ras_mgr->is_paused)
|
||||
return -EAGAIN;
|
||||
|
||||
reinit_completion(&ras_mgr->ras_event_done);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int amdgpu_ras_process_end(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
|
||||
|
||||
complete(&ras_mgr->ras_event_done);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
|
||||
long rc;
|
||||
|
||||
if (!ras_mgr || !ras_mgr->ras_core)
|
||||
return -EINVAL;
|
||||
|
||||
if (!ras_mgr->ras_core->is_initialized)
|
||||
return -EPERM;
|
||||
|
||||
ras_mgr->is_paused = true;
|
||||
|
||||
/* Wait for RAS event processing to complete */
|
||||
rc = wait_for_completion_interruptible_timeout(&ras_mgr->ras_event_done,
|
||||
msecs_to_jiffies(RAS_EVENT_PROCESS_TIMEOUT));
|
||||
if (rc <= 0)
|
||||
RAS_DEV_WARN(adev, "Waiting for ras process to complete %s\n",
|
||||
rc ? "interrupted" : "timeout");
|
||||
|
||||
flush_delayed_work(&ras_mgr->retire_page_dwork);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int amdgpu_ras_process_post_reset(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
|
||||
|
||||
if (!ras_mgr || !ras_mgr->ras_core)
|
||||
return -EINVAL;
|
||||
|
||||
if (!ras_mgr->ras_core->is_initialized)
|
||||
return -EPERM;
|
||||
|
||||
ras_mgr->is_paused = false;
|
||||
|
||||
schedule_delayed_work(&ras_mgr->retire_page_dwork, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -34,4 +34,8 @@ int amdgpu_ras_process_handle_unexpected_interrupt(struct amdgpu_device *adev,
|
||||
void *data);
|
||||
int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev,
|
||||
void *data);
|
||||
int amdgpu_ras_process_begin(struct amdgpu_device *adev);
|
||||
int amdgpu_ras_process_end(struct amdgpu_device *adev);
|
||||
int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev);
|
||||
int amdgpu_ras_process_post_reset(struct amdgpu_device *adev);
|
||||
#endif
|
||||
|
||||
@@ -142,6 +142,12 @@ static int amdgpu_ras_sys_event_notifier(struct ras_core_context *ras_core,
|
||||
case RAS_EVENT_ID__RESET_GPU:
|
||||
ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t *)data);
|
||||
break;
|
||||
case RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN:
|
||||
ret = amdgpu_ras_process_begin(ras_core->dev);
|
||||
break;
|
||||
case RAS_EVENT_ID__RAS_EVENT_PROC_END:
|
||||
ret = amdgpu_ras_process_end(ras_core->dev);
|
||||
break;
|
||||
default:
|
||||
RAS_DEV_WARN(ras_core->dev, "Invalid ras notify event:%d\n", event_id);
|
||||
break;
|
||||
|
||||
@@ -115,6 +115,8 @@ enum ras_notify_event {
|
||||
RAS_EVENT_ID__FATAL_ERROR_DETECTED,
|
||||
RAS_EVENT_ID__RESET_GPU,
|
||||
RAS_EVENT_ID__RESET_VF,
|
||||
RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN,
|
||||
RAS_EVENT_ID__RAS_EVENT_PROC_END,
|
||||
};
|
||||
|
||||
enum ras_gpu_status {
|
||||
|
||||
@@ -162,6 +162,11 @@ int ras_process_handle_ras_event(struct ras_core_context *ras_core)
|
||||
uint32_t umc_event_count;
|
||||
int ret;
|
||||
|
||||
ret = ras_core_event_notify(ras_core,
|
||||
RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, NULL);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ras_aca_clear_fatal_flag(ras_core);
|
||||
ras_umc_log_pending_bad_bank(ras_core);
|
||||
|
||||
@@ -185,6 +190,8 @@ int ras_process_handle_ras_event(struct ras_core_context *ras_core)
|
||||
atomic_set(&ras_proc->umc_interrupt_count, 0);
|
||||
}
|
||||
|
||||
ras_core_event_notify(ras_core,
|
||||
RAS_EVENT_ID__RAS_EVENT_PROC_END, NULL);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user