mirror of
https://github.com/torvalds/linux.git
synced 2025-12-07 20:06:24 +00:00
drm/amdgpu: Add umc page retirement for umc v12_0
Add umc page retirement for umc v12_0.
V2:
1. Changed umc page retirement check condition
to call umc_v12_0_is_uncorrectable_error.
2. Use memset to clear the contents of the umc
error address structure.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
committed by
Alex Deucher
parent
6fe08f56db
commit
99cab331a4
@@ -26,6 +26,7 @@
|
||||
#include "amdgpu.h"
|
||||
#include "umc/umc_12_0_0_offset.h"
|
||||
#include "umc/umc_12_0_0_sh_mask.h"
|
||||
#include "mp/mp_13_0_6_sh_mask.h"
|
||||
|
||||
const uint32_t
|
||||
umc_v12_0_channel_idx_tbl[]
|
||||
@@ -370,6 +371,59 @@ static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
|
||||
void *ras_error_status)
|
||||
{
|
||||
amdgpu_mca_smu_log_ras_error(adev,
|
||||
AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status);
|
||||
amdgpu_mca_smu_log_ras_error(adev,
|
||||
AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status);
|
||||
}
|
||||
|
||||
static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
|
||||
void *ras_error_status)
|
||||
{
|
||||
struct ras_err_node *err_node;
|
||||
uint64_t mc_umc_status;
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
|
||||
|
||||
for_each_ras_error(err_node, err_data) {
|
||||
mc_umc_status = err_node->err_info.err_addr.err_status;
|
||||
if (!mc_umc_status)
|
||||
continue;
|
||||
|
||||
if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)) {
|
||||
uint64_t mca_addr, err_addr, mca_ipid;
|
||||
uint32_t InstanceIdLo;
|
||||
struct amdgpu_smuio_mcm_config_info *mcm_info;
|
||||
|
||||
mcm_info = &err_node->err_info.mcm_info;
|
||||
mca_addr = err_node->err_info.err_addr.err_addr;
|
||||
mca_ipid = err_node->err_info.err_addr.err_ipid;
|
||||
|
||||
err_addr = REG_GET_FIELD(mca_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
|
||||
InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, InstanceIdLo);
|
||||
|
||||
dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, ch:%d, err_addr:0x%llx\n",
|
||||
mca_ipid,
|
||||
mcm_info->die_id,
|
||||
MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
|
||||
MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
|
||||
err_addr);
|
||||
|
||||
umc_v12_0_convert_error_address(adev,
|
||||
err_data, err_addr,
|
||||
MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
|
||||
MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
|
||||
mcm_info->die_id);
|
||||
|
||||
/* Clear umc error address content */
|
||||
memset(&err_node->err_info.err_addr,
|
||||
0, sizeof(err_node->err_info.err_addr));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void umc_v12_0_err_cnt_init(struct amdgpu_device *adev)
|
||||
{
|
||||
amdgpu_umc_loop_channels(adev,
|
||||
@@ -396,4 +450,6 @@ struct amdgpu_umc_ras umc_v12_0_ras = {
|
||||
},
|
||||
.err_cnt_init = umc_v12_0_err_cnt_init,
|
||||
.query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
|
||||
.ecc_info_query_ras_error_count = umc_v12_0_ecc_info_query_ras_error_count,
|
||||
.ecc_info_query_ras_error_address = umc_v12_0_ecc_info_query_ras_error_address,
|
||||
};
|
||||
|
||||
@@ -117,6 +117,10 @@
|
||||
(pa) |= (UMC_V12_0_CHANNEL_HASH_CH6(channel_idx, pa) << UMC_V12_0_PA_CH6_BIT); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_IPID_LO_2_UMC_CH(_ipid_lo) (((((_ipid_lo) >> 20) & 0x1) * 4) + \
|
||||
(((_ipid_lo) >> 12) & 0xF))
|
||||
#define MCA_IPID_LO_2_UMC_INST(_ipid_lo) (((_ipid_lo) >> 21) & 0x7)
|
||||
|
||||
bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
|
||||
bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user