drm/amd/ras: Add psp ras common functions

Add psp ras common functions.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
YiPeng Chai
2025-06-05 17:46:51 +08:00
committed by Alex Deucher
parent 9f3083dc9f
commit c49ef01183
3 changed files with 1126 additions and 0 deletions

View File

@@ -0,0 +1,750 @@
// SPDX-License-Identifier: MIT
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include "ras.h"
#include "ras_ta_if.h"
#include "ras_psp.h"
#include "ras_psp_v13_0.h"
/* position of instance value in sub_block_index of
* ta_ras_trigger_error_input, the sub block uses lower 12 bits
*/
#define RAS_TA_INST_MASK 0xfffff000
#define RAS_TA_INST_SHIFT 0xc
static const struct ras_psp_ip_func *ras_psp_get_ip_funcs(
struct ras_core_context *ras_core, uint32_t ip_version)
{
switch (ip_version) {
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 14):
case IP_VERSION(13, 0, 12):
return &ras_psp_v13_0;
default:
RAS_DEV_ERR(ras_core->dev,
"psp ip version(0x%x) is not supported!\n", ip_version);
break;
}
return NULL;
}
static int ras_psp_sync_system_ras_psp_status(struct ras_core_context *ras_core)
{
struct ras_psp *psp = &ras_core->ras_psp;
struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx;
struct ras_psp_ctx *psp_ctx = &ras_core->ras_psp.psp_ctx;
struct ras_psp_sys_status status = {0};
int ret;
if (psp->sys_func && psp->sys_func->get_ras_psp_system_status) {
ret = psp->sys_func->get_ras_psp_system_status(ras_core, &status);
if (ret)
return ret;
if (status.initialized) {
ta_ctx->preload_ras_ta_enabled = true;
ta_ctx->ras_ta_initialized = status.initialized;
ta_ctx->session_id = status.session_id;
}
psp_ctx->external_mutex = status.psp_cmd_mutex;
}
return 0;
}
static int ras_psp_get_ras_ta_init_param(struct ras_core_context *ras_core,
struct ras_ta_init_param *ras_ta_param)
{
struct ras_psp *psp = &ras_core->ras_psp;
if (psp->sys_func && psp->sys_func->get_ras_ta_init_param)
return psp->sys_func->get_ras_ta_init_param(ras_core, ras_ta_param);
RAS_DEV_ERR(ras_core->dev, "Not config get_ras_ta_init_param API!!\n");
return -EACCES;
}
static struct gpu_mem_block *ras_psp_get_gpu_mem(struct ras_core_context *ras_core,
enum gpu_mem_type mem_type)
{
struct ras_psp *psp = &ras_core->ras_psp;
struct gpu_mem_block *gpu_mem = NULL;
int ret;
switch (mem_type) {
case GPU_MEM_TYPE_RAS_PSP_RING:
gpu_mem = &psp->psp_ring.ras_ring_gpu_mem;
break;
case GPU_MEM_TYPE_RAS_PSP_CMD:
gpu_mem = &psp->psp_ctx.psp_cmd_gpu_mem;
break;
case GPU_MEM_TYPE_RAS_PSP_FENCE:
gpu_mem = &psp->psp_ctx.out_fence_gpu_mem;
break;
case GPU_MEM_TYPE_RAS_TA_FW:
gpu_mem = &psp->ta_ctx.fw_gpu_mem;
break;
case GPU_MEM_TYPE_RAS_TA_CMD:
gpu_mem = &psp->ta_ctx.cmd_gpu_mem;
break;
default:
return NULL;
}
if (!gpu_mem->ref_count) {
ret = ras_core_get_gpu_mem(ras_core, mem_type, gpu_mem);
if (ret)
return NULL;
gpu_mem->mem_type = mem_type;
}
gpu_mem->ref_count++;
return gpu_mem;
}
static int ras_psp_put_gpu_mem(struct ras_core_context *ras_core,
struct gpu_mem_block *gpu_mem)
{
if (!gpu_mem)
return 0;
gpu_mem->ref_count--;
if (gpu_mem->ref_count > 0) {
return 0;
} else if (gpu_mem->ref_count < 0) {
RAS_DEV_WARN(ras_core->dev,
"Duplicate free gpu memory %u\n", gpu_mem->mem_type);
} else {
ras_core_put_gpu_mem(ras_core, gpu_mem->mem_type, gpu_mem);
memset(gpu_mem, 0, sizeof(*gpu_mem));
}
return 0;
}
static void __acquire_psp_cmd_lock(struct ras_core_context *ras_core)
{
struct ras_psp_ctx *psp_ctx = &ras_core->ras_psp.psp_ctx;
if (psp_ctx->external_mutex)
mutex_lock(psp_ctx->external_mutex);
else
mutex_lock(&psp_ctx->internal_mutex);
}
static void __release_psp_cmd_lock(struct ras_core_context *ras_core)
{
struct ras_psp_ctx *psp_ctx = &ras_core->ras_psp.psp_ctx;
if (psp_ctx->external_mutex)
mutex_unlock(psp_ctx->external_mutex);
else
mutex_unlock(&psp_ctx->internal_mutex);
}
static uint32_t __get_ring_frame_slot(struct ras_core_context *ras_core)
{
struct ras_psp *psp = &ras_core->ras_psp;
uint32_t ras_ring_wptr_dw;
ras_ring_wptr_dw = psp->ip_func->psp_ras_ring_wptr_get(ras_core);
return (ras_ring_wptr_dw << 2) / sizeof(struct psp_gfx_rb_frame);
}
static int __set_ring_frame_slot(struct ras_core_context *ras_core,
uint32_t slot)
{
struct ras_psp *psp = &ras_core->ras_psp;
return psp->ip_func->psp_ras_ring_wptr_set(ras_core,
(slot * sizeof(struct psp_gfx_rb_frame)) >> 2);
}
static int write_frame_to_ras_psp_ring(struct ras_core_context *ras_core,
struct psp_gfx_rb_frame *frame)
{
struct gpu_mem_block *ring_mem;
struct psp_gfx_rb_frame *rb_frame;
uint32_t max_frame_slot;
uint32_t slot_idx;
uint32_t write_flush_read_back = 0;
int ret = 0;
ring_mem = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_PSP_RING);
if (!ring_mem)
return -ENOMEM;
max_frame_slot =
ring_mem->mem_size / sizeof(struct psp_gfx_rb_frame);
rb_frame =
(struct psp_gfx_rb_frame *)ring_mem->mem_cpu_addr;
slot_idx = __get_ring_frame_slot(ras_core);
if (slot_idx >= max_frame_slot)
slot_idx = 0;
memcpy(&rb_frame[slot_idx], frame, sizeof(*frame));
/* Do a read to force the write of the frame before writing
* write pointer.
*/
write_flush_read_back = rb_frame[slot_idx].fence_value;
if (write_flush_read_back != frame->fence_value) {
RAS_DEV_ERR(ras_core->dev,
"Failed to submit ring cmd! cmd:0x%x:0x%x, fence:0x%x:0x%x value:%u, expected:%u\n",
rb_frame[slot_idx].cmd_buf_addr_hi,
rb_frame[slot_idx].cmd_buf_addr_lo,
rb_frame[slot_idx].fence_addr_hi,
rb_frame[slot_idx].fence_addr_lo,
write_flush_read_back, frame->fence_value);
ret = -EACCES;
goto err;
}
slot_idx++;
if (slot_idx >= max_frame_slot)
slot_idx = 0;
__set_ring_frame_slot(ras_core, slot_idx);
err:
ras_psp_put_gpu_mem(ras_core, ring_mem);
return ret;
}
static int send_psp_cmd(struct ras_core_context *ras_core,
enum psp_gfx_cmd_id gfx_cmd_id, void *cmd_data,
uint32_t cmd_size, struct psp_cmd_resp *resp)
{
struct ras_psp_ctx *psp_ctx = &ras_core->ras_psp.psp_ctx;
struct gpu_mem_block *psp_cmd_buf = NULL;
struct gpu_mem_block *psp_fence_buf = NULL;
struct psp_gfx_cmd_resp *gfx_cmd;
struct psp_gfx_rb_frame rb_frame;
int ret = 0;
int timeout = 1000;
if (!cmd_data || (cmd_size > sizeof(union psp_gfx_commands)) || !resp) {
RAS_DEV_ERR(ras_core->dev, "Invalid RAS PSP command, id: %u\n", gfx_cmd_id);
return -EINVAL;
}
__acquire_psp_cmd_lock(ras_core);
psp_cmd_buf = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_PSP_CMD);
if (!psp_cmd_buf) {
ret = -ENOMEM;
goto exit;
}
psp_fence_buf = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_PSP_FENCE);
if (!psp_fence_buf) {
ret = -ENOMEM;
goto exit;
}
gfx_cmd = (struct psp_gfx_cmd_resp *)psp_cmd_buf->mem_cpu_addr;
memset(gfx_cmd, 0, sizeof(*gfx_cmd));
gfx_cmd->cmd_id = gfx_cmd_id;
memcpy(&gfx_cmd->cmd, cmd_data, cmd_size);
psp_ctx->in_fence_value++;
memset(&rb_frame, 0, sizeof(rb_frame));
rb_frame.cmd_buf_addr_hi = upper_32_bits(psp_cmd_buf->mem_mc_addr);
rb_frame.cmd_buf_addr_lo = lower_32_bits(psp_cmd_buf->mem_mc_addr);
rb_frame.fence_addr_hi = upper_32_bits(psp_fence_buf->mem_mc_addr);
rb_frame.fence_addr_lo = lower_32_bits(psp_fence_buf->mem_mc_addr);
rb_frame.fence_value = psp_ctx->in_fence_value;
ret = write_frame_to_ras_psp_ring(ras_core, &rb_frame);
if (ret) {
psp_ctx->in_fence_value--;
goto exit;
}
while (*((uint64_t *)psp_fence_buf->mem_cpu_addr) !=
psp_ctx->in_fence_value) {
if (--timeout == 0)
break;
/*
* Shouldn't wait for timeout when err_event_athub occurs,
* because gpu reset thread triggered and lock resource should
* be released for psp resume sequence.
*/
if (ras_core_ras_interrupt_detected(ras_core))
break;
msleep(2);
}
resp->status = gfx_cmd->resp.status;
resp->session_id = gfx_cmd->resp.session_id;
exit:
ras_psp_put_gpu_mem(ras_core, psp_cmd_buf);
ras_psp_put_gpu_mem(ras_core, psp_fence_buf);
__release_psp_cmd_lock(ras_core);
return ret;
}
static void __check_ras_ta_cmd_resp(struct ras_core_context *ras_core,
struct ras_ta_cmd *ras_cmd)
{
if (ras_cmd->ras_out_message.flags.err_inject_switch_disable_flag) {
RAS_DEV_WARN(ras_core->dev, "ECC switch disabled\n");
ras_cmd->ras_status = RAS_TA_STATUS__ERROR_RAS_NOT_AVAILABLE;
} else if (ras_cmd->ras_out_message.flags.reg_access_failure_flag)
RAS_DEV_WARN(ras_core->dev, "RAS internal register access blocked\n");
switch (ras_cmd->ras_status) {
case RAS_TA_STATUS__ERROR_UNSUPPORTED_IP:
RAS_DEV_WARN(ras_core->dev,
"RAS WARNING: cmd failed due to unsupported ip\n");
break;
case RAS_TA_STATUS__ERROR_UNSUPPORTED_ERROR_INJ:
RAS_DEV_WARN(ras_core->dev,
"RAS WARNING: cmd failed due to unsupported error injection\n");
break;
case RAS_TA_STATUS__SUCCESS:
break;
case RAS_TA_STATUS__TEE_ERROR_ACCESS_DENIED:
if (ras_cmd->cmd_id == RAS_TA_CMD_ID__TRIGGER_ERROR)
RAS_DEV_WARN(ras_core->dev,
"RAS WARNING: Inject error to critical region is not allowed\n");
break;
default:
RAS_DEV_WARN(ras_core->dev,
"RAS WARNING: ras status = 0x%X\n", ras_cmd->ras_status);
break;
}
}
static int send_ras_ta_runtime_cmd(struct ras_core_context *ras_core,
enum ras_ta_cmd_id cmd_id, void *in, uint32_t in_size,
void *out, uint32_t out_size)
{
struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx;
struct gpu_mem_block *cmd_mem;
struct ras_ta_cmd *ras_cmd;
struct psp_gfx_cmd_invoke_cmd invoke_cmd = {0};
struct psp_cmd_resp resp = {0};
int ret = 0;
if (!in || (in_size > sizeof(union ras_ta_cmd_input)) ||
(cmd_id >= MAX_RAS_TA_CMD_ID)) {
RAS_DEV_ERR(ras_core->dev, "Invalid RAS TA command, id: %u\n", cmd_id);
return -EINVAL;
}
ras_psp_sync_system_ras_psp_status(ras_core);
cmd_mem = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_TA_CMD);
if (!cmd_mem)
return -ENOMEM;
if (!ras_core_down_trylock_gpu_reset_lock(ras_core)) {
ret = -EACCES;
goto out;
}
ras_cmd = (struct ras_ta_cmd *)cmd_mem->mem_cpu_addr;
mutex_lock(&ta_ctx->ta_mutex);
memset(ras_cmd, 0, sizeof(*ras_cmd));
ras_cmd->cmd_id = cmd_id;
memcpy(&ras_cmd->ras_in_message, in, in_size);
invoke_cmd.ta_cmd_id = cmd_id;
invoke_cmd.session_id = ta_ctx->session_id;
ret = send_psp_cmd(ras_core, GFX_CMD_ID_INVOKE_CMD,
&invoke_cmd, sizeof(invoke_cmd), &resp);
/* If err_event_athub occurs error inject was successful, however
* return status from TA is no long reliable
*/
if (ras_core_ras_interrupt_detected(ras_core)) {
ret = 0;
goto unlock;
}
if (ret || resp.status) {
RAS_DEV_ERR(ras_core->dev,
"RAS: Failed to send psp cmd! ret:%d, status:%u\n",
ret, resp.status);
ret = -ESTRPIPE;
goto unlock;
}
if (ras_cmd->if_version > RAS_TA_HOST_IF_VER) {
RAS_DEV_WARN(ras_core->dev, "RAS: Unsupported Interface\n");
ret = -EINVAL;
goto unlock;
}
if (!ras_cmd->ras_status && out && out_size)
memcpy(out, &ras_cmd->ras_out_message, out_size);
__check_ras_ta_cmd_resp(ras_core, ras_cmd);
unlock:
mutex_unlock(&ta_ctx->ta_mutex);
ras_core_up_gpu_reset_lock(ras_core);
out:
ras_psp_put_gpu_mem(ras_core, cmd_mem);
return ret;
}
static int trigger_ras_ta_error(struct ras_core_context *ras_core,
struct ras_ta_trigger_error_input *info, uint32_t instance_mask)
{
uint32_t dev_mask = 0;
switch (info->block_id) {
case RAS_TA_BLOCK__GFX:
if (ras_gfx_get_ta_subblock(ras_core, info->inject_error_type,
info->sub_block_index, &info->sub_block_index))
return -EINVAL;
dev_mask = RAS_GET_MASK(ras_core->dev, GC, instance_mask);
break;
case RAS_TA_BLOCK__SDMA:
dev_mask = RAS_GET_MASK(ras_core->dev, SDMA0, instance_mask);
break;
case RAS_TA_BLOCK__VCN:
case RAS_TA_BLOCK__JPEG:
dev_mask = RAS_GET_MASK(ras_core->dev, VCN, instance_mask);
break;
default:
dev_mask = instance_mask;
break;
}
/* reuse sub_block_index for backward compatibility */
dev_mask <<= RAS_TA_INST_SHIFT;
dev_mask &= RAS_TA_INST_MASK;
info->sub_block_index |= dev_mask;
return send_ras_ta_runtime_cmd(ras_core, RAS_TA_CMD_ID__TRIGGER_ERROR,
info, sizeof(*info), NULL, 0);
}
static int send_load_ta_fw_cmd(struct ras_core_context *ras_core,
struct ras_ta_ctx *ta_ctx)
{
struct ras_ta_fw_bin *fw_bin = &ta_ctx->fw_bin;
struct gpu_mem_block *fw_mem;
struct gpu_mem_block *cmd_mem;
struct ras_ta_cmd *ta_cmd;
struct ras_ta_init_flags *ta_init_flags;
struct psp_gfx_cmd_load_ta psp_load_ta_cmd;
struct psp_cmd_resp resp = {0};
struct ras_ta_image_header *fw_hdr = NULL;
int ret;
fw_mem = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_TA_FW);
if (!fw_mem)
return -ENOMEM;
cmd_mem = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_TA_CMD);
if (!cmd_mem) {
ret = -ENOMEM;
goto err;
}
ret = ras_psp_get_ras_ta_init_param(ras_core, &ta_ctx->init_param);
if (ret)
goto err;
if (!ras_core_down_trylock_gpu_reset_lock(ras_core)) {
ret = -EACCES;
goto err;
}
/* copy ras ta binary to shared gpu memory */
memcpy(fw_mem->mem_cpu_addr, fw_bin->bin_addr, fw_bin->bin_size);
fw_mem->mem_size = fw_bin->bin_size;
/* Initialize ras ta startup parameter */
ta_cmd = (struct ras_ta_cmd *)cmd_mem->mem_cpu_addr;
ta_init_flags = &ta_cmd->ras_in_message.init_flags;
ta_init_flags->poison_mode_en = ta_ctx->init_param.poison_mode_en;
ta_init_flags->dgpu_mode = ta_ctx->init_param.dgpu_mode;
ta_init_flags->xcc_mask = ta_ctx->init_param.xcc_mask;
ta_init_flags->channel_dis_num = ta_ctx->init_param.channel_dis_num;
ta_init_flags->nps_mode = ta_ctx->init_param.nps_mode;
ta_init_flags->active_umc_mask = ta_ctx->init_param.active_umc_mask;
/* Setup load ras ta command */
memset(&psp_load_ta_cmd, 0, sizeof(psp_load_ta_cmd));
psp_load_ta_cmd.app_phy_addr_lo = lower_32_bits(fw_mem->mem_mc_addr);
psp_load_ta_cmd.app_phy_addr_hi = upper_32_bits(fw_mem->mem_mc_addr);
psp_load_ta_cmd.app_len = fw_mem->mem_size;
psp_load_ta_cmd.cmd_buf_phy_addr_lo = lower_32_bits(cmd_mem->mem_mc_addr);
psp_load_ta_cmd.cmd_buf_phy_addr_hi = upper_32_bits(cmd_mem->mem_mc_addr);
psp_load_ta_cmd.cmd_buf_len = cmd_mem->mem_size;
ret = send_psp_cmd(ras_core, GFX_CMD_ID_LOAD_TA,
&psp_load_ta_cmd, sizeof(psp_load_ta_cmd), &resp);
if (!ret && !resp.status) {
/* Read TA version at FW offset 0x60 if TA version not found*/
fw_hdr = (struct ras_ta_image_header *)fw_bin->bin_addr;
RAS_DEV_INFO(ras_core->dev, "PSP: RAS TA(version:%X.%X.%X.%X) is loaded.\n",
(fw_hdr->image_version >> 24) & 0xFF, (fw_hdr->image_version >> 16) & 0xFF,
(fw_hdr->image_version >> 8) & 0xFF, fw_hdr->image_version & 0xFF);
ta_ctx->ta_version = fw_hdr->image_version;
ta_ctx->session_id = resp.session_id;
ta_ctx->ras_ta_initialized = true;
} else {
RAS_DEV_ERR(ras_core->dev,
"Failed to load RAS TA! ret:%d, status:%d\n", ret, resp.status);
}
ras_core_up_gpu_reset_lock(ras_core);
err:
ras_psp_put_gpu_mem(ras_core, fw_mem);
ras_psp_put_gpu_mem(ras_core, cmd_mem);
return ret;
}
static int load_ras_ta_firmware(struct ras_core_context *ras_core,
struct ras_psp_ta_load *ras_ta_load)
{
struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx;
struct ras_ta_fw_bin *fw_bin = &ta_ctx->fw_bin;
int ret;
fw_bin->bin_addr = ras_ta_load->bin_addr;
fw_bin->bin_size = ras_ta_load->bin_size;
fw_bin->fw_version = ras_ta_load->fw_version;
fw_bin->feature_version = ras_ta_load->feature_version;
ret = send_load_ta_fw_cmd(ras_core, ta_ctx);
if (!ret) {
ras_ta_load->out_session_id = ta_ctx->session_id;
ras_ta_load->out_loaded_ta_version = ta_ctx->ta_version;
}
return ret;
}
static int unload_ras_ta_firmware(struct ras_core_context *ras_core,
struct ras_psp_ta_unload *ras_ta_unload)
{
struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx;
struct psp_gfx_cmd_unload_ta cmd_unload_ta = {0};
struct psp_cmd_resp resp = {0};
int ret;
if (!ras_core_down_trylock_gpu_reset_lock(ras_core))
return -EACCES;
cmd_unload_ta.session_id = ta_ctx->session_id;
ret = send_psp_cmd(ras_core, GFX_CMD_ID_UNLOAD_TA,
&cmd_unload_ta, sizeof(cmd_unload_ta), &resp);
if (ret || resp.status) {
RAS_DEV_ERR(ras_core->dev,
"Failed to unload RAS TA! ret:%d, status:%u\n",
ret, resp.status);
goto unlock;
}
kfree(ta_ctx->fw_bin.bin_addr);
memset(&ta_ctx->fw_bin, 0, sizeof(ta_ctx->fw_bin));
ta_ctx->ta_version = 0;
ta_ctx->ras_ta_initialized = false;
ta_ctx->session_id = 0;
unlock:
ras_core_up_gpu_reset_lock(ras_core);
return ret;
}
int ras_psp_load_firmware(struct ras_core_context *ras_core,
struct ras_psp_ta_load *ras_ta_load)
{
struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx;
struct ras_psp_ta_unload ras_ta_unload = {0};
int ret;
if (ta_ctx->preload_ras_ta_enabled)
return 0;
if (!ras_ta_load)
return -EINVAL;
if (ta_ctx->ras_ta_initialized) {
ras_ta_unload.ras_session_id = ta_ctx->session_id;
ret = unload_ras_ta_firmware(ras_core, &ras_ta_unload);
if (ret)
return ret;
}
return load_ras_ta_firmware(ras_core, ras_ta_load);
}
int ras_psp_unload_firmware(struct ras_core_context *ras_core,
struct ras_psp_ta_unload *ras_ta_unload)
{
struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx;
if (ta_ctx->preload_ras_ta_enabled)
return 0;
if ((!ras_ta_unload) ||
(ras_ta_unload->ras_session_id != ta_ctx->session_id))
return -EINVAL;
return unload_ras_ta_firmware(ras_core, ras_ta_unload);
}
int ras_psp_trigger_error(struct ras_core_context *ras_core,
struct ras_ta_trigger_error_input *info, uint32_t instance_mask)
{
struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx;
if (!ta_ctx->preload_ras_ta_enabled && !ta_ctx->ras_ta_initialized) {
RAS_DEV_ERR(ras_core->dev, "RAS: ras firmware not initialized!");
return -ENOEXEC;
}
if (!info)
return -EINVAL;
return trigger_ras_ta_error(ras_core, info, instance_mask);
}
int ras_psp_query_address(struct ras_core_context *ras_core,
struct ras_ta_query_address_input *addr_in,
struct ras_ta_query_address_output *addr_out)
{
struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx;
if (!ta_ctx->preload_ras_ta_enabled &&
!ta_ctx->ras_ta_initialized) {
RAS_DEV_ERR(ras_core->dev, "RAS: ras firmware not initialized!");
return -ENOEXEC;
}
if (!addr_in || !addr_out)
return -EINVAL;
return send_ras_ta_runtime_cmd(ras_core, RAS_TA_CMD_ID__QUERY_ADDRESS,
addr_in, sizeof(*addr_in), addr_out, sizeof(*addr_out));
}
int ras_psp_sw_init(struct ras_core_context *ras_core)
{
struct ras_psp *psp = &ras_core->ras_psp;
memset(psp, 0, sizeof(*psp));
psp->sys_func = ras_core->config->psp_cfg.psp_sys_fn;
if (!psp->sys_func) {
RAS_DEV_ERR(ras_core->dev, "RAS psp sys function not configured!\n");
return -EINVAL;
}
mutex_init(&psp->psp_ctx.internal_mutex);
mutex_init(&psp->ta_ctx.ta_mutex);
return 0;
}
int ras_psp_sw_fini(struct ras_core_context *ras_core)
{
struct ras_psp *psp = &ras_core->ras_psp;
mutex_destroy(&psp->psp_ctx.internal_mutex);
mutex_destroy(&psp->ta_ctx.ta_mutex);
memset(psp, 0, sizeof(*psp));
return 0;
}
int ras_psp_hw_init(struct ras_core_context *ras_core)
{
struct ras_psp *psp = &ras_core->ras_psp;
psp->psp_ip_version = ras_core->config->psp_ip_version;
psp->ip_func = ras_psp_get_ip_funcs(ras_core, psp->psp_ip_version);
if (!psp->ip_func)
return -EINVAL;
/* After GPU reset, the system RAS PSP status may change.
* therefore, it is necessary to synchronize the system status again.
*/
ras_psp_sync_system_ras_psp_status(ras_core);
return 0;
}
int ras_psp_hw_fini(struct ras_core_context *ras_core)
{
return 0;
}
bool ras_psp_check_supported_cmd(struct ras_core_context *ras_core,
enum ras_ta_cmd_id cmd_id)
{
struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx;
bool ret = false;
if (!ta_ctx->preload_ras_ta_enabled && !ta_ctx->ras_ta_initialized)
return false;
switch (cmd_id) {
case RAS_TA_CMD_ID__QUERY_ADDRESS:
/* Currently, querying the address from RAS TA is only supported
* when the RAS TA firmware is loaded during driver installation.
*/
if (ta_ctx->preload_ras_ta_enabled)
ret = true;
break;
case RAS_TA_CMD_ID__TRIGGER_ERROR:
ret = true;
break;
default:
ret = false;
break;
}
return ret;
}

View File

@@ -0,0 +1,145 @@
/* SPDX-License-Identifier: MIT */
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef __RAS_PSP_H__
#define __RAS_PSP_H__
#include "ras.h"
#include "ras_ta_if.h"
struct ras_core_context;
struct ras_ta_trigger_error_input;
struct ras_ta_query_address_input;
struct ras_ta_query_address_output;
enum ras_ta_cmd_id;
struct ras_ta_image_header {
uint32_t reserved1[24];
uint32_t image_version; /* [0x60] Off Chip Firmware Version */
uint32_t reserved2[39];
};
struct ras_psp_sys_status {
bool initialized;
uint32_t session_id;
void *psp_cmd_mutex;
};
struct ras_ta_init_param {
uint8_t poison_mode_en;
uint8_t dgpu_mode;
uint16_t xcc_mask;
uint8_t channel_dis_num;
uint8_t nps_mode;
uint32_t active_umc_mask;
};
struct gpu_mem_block {
uint32_t mem_type;
void *mem_bo;
uint64_t mem_mc_addr;
void *mem_cpu_addr;
uint32_t mem_size;
int ref_count;
void *private;
};
struct ras_psp_ip_func {
uint32_t (*psp_ras_ring_wptr_get)(struct ras_core_context *ras_core);
int (*psp_ras_ring_wptr_set)(struct ras_core_context *ras_core, uint32_t wptr);
};
struct ras_psp_ring {
struct gpu_mem_block ras_ring_gpu_mem;
};
struct psp_cmd_resp {
uint32_t status;
uint32_t session_id;
};
struct ras_psp_ctx {
void *external_mutex;
struct mutex internal_mutex;
uint64_t in_fence_value;
struct gpu_mem_block psp_cmd_gpu_mem;
struct gpu_mem_block out_fence_gpu_mem;
};
struct ras_ta_fw_bin {
uint32_t fw_version;
uint32_t feature_version;
uint32_t bin_size;
uint8_t *bin_addr;
};
struct ras_ta_ctx {
bool preload_ras_ta_enabled;
bool ras_ta_initialized;
uint32_t session_id;
uint32_t resp_status;
uint32_t ta_version;
struct mutex ta_mutex;
struct ras_ta_fw_bin fw_bin;
struct ras_ta_init_param init_param;
struct gpu_mem_block fw_gpu_mem;
struct gpu_mem_block cmd_gpu_mem;
};
struct ras_psp {
uint32_t psp_ip_version;
struct ras_psp_ring psp_ring;
struct ras_psp_ctx psp_ctx;
struct ras_ta_ctx ta_ctx;
const struct ras_psp_ip_func *ip_func;
const struct ras_psp_sys_func *sys_func;
};
struct ras_psp_ta_load {
uint32_t fw_version;
uint32_t feature_version;
uint32_t bin_size;
uint8_t *bin_addr;
uint64_t out_session_id;
uint32_t out_loaded_ta_version;
};
struct ras_psp_ta_unload {
uint64_t ras_session_id;
};
int ras_psp_sw_init(struct ras_core_context *ras_core);
int ras_psp_sw_fini(struct ras_core_context *ras_core);
int ras_psp_hw_init(struct ras_core_context *ras_core);
int ras_psp_hw_fini(struct ras_core_context *ras_core);
int ras_psp_load_firmware(struct ras_core_context *ras_core,
struct ras_psp_ta_load *ras_ta_load);
int ras_psp_unload_firmware(struct ras_core_context *ras_core,
struct ras_psp_ta_unload *ras_ta_unload);
int ras_psp_trigger_error(struct ras_core_context *ras_core,
struct ras_ta_trigger_error_input *info, uint32_t instance_mask);
int ras_psp_query_address(struct ras_core_context *ras_core,
struct ras_ta_query_address_input *addr_in,
struct ras_ta_query_address_output *addr_out);
bool ras_psp_check_supported_cmd(struct ras_core_context *ras_core,
enum ras_ta_cmd_id cmd_id);
#endif

View File

@@ -0,0 +1,231 @@
/* SPDX-License-Identifier: MIT */
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef _RAS_TA_IF_H
#define _RAS_TA_IF_H
#include "ras.h"
#define RAS_TA_HOST_IF_VER 0
/* Responses have bit 31 set */
#define RSP_ID_MASK (1U << 31)
#define RSP_ID(cmdId) (((uint32_t)(cmdId)) | RSP_ID_MASK)
/* invalid node instance value */
#define RAS_TA_INV_NODE 0xffff
/* RAS related enumerations */
/**********************************************************/
enum ras_ta_cmd_id {
RAS_TA_CMD_ID__ENABLE_FEATURES = 0,
RAS_TA_CMD_ID__DISABLE_FEATURES,
RAS_TA_CMD_ID__TRIGGER_ERROR,
RAS_TA_CMD_ID__QUERY_BLOCK_INFO,
RAS_TA_CMD_ID__QUERY_SUB_BLOCK_INFO,
RAS_TA_CMD_ID__QUERY_ADDRESS,
MAX_RAS_TA_CMD_ID
};
enum ras_ta_status {
RAS_TA_STATUS__SUCCESS = 0x0000,
RAS_TA_STATUS__RESET_NEEDED = 0xA001,
RAS_TA_STATUS__ERROR_INVALID_PARAMETER = 0xA002,
RAS_TA_STATUS__ERROR_RAS_NOT_AVAILABLE = 0xA003,
RAS_TA_STATUS__ERROR_RAS_DUPLICATE_CMD = 0xA004,
RAS_TA_STATUS__ERROR_INJECTION_FAILED = 0xA005,
RAS_TA_STATUS__ERROR_ASD_READ_WRITE = 0xA006,
RAS_TA_STATUS__ERROR_TOGGLE_DF_CSTATE = 0xA007,
RAS_TA_STATUS__ERROR_TIMEOUT = 0xA008,
RAS_TA_STATUS__ERROR_BLOCK_DISABLED = 0XA009,
RAS_TA_STATUS__ERROR_GENERIC = 0xA00A,
RAS_TA_STATUS__ERROR_RAS_MMHUB_INIT = 0xA00B,
RAS_TA_STATUS__ERROR_GET_DEV_INFO = 0xA00C,
RAS_TA_STATUS__ERROR_UNSUPPORTED_DEV = 0xA00D,
RAS_TA_STATUS__ERROR_NOT_INITIALIZED = 0xA00E,
RAS_TA_STATUS__ERROR_TEE_INTERNAL = 0xA00F,
RAS_TA_STATUS__ERROR_UNSUPPORTED_FUNCTION = 0xA010,
RAS_TA_STATUS__ERROR_SYS_DRV_REG_ACCESS = 0xA011,
RAS_TA_STATUS__ERROR_RAS_READ_WRITE = 0xA012,
RAS_TA_STATUS__ERROR_NULL_PTR = 0xA013,
RAS_TA_STATUS__ERROR_UNSUPPORTED_IP = 0xA014,
RAS_TA_STATUS__ERROR_PCS_STATE_QUIET = 0xA015,
RAS_TA_STATUS__ERROR_PCS_STATE_ERROR = 0xA016,
RAS_TA_STATUS__ERROR_PCS_STATE_HANG = 0xA017,
RAS_TA_STATUS__ERROR_PCS_STATE_UNKNOWN = 0xA018,
RAS_TA_STATUS__ERROR_UNSUPPORTED_ERROR_INJ = 0xA019,
RAS_TA_STATUS__TEE_ERROR_ACCESS_DENIED = 0xA01A
};
enum ras_ta_block {
RAS_TA_BLOCK__UMC = 0,
RAS_TA_BLOCK__SDMA,
RAS_TA_BLOCK__GFX,
RAS_TA_BLOCK__MMHUB,
RAS_TA_BLOCK__ATHUB,
RAS_TA_BLOCK__PCIE_BIF,
RAS_TA_BLOCK__HDP,
RAS_TA_BLOCK__XGMI_WAFL,
RAS_TA_BLOCK__DF,
RAS_TA_BLOCK__SMN,
RAS_TA_BLOCK__SEM,
RAS_TA_BLOCK__MP0,
RAS_TA_BLOCK__MP1,
RAS_TA_BLOCK__FUSE,
RAS_TA_BLOCK__MCA,
RAS_TA_BLOCK__VCN,
RAS_TA_BLOCK__JPEG,
RAS_TA_BLOCK__IH,
RAS_TA_BLOCK__MPIO,
RAS_TA_BLOCK__MMSCH,
RAS_TA_NUM_BLOCK_MAX
};
enum ras_ta_mca_block {
RAS_TA_MCA_BLOCK__MP0 = 0,
RAS_TA_MCA_BLOCK__MP1 = 1,
RAS_TA_MCA_BLOCK__MPIO = 2,
RAS_TA_MCA_BLOCK__IOHC = 3,
RAS_TA_MCA_NUM_BLOCK_MAX
};
enum ras_ta_error_type {
RAS_TA_ERROR__NONE = 0,
RAS_TA_ERROR__PARITY = 1,
RAS_TA_ERROR__SINGLE_CORRECTABLE = 2,
RAS_TA_ERROR__MULTI_UNCORRECTABLE = 4,
RAS_TA_ERROR__POISON = 8,
};
enum ras_ta_address_type {
RAS_TA_MCA_TO_PA,
RAS_TA_PA_TO_MCA,
};
enum ras_ta_nps_mode {
RAS_TA_UNKNOWN_MODE = 0,
RAS_TA_NPS1_MODE = 1,
RAS_TA_NPS2_MODE = 2,
RAS_TA_NPS4_MODE = 4,
RAS_TA_NPS8_MODE = 8,
};
/* Input/output structures for RAS commands */
/**********************************************************/
struct ras_ta_enable_features_input {
enum ras_ta_block block_id;
enum ras_ta_error_type error_type;
};
struct ras_ta_disable_features_input {
enum ras_ta_block block_id;
enum ras_ta_error_type error_type;
};
struct ras_ta_trigger_error_input {
/* ras-block. i.e. umc, gfx */
enum ras_ta_block block_id;
/* type of error. i.e. single_correctable */
enum ras_ta_error_type inject_error_type;
/* mem block. i.e. hbm, sram etc. */
uint32_t sub_block_index;
/* explicit address of error */
uint64_t address;
/* method if error injection. i.e persistent, coherent etc. */
uint64_t value;
};
struct ras_ta_init_flags {
uint8_t poison_mode_en;
uint8_t dgpu_mode;
uint16_t xcc_mask;
uint8_t channel_dis_num;
uint8_t nps_mode;
uint32_t active_umc_mask;
};
struct ras_ta_mca_addr {
uint64_t err_addr;
uint32_t ch_inst;
uint32_t umc_inst;
uint32_t node_inst;
uint32_t socket_id;
};
struct ras_ta_phy_addr {
uint64_t pa;
uint32_t bank;
uint32_t channel_idx;
};
struct ras_ta_query_address_input {
enum ras_ta_address_type addr_type;
struct ras_ta_mca_addr ma;
struct ras_ta_phy_addr pa;
};
struct ras_ta_output_flags {
uint8_t ras_init_success_flag;
uint8_t err_inject_switch_disable_flag;
uint8_t reg_access_failure_flag;
};
struct ras_ta_query_address_output {
/* don't use the flags here */
struct ras_ta_output_flags flags;
struct ras_ta_mca_addr ma;
struct ras_ta_phy_addr pa;
};
/* Common input structure for RAS callbacks */
/**********************************************************/
union ras_ta_cmd_input {
struct ras_ta_init_flags init_flags;
struct ras_ta_enable_features_input enable_features;
struct ras_ta_disable_features_input disable_features;
struct ras_ta_trigger_error_input trigger_error;
struct ras_ta_query_address_input address;
uint32_t reserve_pad[256];
};
union ras_ta_cmd_output {
struct ras_ta_output_flags flags;
struct ras_ta_query_address_output address;
uint32_t reserve_pad[256];
};
struct ras_ta_cmd {
uint32_t cmd_id;
uint32_t resp_id;
uint32_t ras_status;
uint32_t if_version;
union ras_ta_cmd_input ras_in_message;
union ras_ta_cmd_output ras_out_message;
};
#endif