From ab66c832847fcdffc97d4591ba5547e3990d9d33 Mon Sep 17 00:00:00 2001 From: Zhigang Luo Date: Thu, 29 Feb 2024 16:04:35 -0500 Subject: drm/amdgpu: trigger flr_work if reading pf2vf data failed if reading pf2vf data failed 30 times continuously, it means something is wrong. Need to trigger flr_work to recover the issue. also use dev_err to print the error message to get which device has issue and add warning message if waiting IDH_FLR_NOTIFICATION_CMPL timeout. Signed-off-by: Zhigang Luo Acked-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 7a4eae36778a..aed60aaf1a55 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -32,6 +32,7 @@ #include "amdgpu.h" #include "amdgpu_ras.h" +#include "amdgpu_reset.h" #include "vi.h" #include "soc15.h" #include "nv.h" @@ -424,7 +425,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev) return -EINVAL; if (pf2vf_info->size > 1024) { - DRM_ERROR("invalid pf2vf message size\n"); + dev_err(adev->dev, "invalid pf2vf message size: 0x%x\n", pf2vf_info->size); return -EINVAL; } @@ -435,7 +436,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev) adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size, adev->virt.fw_reserve.checksum_key, checksum); if (checksum != checkval) { - DRM_ERROR("invalid pf2vf message\n"); + dev_err(adev->dev, + "invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n", + checksum, checkval); return -EINVAL; } @@ -449,7 +452,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev) adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size, 0, checksum); if (checksum != checkval) { - DRM_ERROR("invalid pf2vf message\n"); + dev_err(adev->dev, + "invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n", + checksum, checkval); return -EINVAL; } @@ -485,7 +490,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev) ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid; break; default: - DRM_ERROR("invalid pf2vf version\n"); + dev_err(adev->dev, "invalid pf2vf version: 0x%x\n", pf2vf_info->version); return -EINVAL; } @@ -584,8 +589,21 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work) int ret; ret = amdgpu_virt_read_pf2vf_data(adev); - if (ret) + if (ret) { + adev->virt.vf2pf_update_retry_cnt++; + if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) && + amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) { + if (amdgpu_reset_domain_schedule(adev->reset_domain, + &adev->virt.flr_work)) + return; + else + dev_err(adev->dev, "Failed to queue work! at %s", __func__); + } + goto out; + } + + adev->virt.vf2pf_update_retry_cnt = 0; amdgpu_virt_write_vf2pf_data(adev); out: @@ -606,6 +624,7 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev) adev->virt.fw_reserve.p_pf2vf = NULL; adev->virt.fw_reserve.p_vf2pf = NULL; adev->virt.vf2pf_update_interval_ms = 0; + adev->virt.vf2pf_update_retry_cnt = 0; if (adev->mman.fw_vram_usage_va && adev->mman.drv_vram_usage_va) { DRM_WARN("Currently fw_vram and drv_vram should not have values at the same time!"); -- cgit v1.2.3 From 3e2dacca540643ee35e3deb1d60873e7138a6af3 Mon Sep 17 00:00:00 2001 From: Danijel Slivka Date: Wed, 27 Mar 2024 23:56:23 +0100 Subject: drm/amdgpu: use vm_update_mode=0 as default in sriov for gfx10.3 onwards Apply this rule to all newer asics in sriov case. For asic with VF MMIO access protection avoid using CPU for VM table updates. CPU pagetable updates have issues with HDP flush as VF MMIO access protection blocks write to BIF_BX_DEV0_EPF0_VF0_HDP_MEM_COHERENCY_FLUSH_CNTL register during sriov runtime. Moved the check to amdgpu_device_init() to ensure it is done after amdgpu_device_ip_early_init() where the IP versions are discovered. Signed-off-by: Danijel Slivka Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 6 ------ 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 12dc71a6b5db..66a5979e075d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4072,6 +4072,13 @@ int amdgpu_device_init(struct amdgpu_device *adev, /* Enable TMZ based on IP_VERSION */ amdgpu_gmc_tmz_set(adev); + if (amdgpu_sriov_vf(adev) && + amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) + /* VF MMIO access (except mailbox range) from CPU + * will be blocked during sriov runtime + */ + adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; + amdgpu_gmc_noretry_set(adev); /* Need to get xgmi info early to decide the reset behavior*/ if (adev->gmc.xgmi.supported) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index aed60aaf1a55..6f01de220c44 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -724,12 +724,6 @@ void amdgpu_detect_virtualization(struct amdgpu_device *adev) adev->virt.caps |= AMDGPU_PASSTHROUGH_MODE; } - if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) - /* VF MMIO access (except mailbox range) from CPU - * will be blocked during sriov runtime - */ - adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; - /* we have the ability to check now */ if (amdgpu_sriov_vf(adev)) { switch (adev->asic_type) { -- cgit v1.2.3 From f6ac0842364a5721c02e9dd1c956eb51c7431ff3 Mon Sep 17 00:00:00 2001 From: chongli2 Date: Tue, 26 Mar 2024 13:24:21 +0800 Subject: drm/amd/amdgpu: support MES command SET_HW_RESOURCE1 in sriov support MES command SET_HW_RESOURCE1 in sriov Signed-off-by: chongli2 Reviewed-by: Jingwen Chen Acked-by: Jingwen Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 6 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 5 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 4 +++ drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h | 9 ++++-- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 43 +++++++++++++++++++++++++++ drivers/gpu/drm/amd/include/mes_v11_api_def.h | 21 +++++++++++++ 6 files changed, 85 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 4c8fc3117ef8..6b3e1844eac5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -141,6 +141,12 @@ struct amdgpu_mes { /* ip specific functions */ const struct amdgpu_mes_funcs *funcs; + + /* mes resource_1 bo*/ + struct amdgpu_bo *resource_1; + uint64_t resource_1_gpu_addr; + void *resource_1_addr; + }; struct amdgpu_mes_process { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 6f01de220c44..461753904dea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -576,6 +576,11 @@ static int amdgpu_virt_write_vf2pf_data(struct amdgpu_device *adev) vf2pf_info->decode_usage = 0; vf2pf_info->dummy_page_addr = (uint64_t)adev->dummy_page_addr; + vf2pf_info->mes_info_addr = (uint64_t)adev->mes.resource_1_gpu_addr; + + if (adev->mes.resource_1) { + vf2pf_info->mes_info_size = adev->mes.resource_1->tbo.base.size; + } vf2pf_info->checksum = amd_sriov_msg_checksum( vf2pf_info, vf2pf_info->header.size, 0, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index a858bc98cad4..a9f2f0c4f799 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -132,6 +132,8 @@ enum AMDGIM_FEATURE_FLAG { AMDGIM_FEATURE_AV1_SUPPORT = (1 << 6), /* VCN RB decouple */ AMDGIM_FEATURE_VCN_RB_DECOUPLE = (1 << 7), + /* MES info */ + AMDGIM_FEATURE_MES_INFO_ENABLE = (1 << 8), }; enum AMDGIM_REG_ACCESS_FLAG { @@ -335,6 +337,8 @@ static inline bool is_virtual_machine(void) ((adev)->virt.gim_feature & AMDGIM_FEATURE_AV1_SUPPORT) #define amdgpu_sriov_is_vcn_rb_decouple(adev) \ ((adev)->virt.gim_feature & AMDGIM_FEATURE_VCN_RB_DECOUPLE) +#define amdgpu_sriov_is_mes_info_enable(adev) \ + ((adev)->virt.gim_feature & AMDGIM_FEATURE_MES_INFO_ENABLE) bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev); void amdgpu_virt_init_setting(struct amdgpu_device *adev); int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h index 51a14f6d93bd..0de78d6a83fe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h @@ -94,7 +94,8 @@ union amd_sriov_msg_feature_flags { uint32_t reg_indirect_acc : 1; uint32_t av1_support : 1; uint32_t vcn_rb_decouple : 1; - uint32_t reserved : 24; + uint32_t mes_info_enable : 1; + uint32_t reserved : 23; } flags; uint32_t all; }; @@ -221,7 +222,7 @@ struct amd_sriov_msg_vf2pf_info_header { uint32_t reserved[2]; }; -#define AMD_SRIOV_MSG_VF2PF_INFO_FILLED_SIZE (70) +#define AMD_SRIOV_MSG_VF2PF_INFO_FILLED_SIZE (73) struct amd_sriov_msg_vf2pf_info { /* header contains size and version */ struct amd_sriov_msg_vf2pf_info_header header; @@ -265,7 +266,9 @@ struct amd_sriov_msg_vf2pf_info { uint32_t version; } ucode_info[AMD_SRIOV_MSG_RESERVE_UCODE]; uint64_t dummy_page_addr; - + /* FB allocated for guest MES to record UQ info */ + uint64_t mes_info_addr; + uint32_t mes_info_size; /* reserved */ uint32_t reserved[256 - AMD_SRIOV_MSG_VF2PF_INFO_FILLED_SIZE]; }; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 63f281a9984d..e5230078a4cd 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -422,6 +422,36 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes) offsetof(union MESAPI_SET_HW_RESOURCES, api_status)); } +static int mes_v11_0_set_hw_resources_1(struct amdgpu_mes *mes) +{ + int size = 128 * PAGE_SIZE; + int ret = 0; + struct amdgpu_device *adev = mes->adev; + union MESAPI_SET_HW_RESOURCES_1 mes_set_hw_res_pkt; + memset(&mes_set_hw_res_pkt, 0, sizeof(mes_set_hw_res_pkt)); + + mes_set_hw_res_pkt.header.type = MES_API_TYPE_SCHEDULER; + mes_set_hw_res_pkt.header.opcode = MES_SCH_API_SET_HW_RSRC_1; + mes_set_hw_res_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; + mes_set_hw_res_pkt.enable_mes_info_ctx = 1; + + ret = amdgpu_bo_create_kernel(adev, size, PAGE_SIZE, + AMDGPU_GEM_DOMAIN_VRAM, + &mes->resource_1, + &mes->resource_1_gpu_addr, + &mes->resource_1_addr); + if (ret) { + dev_err(adev->dev, "(%d) failed to create mes resource_1 bo\n", ret); + return ret; + } + + mes_set_hw_res_pkt.mes_info_ctx_mc_addr = mes->resource_1_gpu_addr; + mes_set_hw_res_pkt.mes_info_ctx_size = mes->resource_1->tbo.base.size; + return mes_v11_0_submit_pkt_and_poll_completion(mes, + &mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt), + offsetof(union MESAPI_SET_HW_RESOURCES_1, api_status)); +} + static const struct amdgpu_mes_funcs mes_v11_0_funcs = { .add_hw_queue = mes_v11_0_add_hw_queue, .remove_hw_queue = mes_v11_0_remove_hw_queue, @@ -1203,6 +1233,14 @@ static int mes_v11_0_hw_init(void *handle) if (r) goto failure; + if (amdgpu_sriov_is_mes_info_enable(adev)) { + r = mes_v11_0_set_hw_resources_1(&adev->mes); + if (r) { + DRM_ERROR("failed mes_v11_0_set_hw_resources_1, r=%d\n", r); + goto failure; + } + } + r = mes_v11_0_query_sched_status(&adev->mes); if (r) { DRM_ERROR("MES is busy\n"); @@ -1226,6 +1264,11 @@ failure: static int mes_v11_0_hw_fini(void *handle) { + struct amdgpu_device *adev = (struct amdgpu_device *)handle; + if (amdgpu_sriov_is_mes_info_enable(adev)) { + amdgpu_bo_free_kernel(&adev->mes.resource_1, &adev->mes.resource_1_gpu_addr, + &adev->mes.resource_1_addr); + } return 0; } diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h b/drivers/gpu/drm/amd/include/mes_v11_api_def.h index ec5b9ab67c5e..410c8d664336 100644 --- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h +++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h @@ -61,6 +61,7 @@ enum MES_SCH_API_OPCODE { MES_SCH_API_MISC = 14, MES_SCH_API_UPDATE_ROOT_PAGE_TABLE = 15, MES_SCH_API_AMD_LOG = 16, + MES_SCH_API_SET_HW_RSRC_1 = 19, MES_SCH_API_MAX = 0xFF }; @@ -238,6 +239,26 @@ union MESAPI_SET_HW_RESOURCES { uint32_t max_dwords_in_api[API_FRAME_SIZE_IN_DWORDS]; }; +union MESAPI_SET_HW_RESOURCES_1 { + struct { + union MES_API_HEADER header; + struct MES_API_STATUS api_status; + uint64_t timestamp; + union { + struct { + uint32_t enable_mes_info_ctx : 1; + uint32_t reserved : 31; + }; + uint32_t uint32_all; + }; + uint64_t mes_info_ctx_mc_addr; + uint32_t mes_info_ctx_size; + uint32_t mes_kiq_unmap_timeout; // unit is 100ms + }; + + uint32_t max_dwords_in_api[API_FRAME_SIZE_IN_DWORDS]; +}; + union MESAPI__ADD_QUEUE { struct { union MES_API_HEADER header; -- cgit v1.2.3 From d1999b4017d485a3168b4ba1316937c82454165a Mon Sep 17 00:00:00 2001 From: Zhigang Luo Date: Wed, 20 Mar 2024 10:40:27 -0400 Subject: amd/amdgpu: improve VF recover time 1. change AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT from 30 to 5. 2. set fatel error detected flag. Signed-off-by: Zhigang Luo Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f830024cffca..170da4551ed5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4994,6 +4994,7 @@ retry: r = amdgpu_virt_reset_gpu(adev); if (r) return r; + amdgpu_ras_set_fed(adev, false); amdgpu_irq_gpu_reset_resume_helper(adev); /* some sw clean up VF needs to do before recover */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 461753904dea..54ab51a4ada7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -598,6 +598,7 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work) adev->virt.vf2pf_update_retry_cnt++; if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) && amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) { + amdgpu_ras_set_fed(adev, true); if (amdgpu_reset_domain_schedule(adev->reset_domain, &adev->virt.flr_work)) return; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index a9f2f0c4f799..642f1fd287d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -52,7 +52,7 @@ /* tonga/fiji use this offset */ #define mmBIF_IOV_FUNC_IDENTIFIER 0x1503 -#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 30 +#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 5 enum amdgpu_sriov_vf_mode { SRIOV_VF_MODE_BARE_METAL = 0, -- cgit v1.2.3