From abc3b5d21d34d21914b8e3caa75690f72baa2f36 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 31 Jan 2024 17:48:34 +0800 Subject: drm/amdgpu: add new aca_smu_type support Add new types to distinguish between ACA error type and smu mca type. e.g.: the ACA_ERROR_TYPE_DEFERRED is not matched any smu mca valid bank channel, so add new type 'aca_smu_type' to distinguish aca error type and smu mca type. Signed-off-by: Yang Wang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h index 2da50e095883..a48f4abc345c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h @@ -99,7 +99,14 @@ enum aca_error_type { ACA_ERROR_TYPE_COUNT }; +enum aca_smu_type { + ACA_SMU_TYPE_UE = 0, + ACA_SMU_TYPE_CE, + ACA_SMU_TYPE_COUNT, +}; + struct aca_bank { + enum aca_smu_type type; u64 regs[ACA_MAX_REGS_COUNT]; }; @@ -157,9 +164,9 @@ struct aca_handle { }; struct aca_bank_ops { - int (*aca_bank_generate_report)(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, + int (*aca_bank_generate_report)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, struct aca_bank_report *report, void *data); - bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, + bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data); }; @@ -167,8 +174,8 @@ struct aca_smu_funcs { int max_ue_bank_count; int max_ce_bank_count; int (*set_debug_mode)(struct amdgpu_device *adev, bool enable); - int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_error_type type, u32 *count); - int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_error_type type, int idx, struct aca_bank *bank); + int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count); + int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_smu_type type, int idx, struct aca_bank *bank); }; struct amdgpu_aca { -- cgit v1.2.3 From 949899cbacf54f1611a7c343093069462bbb6625 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 6 Feb 2024 13:24:16 +0800 Subject: drm/amdgpu: add new api to save error count into aca cache add new api to save error count into aca cache. Signed-off-by: Yang Wang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 33 +++++++-------------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 4 +++- 2 files changed, 10 insertions(+), 27 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index 2ad2d874f42f..7cbff74c0f2f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -274,25 +274,25 @@ static struct aca_bank_error *get_bank_error(struct aca_error *aerr, struct aca_ return new_bank_error(aerr, info); } -static int aca_log_errors(struct aca_handle *handle, enum aca_error_type type, - struct aca_bank_report *report) +int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info, + enum aca_error_type type, u64 count) { struct aca_error_cache *error_cache = &handle->error_cache; struct aca_bank_error *bank_error; struct aca_error *aerr; - if (!handle || !report) + if (!handle || !info || type >= ACA_ERROR_TYPE_COUNT) return -EINVAL; - if (!report->count[type]) + if (!count) return 0; aerr = &error_cache->errors[type]; - bank_error = get_bank_error(aerr, &report->info); + bank_error = get_bank_error(aerr, info); if (!bank_error) return -ENOMEM; - bank_error->count[type] += report->count[type]; + bank_error->count += count; return 0; } @@ -317,31 +317,12 @@ static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank enum aca_smu_type smu_type, void *data) { struct aca_bank_report report; - enum aca_error_type type; int ret; - switch (smu_type) { - case ACA_SMU_TYPE_UE: - type = ACA_ERROR_TYPE_UE; - break; - case ACA_SMU_TYPE_CE: - type = ACA_ERROR_TYPE_CE; - break; - default: - return -EINVAL; - } - ret = aca_generate_bank_report(handle, bank, smu_type, &report); if (ret) return ret; - if (!report.count[type]) - return 0; - - ret = aca_log_errors(handle, type, &report); - if (ret) - return ret; - return 0; } @@ -440,7 +421,7 @@ static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_er if (type >= ACA_ERROR_TYPE_COUNT) return -EINVAL; - count = bank_error->count[type]; + count = bank_error->count; if (!count) return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h index a48f4abc345c..470efefcde10 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h @@ -130,7 +130,7 @@ struct aca_bank_report { struct aca_bank_error { struct list_head node; struct aca_bank_info info; - u64 count[ACA_ERROR_TYPE_COUNT]; + u64 count; }; struct aca_error { @@ -206,4 +206,6 @@ int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *han enum aca_error_type type, void *data); int amdgpu_aca_smu_set_debug_mode(struct amdgpu_device *adev, bool en); void amdgpu_aca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root); +int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info, + enum aca_error_type type, u64 count); #endif -- cgit v1.2.3 From e3d4de8d8b24933a9588019bf53891bca520b226 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Thu, 22 Feb 2024 14:03:14 +0800 Subject: drm/amdgpu: retire unused aca_bank_report data structure retire unused aca_bank_report data structure. Signed-off-by: Yang Wang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 17 +++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 8 +------- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 6 +++--- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 8 ++++---- drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 7 +++---- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 7 +++---- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 6 +++--- 7 files changed, 24 insertions(+), 35 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index 7cbff74c0f2f..a0b7f0d0c513 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -297,29 +297,26 @@ int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_in return 0; } -static int aca_generate_bank_report(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type type, struct aca_bank_report *report) +static int aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type) { const struct aca_bank_ops *bank_ops = handle->bank_ops; - if (!bank || !report) + if (!bank) return -EINVAL; - if (!bank_ops->aca_bank_generate_report) + if (!bank_ops->aca_bank_parser) return -EOPNOTSUPP; - memset(report, 0, sizeof(*report)); - return bank_ops->aca_bank_generate_report(handle, bank, type, - report, handle->data); + return bank_ops->aca_bank_parser(handle, bank, type, + handle->data); } static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type smu_type, void *data) + enum aca_smu_type type, void *data) { - struct aca_bank_report report; int ret; - ret = aca_generate_bank_report(handle, bank, smu_type, &report); + ret = aca_bank_parser(handle, bank, type); if (ret) return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h index 470efefcde10..6eb4c0341278 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h @@ -122,11 +122,6 @@ struct aca_bank_info { int mcatype; }; -struct aca_bank_report { - struct aca_bank_info info; - u64 count[ACA_ERROR_TYPE_COUNT]; -}; - struct aca_bank_error { struct list_head node; struct aca_bank_info info; @@ -164,8 +159,7 @@ struct aca_handle { }; struct aca_bank_ops { - int (*aca_bank_generate_report)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, - struct aca_bank_report *report, void *data); + int (*aca_bank_parser)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data); bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data); }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 3e7f0199926e..523827e8e7a8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -1035,8 +1035,8 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) return 0; } -static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, - struct aca_bank_report *report, void *data) +static int xgmi_v6_4_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, + enum aca_smu_type type, void *data) { struct amdgpu_device *adev = handle->adev; struct aca_bank_info info; @@ -1075,7 +1075,7 @@ static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struc } static const struct aca_bank_ops xgmi_v6_4_0_aca_bank_ops = { - .aca_bank_generate_report = xgmi_v6_4_0_aca_bank_generate_report, + .aca_bank_parser = xgmi_v6_4_0_aca_bank_parser, }; static const struct aca_info xgmi_v6_4_0_aca_info = { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index cc7cccf0b2b2..fc33354f1d3d 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -680,9 +680,9 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_3_gfx_funcs = { .ih_node_to_logical_xcc = &gfx_v9_4_3_ih_to_xcc_inst, }; -static int gfx_v9_4_3_aca_bank_generate_report(struct aca_handle *handle, - struct aca_bank *bank, enum aca_smu_type type, - struct aca_bank_report *report, void *data) +static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle, + struct aca_bank *bank, enum aca_smu_type type, + void *data) { struct aca_bank_info info; u64 misc0; @@ -736,7 +736,7 @@ static bool gfx_v9_4_3_aca_bank_is_valid(struct aca_handle *handle, struct aca_b } static const struct aca_bank_ops gfx_v9_4_3_aca_bank_ops = { - .aca_bank_generate_report = gfx_v9_4_3_aca_bank_generate_report, + .aca_bank_parser = gfx_v9_4_3_aca_bank_parser, .aca_bank_is_valid = gfx_v9_4_3_aca_bank_is_valid, }; diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c index 50113cadaa0d..7a1ff298417a 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c @@ -721,9 +721,8 @@ static const struct amdgpu_ras_block_hw_ops mmhub_v1_8_ras_hw_ops = { .reset_ras_error_count = mmhub_v1_8_reset_ras_error_count, }; -static int mmhub_v1_8_aca_bank_generate_report(struct aca_handle *handle, - struct aca_bank *bank, enum aca_smu_type type, - struct aca_bank_report *report, void *data) +static int mmhub_v1_8_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, + enum aca_smu_type type, void *data) { struct aca_bank_info info; u64 misc0; @@ -780,7 +779,7 @@ static bool mmhub_v1_8_aca_bank_is_valid(struct aca_handle *handle, struct aca_b } static const struct aca_bank_ops mmhub_v1_8_aca_bank_ops = { - .aca_bank_generate_report = mmhub_v1_8_aca_bank_generate_report, + .aca_bank_parser = mmhub_v1_8_aca_bank_parser, .aca_bank_is_valid = mmhub_v1_8_aca_bank_is_valid, }; diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 02fc11b98b20..71c2f50530cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -2189,9 +2189,8 @@ static const struct amdgpu_ras_block_hw_ops sdma_v4_4_2_ras_hw_ops = { .reset_ras_error_count = sdma_v4_4_2_reset_ras_error_count, }; -static int sdma_v4_4_2_aca_bank_generate_report(struct aca_handle *handle, - struct aca_bank *bank, enum aca_smu_type type, - struct aca_bank_report *report, void *data) +static int sdma_v4_4_2_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, + enum aca_smu_type type, void *data) { struct aca_bank_info info; u64 misc0; @@ -2241,7 +2240,7 @@ static bool sdma_v4_4_2_aca_bank_is_valid(struct aca_handle *handle, struct aca_ } static const struct aca_bank_ops sdma_v4_4_2_aca_bank_ops = { - .aca_bank_generate_report = sdma_v4_4_2_aca_bank_generate_report, + .aca_bank_parser = sdma_v4_4_2_aca_bank_parser, .aca_bank_is_valid = sdma_v4_4_2_aca_bank_is_valid, }; diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index b512a4b38067..e2e18ccbbc62 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -504,8 +504,8 @@ const struct amdgpu_ras_block_hw_ops umc_v12_0_ras_hw_ops = { .query_ras_error_address = umc_v12_0_query_ras_error_address, }; -static int umc_v12_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, - struct aca_bank_report *report, void *data) +static int umc_v12_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, + enum aca_smu_type type, void *data) { struct amdgpu_device *adev = handle->adev; struct aca_bank_info info; @@ -542,7 +542,7 @@ static int umc_v12_0_aca_bank_generate_report(struct aca_handle *handle, struct } static const struct aca_bank_ops umc_v12_0_aca_bank_ops = { - .aca_bank_generate_report = umc_v12_0_aca_bank_generate_report, + .aca_bank_parser = umc_v12_0_aca_bank_parser, }; const struct aca_info umc_v12_0_aca_info = { -- cgit v1.2.3 From bd15bf742f6d869998d331f01496e8ed54bcf237 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Sun, 3 Mar 2024 19:01:23 +0800 Subject: drm/amdgpu: avoid update aca bank multi times during ras isr Because the UE Valid MCA count will only be cleared after reset, in order to avoid repeated counting of the error count, the aca bank is only updated once during ras isr. Signed-off-by: Yang Wang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 27 +++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 1 + 2 files changed, 28 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index 611c1751577a..53ad76f590a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -369,6 +369,26 @@ static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks * return 0; } +static bool aca_bank_should_update(struct amdgpu_device *adev, enum aca_smu_type type) +{ + struct amdgpu_aca *aca = &adev->aca; + bool ret = true; + + /* + * Because the UE Valid MCA count will only be cleared after reset, + * in order to avoid repeated counting of the error count, + * the aca bank is only updated once during the gpu recovery stage. + */ + if (type == ACA_SMU_TYPE_UE) { + if (amdgpu_ras_intr_triggered()) + ret = atomic_cmpxchg(&aca->ue_update_flag, 0, 1) == 0; + else + atomic_set(&aca->ue_update_flag, 0); + } + + return ret; +} + static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type, bank_handler_t handler, void *data) { @@ -380,6 +400,9 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type, if (list_empty(&aca->mgr.list)) return 0; + if (!aca_bank_should_update(adev, type)) + return 0; + ret = aca_smu_get_valid_aca_count(adev, type, &count); if (ret) return ret; @@ -670,6 +693,8 @@ int amdgpu_aca_init(struct amdgpu_device *adev) struct amdgpu_aca *aca = &adev->aca; int ret; + atomic_set(&aca->ue_update_flag, 0); + ret = aca_manager_init(&aca->mgr); if (ret) return ret; @@ -682,6 +707,8 @@ void amdgpu_aca_fini(struct amdgpu_device *adev) struct amdgpu_aca *aca = &adev->aca; aca_manager_fini(&aca->mgr); + + atomic_set(&aca->ue_update_flag, 0); } int amdgpu_aca_reset(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h index 6eb4c0341278..674a5a9da862 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h @@ -175,6 +175,7 @@ struct aca_smu_funcs { struct amdgpu_aca { struct aca_handle_manager mgr; const struct aca_smu_funcs *smu_funcs; + atomic_t ue_update_flag; bool is_enabled; }; -- cgit v1.2.3 From 31fd330b97ba334e00ea6102dfb68565eeb7a23f Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 18 Mar 2024 09:32:36 +0800 Subject: drm/amdgpu: add ras event id support for ACA add ras event id support for ACA. Signed-off-by: Yang Wang Reviewed-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 29 ++++++++++++++++------------- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 ++++++----- 3 files changed, 23 insertions(+), 19 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index 53ad76f590a1..c3242dbf8355 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -116,20 +116,22 @@ static struct aca_regs_dump { {"CONTROL_MASK", ACA_REG_IDX_CTL_MASK}, }; -static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank) +static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank, + struct ras_query_context *qctx) { + u64 event_id = qctx ? qctx->event_id : 0ULL; int i; - dev_info(adev->dev, HW_ERR "Accelerator Check Architecture events logged\n"); + RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n"); /* plus 1 for output format, e.g: ACA[08/08]: xxxx */ for (i = 0; i < ARRAY_SIZE(aca_regs); i++) - dev_info(adev->dev, HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n", - idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]); + RAS_EVENT_LOG(adev, event_id, HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n", + idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]); } static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type, int start, int count, - struct aca_banks *banks) + struct aca_banks *banks, struct ras_query_context *qctx) { struct amdgpu_aca *aca = &adev->aca; const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; @@ -165,7 +167,7 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_ bank.type = type; - aca_smu_bank_dump(adev, i, count, &bank); + aca_smu_bank_dump(adev, i, count, &bank, qctx); ret = aca_banks_add_bank(banks, &bank); if (ret) @@ -390,7 +392,7 @@ static bool aca_bank_should_update(struct amdgpu_device *adev, enum aca_smu_type } static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type, - bank_handler_t handler, void *data) + bank_handler_t handler, struct ras_query_context *qctx, void *data) { struct amdgpu_aca *aca = &adev->aca; struct aca_banks banks; @@ -412,7 +414,7 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type, aca_banks_init(&banks); - ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks); + ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks, qctx); if (ret) goto err_release_banks; @@ -489,7 +491,7 @@ out_unlock: } static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, enum aca_error_type type, - struct ras_err_data *err_data) + struct ras_err_data *err_data, struct ras_query_context *qctx) { enum aca_smu_type smu_type; int ret; @@ -507,7 +509,7 @@ static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *h } /* udpate aca bank to aca source error_cache first */ - ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, NULL); + ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, qctx, NULL); if (ret) return ret; @@ -523,7 +525,7 @@ static bool aca_handle_is_valid(struct aca_handle *handle) } int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, - enum aca_error_type type, void *data) + enum aca_error_type type, void *data, void *qctx) { struct ras_err_data *err_data = (struct ras_err_data *)data; @@ -536,7 +538,8 @@ int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *han if (!(BIT(type) & handle->mask)) return 0; - return __aca_get_error_data(adev, handle, type, err_data); + return __aca_get_error_data(adev, handle, type, err_data, + (struct ras_query_context *)qctx); } static void aca_error_init(struct aca_error *aerr, enum aca_error_type type) @@ -853,7 +856,7 @@ static int aca_dump_show(struct seq_file *m, enum aca_smu_type type) .idx = 0, }; - return aca_banks_update(adev, type, handler_aca_bank_dump, (void *)&context); + return aca_banks_update(adev, type, handler_aca_bank_dump, NULL, (void *)&context); } static int aca_dump_ce_show(struct seq_file *m, void *unused) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h index 674a5a9da862..247968d6a925 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h @@ -198,7 +198,7 @@ int amdgpu_aca_add_handle(struct amdgpu_device *adev, struct aca_handle *handle, const char *name, const struct aca_info *aca_info, void *data); void amdgpu_aca_remove_handle(struct aca_handle *handle); int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, - enum aca_error_type type, void *data); + enum aca_error_type type, void *data, void *qctx); int amdgpu_aca_smu_set_debug_mode(struct amdgpu_device *adev, bool en); void amdgpu_aca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root); int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 2bc02b809246..b8c7d0bf8fb1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1269,7 +1269,8 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk) } static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk, - enum aca_error_type type, struct ras_err_data *err_data) + enum aca_error_type type, struct ras_err_data *err_data, + struct ras_query_context *qctx) { struct ras_manager *obj; @@ -1277,7 +1278,7 @@ static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu if (!obj) return -EINVAL; - return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data); + return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data, qctx); } ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr, @@ -1334,15 +1335,15 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, } } else { if (amdgpu_aca_is_enabled(adev)) { - ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data); + ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data, qctx); if (ret) return ret; - ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data); + ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data, qctx); if (ret) return ret; - ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_DEFERRED, err_data); + ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_DEFERRED, err_data, qctx); if (ret) return ret; } else { -- cgit v1.2.3 From 81d96e8b5a85e3ebb85342525aab02e89bec72dc Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Thu, 28 Mar 2024 13:46:14 +0800 Subject: drm/amdgpu: refine function signature of amdgpu_aca_get_error_data() refine function signature of amdgpu_aca_get_error_data(); Signed-off-by: Yang Wang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 8 +++----- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 6 +++++- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index c3242dbf8355..c02634a124dd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -525,10 +525,9 @@ static bool aca_handle_is_valid(struct aca_handle *handle) } int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, - enum aca_error_type type, void *data, void *qctx) + enum aca_error_type type, struct ras_err_data *err_data, + struct ras_query_context *qctx) { - struct ras_err_data *err_data = (struct ras_err_data *)data; - if (!handle || !err_data) return -EINVAL; @@ -538,8 +537,7 @@ int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *han if (!(BIT(type) & handle->mask)) return 0; - return __aca_get_error_data(adev, handle, type, err_data, - (struct ras_query_context *)qctx); + return __aca_get_error_data(adev, handle, type, err_data, qctx); } static void aca_error_init(struct aca_error *aerr, enum aca_error_type type) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h index 247968d6a925..3765843ea648 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h @@ -26,6 +26,9 @@ #include +struct ras_err_data; +struct ras_query_context; + #define ACA_MAX_REGS_COUNT (16) #define ACA_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> l) @@ -198,7 +201,8 @@ int amdgpu_aca_add_handle(struct amdgpu_device *adev, struct aca_handle *handle, const char *name, const struct aca_info *aca_info, void *data); void amdgpu_aca_remove_handle(struct aca_handle *handle); int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, - enum aca_error_type type, void *data, void *qctx); + enum aca_error_type type, struct ras_err_data *err_data, + struct ras_query_context *qctx); int amdgpu_aca_smu_set_debug_mode(struct amdgpu_device *adev, bool en); void amdgpu_aca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root); int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info, -- cgit v1.2.3 From f23558627f2bb28df3b7f3d1d53b1e7f4bd1e250 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Fri, 12 Apr 2024 09:53:15 +0800 Subject: drm/amdgpu: add new aca smu callback func parse_error_code() add new aca smu callback parse_error_code{} to avoid specific asic check in amdgpu_aca.c file Signed-off-by: Yang Wang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 23 ++++++++-------------- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 1 + .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 13 ++++++++++++ 3 files changed, 22 insertions(+), 15 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index c02634a124dd..c50202215f6b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -753,23 +753,13 @@ int aca_bank_info_decode(struct aca_bank *bank, struct aca_bank_info *info) static int aca_bank_get_error_code(struct amdgpu_device *adev, struct aca_bank *bank) { - int error_code; - - switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { - case IP_VERSION(13, 0, 6): - if (!(adev->flags & AMD_IS_APU) && adev->pm.fw_version >= 0x00555600) { - error_code = ACA_REG__SYND__ERRORINFORMATION(bank->regs[ACA_REG_IDX_SYND]); - return error_code & 0xff; - } - break; - default: - break; - } + struct amdgpu_aca *aca = &adev->aca; + const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; - /* NOTE: the true error code is encoded in status.errorcode[0:7] */ - error_code = ACA_REG__STATUS__ERRORCODE(bank->regs[ACA_REG_IDX_STATUS]); + if (!smu_funcs || !smu_funcs->parse_error_code) + return -EOPNOTSUPP; - return error_code & 0xff; + return smu_funcs->parse_error_code(adev, bank); } int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank, int *err_codes, int size) @@ -780,6 +770,9 @@ int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank return -EINVAL; error_code = aca_bank_get_error_code(adev, bank); + if (error_code < 0) + return error_code; + for (i = 0; i < size; i++) { if (err_codes[i] == error_code) return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h index 3765843ea648..5ef6b745f222 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h @@ -173,6 +173,7 @@ struct aca_smu_funcs { int (*set_debug_mode)(struct amdgpu_device *adev, bool enable); int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count); int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_smu_type type, int idx, struct aca_bank *bank); + int (*parse_error_code)(struct amdgpu_device *adev, struct aca_bank *bank); }; struct amdgpu_aca { diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 16179e170874..bc1f3fe4d82a 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -3118,12 +3118,25 @@ static int aca_smu_get_valid_aca_bank(struct amdgpu_device *adev, return 0; } +static int aca_smu_parse_error_code(struct amdgpu_device *adev, struct aca_bank *bank) +{ + int error_code; + + if (!(adev->flags & AMD_IS_APU) && adev->pm.fw_version >= 0x00555600) + error_code = ACA_REG__SYND__ERRORINFORMATION(bank->regs[ACA_REG_IDX_SYND]); + else + error_code = ACA_REG__STATUS__ERRORCODE(bank->regs[ACA_REG_IDX_STATUS]); + + return error_code & 0xff; +} + static const struct aca_smu_funcs smu_v13_0_6_aca_smu_funcs = { .max_ue_bank_count = 12, .max_ce_bank_count = 12, .set_debug_mode = aca_smu_set_debug_mode, .get_valid_aca_count = aca_smu_get_valid_aca_count, .get_valid_aca_bank = aca_smu_get_valid_aca_bank, + .parse_error_code = aca_smu_parse_error_code, }; static int smu_v13_0_6_select_xgmi_plpd_policy(struct smu_context *smu, -- cgit v1.2.3