aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-05-15 09:43:42 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-05-15 09:43:42 -0700
commitdb5d28c0bfe566908719bec8e25443aabecbb802 (patch)
treec113e307ba7a5964ff174f590cd58bce07e2e4ee /drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
parentMerge tag 'asymmetric-keys-next-6.10-rc1' of git://git.kernel.org/pub/scm/lin... (diff)
parentMerge tag 'drm-xe-next-fixes-2024-05-09-1' of https://gitlab.freedesktop.org/... (diff)
downloadlinux-db5d28c0bfe566908719bec8e25443aabecbb802.tar.gz
linux-db5d28c0bfe566908719bec8e25443aabecbb802.zip
Merge tag 'drm-next-2024-05-15' of https://gitlab.freedesktop.org/drm/kernel
Pull drm updates from Dave Airlie: "This is the main pull request for the drm subsystems for 6.10. In drivers the main thing is a new driver for ARM Mali firmware based GPUs, otherwise there are a lot of changes to amdgpu/xe/i915/msm and scattered changes to everything else. In the core a bunch of headers and Kconfig was refactored, along with the addition of a new panic handler which is meant to provide a user friendly message when a panic happens and graphical display is enabled. New drivers: - panthor: ARM Mali/Immortalis CSF-based GPU driver Core: - add a CONFIG_DRM_WERROR option - make more headers self-contained - grab resv lock in pin/unpin - fix vmap resv locking - EDID/eDP panel matching - Kconfig cleanups - DT sound bindings - Add SIZE_HINTS property for cursor planes - Add struct drm_edid_product_id and helpers. - Use drm device based logging in more drm functions. - drop seq_file.h from a bunch of places - use drm_edid driver conversions dp: - DP Tunnel documentation - MST read sideband cap - Adaptive sync SDP prep work ttm: - improve placement for TTM BOs in idle/busy handling panic: - Fixes for drm-panic, and option to test it. - Add drm panic to simpledrm, mgag200, imx, ast bridge: - improve init ordering - adv7511: allow GPIO pin sharing - tc358775: add tc358675 support panel: - AUO B120XAN01.0 - Samsung s6e3fa7 - BOE NT116WHM-N44 - CMN N116BCA-EA1, - CrystalClear CMT430B19N00 - Startek KD050HDFIA020-C020A - powertip PH128800T006-ZHC01 - Innolux G121X1-L03 - LG sw43408 - Khadas TS050 V2 - EDO RM69380 OLED - CSOT MNB601LS1-1 amdgpu: - HDCP/ODM/RAS fixes - Devcoredump improvements - Expose VCN activity via sysfs - SMY 13.0.x updates - Enable fast updates on DCN 3.1.4 - Add dclk and vclk reporting on additional devices - Add ACA RAS infrastructure - Implement TLB flush fence - EEPROM handling fixes - SMUIO 14.0.2 support - SMU 14.0.1 Updates - SMU 14.0.2 support - Sync page table freeing with TLB flushes - DML2 refactor - DC debug improvements - DCN 3.5.x Updates - GPU reset fixes - HDP fix for second GFX pipe on GC 10.x - Enable secondary GFX pipe on GC 10.3 - Refactor and clean up BACO/BOCO/BAMACO handling - Remove invalid TTM resource start check - UAF fix in VA IOCTL - GPUVM page fault redirection to secondary IH rings for IH 6.x - Initial support for mapping kernel queues via MES - Fix VRAM memory accounting amdkfd: - MQD handling cleanup - Preemption handling fixes for XCDs - TLB flush fix for GC 9.4.2 - Properly clean up workqueue during module unload - Fix memory leak process create failure - Range check CP bad op exception targets to avoid reporting invalid exceptions to userspace - Fix eviction fence handling - Fix leak in GPU memory allocation failure case - DMABuf import handling fix - Enable SQ watchpoint for gfx10 i915: - Adding new DG2 PCI ID - add context hints for GT frequency - enable only one CCS for compute workloads - new workarounds - Fix UAF on destroy against retire race and remove two earlier partial fixes - Limit the reserved VM space to only the platforms that need it - Fix gt reset with GuC submission is disable - Add and use gt_to_guc() wrapper i915/xe display: - Lunar Lake display enabling, including cdclk and other refactors - BIOS/VBT/opregion related refactor - Digital port related refactor/clean-up - Fix 2s boot time regression on DP panel replay init - Remove duplication on audio enable/disable on SDVO and g4x+ DP - Disable AuxCCS framebuffers if built for Xe - Make crtc disable more atomic - Increase DP idle pattern wait timeout to 2ms - Start using container_of_const() for some extra const safety - Fix Jasper Lake boot freeze - Enable MST mode for 128b/132b single-stream sideband - Enable Adaptive Sync SDP Support for DP - Fix MTL supported DP rates - removal of UHBR13.5 - PLL refactoring - Limit eDP MSO pipe only for display version 20 - More display refactor towards independence from i915 dev_priv - Convert i915/xe fbdev to DRM client - More initial work to make display code more independent from i915 xe: - improved error capture - clean up some uAPI leftovers - devcoredump update - Add BMG mocs table - Handle GSCCS ER interrupt - Implement xe2- and GuC workarounds - struct xe_device cleanup - Hwmon updates - Add LRC parsing for more GPU instruction - Increase VM_BIND number of per-ioctl Ops - drm/xe: Add XE_BO_GGTT_INVALIDATE flag - Initial development for SR-IOV support - Add new PCI IDs to DG2 platform - Move userptr over to start using hmm_range_fault msm: - Switched to generating register header files during build process instead of shipping pre-generated headers - Merged DPU and MDP4 format databases. - DP: - Stop using compat string to distinguish DP and eDP cases - Added support for X Elite platform (X1E80100) - Reworked DP aux/audio support - Added SM6350 DP to the bindings - GPU: - a7xx perfcntr reg fixes - MAINTAINERS updates - a750 devcoredump support radeon: - Silence UBSAN warnings related to flexible arrays nouveau: - move some uAPI objects to uapi headers omapdrm: - console fix ast: - add i2c polling qaic: - add debugfs entries exynos: - fix platform_driver .owner - drop cleanup code mediatek: - Use devm_platform_get_and_ioremap_resource() in mtk_hdmi_ddc_probe() - Add GAMMA 12-bit LUT support for MT8188 - Rename mtk_drm_* to mtk_* - Drop driver owner initialization - Correct calculation formula of PHY Timing" * tag 'drm-next-2024-05-15' of https://gitlab.freedesktop.org/drm/kernel: (1477 commits) drm/xe/ads: Use flexible-array drm/xe: Use ordered WQ for G2H handler drm/msm/gen_header: allow skipping the validation drm/msm/a6xx: Cleanup indexed regs const'ness drm/msm: Add devcoredump support for a750 drm/msm: Adjust a7xx GBIF debugbus dumping drm/msm: Update a6xx registers XML drm/msm: Fix imported a750 snapshot header for upstream drm/msm: Import a750 snapshot registers from kgsl MAINTAINERS: Add Konrad Dybcio as a reviewer for the Adreno driver MAINTAINERS: Add a separate entry for Qualcomm Adreno GPU drivers drm/msm/a6xx: Avoid a nullptr dereference when speedbin setting fails drm/msm/adreno: fix CP cycles stat retrieval on a7xx drm/msm/a7xx: allow writing to CP_BV counter selection registers drm: zynqmp_dpsub: Always register bridge Revert "drm/bridge: ti-sn65dsi83: Fix enable error path" drm/fb_dma: Add checks in drm_fb_dma_get_scanout_buffer() drm/fbdev-generic: Do not set physical framebuffer address drm/panthor: Fix the FW reset logic drm/panthor: Make sure we handle 'unknown group state' case properly ...
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c133
1 files changed, 114 insertions, 19 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 20436f81856a..540e0f066b26 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -21,10 +21,13 @@
*
*/
+#include <linux/sort.h>
#include "amdgpu.h"
#include "umc_v6_7.h"
#define MAX_UMC_POISON_POLLING_TIME_SYNC 20 //ms
+#define MAX_UMC_HASH_STRING_SIZE 256
+
static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t err_addr,
uint32_t ch_inst, uint32_t umc_inst)
@@ -63,6 +66,8 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
goto out_fini_err_data;
}
+ err_data.err_addr_len = adev->umc.max_ras_err_cnt_per_query;
+
/*
* Translate UMC channel address to Physical address
*/
@@ -86,7 +91,7 @@ out_fini_err_data:
return ret;
}
-static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
+void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
void *ras_error_status)
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
@@ -118,6 +123,8 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
if(!err_data->err_addr)
dev_warn(adev->dev, "Failed to alloc memory for "
"umc error address record!\n");
+ else
+ err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
/* umc query_ras_error_address is also responsible for clearing
* error status
@@ -143,6 +150,8 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
if(!err_data->err_addr)
dev_warn(adev->dev, "Failed to alloc memory for "
"umc error address record!\n");
+ else
+ err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
/* umc query_ras_error_address is also responsible for clearing
* error status
@@ -170,6 +179,7 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
}
kfree(err_data->err_addr);
+ err_data->err_addr = NULL;
mutex_unlock(&con->page_retirement_lock);
}
@@ -177,7 +187,7 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
void *ras_error_status,
struct amdgpu_iv_entry *entry,
- bool reset)
+ uint32_t reset)
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -186,9 +196,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
amdgpu_umc_handle_bad_pages(adev, ras_error_status);
if (err_data->ue_count && reset) {
- /* use mode-2 reset for poison consumption */
- if (!entry)
- con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+ con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
}
@@ -196,7 +204,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
}
int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
- bool reset, uint32_t timeout_ms)
+ uint32_t reset, uint32_t timeout_ms)
{
struct ras_err_data err_data;
struct ras_common_if head = {
@@ -238,16 +246,16 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
if (reset) {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- /* use mode-2 reset for poison consumption */
- con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+ con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
}
return 0;
}
-int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
- enum amdgpu_ras_block block, bool reset)
+int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block, uint16_t pasid,
+ pasid_notify pasid_fn, void *data, uint32_t reset)
{
int ret = AMDGPU_RAS_SUCCESS;
@@ -285,16 +293,14 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
amdgpu_ras_error_data_fini(&err_data);
} else {
- if (reset) {
- amdgpu_umc_bad_page_polling_timeout(adev,
- reset, MAX_UMC_POISON_POLLING_TIME_SYNC);
- } else {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ amdgpu_ras_put_poison_req(adev,
+ block, pasid, pasid_fn, data, reset);
+
atomic_inc(&con->page_retirement_req_cnt);
wake_up(&con->page_retirement_wq);
- }
}
} else {
if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
@@ -307,11 +313,19 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
return ret;
}
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block, uint32_t reset)
+{
+ return amdgpu_umc_pasid_poison_handler(adev,
+ block, 0, NULL, NULL, reset);
+}
+
int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
void *ras_error_status,
struct amdgpu_iv_entry *entry)
{
- return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
+ return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry,
+ AMDGPU_RAS_GPU_RESET_MODE1_RESET);
}
int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev)
@@ -388,14 +402,20 @@ int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
return 0;
}
-void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
+int amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
uint64_t err_addr,
uint64_t retired_page,
uint32_t channel_index,
uint32_t umc_inst)
{
- struct eeprom_table_record *err_rec =
- &err_data->err_addr[err_data->err_addr_cnt];
+ struct eeprom_table_record *err_rec;
+
+ if (!err_data ||
+ !err_data->err_addr ||
+ (err_data->err_addr_cnt >= err_data->err_addr_len))
+ return -EINVAL;
+
+ err_rec = &err_data->err_addr[err_data->err_addr_cnt];
err_rec->address = err_addr;
/* page frame address is saved */
@@ -407,6 +427,8 @@ void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
err_rec->mcumc_id = umc_inst;
err_data->err_addr_cnt++;
+
+ return 0;
}
int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
@@ -439,3 +461,76 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
return 0;
}
+
+int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
+ uint64_t status, uint64_t ipid, uint64_t addr)
+{
+ if (adev->umc.ras->update_ecc_status)
+ return adev->umc.ras->update_ecc_status(adev,
+ status, ipid, addr);
+ return 0;
+}
+
+static int amdgpu_umc_uint64_cmp(const void *a, const void *b)
+{
+ uint64_t *addr_a = (uint64_t *)a;
+ uint64_t *addr_b = (uint64_t *)b;
+
+ if (*addr_a > *addr_b)
+ return 1;
+ else if (*addr_a < *addr_b)
+ return -1;
+ else
+ return 0;
+}
+
+/* Use string hash to avoid logging the same bad pages repeatedly */
+int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
+ uint64_t *pfns, int len, uint64_t *val)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ char buf[MAX_UMC_HASH_STRING_SIZE] = {0};
+ int offset = 0, i = 0;
+ uint64_t hash_val;
+
+ if (!pfns || !len)
+ return -EINVAL;
+
+ sort(pfns, len, sizeof(uint64_t), amdgpu_umc_uint64_cmp, NULL);
+
+ for (i = 0; i < len; i++)
+ offset += snprintf(&buf[offset], sizeof(buf) - offset, "%llx", pfns[i]);
+
+ hash_val = siphash(buf, offset, &con->umc_ecc_log.ecc_key);
+
+ *val = hash_val;
+
+ return 0;
+}
+
+int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
+ struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_ecc_log_info *ecc_log;
+ int ret;
+
+ ecc_log = &con->umc_ecc_log;
+
+ mutex_lock(&ecc_log->lock);
+ ret = radix_tree_insert(ecc_tree, ecc_err->hash_index, ecc_err);
+ if (!ret) {
+ struct ras_err_pages *err_pages = &ecc_err->err_pages;
+ int i;
+
+ /* Reserve memory */
+ for (i = 0; i < err_pages->count; i++)
+ amdgpu_ras_reserve_page(adev, err_pages->pfn[i]);
+
+ radix_tree_tag_set(ecc_tree,
+ ecc_err->hash_index, UMC_ECC_NEW_DETECTED_TAG);
+ }
+ mutex_unlock(&ecc_log->lock);
+
+ return ret;
+}