diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-15 09:43:42 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-15 09:43:42 -0700 |
| commit | db5d28c0bfe566908719bec8e25443aabecbb802 (patch) | |
| tree | c113e307ba7a5964ff174f590cd58bce07e2e4ee /drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | |
| parent | Merge tag 'asymmetric-keys-next-6.10-rc1' of git://git.kernel.org/pub/scm/lin... (diff) | |
| parent | Merge tag 'drm-xe-next-fixes-2024-05-09-1' of https://gitlab.freedesktop.org/... (diff) | |
| download | linux-db5d28c0bfe566908719bec8e25443aabecbb802.tar.gz linux-db5d28c0bfe566908719bec8e25443aabecbb802.zip | |
Merge tag 'drm-next-2024-05-15' of https://gitlab.freedesktop.org/drm/kernel
Pull drm updates from Dave Airlie:
"This is the main pull request for the drm subsystems for 6.10.
In drivers the main thing is a new driver for ARM Mali firmware based
GPUs, otherwise there are a lot of changes to amdgpu/xe/i915/msm and
scattered changes to everything else.
In the core a bunch of headers and Kconfig was refactored, along with
the addition of a new panic handler which is meant to provide a user
friendly message when a panic happens and graphical display is
enabled.
New drivers:
- panthor: ARM Mali/Immortalis CSF-based GPU driver
Core:
- add a CONFIG_DRM_WERROR option
- make more headers self-contained
- grab resv lock in pin/unpin
- fix vmap resv locking
- EDID/eDP panel matching
- Kconfig cleanups
- DT sound bindings
- Add SIZE_HINTS property for cursor planes
- Add struct drm_edid_product_id and helpers.
- Use drm device based logging in more drm functions.
- drop seq_file.h from a bunch of places
- use drm_edid driver conversions
dp:
- DP Tunnel documentation
- MST read sideband cap
- Adaptive sync SDP prep work
ttm:
- improve placement for TTM BOs in idle/busy handling
panic:
- Fixes for drm-panic, and option to test it.
- Add drm panic to simpledrm, mgag200, imx, ast
bridge:
- improve init ordering
- adv7511: allow GPIO pin sharing
- tc358775: add tc358675 support
panel:
- AUO B120XAN01.0
- Samsung s6e3fa7
- BOE NT116WHM-N44
- CMN N116BCA-EA1,
- CrystalClear CMT430B19N00
- Startek KD050HDFIA020-C020A
- powertip PH128800T006-ZHC01
- Innolux G121X1-L03
- LG sw43408
- Khadas TS050 V2
- EDO RM69380 OLED
- CSOT MNB601LS1-1
amdgpu:
- HDCP/ODM/RAS fixes
- Devcoredump improvements
- Expose VCN activity via sysfs
- SMY 13.0.x updates
- Enable fast updates on DCN 3.1.4
- Add dclk and vclk reporting on additional devices
- Add ACA RAS infrastructure
- Implement TLB flush fence
- EEPROM handling fixes
- SMUIO 14.0.2 support
- SMU 14.0.1 Updates
- SMU 14.0.2 support
- Sync page table freeing with TLB flushes
- DML2 refactor
- DC debug improvements
- DCN 3.5.x Updates
- GPU reset fixes
- HDP fix for second GFX pipe on GC 10.x
- Enable secondary GFX pipe on GC 10.3
- Refactor and clean up BACO/BOCO/BAMACO handling
- Remove invalid TTM resource start check
- UAF fix in VA IOCTL
- GPUVM page fault redirection to secondary IH rings for IH 6.x
- Initial support for mapping kernel queues via MES
- Fix VRAM memory accounting
amdkfd:
- MQD handling cleanup
- Preemption handling fixes for XCDs
- TLB flush fix for GC 9.4.2
- Properly clean up workqueue during module unload
- Fix memory leak process create failure
- Range check CP bad op exception targets to avoid reporting invalid exceptions to userspace
- Fix eviction fence handling
- Fix leak in GPU memory allocation failure case
- DMABuf import handling fix
- Enable SQ watchpoint for gfx10
i915:
- Adding new DG2 PCI ID
- add context hints for GT frequency
- enable only one CCS for compute workloads
- new workarounds
- Fix UAF on destroy against retire race and remove two earlier partial fixes
- Limit the reserved VM space to only the platforms that need it
- Fix gt reset with GuC submission is disable
- Add and use gt_to_guc() wrapper
i915/xe display:
- Lunar Lake display enabling, including cdclk and other refactors
- BIOS/VBT/opregion related refactor
- Digital port related refactor/clean-up
- Fix 2s boot time regression on DP panel replay init
- Remove duplication on audio enable/disable on SDVO and g4x+ DP
- Disable AuxCCS framebuffers if built for Xe
- Make crtc disable more atomic
- Increase DP idle pattern wait timeout to 2ms
- Start using container_of_const() for some extra const safety
- Fix Jasper Lake boot freeze
- Enable MST mode for 128b/132b single-stream sideband
- Enable Adaptive Sync SDP Support for DP
- Fix MTL supported DP rates - removal of UHBR13.5
- PLL refactoring
- Limit eDP MSO pipe only for display version 20
- More display refactor towards independence from i915 dev_priv
- Convert i915/xe fbdev to DRM client
- More initial work to make display code more independent from i915
xe:
- improved error capture
- clean up some uAPI leftovers
- devcoredump update
- Add BMG mocs table
- Handle GSCCS ER interrupt
- Implement xe2- and GuC workarounds
- struct xe_device cleanup
- Hwmon updates
- Add LRC parsing for more GPU instruction
- Increase VM_BIND number of per-ioctl Ops
- drm/xe: Add XE_BO_GGTT_INVALIDATE flag
- Initial development for SR-IOV support
- Add new PCI IDs to DG2 platform
- Move userptr over to start using hmm_range_fault
msm:
- Switched to generating register header files during build process
instead of shipping pre-generated headers
- Merged DPU and MDP4 format databases.
- DP:
- Stop using compat string to distinguish DP and eDP cases
- Added support for X Elite platform (X1E80100)
- Reworked DP aux/audio support
- Added SM6350 DP to the bindings
- GPU:
- a7xx perfcntr reg fixes
- MAINTAINERS updates
- a750 devcoredump support
radeon:
- Silence UBSAN warnings related to flexible arrays
nouveau:
- move some uAPI objects to uapi headers
omapdrm:
- console fix
ast:
- add i2c polling
qaic:
- add debugfs entries
exynos:
- fix platform_driver .owner
- drop cleanup code
mediatek:
- Use devm_platform_get_and_ioremap_resource() in mtk_hdmi_ddc_probe()
- Add GAMMA 12-bit LUT support for MT8188
- Rename mtk_drm_* to mtk_*
- Drop driver owner initialization
- Correct calculation formula of PHY Timing"
* tag 'drm-next-2024-05-15' of https://gitlab.freedesktop.org/drm/kernel: (1477 commits)
drm/xe/ads: Use flexible-array
drm/xe: Use ordered WQ for G2H handler
drm/msm/gen_header: allow skipping the validation
drm/msm/a6xx: Cleanup indexed regs const'ness
drm/msm: Add devcoredump support for a750
drm/msm: Adjust a7xx GBIF debugbus dumping
drm/msm: Update a6xx registers XML
drm/msm: Fix imported a750 snapshot header for upstream
drm/msm: Import a750 snapshot registers from kgsl
MAINTAINERS: Add Konrad Dybcio as a reviewer for the Adreno driver
MAINTAINERS: Add a separate entry for Qualcomm Adreno GPU drivers
drm/msm/a6xx: Avoid a nullptr dereference when speedbin setting fails
drm/msm/adreno: fix CP cycles stat retrieval on a7xx
drm/msm/a7xx: allow writing to CP_BV counter selection registers
drm: zynqmp_dpsub: Always register bridge
Revert "drm/bridge: ti-sn65dsi83: Fix enable error path"
drm/fb_dma: Add checks in drm_fb_dma_get_scanout_buffer()
drm/fbdev-generic: Do not set physical framebuffer address
drm/panthor: Fix the FW reset logic
drm/panthor: Make sure we handle 'unknown group state' case properly
...
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 133 |
1 files changed, 114 insertions, 19 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 20436f81856a..540e0f066b26 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -21,10 +21,13 @@ * */ +#include <linux/sort.h> #include "amdgpu.h" #include "umc_v6_7.h" #define MAX_UMC_POISON_POLLING_TIME_SYNC 20 //ms +#define MAX_UMC_HASH_STRING_SIZE 256 + static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev, struct ras_err_data *err_data, uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst) @@ -63,6 +66,8 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, goto out_fini_err_data; } + err_data.err_addr_len = adev->umc.max_ras_err_cnt_per_query; + /* * Translate UMC channel address to Physical address */ @@ -86,7 +91,7 @@ out_fini_err_data: return ret; } -static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, +void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, void *ras_error_status) { struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; @@ -118,6 +123,8 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, if(!err_data->err_addr) dev_warn(adev->dev, "Failed to alloc memory for " "umc error address record!\n"); + else + err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query; /* umc query_ras_error_address is also responsible for clearing * error status @@ -143,6 +150,8 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, if(!err_data->err_addr) dev_warn(adev->dev, "Failed to alloc memory for " "umc error address record!\n"); + else + err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query; /* umc query_ras_error_address is also responsible for clearing * error status @@ -170,6 +179,7 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, } kfree(err_data->err_addr); + err_data->err_addr = NULL; mutex_unlock(&con->page_retirement_lock); } @@ -177,7 +187,7 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, void *ras_error_status, struct amdgpu_iv_entry *entry, - bool reset) + uint32_t reset) { struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -186,9 +196,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, amdgpu_umc_handle_bad_pages(adev, ras_error_status); if (err_data->ue_count && reset) { - /* use mode-2 reset for poison consumption */ - if (!entry) - con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; + con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); } @@ -196,7 +204,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, } int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, - bool reset, uint32_t timeout_ms) + uint32_t reset, uint32_t timeout_ms) { struct ras_err_data err_data; struct ras_common_if head = { @@ -238,16 +246,16 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, if (reset) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - /* use mode-2 reset for poison consumption */ - con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; + con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); } return 0; } -int amdgpu_umc_poison_handler(struct amdgpu_device *adev, - enum amdgpu_ras_block block, bool reset) +int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev, + enum amdgpu_ras_block block, uint16_t pasid, + pasid_notify pasid_fn, void *data, uint32_t reset) { int ret = AMDGPU_RAS_SUCCESS; @@ -285,16 +293,14 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, amdgpu_ras_error_data_fini(&err_data); } else { - if (reset) { - amdgpu_umc_bad_page_polling_timeout(adev, - reset, MAX_UMC_POISON_POLLING_TIME_SYNC); - } else { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + amdgpu_ras_put_poison_req(adev, + block, pasid, pasid_fn, data, reset); + atomic_inc(&con->page_retirement_req_cnt); wake_up(&con->page_retirement_wq); - } } } else { if (adev->virt.ops && adev->virt.ops->ras_poison_handler) @@ -307,11 +313,19 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, return ret; } +int amdgpu_umc_poison_handler(struct amdgpu_device *adev, + enum amdgpu_ras_block block, uint32_t reset) +{ + return amdgpu_umc_pasid_poison_handler(adev, + block, 0, NULL, NULL, reset); +} + int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, void *ras_error_status, struct amdgpu_iv_entry *entry) { - return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true); + return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, + AMDGPU_RAS_GPU_RESET_MODE1_RESET); } int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev) @@ -388,14 +402,20 @@ int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, return 0; } -void amdgpu_umc_fill_error_record(struct ras_err_data *err_data, +int amdgpu_umc_fill_error_record(struct ras_err_data *err_data, uint64_t err_addr, uint64_t retired_page, uint32_t channel_index, uint32_t umc_inst) { - struct eeprom_table_record *err_rec = - &err_data->err_addr[err_data->err_addr_cnt]; + struct eeprom_table_record *err_rec; + + if (!err_data || + !err_data->err_addr || + (err_data->err_addr_cnt >= err_data->err_addr_len)) + return -EINVAL; + + err_rec = &err_data->err_addr[err_data->err_addr_cnt]; err_rec->address = err_addr; /* page frame address is saved */ @@ -407,6 +427,8 @@ void amdgpu_umc_fill_error_record(struct ras_err_data *err_data, err_rec->mcumc_id = umc_inst; err_data->err_addr_cnt++; + + return 0; } int amdgpu_umc_loop_channels(struct amdgpu_device *adev, @@ -439,3 +461,76 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev, return 0; } + +int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev, + uint64_t status, uint64_t ipid, uint64_t addr) +{ + if (adev->umc.ras->update_ecc_status) + return adev->umc.ras->update_ecc_status(adev, + status, ipid, addr); + return 0; +} + +static int amdgpu_umc_uint64_cmp(const void *a, const void *b) +{ + uint64_t *addr_a = (uint64_t *)a; + uint64_t *addr_b = (uint64_t *)b; + + if (*addr_a > *addr_b) + return 1; + else if (*addr_a < *addr_b) + return -1; + else + return 0; +} + +/* Use string hash to avoid logging the same bad pages repeatedly */ +int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev, + uint64_t *pfns, int len, uint64_t *val) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + char buf[MAX_UMC_HASH_STRING_SIZE] = {0}; + int offset = 0, i = 0; + uint64_t hash_val; + + if (!pfns || !len) + return -EINVAL; + + sort(pfns, len, sizeof(uint64_t), amdgpu_umc_uint64_cmp, NULL); + + for (i = 0; i < len; i++) + offset += snprintf(&buf[offset], sizeof(buf) - offset, "%llx", pfns[i]); + + hash_val = siphash(buf, offset, &con->umc_ecc_log.ecc_key); + + *val = hash_val; + + return 0; +} + +int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev, + struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_ecc_log_info *ecc_log; + int ret; + + ecc_log = &con->umc_ecc_log; + + mutex_lock(&ecc_log->lock); + ret = radix_tree_insert(ecc_tree, ecc_err->hash_index, ecc_err); + if (!ret) { + struct ras_err_pages *err_pages = &ecc_err->err_pages; + int i; + + /* Reserve memory */ + for (i = 0; i < err_pages->count; i++) + amdgpu_ras_reserve_page(adev, err_pages->pfn[i]); + + radix_tree_tag_set(ecc_tree, + ecc_err->hash_index, UMC_ECC_NEW_DETECTED_TAG); + } + mutex_unlock(&ecc_log->lock); + + return ret; +} |
