From 42a66677805d03df9e2600fab82d0cbe855500e1 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 16 Apr 2025 17:49:45 -0400 Subject: drm/amdgpu/userq: use consistent function naming s/userqueue/userq/ 1. remove the mix of amdgpu_userqueue and amdgpu_userq 2. to be consistent with other amdgpu_userq_fence.c 3. it's shorter Reviewed-by: Prike Liang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 915 ++++++++++++++++++++++++++++++ 1 file changed, 915 insertions(+) create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c new file mode 100644 index 000000000000..4be72bebcf34 --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -0,0 +1,915 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include +#include +#include + +#include "amdgpu.h" +#include "amdgpu_vm.h" +#include "amdgpu_userq.h" +#include "amdgpu_userq_fence.h" + +u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev) +{ + int i; + u32 userq_ip_mask = 0; + + for (i = 0; i < AMDGPU_HW_IP_NUM; i++) { + if (adev->userq_funcs[i]) + userq_ip_mask |= (1 << i); + } + + return userq_ip_mask; +} + +static int +amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr, + struct amdgpu_usermode_queue *queue) +{ + struct amdgpu_device *adev = uq_mgr->adev; + const struct amdgpu_userq_funcs *userq_funcs = + adev->userq_funcs[queue->queue_type]; + int r = 0; + + if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { + r = userq_funcs->unmap(uq_mgr, queue); + if (r) + queue->state = AMDGPU_USERQ_STATE_HUNG; + else + queue->state = AMDGPU_USERQ_STATE_UNMAPPED; + } + return r; +} + +static int +amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr, + struct amdgpu_usermode_queue *queue) +{ + struct amdgpu_device *adev = uq_mgr->adev; + const struct amdgpu_userq_funcs *userq_funcs = + adev->userq_funcs[queue->queue_type]; + int r = 0; + + if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) { + r = userq_funcs->map(uq_mgr, queue); + if (r) { + queue->state = AMDGPU_USERQ_STATE_HUNG; + } else { + queue->state = AMDGPU_USERQ_STATE_MAPPED; + } + } + return r; +} + +static void +amdgpu_userq_wait_for_last_fence(struct amdgpu_userq_mgr *uq_mgr, + struct amdgpu_usermode_queue *queue) +{ + struct amdgpu_device *adev = uq_mgr->adev; + struct dma_fence *f = queue->last_fence; + int ret; + + if (f && !dma_fence_is_signaled(f)) { + ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100)); + if (ret <= 0) + dev_err(adev->dev, "Timed out waiting for fence f=%p\n", f); + } +} + +static void +amdgpu_userq_cleanup(struct amdgpu_userq_mgr *uq_mgr, + struct amdgpu_usermode_queue *queue, + int queue_id) +{ + struct amdgpu_device *adev = uq_mgr->adev; + const struct amdgpu_userq_funcs *uq_funcs = adev->userq_funcs[queue->queue_type]; + + uq_funcs->mqd_destroy(uq_mgr, queue); + amdgpu_userq_fence_driver_free(queue); + idr_remove(&uq_mgr->userq_idr, queue_id); + kfree(queue); +} + +int +amdgpu_userq_active(struct amdgpu_userq_mgr *uq_mgr) +{ + struct amdgpu_usermode_queue *queue; + int queue_id; + int ret = 0; + + mutex_lock(&uq_mgr->userq_mutex); + /* Resume all the queues for this process */ + idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) + ret += queue->state == AMDGPU_USERQ_STATE_MAPPED; + + mutex_unlock(&uq_mgr->userq_mutex); + return ret; +} + +#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ +static struct amdgpu_usermode_queue * +amdgpu_userq_find(struct amdgpu_userq_mgr *uq_mgr, int qid) +{ + return idr_find(&uq_mgr->userq_idr, qid); +} + +void +amdgpu_userq_ensure_ev_fence(struct amdgpu_userq_mgr *uq_mgr, + struct amdgpu_eviction_fence_mgr *evf_mgr) +{ + struct amdgpu_eviction_fence *ev_fence; + +retry: + /* Flush any pending resume work to create ev_fence */ + flush_delayed_work(&uq_mgr->resume_work); + + mutex_lock(&uq_mgr->userq_mutex); + spin_lock(&evf_mgr->ev_fence_lock); + ev_fence = evf_mgr->ev_fence; + spin_unlock(&evf_mgr->ev_fence_lock); + if (!ev_fence || dma_fence_is_signaled(&ev_fence->base)) { + mutex_unlock(&uq_mgr->userq_mutex); + /* + * Looks like there was no pending resume work, + * add one now to create a valid eviction fence + */ + schedule_delayed_work(&uq_mgr->resume_work, 0); + goto retry; + } +} + +int amdgpu_userq_create_object(struct amdgpu_userq_mgr *uq_mgr, + struct amdgpu_userq_obj *userq_obj, + int size) +{ + struct amdgpu_device *adev = uq_mgr->adev; + struct amdgpu_bo_param bp; + int r; + + memset(&bp, 0, sizeof(bp)); + bp.byte_align = PAGE_SIZE; + bp.domain = AMDGPU_GEM_DOMAIN_GTT; + bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS | + AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; + bp.type = ttm_bo_type_kernel; + bp.size = size; + bp.resv = NULL; + bp.bo_ptr_size = sizeof(struct amdgpu_bo); + + r = amdgpu_bo_create(adev, &bp, &userq_obj->obj); + if (r) { + DRM_ERROR("Failed to allocate BO for userqueue (%d)", r); + return r; + } + + r = amdgpu_bo_reserve(userq_obj->obj, true); + if (r) { + DRM_ERROR("Failed to reserve BO to map (%d)", r); + goto free_obj; + } + + r = amdgpu_ttm_alloc_gart(&(userq_obj->obj)->tbo); + if (r) { + DRM_ERROR("Failed to alloc GART for userqueue object (%d)", r); + goto unresv; + } + + r = amdgpu_bo_kmap(userq_obj->obj, &userq_obj->cpu_ptr); + if (r) { + DRM_ERROR("Failed to map BO for userqueue (%d)", r); + goto unresv; + } + + userq_obj->gpu_addr = amdgpu_bo_gpu_offset(userq_obj->obj); + amdgpu_bo_unreserve(userq_obj->obj); + memset(userq_obj->cpu_ptr, 0, size); + return 0; + +unresv: + amdgpu_bo_unreserve(userq_obj->obj); + +free_obj: + amdgpu_bo_unref(&userq_obj->obj); + return r; +} + +void amdgpu_userq_destroy_object(struct amdgpu_userq_mgr *uq_mgr, + struct amdgpu_userq_obj *userq_obj) +{ + amdgpu_bo_kunmap(userq_obj->obj); + amdgpu_bo_unref(&userq_obj->obj); +} + +uint64_t +amdgpu_userq_get_doorbell_index(struct amdgpu_userq_mgr *uq_mgr, + struct amdgpu_db_info *db_info, + struct drm_file *filp) +{ + uint64_t index; + struct drm_gem_object *gobj; + struct amdgpu_userq_obj *db_obj = db_info->db_obj; + int r, db_size; + + gobj = drm_gem_object_lookup(filp, db_info->doorbell_handle); + if (gobj == NULL) { + DRM_ERROR("Can't find GEM object for doorbell\n"); + return -EINVAL; + } + + db_obj->obj = amdgpu_bo_ref(gem_to_amdgpu_bo(gobj)); + drm_gem_object_put(gobj); + + /* Pin the BO before generating the index, unpin in queue destroy */ + r = amdgpu_bo_pin(db_obj->obj, AMDGPU_GEM_DOMAIN_DOORBELL); + if (r) { + DRM_ERROR("[Usermode queues] Failed to pin doorbell object\n"); + goto unref_bo; + } + + r = amdgpu_bo_reserve(db_obj->obj, true); + if (r) { + DRM_ERROR("[Usermode queues] Failed to pin doorbell object\n"); + goto unpin_bo; + } + + switch (db_info->queue_type) { + case AMDGPU_HW_IP_GFX: + case AMDGPU_HW_IP_COMPUTE: + case AMDGPU_HW_IP_DMA: + db_size = sizeof(u64); + break; + + case AMDGPU_HW_IP_VCN_ENC: + db_size = sizeof(u32); + db_info->doorbell_offset += AMDGPU_NAVI10_DOORBELL64_VCN0_1 << 1; + break; + + case AMDGPU_HW_IP_VPE: + db_size = sizeof(u32); + db_info->doorbell_offset += AMDGPU_NAVI10_DOORBELL64_VPE << 1; + break; + + default: + DRM_ERROR("[Usermode queues] IP %d not support\n", db_info->queue_type); + r = -EINVAL; + goto unpin_bo; + } + + index = amdgpu_doorbell_index_on_bar(uq_mgr->adev, db_obj->obj, + db_info->doorbell_offset, db_size); + DRM_DEBUG_DRIVER("[Usermode queues] doorbell index=%lld\n", index); + amdgpu_bo_unreserve(db_obj->obj); + return index; + +unpin_bo: + amdgpu_bo_unpin(db_obj->obj); + +unref_bo: + amdgpu_bo_unref(&db_obj->obj); + return r; +} + +static int +amdgpu_userq_destroy(struct drm_file *filp, int queue_id) +{ + struct amdgpu_fpriv *fpriv = filp->driver_priv; + struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr; + struct amdgpu_device *adev = uq_mgr->adev; + struct amdgpu_usermode_queue *queue; + int r = 0; + + cancel_delayed_work(&uq_mgr->resume_work); + mutex_lock(&uq_mgr->userq_mutex); + + queue = amdgpu_userq_find(uq_mgr, queue_id); + if (!queue) { + DRM_DEBUG_DRIVER("Invalid queue id to destroy\n"); + mutex_unlock(&uq_mgr->userq_mutex); + return -EINVAL; + } + amdgpu_userq_wait_for_last_fence(uq_mgr, queue); + r = amdgpu_userq_unmap_helper(uq_mgr, queue); + amdgpu_bo_unpin(queue->db_obj.obj); + amdgpu_bo_unref(&queue->db_obj.obj); + amdgpu_userq_cleanup(uq_mgr, queue, queue_id); + mutex_unlock(&uq_mgr->userq_mutex); + + pm_runtime_mark_last_busy(adev_to_drm(adev)->dev); + pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); + + return r; +} + +static int amdgpu_userq_priority_permit(struct drm_file *filp, + int priority) +{ + if (priority < AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_HIGH) + return 0; + + if (capable(CAP_SYS_NICE)) + return 0; + + if (drm_is_current_master(filp)) + return 0; + + return -EACCES; +} + +static int +amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) +{ + struct amdgpu_fpriv *fpriv = filp->driver_priv; + struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr; + struct amdgpu_device *adev = uq_mgr->adev; + const struct amdgpu_userq_funcs *uq_funcs; + struct amdgpu_usermode_queue *queue; + struct amdgpu_db_info db_info; + bool skip_map_queue; + uint64_t index; + int qid, r = 0; + int priority = + (args->in.flags & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK) >> + AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_SHIFT; + + /* Usermode queues are only supported for GFX IP as of now */ + if (args->in.ip_type != AMDGPU_HW_IP_GFX && + args->in.ip_type != AMDGPU_HW_IP_DMA && + args->in.ip_type != AMDGPU_HW_IP_COMPUTE) { + DRM_ERROR("Usermode queue doesn't support IP type %u\n", args->in.ip_type); + return -EINVAL; + } + + r = amdgpu_userq_priority_permit(filp, priority); + if (r) + return r; + + if ((args->in.flags & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE) && + (args->in.ip_type != AMDGPU_HW_IP_GFX) && + (args->in.ip_type != AMDGPU_HW_IP_COMPUTE) && + !amdgpu_is_tmz(adev)) { + drm_err(adev_to_drm(adev), "Secure only supported on GFX/Compute queues\n"); + return -EINVAL; + } + + r = pm_runtime_get_sync(adev_to_drm(adev)->dev); + if (r < 0) { + dev_err(adev->dev, "pm_runtime_get_sync() failed for userqueue create\n"); + pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); + return r; + } + + /* + * There could be a situation that we are creating a new queue while + * the other queues under this UQ_mgr are suspended. So if there is any + * resume work pending, wait for it to get done. + * + * This will also make sure we have a valid eviction fence ready to be used. + */ + amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr); + + uq_funcs = adev->userq_funcs[args->in.ip_type]; + if (!uq_funcs) { + DRM_ERROR("Usermode queue is not supported for this IP (%u)\n", args->in.ip_type); + r = -EINVAL; + goto unlock; + } + + queue = kzalloc(sizeof(struct amdgpu_usermode_queue), GFP_KERNEL); + if (!queue) { + DRM_ERROR("Failed to allocate memory for queue\n"); + r = -ENOMEM; + goto unlock; + } + queue->doorbell_handle = args->in.doorbell_handle; + queue->queue_type = args->in.ip_type; + queue->vm = &fpriv->vm; + queue->priority = priority; + + db_info.queue_type = queue->queue_type; + db_info.doorbell_handle = queue->doorbell_handle; + db_info.db_obj = &queue->db_obj; + db_info.doorbell_offset = args->in.doorbell_offset; + + /* Convert relative doorbell offset into absolute doorbell index */ + index = amdgpu_userq_get_doorbell_index(uq_mgr, &db_info, filp); + if (index == (uint64_t)-EINVAL) { + DRM_ERROR("Failed to get doorbell for queue\n"); + kfree(queue); + goto unlock; + } + + queue->doorbell_index = index; + xa_init_flags(&queue->fence_drv_xa, XA_FLAGS_ALLOC); + r = amdgpu_userq_fence_driver_alloc(adev, queue); + if (r) { + DRM_ERROR("Failed to alloc fence driver\n"); + goto unlock; + } + + r = uq_funcs->mqd_create(uq_mgr, &args->in, queue); + if (r) { + DRM_ERROR("Failed to create Queue\n"); + amdgpu_userq_fence_driver_free(queue); + kfree(queue); + goto unlock; + } + + + qid = idr_alloc(&uq_mgr->userq_idr, queue, 1, AMDGPU_MAX_USERQ_COUNT, GFP_KERNEL); + if (qid < 0) { + DRM_ERROR("Failed to allocate a queue id\n"); + amdgpu_userq_fence_driver_free(queue); + uq_funcs->mqd_destroy(uq_mgr, queue); + kfree(queue); + r = -ENOMEM; + goto unlock; + } + + /* don't map the queue if scheduling is halted */ + mutex_lock(&adev->userq_mutex); + if (adev->userq_halt_for_enforce_isolation && + ((queue->queue_type == AMDGPU_HW_IP_GFX) || + (queue->queue_type == AMDGPU_HW_IP_COMPUTE))) + skip_map_queue = true; + else + skip_map_queue = false; + if (!skip_map_queue) { + r = amdgpu_userq_map_helper(uq_mgr, queue); + if (r) { + mutex_unlock(&adev->userq_mutex); + DRM_ERROR("Failed to map Queue\n"); + idr_remove(&uq_mgr->userq_idr, qid); + amdgpu_userq_fence_driver_free(queue); + uq_funcs->mqd_destroy(uq_mgr, queue); + kfree(queue); + goto unlock; + } + } + mutex_unlock(&adev->userq_mutex); + + + args->out.queue_id = qid; + +unlock: + mutex_unlock(&uq_mgr->userq_mutex); + + return r; +} + +int amdgpu_userq_ioctl(struct drm_device *dev, void *data, + struct drm_file *filp) +{ + union drm_amdgpu_userq *args = data; + int r; + + switch (args->in.op) { + case AMDGPU_USERQ_OP_CREATE: + if (args->in.flags & ~(AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK | + AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE)) + return -EINVAL; + r = amdgpu_userq_create(filp, args); + if (r) + DRM_ERROR("Failed to create usermode queue\n"); + break; + + case AMDGPU_USERQ_OP_FREE: + if (args->in.ip_type || + args->in.doorbell_handle || + args->in.doorbell_offset || + args->in.flags || + args->in.queue_va || + args->in.queue_size || + args->in.rptr_va || + args->in.wptr_va || + args->in.wptr_va || + args->in.mqd || + args->in.mqd_size) + return -EINVAL; + r = amdgpu_userq_destroy(filp, args->in.queue_id); + if (r) + DRM_ERROR("Failed to destroy usermode queue\n"); + break; + + default: + DRM_DEBUG_DRIVER("Invalid user queue op specified: %d\n", args->in.op); + return -EINVAL; + } + + return r; +} +#else +int amdgpu_userq_ioctl(struct drm_device *dev, void *data, + struct drm_file *filp) +{ + return -ENOTSUPP; +} +#endif + +static int +amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr) +{ + struct amdgpu_device *adev = uq_mgr->adev; + struct amdgpu_usermode_queue *queue; + int queue_id; + int ret = 0, r; + + /* Resume all the queues for this process */ + idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) { + r = amdgpu_userq_map_helper(uq_mgr, queue); + if (r) + ret = r; + } + + if (ret) + dev_err(adev->dev, "Failed to map all the queues\n"); + return ret; +} + +static int +amdgpu_userq_validate_vm_bo(void *_unused, struct amdgpu_bo *bo) +{ + struct ttm_operation_ctx ctx = { false, false }; + int ret; + + amdgpu_bo_placement_from_domain(bo, bo->allowed_domains); + + ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); + if (ret) + DRM_ERROR("Fail to validate\n"); + + return ret; +} + +static int +amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr) +{ + struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); + struct amdgpu_vm *vm = &fpriv->vm; + struct amdgpu_device *adev = uq_mgr->adev; + struct amdgpu_bo_va *bo_va; + struct ww_acquire_ctx *ticket; + struct drm_exec exec; + struct amdgpu_bo *bo; + struct dma_resv *resv; + bool clear, unlock; + int ret = 0; + + drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0); + drm_exec_until_all_locked(&exec) { + ret = amdgpu_vm_lock_pd(vm, &exec, 2); + drm_exec_retry_on_contention(&exec); + if (unlikely(ret)) { + DRM_ERROR("Failed to lock PD\n"); + goto unlock_all; + } + + /* Lock the done list */ + list_for_each_entry(bo_va, &vm->done, base.vm_status) { + bo = bo_va->base.bo; + if (!bo) + continue; + + ret = drm_exec_lock_obj(&exec, &bo->tbo.base); + drm_exec_retry_on_contention(&exec); + if (unlikely(ret)) + goto unlock_all; + } + } + + spin_lock(&vm->status_lock); + while (!list_empty(&vm->moved)) { + bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va, + base.vm_status); + spin_unlock(&vm->status_lock); + + /* Per VM BOs never need to bo cleared in the page tables */ + ret = amdgpu_vm_bo_update(adev, bo_va, false); + if (ret) + goto unlock_all; + spin_lock(&vm->status_lock); + } + + ticket = &exec.ticket; + while (!list_empty(&vm->invalidated)) { + bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va, + base.vm_status); + resv = bo_va->base.bo->tbo.base.resv; + spin_unlock(&vm->status_lock); + + bo = bo_va->base.bo; + ret = amdgpu_userq_validate_vm_bo(NULL, bo); + if (ret) { + DRM_ERROR("Failed to validate BO\n"); + goto unlock_all; + } + + /* Try to reserve the BO to avoid clearing its ptes */ + if (!adev->debug_vm && dma_resv_trylock(resv)) { + clear = false; + unlock = true; + /* The caller is already holding the reservation lock */ + } else if (ticket && dma_resv_locking_ctx(resv) == ticket) { + clear = false; + unlock = false; + /* Somebody else is using the BO right now */ + } else { + clear = true; + unlock = false; + } + + ret = amdgpu_vm_bo_update(adev, bo_va, clear); + + if (unlock) + dma_resv_unlock(resv); + if (ret) + goto unlock_all; + + spin_lock(&vm->status_lock); + } + spin_unlock(&vm->status_lock); + + ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec); + if (ret) + DRM_ERROR("Failed to replace eviction fence\n"); + +unlock_all: + drm_exec_fini(&exec); + return ret; +} + +static void amdgpu_userq_restore_worker(struct work_struct *work) +{ + struct amdgpu_userq_mgr *uq_mgr = work_to_uq_mgr(work, resume_work.work); + struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); + int ret; + + flush_work(&fpriv->evf_mgr.suspend_work.work); + + mutex_lock(&uq_mgr->userq_mutex); + + ret = amdgpu_userq_validate_bos(uq_mgr); + if (ret) { + DRM_ERROR("Failed to validate BOs to restore\n"); + goto unlock; + } + + ret = amdgpu_userq_restore_all(uq_mgr); + if (ret) { + DRM_ERROR("Failed to restore all queues\n"); + goto unlock; + } + +unlock: + mutex_unlock(&uq_mgr->userq_mutex); +} + +static int +amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) +{ + struct amdgpu_device *adev = uq_mgr->adev; + struct amdgpu_usermode_queue *queue; + int queue_id; + int ret = 0, r; + + /* Try to unmap all the queues in this process ctx */ + idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) { + r = amdgpu_userq_unmap_helper(uq_mgr, queue); + if (r) + ret = r; + } + + if (ret) + dev_err(adev->dev, "Couldn't unmap all the queues\n"); + return ret; +} + +static int +amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) +{ + struct amdgpu_usermode_queue *queue; + int queue_id, ret; + + idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) { + struct dma_fence *f = queue->last_fence; + + if (!f || dma_fence_is_signaled(f)) + continue; + ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100)); + if (ret <= 0) { + DRM_ERROR("Timed out waiting for fence=%llu:%llu\n", + f->context, f->seqno); + return -ETIMEDOUT; + } + } + + return 0; +} + +void +amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, + struct amdgpu_eviction_fence *ev_fence) +{ + int ret; + struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); + struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr; + + /* Wait for any pending userqueue fence work to finish */ + ret = amdgpu_userq_wait_for_signal(uq_mgr); + if (ret) { + DRM_ERROR("Not evicting userqueue, timeout waiting for work\n"); + return; + } + + ret = amdgpu_userq_evict_all(uq_mgr); + if (ret) { + DRM_ERROR("Failed to evict userqueue\n"); + return; + } + + /* Signal current eviction fence */ + amdgpu_eviction_fence_signal(evf_mgr, ev_fence); + + if (evf_mgr->fd_closing) { + cancel_delayed_work(&uq_mgr->resume_work); + return; + } + + /* Schedule a resume work */ + schedule_delayed_work(&uq_mgr->resume_work, 0); +} + +int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct amdgpu_device *adev) +{ + mutex_init(&userq_mgr->userq_mutex); + idr_init_base(&userq_mgr->userq_idr, 1); + userq_mgr->adev = adev; + + mutex_lock(&adev->userq_mutex); + list_add(&userq_mgr->list, &adev->userq_mgr_list); + mutex_unlock(&adev->userq_mutex); + + INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker); + return 0; +} + +void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr) +{ + struct amdgpu_device *adev = userq_mgr->adev; + struct amdgpu_usermode_queue *queue; + struct amdgpu_userq_mgr *uqm, *tmp; + uint32_t queue_id; + + cancel_delayed_work(&userq_mgr->resume_work); + + mutex_lock(&userq_mgr->userq_mutex); + idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) { + amdgpu_userq_wait_for_last_fence(userq_mgr, queue); + amdgpu_userq_unmap_helper(userq_mgr, queue); + amdgpu_userq_cleanup(userq_mgr, queue, queue_id); + } + mutex_lock(&adev->userq_mutex); + list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { + if (uqm == userq_mgr) { + list_del(&uqm->list); + break; + } + } + mutex_unlock(&adev->userq_mutex); + idr_destroy(&userq_mgr->userq_idr); + mutex_unlock(&userq_mgr->userq_mutex); + mutex_destroy(&userq_mgr->userq_mutex); +} + +int amdgpu_userq_suspend(struct amdgpu_device *adev) +{ + u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev); + struct amdgpu_usermode_queue *queue; + struct amdgpu_userq_mgr *uqm, *tmp; + int queue_id; + int ret = 0, r; + + if (!ip_mask) + return 0; + + mutex_lock(&adev->userq_mutex); + list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { + cancel_delayed_work_sync(&uqm->resume_work); + idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { + r = amdgpu_userq_unmap_helper(uqm, queue); + if (r) + ret = r; + } + } + mutex_unlock(&adev->userq_mutex); + return ret; +} + +int amdgpu_userq_resume(struct amdgpu_device *adev) +{ + u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev); + struct amdgpu_usermode_queue *queue; + struct amdgpu_userq_mgr *uqm, *tmp; + int queue_id; + int ret = 0, r; + + if (!ip_mask) + return 0; + + mutex_lock(&adev->userq_mutex); + list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { + idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { + r = amdgpu_userq_map_helper(uqm, queue); + if (r) + ret = r; + } + } + mutex_unlock(&adev->userq_mutex); + return ret; +} + +int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev, + u32 idx) +{ + u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev); + struct amdgpu_usermode_queue *queue; + struct amdgpu_userq_mgr *uqm, *tmp; + int queue_id; + int ret = 0, r; + + /* only need to stop gfx/compute */ + if (!(ip_mask & ((1 << AMDGPU_HW_IP_GFX) | (1 << AMDGPU_HW_IP_COMPUTE)))) + return 0; + + mutex_lock(&adev->userq_mutex); + if (adev->userq_halt_for_enforce_isolation) + dev_warn(adev->dev, "userq scheduling already stopped!\n"); + adev->userq_halt_for_enforce_isolation = true; + list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { + cancel_delayed_work_sync(&uqm->resume_work); + idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { + if (((queue->queue_type == AMDGPU_HW_IP_GFX) || + (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) && + (queue->xcp_id == idx)) { + r = amdgpu_userq_unmap_helper(uqm, queue); + if (r) + ret = r; + } + } + } + mutex_unlock(&adev->userq_mutex); + return ret; +} + +int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev, + u32 idx) +{ + u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev); + struct amdgpu_usermode_queue *queue; + struct amdgpu_userq_mgr *uqm, *tmp; + int queue_id; + int ret = 0, r; + + /* only need to stop gfx/compute */ + if (!(ip_mask & ((1 << AMDGPU_HW_IP_GFX) | (1 << AMDGPU_HW_IP_COMPUTE)))) + return 0; + + mutex_lock(&adev->userq_mutex); + if (!adev->userq_halt_for_enforce_isolation) + dev_warn(adev->dev, "userq scheduling already started!\n"); + adev->userq_halt_for_enforce_isolation = false; + list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { + idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { + if (((queue->queue_type == AMDGPU_HW_IP_GFX) || + (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) && + (queue->xcp_id == idx)) { + r = amdgpu_userq_map_helper(uqm, queue); + if (r) + ret = r; + } + } + } + mutex_unlock(&adev->userq_mutex); + return ret; +} -- cgit v1.2.3 From 127e612bf16726620e431b6e0f771424916492be Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Mon, 21 Apr 2025 17:45:49 +0530 Subject: drm/amdgpu: update fence ptr with context:seqno log context:seqno of the fence during timeout rather than logging fence pointer. Reviewed-by: Arvind Yadav Cc: Tvrtko Ursulin Signed-off-by: Sunil Khatri Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 4be72bebcf34..b0e8098a3988 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -94,7 +94,8 @@ amdgpu_userq_wait_for_last_fence(struct amdgpu_userq_mgr *uq_mgr, if (f && !dma_fence_is_signaled(f)) { ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100)); if (ret <= 0) - dev_err(adev->dev, "Timed out waiting for fence f=%p\n", f); + dev_err(adev->dev, "Timed out waiting for fence=%llu:%llu\n", + f->context, f->seqno); } } -- cgit v1.2.3 From 56801cb83c8c95ae23ea576570c75a01c1f07774 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Tue, 22 Apr 2025 19:29:03 +0530 Subject: drm/amdgpu: remove DRM_AMDGPU_NAVI3X_USERQ config for UQ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DRM_AMDGPU_NAVI3X_USERQ config support is not required for usermode queue. v2: rebase. Cc: Arunpravin Paneer Selvam Reviewed-by: Sunil Khatri Reviewed-by: Alex Deucher Reviewed-by: Christian König Signed-off-by: Arvind Yadav Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/Kconfig | 8 -------- drivers/gpu/drm/amd/amdgpu/Makefile | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +------ drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 5 +---- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 8 -------- drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 18 ------------------ drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 4 ---- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 2 -- drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 3 +-- drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 3 --- 10 files changed, 4 insertions(+), 56 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c') diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig b/drivers/gpu/drm/amd/amdgpu/Kconfig index 7b95221d2f3d..1a11cab741ac 100644 --- a/drivers/gpu/drm/amd/amdgpu/Kconfig +++ b/drivers/gpu/drm/amd/amdgpu/Kconfig @@ -96,14 +96,6 @@ config DRM_AMDGPU_WERROR Add -Werror to the build flags for amdgpu.ko. Only enable this if you are warning code for amdgpu.ko. -config DRM_AMDGPU_NAVI3X_USERQ - bool "Enable amdgpu usermode queues" - depends on DRM_AMDGPU - default n - help - Choose this option to enable GFX usermode queue support for GFX/SDMA/Compute - workload submission. This feature is experimental and supported on GFX11+. - source "drivers/gpu/drm/amd/acp/Kconfig" source "drivers/gpu/drm/amd/display/Kconfig" source "drivers/gpu/drm/amd/amdkfd/Kconfig" diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index 8595e05c691b..87080c06e5fc 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -177,7 +177,7 @@ amdgpu-y += \ mes_v12_0.o \ # add GFX userqueue support -amdgpu-$(CONFIG_DRM_AMDGPU_NAVI3X_USERQ) += mes_userqueue.o +amdgpu-y += mes_userqueue.o # add UVD block amdgpu-y += \ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b96e0613ea7e..fe68ba9997ae 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3513,9 +3513,7 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); amdgpu_amdkfd_suspend(adev, false); -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ amdgpu_userq_suspend(adev); -#endif /* Workaround for ASICs need to disable SMC first */ amdgpu_device_smu_fini_early(adev); @@ -5086,9 +5084,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) if (!adev->in_s0ix) { amdgpu_amdkfd_suspend(adev, adev->in_runpm); -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ amdgpu_userq_suspend(adev); -#endif } r = amdgpu_device_evict_resources(adev); @@ -5156,11 +5152,10 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) r = amdgpu_amdkfd_resume(adev, adev->in_runpm); if (r) goto exit; -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ + r = amdgpu_userq_resume(adev); if (r) goto exit; -#endif } r = amdgpu_device_ip_late_init(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index e0cc2bb083cb..8f1a2f7b03c1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1981,9 +1981,7 @@ static void amdgpu_gfx_kfd_sch_ctrl(struct amdgpu_device *adev, u32 idx, if (adev->gfx.userq_sch_req_count[idx] == 0) { cancel_delayed_work_sync(&adev->gfx.enforce_isolation[idx].work); if (!adev->gfx.userq_sch_inactive[idx]) { -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ amdgpu_userq_stop_sched_for_enforce_isolation(adev, idx); -#endif if (adev->kfd.init_complete) amdgpu_amdkfd_stop_sched(adev, idx); adev->gfx.userq_sch_inactive[idx] = true; @@ -2041,9 +2039,8 @@ void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work) /* Tell KFD to resume the runqueue */ WARN_ON_ONCE(!adev->gfx.userq_sch_inactive[idx]); WARN_ON_ONCE(adev->gfx.userq_sch_req_count[idx]); -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ + amdgpu_userq_start_sched_for_enforce_isolation(adev, idx); -#endif if (adev->kfd.init_complete) amdgpu_amdkfd_start_sched(adev, idx); adev->gfx.userq_sch_inactive[idx] = false; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index b0e8098a3988..451890ee3fb7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -129,7 +129,6 @@ amdgpu_userq_active(struct amdgpu_userq_mgr *uq_mgr) return ret; } -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ static struct amdgpu_usermode_queue * amdgpu_userq_find(struct amdgpu_userq_mgr *uq_mgr, int qid) { @@ -520,13 +519,6 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data, return r; } -#else -int amdgpu_userq_ioctl(struct drm_device *dev, void *data, - struct drm_file *filp) -{ - return -ENOTSUPP; -} -#endif static int amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index be068e8e37d1..3288c2ff692e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -216,7 +216,6 @@ void amdgpu_userq_fence_driver_put(struct amdgpu_userq_fence_driver *fence_drv) kref_put(&fence_drv->refcount, amdgpu_userq_fence_driver_destroy); } -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ static int amdgpu_userq_fence_alloc(struct amdgpu_userq_fence **userq_fence) { *userq_fence = kmem_cache_alloc(amdgpu_userq_fence_slab, GFP_ATOMIC); @@ -288,7 +287,6 @@ static int amdgpu_userq_fence_create(struct amdgpu_usermode_queue *userq, return 0; } -#endif static const char *amdgpu_userq_fence_get_driver_name(struct dma_fence *f) { @@ -343,7 +341,6 @@ static const struct dma_fence_ops amdgpu_userq_fence_ops = { .release = amdgpu_userq_fence_release, }; -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ /** * amdgpu_userq_fence_read_wptr - Read the userq wptr value * @@ -594,15 +591,7 @@ free_syncobj_handles: return r; } -#else -int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, - struct drm_file *filp) -{ - return -ENOTSUPP; -} -#endif -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { @@ -968,10 +957,3 @@ free_bo_handles_read: return r; } -#else -int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, - struct drm_file *filp) -{ - return -ENOTSUPP; -} -#endif diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 2df11f4127cc..3f4ee4b3b0a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -1606,7 +1606,6 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) case IP_VERSION(11, 0, 0): case IP_VERSION(11, 0, 2): case IP_VERSION(11, 0, 3): -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ if (!adev->gfx.disable_uq && adev->gfx.me_fw_version >= 2390 && adev->gfx.pfp_fw_version >= 2530 && @@ -1615,7 +1614,6 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) adev->userq_funcs[AMDGPU_HW_IP_GFX] = &userq_mes_funcs; adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] = &userq_mes_funcs; } -#endif break; case IP_VERSION(11, 0, 1): case IP_VERSION(11, 0, 4): @@ -1623,13 +1621,11 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) case IP_VERSION(11, 5, 1): case IP_VERSION(11, 5, 2): case IP_VERSION(11, 5, 3): -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ /* add firmware version checks here */ if (0 && !adev->gfx.disable_uq) { adev->userq_funcs[AMDGPU_HW_IP_GFX] = &userq_mes_funcs; adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] = &userq_mes_funcs; } -#endif break; default: break; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index dfa0830a4eb1..f09d96bfee16 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -1416,7 +1416,6 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block) switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { case IP_VERSION(12, 0, 0): case IP_VERSION(12, 0, 1): -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ if (!adev->gfx.disable_uq && adev->gfx.me_fw_version >= 2780 && adev->gfx.pfp_fw_version >= 2840 && @@ -1425,7 +1424,6 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block) adev->userq_funcs[AMDGPU_HW_IP_GFX] = &userq_mes_funcs; adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] = &userq_mes_funcs; } -#endif break; default: break; diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c index 6bb36187a53d..da5b5d64f137 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c @@ -1363,11 +1363,10 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block *ip_block) else DRM_ERROR("Failed to allocated memory for SDMA IP Dump\n"); -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ /* add firmware version checks here */ if (0 && !adev->sdma.disable_uq) adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs; -#endif + r = amdgpu_sdma_sysfs_reset_mask_init(adev); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c index 943c6446a0a7..befe013b11a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c @@ -1338,12 +1338,9 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block *ip_block) else DRM_ERROR("Failed to allocated memory for SDMA IP Dump\n"); -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ /* add firmware version checks here */ if (0 && !adev->sdma.disable_uq) adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs; -#endif - return r; } -- cgit v1.2.3 From c5e02d6588f2498cf726e13504593f7325f1ee61 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 25 Apr 2025 12:07:06 -0400 Subject: drm/amdgpu/userq: take the userq_mgr lock in suspend/resume Add the missing locking. Fixes: 73e12e98ec0c ("drm/amdgpu/userq: add suspend and resume helpers") Reviewed-by: Arvind Yadav Reviewed-by: Prike Liang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 451890ee3fb7..1fa9d2be87f3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -809,11 +809,13 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev) mutex_lock(&adev->userq_mutex); list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { cancel_delayed_work_sync(&uqm->resume_work); + mutex_lock(&uqm->userq_mutex); idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { r = amdgpu_userq_unmap_helper(uqm, queue); if (r) ret = r; } + mutex_unlock(&uqm->userq_mutex); } mutex_unlock(&adev->userq_mutex); return ret; @@ -832,11 +834,13 @@ int amdgpu_userq_resume(struct amdgpu_device *adev) mutex_lock(&adev->userq_mutex); list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { + mutex_lock(&uqm->userq_mutex); idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { r = amdgpu_userq_map_helper(uqm, queue); if (r) ret = r; } + mutex_unlock(&uqm->userq_mutex); } mutex_unlock(&adev->userq_mutex); return ret; -- cgit v1.2.3 From 482d48533257fdaa05b0f847ee919305f1668790 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 25 Apr 2025 12:11:45 -0400 Subject: drm/amdgpu/userq: take the userq_mgr lock in enforce isolation Add the missing locking. Fixes: 94976e7e5ede ("drm/amdgpu/userq: add helpers to start/stop scheduling") Reviewed-by: Arvind Yadav Reviewed-by: Prike Liang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 1fa9d2be87f3..afbe01149ed3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -865,6 +865,7 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev, adev->userq_halt_for_enforce_isolation = true; list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { cancel_delayed_work_sync(&uqm->resume_work); + mutex_lock(&uqm->userq_mutex); idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { if (((queue->queue_type == AMDGPU_HW_IP_GFX) || (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) && @@ -874,6 +875,7 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev, ret = r; } } + mutex_unlock(&uqm->userq_mutex); } mutex_unlock(&adev->userq_mutex); return ret; @@ -897,6 +899,7 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev, dev_warn(adev->dev, "userq scheduling already started!\n"); adev->userq_halt_for_enforce_isolation = false; list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { + mutex_lock(&uqm->userq_mutex); idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { if (((queue->queue_type == AMDGPU_HW_IP_GFX) || (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) && @@ -906,6 +909,7 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev, ret = r; } } + mutex_unlock(&uqm->userq_mutex); } mutex_unlock(&adev->userq_mutex); return ret; -- cgit v1.2.3 From 97c39b4da606f6e6ec62689d1963ee6a5ad7f8ac Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 30 Apr 2025 11:05:13 +0300 Subject: drm/amdgpu/userq: remove unnecessary NULL check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "ticket" pointer points to in the middle of the &exec struct so it can't be NULL. Remove the check. Reviewed-by: Christian König Acked-by: Shashank Sharma Signed-off-by: Dan Carpenter Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index afbe01149ed3..33544586ffaa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -623,7 +623,7 @@ amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr) clear = false; unlock = true; /* The caller is already holding the reservation lock */ - } else if (ticket && dma_resv_locking_ctx(resv) == ticket) { + } else if (dma_resv_locking_ctx(resv) == ticket) { clear = false; unlock = false; /* Somebody else is using the BO right now */ -- cgit v1.2.3 From 30ff75809d0359566fda48883dcc15bf427558e9 Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Tue, 22 Apr 2025 17:55:50 +0530 Subject: drm/amdgpu: add drm_file reference in userq_mgr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drm_file will be used in usermode queues code to enable better process information in logging and hence add drm_file part of the userq_mgr struct. update the drm_file pointer in userq_mgr for each amdgpu_driver_open_kms. Signed-off-by: Sunil Khatri Reviewed-by: Tvrtko Ursulin Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 4 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 8f992314c5a1..8d4a2aed7231 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1431,7 +1431,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) amdgpu_ctx_mgr_init(&fpriv->ctx_mgr, adev); - r = amdgpu_userq_mgr_init(&fpriv->userq_mgr, adev); + r = amdgpu_userq_mgr_init(&fpriv->userq_mgr, file_priv, adev); if (r) DRM_WARN("Can't setup usermode queues, use legacy workload submission only\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 33544586ffaa..c7a3abfccd13 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -753,11 +753,13 @@ amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, schedule_delayed_work(&uq_mgr->resume_work, 0); } -int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct amdgpu_device *adev) +int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *file_priv, + struct amdgpu_device *adev) { mutex_init(&userq_mgr->userq_mutex); idr_init_base(&userq_mgr->userq_idr, 1); userq_mgr->adev = adev; + userq_mgr->file = file_priv; mutex_lock(&adev->userq_mutex); list_add(&userq_mgr->list, &adev->userq_mgr_list); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h index 4d3eb651acf1..ec040c2fd6c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h @@ -86,6 +86,7 @@ struct amdgpu_userq_mgr { struct amdgpu_device *adev; struct delayed_work resume_work; struct list_head list; + struct drm_file *file; }; struct amdgpu_db_info { @@ -97,7 +98,8 @@ struct amdgpu_db_info { int amdgpu_userq_ioctl(struct drm_device *dev, void *data, struct drm_file *filp); -int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct amdgpu_device *adev); +int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *file_priv, + struct amdgpu_device *adev); void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr); -- cgit v1.2.3 From 8c97cdb1a692eafcf5b6e8fc7e7ecc7eff28984e Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Tue, 22 Apr 2025 18:10:22 +0530 Subject: drm/amdgpu: use drm_file_err in fence timeouts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit use drm_file_err instead of DRM_ERROR which adds process and pid information in the userqueue error logging. Sample log: [ 19.802315] amdgpu 0000:0a:00.0: [drm] *ERROR* comm: ibus-x11 pid: 2055 client: Unset ... Couldn't unmap all the queues [ 19.802319] amdgpu 0000:0a:00.0: [drm] *ERROR* comm: ibus-x11 pid: 2055 client: Unset ... Failed to evict userqueue [ 19.838432] amdgpu 0000:0a:00.0: [drm] *ERROR* comm: systemd-logind pid: 1042 client: Unset ... Couldn't unmap all the queues [ 19.838436] amdgpu 0000:0a:00.0: [drm] *ERROR* comm: systemd-logind pid: 1042 client: Unset ... Failed to evict userqueue Signed-off-by: Sunil Khatri Reviewed-by: Tvrtko Ursulin Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index c7a3abfccd13..8b1323babb96 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -87,15 +87,14 @@ static void amdgpu_userq_wait_for_last_fence(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_queue *queue) { - struct amdgpu_device *adev = uq_mgr->adev; struct dma_fence *f = queue->last_fence; int ret; if (f && !dma_fence_is_signaled(f)) { ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100)); if (ret <= 0) - dev_err(adev->dev, "Timed out waiting for fence=%llu:%llu\n", - f->context, f->seqno); + drm_file_err(uq_mgr->file, "Timed out waiting for fence=%llu:%llu\n", + f->context, f->seqno); } } @@ -711,8 +710,8 @@ amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) continue; ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100)); if (ret <= 0) { - DRM_ERROR("Timed out waiting for fence=%llu:%llu\n", - f->context, f->seqno); + drm_file_err(uq_mgr->file, "Timed out waiting for fence=%llu:%llu\n", + f->context, f->seqno); return -ETIMEDOUT; } } -- cgit v1.2.3 From c46a37628a274ff557e6c1f45e6c00c3c6db60df Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Tue, 22 Apr 2025 18:40:03 +0530 Subject: drm/amdgpu: change DRM_ERROR to drm_file_err in amdgpu_userq.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit change the DRM_ERROR and drm_err to drm_file_err to add process name and pid to the logging. Signed-off-by: Sunil Khatri Reviewed-by: Tvrtko Ursulin Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 63 ++++++++++++++++--------------- 1 file changed, 32 insertions(+), 31 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 8b1323babb96..854577d3c37c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -179,25 +179,25 @@ int amdgpu_userq_create_object(struct amdgpu_userq_mgr *uq_mgr, r = amdgpu_bo_create(adev, &bp, &userq_obj->obj); if (r) { - DRM_ERROR("Failed to allocate BO for userqueue (%d)", r); + drm_file_err(uq_mgr->file, "Failed to allocate BO for userqueue (%d)", r); return r; } r = amdgpu_bo_reserve(userq_obj->obj, true); if (r) { - DRM_ERROR("Failed to reserve BO to map (%d)", r); + drm_file_err(uq_mgr->file, "Failed to reserve BO to map (%d)", r); goto free_obj; } r = amdgpu_ttm_alloc_gart(&(userq_obj->obj)->tbo); if (r) { - DRM_ERROR("Failed to alloc GART for userqueue object (%d)", r); + drm_file_err(uq_mgr->file, "Failed to alloc GART for userqueue object (%d)", r); goto unresv; } r = amdgpu_bo_kmap(userq_obj->obj, &userq_obj->cpu_ptr); if (r) { - DRM_ERROR("Failed to map BO for userqueue (%d)", r); + drm_file_err(uq_mgr->file, "Failed to map BO for userqueue (%d)", r); goto unresv; } @@ -233,7 +233,7 @@ amdgpu_userq_get_doorbell_index(struct amdgpu_userq_mgr *uq_mgr, gobj = drm_gem_object_lookup(filp, db_info->doorbell_handle); if (gobj == NULL) { - DRM_ERROR("Can't find GEM object for doorbell\n"); + drm_file_err(uq_mgr->file, "Can't find GEM object for doorbell\n"); return -EINVAL; } @@ -243,13 +243,13 @@ amdgpu_userq_get_doorbell_index(struct amdgpu_userq_mgr *uq_mgr, /* Pin the BO before generating the index, unpin in queue destroy */ r = amdgpu_bo_pin(db_obj->obj, AMDGPU_GEM_DOMAIN_DOORBELL); if (r) { - DRM_ERROR("[Usermode queues] Failed to pin doorbell object\n"); + drm_file_err(uq_mgr->file, "[Usermode queues] Failed to pin doorbell object\n"); goto unref_bo; } r = amdgpu_bo_reserve(db_obj->obj, true); if (r) { - DRM_ERROR("[Usermode queues] Failed to pin doorbell object\n"); + drm_file_err(uq_mgr->file, "[Usermode queues] Failed to pin doorbell object\n"); goto unpin_bo; } @@ -271,7 +271,8 @@ amdgpu_userq_get_doorbell_index(struct amdgpu_userq_mgr *uq_mgr, break; default: - DRM_ERROR("[Usermode queues] IP %d not support\n", db_info->queue_type); + drm_file_err(uq_mgr->file, "[Usermode queues] IP %d not support\n", + db_info->queue_type); r = -EINVAL; goto unpin_bo; } @@ -356,7 +357,8 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) if (args->in.ip_type != AMDGPU_HW_IP_GFX && args->in.ip_type != AMDGPU_HW_IP_DMA && args->in.ip_type != AMDGPU_HW_IP_COMPUTE) { - DRM_ERROR("Usermode queue doesn't support IP type %u\n", args->in.ip_type); + drm_file_err(uq_mgr->file, "Usermode queue doesn't support IP type %u\n", + args->in.ip_type); return -EINVAL; } @@ -368,13 +370,13 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) (args->in.ip_type != AMDGPU_HW_IP_GFX) && (args->in.ip_type != AMDGPU_HW_IP_COMPUTE) && !amdgpu_is_tmz(adev)) { - drm_err(adev_to_drm(adev), "Secure only supported on GFX/Compute queues\n"); + drm_file_err(uq_mgr->file, "Secure only supported on GFX/Compute queues\n"); return -EINVAL; } r = pm_runtime_get_sync(adev_to_drm(adev)->dev); if (r < 0) { - dev_err(adev->dev, "pm_runtime_get_sync() failed for userqueue create\n"); + drm_file_err(uq_mgr->file, "pm_runtime_get_sync() failed for userqueue create\n"); pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); return r; } @@ -390,14 +392,15 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) uq_funcs = adev->userq_funcs[args->in.ip_type]; if (!uq_funcs) { - DRM_ERROR("Usermode queue is not supported for this IP (%u)\n", args->in.ip_type); + drm_file_err(uq_mgr->file, "Usermode queue is not supported for this IP (%u)\n", + args->in.ip_type); r = -EINVAL; goto unlock; } queue = kzalloc(sizeof(struct amdgpu_usermode_queue), GFP_KERNEL); if (!queue) { - DRM_ERROR("Failed to allocate memory for queue\n"); + drm_file_err(uq_mgr->file, "Failed to allocate memory for queue\n"); r = -ENOMEM; goto unlock; } @@ -414,7 +417,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) /* Convert relative doorbell offset into absolute doorbell index */ index = amdgpu_userq_get_doorbell_index(uq_mgr, &db_info, filp); if (index == (uint64_t)-EINVAL) { - DRM_ERROR("Failed to get doorbell for queue\n"); + drm_file_err(uq_mgr->file, "Failed to get doorbell for queue\n"); kfree(queue); goto unlock; } @@ -423,13 +426,13 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) xa_init_flags(&queue->fence_drv_xa, XA_FLAGS_ALLOC); r = amdgpu_userq_fence_driver_alloc(adev, queue); if (r) { - DRM_ERROR("Failed to alloc fence driver\n"); + drm_file_err(uq_mgr->file, "Failed to alloc fence driver\n"); goto unlock; } r = uq_funcs->mqd_create(uq_mgr, &args->in, queue); if (r) { - DRM_ERROR("Failed to create Queue\n"); + drm_file_err(uq_mgr->file, "Failed to create Queue\n"); amdgpu_userq_fence_driver_free(queue); kfree(queue); goto unlock; @@ -438,7 +441,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) qid = idr_alloc(&uq_mgr->userq_idr, queue, 1, AMDGPU_MAX_USERQ_COUNT, GFP_KERNEL); if (qid < 0) { - DRM_ERROR("Failed to allocate a queue id\n"); + drm_file_err(uq_mgr->file, "Failed to allocate a queue id\n"); amdgpu_userq_fence_driver_free(queue); uq_funcs->mqd_destroy(uq_mgr, queue); kfree(queue); @@ -458,7 +461,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) r = amdgpu_userq_map_helper(uq_mgr, queue); if (r) { mutex_unlock(&adev->userq_mutex); - DRM_ERROR("Failed to map Queue\n"); + drm_file_err(uq_mgr->file, "Failed to map Queue\n"); idr_remove(&uq_mgr->userq_idr, qid); amdgpu_userq_fence_driver_free(queue); uq_funcs->mqd_destroy(uq_mgr, queue); @@ -490,7 +493,7 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data, return -EINVAL; r = amdgpu_userq_create(filp, args); if (r) - DRM_ERROR("Failed to create usermode queue\n"); + drm_file_err(filp, "Failed to create usermode queue\n"); break; case AMDGPU_USERQ_OP_FREE: @@ -508,7 +511,7 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data, return -EINVAL; r = amdgpu_userq_destroy(filp, args->in.queue_id); if (r) - DRM_ERROR("Failed to destroy usermode queue\n"); + drm_file_err(filp, "Failed to destroy usermode queue\n"); break; default: @@ -522,7 +525,6 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data, static int amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr) { - struct amdgpu_device *adev = uq_mgr->adev; struct amdgpu_usermode_queue *queue; int queue_id; int ret = 0, r; @@ -535,7 +537,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr) } if (ret) - dev_err(adev->dev, "Failed to map all the queues\n"); + drm_file_err(uq_mgr->file, "Failed to map all the queues\n"); return ret; } @@ -573,7 +575,7 @@ amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr) ret = amdgpu_vm_lock_pd(vm, &exec, 2); drm_exec_retry_on_contention(&exec); if (unlikely(ret)) { - DRM_ERROR("Failed to lock PD\n"); + drm_file_err(uq_mgr->file, "Failed to lock PD\n"); goto unlock_all; } @@ -613,7 +615,7 @@ amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr) bo = bo_va->base.bo; ret = amdgpu_userq_validate_vm_bo(NULL, bo); if (ret) { - DRM_ERROR("Failed to validate BO\n"); + drm_file_err(uq_mgr->file, "Failed to validate BO\n"); goto unlock_all; } @@ -644,7 +646,7 @@ amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr) ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec); if (ret) - DRM_ERROR("Failed to replace eviction fence\n"); + drm_file_err(uq_mgr->file, "Failed to replace eviction fence\n"); unlock_all: drm_exec_fini(&exec); @@ -663,13 +665,13 @@ static void amdgpu_userq_restore_worker(struct work_struct *work) ret = amdgpu_userq_validate_bos(uq_mgr); if (ret) { - DRM_ERROR("Failed to validate BOs to restore\n"); + drm_file_err(uq_mgr->file, "Failed to validate BOs to restore\n"); goto unlock; } ret = amdgpu_userq_restore_all(uq_mgr); if (ret) { - DRM_ERROR("Failed to restore all queues\n"); + drm_file_err(uq_mgr->file, "Failed to restore all queues\n"); goto unlock; } @@ -680,7 +682,6 @@ unlock: static int amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) { - struct amdgpu_device *adev = uq_mgr->adev; struct amdgpu_usermode_queue *queue; int queue_id; int ret = 0, r; @@ -693,7 +694,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) } if (ret) - dev_err(adev->dev, "Couldn't unmap all the queues\n"); + drm_file_err(uq_mgr->file, "Couldn't unmap all the queues\n"); return ret; } @@ -730,13 +731,13 @@ amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, /* Wait for any pending userqueue fence work to finish */ ret = amdgpu_userq_wait_for_signal(uq_mgr); if (ret) { - DRM_ERROR("Not evicting userqueue, timeout waiting for work\n"); + drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout waiting for work\n"); return; } ret = amdgpu_userq_evict_all(uq_mgr); if (ret) { - DRM_ERROR("Failed to evict userqueue\n"); + drm_file_err(uq_mgr->file, "Failed to evict userqueue\n"); return; } -- cgit v1.2.3 From 71353c1a4f91a11c20eca51a48f550341e2e1556 Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Tue, 22 Apr 2025 18:49:02 +0530 Subject: drm/amdgpu: change DRM_DBG_DRIVER to drm_dbg_driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit update the functions in amdgpu_userqueues.c from DRM_DBG_DRIVER to drm_dbg_driver so multi gpu instance can be logged in. Signed-off-by: Sunil Khatri Reviewed-by: Tvrtko Ursulin Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 854577d3c37c..394b036be1d9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -279,7 +279,8 @@ amdgpu_userq_get_doorbell_index(struct amdgpu_userq_mgr *uq_mgr, index = amdgpu_doorbell_index_on_bar(uq_mgr->adev, db_obj->obj, db_info->doorbell_offset, db_size); - DRM_DEBUG_DRIVER("[Usermode queues] doorbell index=%lld\n", index); + drm_dbg_driver(adev_to_drm(uq_mgr->adev), + "[Usermode queues] doorbell index=%lld\n", index); amdgpu_bo_unreserve(db_obj->obj); return index; @@ -305,7 +306,7 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id) queue = amdgpu_userq_find(uq_mgr, queue_id); if (!queue) { - DRM_DEBUG_DRIVER("Invalid queue id to destroy\n"); + drm_dbg_driver(adev_to_drm(uq_mgr->adev), "Invalid queue id to destroy\n"); mutex_unlock(&uq_mgr->userq_mutex); return -EINVAL; } @@ -515,7 +516,7 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data, break; default: - DRM_DEBUG_DRIVER("Invalid user queue op specified: %d\n", args->in.op); + drm_dbg_driver(dev, "Invalid user queue op specified: %d\n", args->in.op); return -EINVAL; } -- cgit v1.2.3 From f10eb185ad0552d1ba3d16b559c30d7d1b9481fa Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 7 May 2025 22:06:17 +0530 Subject: drm/amdgpu: Fix NULL dereference in amdgpu_userq_restore_worker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch cancel_delayed_work() to cancel_delayed_work_sync() to ensure the delayed work has finished executing before proceeding with resource cleanup. This prevents a potential use-after-free or NULL dereference if the resume_work is still running during finalization. BUG: kernel NULL pointer dereference, address: 0000000000000140 [ +0.000050] #PF: supervisor read access in kernel mode [ +0.000019] #PF: error_code(0x0000) - not-present page [ +0.000021] PGD 0 P4D 0 [ +0.000015] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI [ +0.000021] CPU: 17 UID: 0 PID: 196299 Comm: kworker/17:0 Tainted: G U 6.14.0-org-staging #1 [ +0.000032] Tainted: [U]=USER [ +0.000015] Hardware name: Gigabyte Technology Co., Ltd. X570 AORUS ELITE/X570 AORUS ELITE, BIOS F39 03/22/2024 [ +0.000029] Workqueue: events amdgpu_userq_restore_worker [amdgpu] [ +0.000426] RIP: 0010:drm_exec_lock_obj+0x32/0x210 [drm_exec] [ +0.000025] Code: e5 41 57 41 56 41 55 49 89 f5 41 54 49 89 fc 48 83 ec 08 4c 8b 77 30 4d 85 f6 0f 85 c0 00 00 00 4c 8d 7f 08 48 39 77 38 74 54 <49> 8b bd f8 00 00 00 4c 89 fe 41 f6 04 24 01 75 3c e8 08 50 bc e0 [ +0.000046] RSP: 0018:ffffab1b04da3ce8 EFLAGS: 00010297 [ +0.000020] RAX: 0000000000000001 RBX: ffff930cc60e4bc0 RCX: 0000000000000000 [ +0.000025] RDX: 0000000000000004 RSI: 0000000000000048 RDI: ffffab1b04da3d88 [ +0.000028] RBP: ffffab1b04da3d10 R08: ffff930cc60e4000 R09: 0000000000000000 [ +0.000022] R10: ffffab1b04da3d18 R11: 0000000000000001 R12: ffffab1b04da3d88 [ +0.000023] R13: 0000000000000048 R14: 0000000000000000 R15: ffffab1b04da3d90 [ +0.000023] FS: 0000000000000000(0000) GS:ffff9313dea80000(0000) knlGS:0000000000000000 [ +0.000024] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ +0.000021] CR2: 0000000000000140 CR3: 000000018351a000 CR4: 0000000000350ef0 [ +0.000025] Call Trace: [ +0.000018] [ +0.000015] ? show_regs+0x69/0x80 [ +0.000022] ? __die+0x25/0x70 [ +0.000019] ? page_fault_oops+0x15d/0x510 [ +0.000024] ? do_user_addr_fault+0x312/0x690 [ +0.000024] ? sched_clock_cpu+0x10/0x1a0 [ +0.000028] ? exc_page_fault+0x78/0x1b0 [ +0.000025] ? asm_exc_page_fault+0x27/0x30 [ +0.000024] ? drm_exec_lock_obj+0x32/0x210 [drm_exec] [ +0.000024] drm_exec_prepare_obj+0x21/0x60 [drm_exec] [ +0.000021] amdgpu_vm_lock_pd+0x22/0x30 [amdgpu] [ +0.000266] amdgpu_userq_validate_bos+0x6c/0x320 [amdgpu] [ +0.000333] amdgpu_userq_restore_worker+0x4a/0x120 [amdgpu] [ +0.000316] process_one_work+0x189/0x3c0 [ +0.000021] worker_thread+0x2a4/0x3b0 [ +0.000022] kthread+0x109/0x220 [ +0.000018] ? __pfx_worker_thread+0x10/0x10 [ +0.000779] ? _raw_spin_unlock_irq+0x1f/0x40 [ +0.000560] ? __pfx_kthread+0x10/0x10 [ +0.000543] ret_from_fork+0x3c/0x60 [ +0.000507] ? __pfx_kthread+0x10/0x10 [ +0.000515] ret_from_fork_asm+0x1a/0x30 [ +0.000515] v2: Replace cancel_delayed_work() to cancel_delayed_work_sync() in amdgpu_userq_destroy() and amdgpu_userq_evict(). Cc: Alex Deucher Reviewed-by: Christian König Reviewed-by: Sunil Khatri Signed-off-by: Arvind Yadav Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 394b036be1d9..ac3ab4cc8e5a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -301,7 +301,7 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id) struct amdgpu_usermode_queue *queue; int r = 0; - cancel_delayed_work(&uq_mgr->resume_work); + cancel_delayed_work_sync(&uq_mgr->resume_work); mutex_lock(&uq_mgr->userq_mutex); queue = amdgpu_userq_find(uq_mgr, queue_id); @@ -746,7 +746,7 @@ amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, amdgpu_eviction_fence_signal(evf_mgr, ev_fence); if (evf_mgr->fd_closing) { - cancel_delayed_work(&uq_mgr->resume_work); + cancel_delayed_work_sync(&uq_mgr->resume_work); return; } @@ -777,7 +777,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr) struct amdgpu_userq_mgr *uqm, *tmp; uint32_t queue_id; - cancel_delayed_work(&userq_mgr->resume_work); + cancel_delayed_work_sync(&userq_mgr->resume_work); mutex_lock(&userq_mgr->userq_mutex); idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) { -- cgit v1.2.3 From 648a0dc0d78c369233b16878e4f351efe7fd8df6 Mon Sep 17 00:00:00 2001 From: "Jesse.Zhang" Date: Fri, 9 May 2025 17:18:16 +0800 Subject: drm/amdgpu: Fix user queue deadlock by reordering mutex locking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This resolves a deadlock between user queue management and GPU reset paths by enforcing consistent lock ordering. The deadlock occurred when: 1. Process exit path (amdgpu_userq_mgr_fini) would: - Take uqm->userq_mutex - Then try to take adev->userq_mutex for list operations 2. GPU reset path (amdgpu_userq_pre_reset) would: - Take adev->userq_mutex first (for list traversal) - Then take uqm->userq_mutex The solution establishes a strict top-down locking order: 1. Always take adev->userq_mutex before any uqm->userq_mutex 2. Maintain this order consistently across all code paths Changes made: - Reordered locking in amdgpu_userq_mgr_fini() to take device lock first - Kept existing proper order in amdgpu_userq_pre_reset() - Simplified the fini flow by removing redundant operations This prevents circular dependencies while maintaining thread safety during both normal operation and GPU reset scenarios. Fixes: 4ce60dbada96 ("drm/amdgpu: store userq_managers in a list in adev") Reviewed-by: Christian König Reviewed-by: Arvind Yadav Signed-off-by: Jesse Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index ac3ab4cc8e5a..9d1f185a815e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -779,22 +779,23 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr) cancel_delayed_work_sync(&userq_mgr->resume_work); + mutex_lock(&adev->userq_mutex); mutex_lock(&userq_mgr->userq_mutex); idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) { amdgpu_userq_wait_for_last_fence(userq_mgr, queue); amdgpu_userq_unmap_helper(userq_mgr, queue); amdgpu_userq_cleanup(userq_mgr, queue, queue_id); } - mutex_lock(&adev->userq_mutex); + list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { if (uqm == userq_mgr) { list_del(&uqm->list); break; } } - mutex_unlock(&adev->userq_mutex); idr_destroy(&userq_mgr->userq_idr); mutex_unlock(&userq_mgr->userq_mutex); + mutex_unlock(&adev->userq_mutex); mutex_destroy(&userq_mgr->userq_mutex); } -- cgit v1.2.3 From bc5bab82d36067206a5cb865313f354e0dc911ae Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Tue, 13 May 2025 17:07:58 +0530 Subject: drm/amdgpu: Fix userq ttm_bo_pin and ttm_bo_unpin lockdep warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ttm_bo_pin and ttm_bo_unpin warnings are resolved by moving the doorbell bo reserve up before pin/unpin. WARNING: CPU: 11 PID: 1818 at drivers/gpu/drm/ttm/ttm_bo.c:592 ttm_bo_pin+0x1f6/0x270 [ttm] [ +0.000277] CPU: 11 UID: 1000 PID: 1818 Comm: Xwayland Tainted: G W 6.12.0+ #15 [ +0.000006] Tainted: [W]=WARN [ +0.000004] Hardware name: ASUS System Product Name/TUF GAMING B650-PLUS, BIOS 3072 12/20/2024 [ +0.000004] RIP: 0010:ttm_bo_pin+0x1f6/0x270 [ttm] [ +0.000005] RSP: 0018:ffff88846ca879d0 EFLAGS: 00010246 [ +0.000007] RAX: 0000000000000000 RBX: ffff88810b7ca848 RCX: 0000000000000000 [ +0.000004] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ +0.000005] RBP: ffff88846ca879e8 R08: 0000000000000000 R09: 0000000000000000 [ +0.000004] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88810b7ca848 [ +0.000004] R13: ffff88846c666250 R14: 1ffff1108d950f44 R15: ffff88846ca87aa0 [ +0.000005] FS: 00007c45ff436d00(0000) GS:ffff888409580000(0000) knlGS:0000000000000000 [ +0.000004] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ +0.000005] CR2: 00005b0c142a60e0 CR3: 000000012ce5a000 CR4: 0000000000f50ef0 [ +0.000004] PKRU: 55555554 [ +0.000004] Call Trace: [ +0.000004] [ +0.000005] ? show_regs+0x6c/0x80 [ +0.000007] ? __warn+0xd2/0x2d0 [ +0.000007] ? ttm_bo_pin+0x1f6/0x270 [ttm] [ +0.000031] ? report_bug+0x282/0x2f0 [ +0.000012] ? handle_bug+0x6e/0xc0 [ +0.000007] ? exc_invalid_op+0x18/0x50 [ +0.000007] ? asm_exc_invalid_op+0x1b/0x20 [ +0.000017] ? ttm_bo_pin+0x1f6/0x270 [ttm] [ +0.000014] amdgpu_bo_pin+0x365/0x9d0 [amdgpu] [ +0.000191] ? __pfx_amdgpu_bo_pin+0x10/0x10 [amdgpu] [ +0.000185] ? drm_gem_object_lookup+0x81/0xc0 [ +0.000008] ? kasan_save_alloc_info+0x37/0x60 [ +0.000007] ? __kasan_kmalloc+0xc3/0xd0 [ +0.000013] amdgpu_userqueue_get_doorbell_index+0xee/0x5f0 [amdgpu] [ +0.000209] amdgpu_userq_ioctl+0x6b4/0xd40 [amdgpu] [ +0.000193] ? __pfx_amdgpu_userq_ioctl+0x10/0x10 [amdgpu] [ +0.000211] ? lock_acquire+0x7c/0xc0 [ +0.000006] ? drm_dev_enter+0x51/0x190 [ +0.000015] drm_ioctl_kernel+0x18b/0x330 [ +0.000007] ? __pfx_amdgpu_userq_ioctl+0x10/0x10 [amdgpu] [ +0.000190] ? __pfx_drm_ioctl_kernel+0x10/0x10 [ +0.000005] ? lock_acquire+0x7c/0xc0 [ +0.000009] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000005] ? __kasan_check_write+0x14/0x30 [ +0.000005] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000011] drm_ioctl+0x589/0xd00 [ +0.000005] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000006] ? __pfx_amdgpu_userq_ioctl+0x10/0x10 [amdgpu] [ +0.000194] ? __pfx_drm_ioctl+0x10/0x10 [ +0.000006] ? __pm_runtime_resume+0x80/0x110 [ +0.000021] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000005] ? trace_hardirqs_on+0x53/0x60 [ +0.000005] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000005] ? _raw_spin_unlock_irqrestore+0x51/0x80 [ +0.000013] amdgpu_drm_ioctl+0xd2/0x1c0 [amdgpu] [ +0.000185] __x64_sys_ioctl+0x13a/0x1c0 [ +0.000010] x64_sys_call+0x11ad/0x25f0 [ +0.000007] do_syscall_64+0x91/0x180 [ +0.000007] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000005] ? irqentry_exit+0x77/0xb0 [ +0.000005] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000005] ? exc_page_fault+0x93/0x150 [ +0.000009] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ +0.000005] RIP: 0033:0x7c45ff924ded [ +0.000005] RSP: 002b:00007ffff7167810 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ +0.000008] RAX: ffffffffffffffda RBX: 00000000c0486456 RCX: 00007c45ff924ded [ +0.000004] RDX: 00007ffff7167870 RSI: 00000000c0486456 RDI: 000000000000000b [ +0.000004] RBP: 00007ffff7167860 R08: ffff800100000000 R09: 0000000000010000 [ +0.000005] R10: 00007ffff7167950 R11: 0000000000000246 R12: 00005b0c2a51bc48 [ +0.000004] R13: 000000000000000b R14: 0000000000000000 R15: 00007ffff7167950 [ +0.000022] [ +0.000004] irq event stamp: 80693 [ +0.000004] hardirqs last enabled at (80699): [] __up_console_sem+0x79/0xa0 [ +0.000005] hardirqs last disabled at (80704): [] __up_console_sem+0x5e/0xa0 [ +0.000005] softirqs last enabled at (80390): [] __irq_exit_rcu+0x17e/0x1d0 [ +0.000005] softirqs last disabled at (80385): [] __irq_exit_rcu+0x17e/0x1d0 [ +0.000006] ---[ end trace 0000000000000000 ]--- ------------------------------------------------------------------------------------------------------ [ +0.000006] WARNING: CPU: 10 PID: 1818 at drivers/gpu/drm/ttm/ttm_bo.c:611 ttm_bo_unpin+0x21f/0x2c0 [ttm] [ +0.000280] CPU: 10 UID: 1000 PID: 1818 Comm: Xwayland Tainted: G W 6.12.0+ #15 [ +0.000006] Tainted: [W]=WARN [ +0.000004] Hardware name: ASUS System Product Name/TUF GAMING B650-PLUS, BIOS 3072 12/20/2024 [ +0.000004] RIP: 0010:ttm_bo_unpin+0x21f/0x2c0 [ttm] [ +0.000005] RSP: 0018:ffff88846ca87888 EFLAGS: 00010246 [ +0.000007] RAX: 0000000000000000 RBX: ffff88810b7ca848 RCX: 0000000000000000 [ +0.000005] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ +0.000004] RBP: ffff88846ca878a0 R08: 0000000000000000 R09: 0000000000000000 [ +0.000004] R10: 0000000000000000 R11: 0000000000000000 R12: ffff888164e90050 [ +0.000005] R13: ffff88846c666200 R14: 0000000000000001 R15: ffff888168402d28 [ +0.000004] FS: 00007c45ff436d00(0000) GS:ffff888409500000(0000) knlGS:0000000000000000 [ +0.000005] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ +0.000004] CR2: 00007c45f7373b20 CR3: 000000012ce5a000 CR4: 0000000000f50ef0 [ +0.000005] PKRU: 55555554 [ +0.000004] Call Trace: [ +0.000004] [ +0.000005] ? show_regs+0x6c/0x80 [ +0.000008] ? __warn+0xd2/0x2d0 [ +0.000007] ? ttm_bo_unpin+0x21f/0x2c0 [ttm] [ +0.000012] ? report_bug+0x282/0x2f0 [ +0.000013] ? handle_bug+0x6e/0xc0 [ +0.000006] ? exc_invalid_op+0x18/0x50 [ +0.000008] ? asm_exc_invalid_op+0x1b/0x20 [ +0.000017] ? ttm_bo_unpin+0x21f/0x2c0 [ttm] [ +0.000011] ? ttm_bo_unpin+0x217/0x2c0 [ttm] [ +0.000011] amdgpu_bo_unpin+0x45/0x250 [amdgpu] [ +0.000216] amdgpu_userq_ioctl+0x2c3/0xd40 [amdgpu] [ +0.000226] ? drm_dev_exit+0x2d/0x60 [ +0.000010] ? __pfx_amdgpu_userq_ioctl+0x10/0x10 [amdgpu] [ +0.000201] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000005] ? lock_acquire+0x7c/0xc0 [ +0.000006] ? drm_dev_enter+0x51/0x190 [ +0.000015] drm_ioctl_kernel+0x18b/0x330 [ +0.000007] ? __pfx_amdgpu_userq_ioctl+0x10/0x10 [amdgpu] [ +0.000188] ? __pfx_drm_ioctl_kernel+0x10/0x10 [ +0.000006] ? lock_acquire+0x7c/0xc0 [ +0.000008] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000005] ? __kasan_check_write+0x14/0x30 [ +0.000006] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000010] drm_ioctl+0x589/0xd00 [ +0.000005] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000006] ? __pfx_amdgpu_userq_ioctl+0x10/0x10 [amdgpu] [ +0.000211] ? __pfx_drm_ioctl+0x10/0x10 [ +0.000006] ? __pm_runtime_resume+0x80/0x110 [ +0.000020] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000006] ? trace_hardirqs_on+0x53/0x60 [ +0.000005] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000005] ? _raw_spin_unlock_irqrestore+0x51/0x80 [ +0.000013] amdgpu_drm_ioctl+0xd2/0x1c0 [amdgpu] [ +0.000186] __x64_sys_ioctl+0x13a/0x1c0 [ +0.000010] x64_sys_call+0x11ad/0x25f0 [ +0.000007] do_syscall_64+0x91/0x180 [ +0.000007] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000005] ? do_syscall_64+0x9d/0x180 [ +0.000007] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000010] ? __pfx___rseq_handle_notify_resume+0x10/0x10 [ +0.000005] ? __pfx_blkcg_maybe_throttle_current+0x10/0x10 [ +0.000013] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000009] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000008] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000005] ? syscall_exit_to_user_mode+0x95/0x260 [ +0.000008] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000005] ? do_syscall_64+0x9d/0x180 [ +0.000007] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000005] ? do_syscall_64+0x9d/0x180 [ +0.000011] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000010] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000009] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000008] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000005] ? irqentry_exit_to_user_mode+0x8b/0x260 [ +0.000007] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000006] ? irqentry_exit+0x77/0xb0 [ +0.000004] ? srso_alias_return_thunk+0x5/0xfbef5 [ +0.000005] ? exc_page_fault+0x93/0x150 [ +0.000010] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ +0.000005] RIP: 0033:0x7c45ff924ded [ +0.000005] RSP: 002b:00007ffff7168790 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ +0.000008] RAX: ffffffffffffffda RBX: 00000000c0486456 RCX: 00007c45ff924ded [ +0.000005] RDX: 00007ffff71687f0 RSI: 00000000c0486456 RDI: 000000000000000b [ +0.000004] RBP: 00007ffff71687e0 R08: 00005b0c2a49b010 R09: 0000000000000007 [ +0.000004] R10: 00005b0c2a4d7140 R11: 0000000000000246 R12: 000000000000000b [ +0.000004] R13: 00007c45ff19e5cc R14: 00005b0c2a51c538 R15: 00005b0c2a51bbd8 [ +0.000022] [ +0.000005] irq event stamp: 87419 [ +0.000004] hardirqs last enabled at (87425): [] __up_console_sem+0x79/0xa0 [ +0.000005] hardirqs last disabled at (87430): [] __up_console_sem+0x5e/0xa0 [ +0.000005] softirqs last enabled at (87058): [] __irq_exit_rcu+0x17e/0x1d0 [ +0.000006] softirqs last disabled at (87053): [] __irq_exit_rcu+0x17e/0x1d0 [ +0.000005] ---[ end trace 0000000000000000 ]--- Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Christian König Reviewed-by: Jesse Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 9d1f185a815e..80401a37af77 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -240,17 +240,17 @@ amdgpu_userq_get_doorbell_index(struct amdgpu_userq_mgr *uq_mgr, db_obj->obj = amdgpu_bo_ref(gem_to_amdgpu_bo(gobj)); drm_gem_object_put(gobj); - /* Pin the BO before generating the index, unpin in queue destroy */ - r = amdgpu_bo_pin(db_obj->obj, AMDGPU_GEM_DOMAIN_DOORBELL); + r = amdgpu_bo_reserve(db_obj->obj, true); if (r) { drm_file_err(uq_mgr->file, "[Usermode queues] Failed to pin doorbell object\n"); goto unref_bo; } - r = amdgpu_bo_reserve(db_obj->obj, true); + /* Pin the BO before generating the index, unpin in queue destroy */ + r = amdgpu_bo_pin(db_obj->obj, AMDGPU_GEM_DOMAIN_DOORBELL); if (r) { drm_file_err(uq_mgr->file, "[Usermode queues] Failed to pin doorbell object\n"); - goto unpin_bo; + goto unresv_bo; } switch (db_info->queue_type) { @@ -286,7 +286,8 @@ amdgpu_userq_get_doorbell_index(struct amdgpu_userq_mgr *uq_mgr, unpin_bo: amdgpu_bo_unpin(db_obj->obj); - +unresv_bo: + amdgpu_bo_unreserve(db_obj->obj); unref_bo: amdgpu_bo_unref(&db_obj->obj); return r; @@ -311,9 +312,13 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id) return -EINVAL; } amdgpu_userq_wait_for_last_fence(uq_mgr, queue); - r = amdgpu_userq_unmap_helper(uq_mgr, queue); - amdgpu_bo_unpin(queue->db_obj.obj); + r = amdgpu_bo_reserve(queue->db_obj.obj, true); + if (!r) { + amdgpu_bo_unpin(queue->db_obj.obj); + amdgpu_bo_unreserve(queue->db_obj.obj); + } amdgpu_bo_unref(&queue->db_obj.obj); + r = amdgpu_userq_unmap_helper(uq_mgr, queue); amdgpu_userq_cleanup(uq_mgr, queue, queue_id); mutex_unlock(&uq_mgr->userq_mutex); -- cgit v1.2.3 From 96a86dcb5b5ca65564efeaaeecfca669e79a57e1 Mon Sep 17 00:00:00 2001 From: "Jesse.Zhang" Date: Tue, 13 May 2025 10:09:04 +0800 Subject: drm/amdgpu: Fix circular locking in userq creation A circular locking dependency was detected between the global `adev->userq_mutex` and per-file `userq_mgr->userq_mutex` when creating user queues. The issue occurs because: 1. `amdgpu_userq_suspend()` and `amdgpu_userq_resume` take `adev->userq_mutex` first, then `userq_mgr->userq_mutex` 2. While `amdgpu_userq_create()` takes them in reverse order This patch resolves the issue by: 1. Moving the `adev->userq_mutex` lock earlier in `amdgpu_userq_create()` to cover the `amdgpu_userq_ensure_ev_fence()` call 2. Releasing it after we're done with both queue creation and the scheduling halt check v2: remove unused adev->userq_mutex lock (Prike) Signed-off-by: Jesse Zhang Reviewed-by: Prike Liang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 80401a37af77..295e7186e156 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -394,6 +394,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) * * This will also make sure we have a valid eviction fence ready to be used. */ + mutex_lock(&adev->userq_mutex); amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr); uq_funcs = adev->userq_funcs[args->in.ip_type]; @@ -456,7 +457,6 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) } /* don't map the queue if scheduling is halted */ - mutex_lock(&adev->userq_mutex); if (adev->userq_halt_for_enforce_isolation && ((queue->queue_type == AMDGPU_HW_IP_GFX) || (queue->queue_type == AMDGPU_HW_IP_COMPUTE))) @@ -466,7 +466,6 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) if (!skip_map_queue) { r = amdgpu_userq_map_helper(uq_mgr, queue); if (r) { - mutex_unlock(&adev->userq_mutex); drm_file_err(uq_mgr->file, "Failed to map Queue\n"); idr_remove(&uq_mgr->userq_idr, qid); amdgpu_userq_fence_driver_free(queue); @@ -475,13 +474,13 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) goto unlock; } } - mutex_unlock(&adev->userq_mutex); args->out.queue_id = qid; unlock: mutex_unlock(&uq_mgr->userq_mutex); + mutex_unlock(&adev->userq_mutex); return r; } -- cgit v1.2.3