Skip to content

Commit 0ef930e

Browse files
jokim-amdalexdeucher
authored andcommitted
drm/amdgpu: fix hung reset queue array memory allocation
By design the MES will return an array result that is twice the number of hung doorbells it can report. i.e. if up k reported doorbells are supported, then the second half of the array, also of length k, holds the HQD information (type/queue/pipe) where queue 1 corresponds to index 0 and k, queue 2 corresponds to index 1 and k + 1 etc ... The driver will use the HDQ info to target queue/pipe reset for hardware scheduled user compute queues. Signed-off-by: Jonathan Kim <[email protected]> Reviewed-by: Alex Deucher <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 8745ca5 commit 0ef930e

5 files changed

Lines changed: 20 additions & 10 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -420,12 +420,17 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
420420
dev_err(adev->dev, "failed to detect and reset\n");
421421
} else {
422422
*hung_db_num = 0;
423-
for (i = 0; i < adev->mes.hung_queue_db_array_size; i++) {
423+
for (i = 0; i < adev->mes.hung_queue_hqd_info_offset; i++) {
424424
if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
425425
hung_db_array[i] = db_array[i];
426426
*hung_db_num += 1;
427427
}
428428
}
429+
430+
/*
431+
* TODO: return HQD info for MES scheduled user compute queue reset cases
432+
* stored in hung_db_array hqd info offset to full array size
433+
*/
429434
}
430435

431436
return r;

drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ struct amdgpu_mes {
149149
void *resource_1_addr[AMDGPU_MAX_MES_PIPES];
150150

151151
int hung_queue_db_array_size;
152+
int hung_queue_hqd_info_offset;
152153
struct amdgpu_bo *hung_queue_db_array_gpu_obj;
153154
uint64_t hung_queue_db_array_gpu_addr;
154155
void *hung_queue_db_array_cpu_addr;

drivers/gpu/drm/amd/amdgpu/mes_userqueue.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -208,10 +208,10 @@ static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
208208
struct amdgpu_userq_mgr *uqm, *tmp;
209209
unsigned int hung_db_num = 0;
210210
int queue_id, r, i;
211-
u32 db_array[4];
211+
u32 db_array[8];
212212

213-
if (db_array_size > 4) {
214-
dev_err(adev->dev, "DB array size (%d vs 4) too small\n",
213+
if (db_array_size > 8) {
214+
dev_err(adev->dev, "DB array size (%d vs 8) too small\n",
215215
db_array_size);
216216
return -EINVAL;
217217
}

drivers/gpu/drm/amd/amdgpu/mes_v11_0.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,8 @@ static int mes_v11_0_kiq_hw_fini(struct amdgpu_device *adev);
6666
#define GFX_MES_DRAM_SIZE 0x80000
6767
#define MES11_HW_RESOURCE_1_SIZE (128 * AMDGPU_GPU_PAGE_SIZE)
6868

69-
#define MES11_HUNG_DB_OFFSET_ARRAY_SIZE 4
69+
#define MES11_HUNG_DB_OFFSET_ARRAY_SIZE 8 /* [0:3] = db offset, [4:7] = hqd info */
70+
#define MES11_HUNG_HQD_INFO_OFFSET 4
7071

7172
static void mes_v11_0_ring_set_wptr(struct amdgpu_ring *ring)
7273
{
@@ -1720,8 +1721,9 @@ static int mes_v11_0_early_init(struct amdgpu_ip_block *ip_block)
17201721
struct amdgpu_device *adev = ip_block->adev;
17211722
int pipe, r;
17221723

1723-
adev->mes.hung_queue_db_array_size =
1724-
MES11_HUNG_DB_OFFSET_ARRAY_SIZE;
1724+
adev->mes.hung_queue_db_array_size = MES11_HUNG_DB_OFFSET_ARRAY_SIZE;
1725+
adev->mes.hung_queue_hqd_info_offset = MES11_HUNG_HQD_INFO_OFFSET;
1726+
17251727
for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
17261728
if (!adev->enable_mes_kiq && pipe == AMDGPU_MES_KIQ_PIPE)
17271729
continue;

drivers/gpu/drm/amd/amdgpu/mes_v12_0.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ static int mes_v12_0_kiq_hw_fini(struct amdgpu_device *adev);
4747

4848
#define MES_EOP_SIZE 2048
4949

50-
#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 4
50+
#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 8 /* [0:3] = db offset [4:7] hqd info */
51+
#define MES12_HUNG_HQD_INFO_OFFSET 4
5152

5253
static void mes_v12_0_ring_set_wptr(struct amdgpu_ring *ring)
5354
{
@@ -1904,8 +1905,9 @@ static int mes_v12_0_early_init(struct amdgpu_ip_block *ip_block)
19041905
struct amdgpu_device *adev = ip_block->adev;
19051906
int pipe, r;
19061907

1907-
adev->mes.hung_queue_db_array_size =
1908-
MES12_HUNG_DB_OFFSET_ARRAY_SIZE;
1908+
adev->mes.hung_queue_db_array_size = MES12_HUNG_DB_OFFSET_ARRAY_SIZE;
1909+
adev->mes.hung_queue_hqd_info_offset = MES12_HUNG_HQD_INFO_OFFSET;
1910+
19091911
for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
19101912
r = amdgpu_mes_init_microcode(adev, pipe);
19111913
if (r)

0 commit comments

Comments
 (0)