summaryrefslogtreecommitdiff
path: root/pkgs/development/rocm-modules/rocm-runtime/queue-failure.patch
blob: f9087144bc229bdfd47cc24e55807815172ff422 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
From b7717446c85d08b2d7c0c60ba3ac0eff11ee6120 Mon Sep 17 00:00:00 2001
From: Luna Nova <git@lunnova.dev>
Date: Tue, 20 Jan 2026 12:55:45 -0800
Subject: [PATCH 1/2] rocm-runtime: fix crash in QueueCreate due to trying to
 free non allocated scratch

if (scratch.main_queue_base != nullptr) before calling ReleaseQueueMainScratch
because ReleaseQueueMainScratch is only valid if main_queue_base is set
and the scope guard can fire for an error allocating the queue.
---
 .../runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp        | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
index 01b01fe869..83db40dacc 100644
--- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
@@ -1792,7 +1792,9 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, u
   scratch.main_queue_base = nullptr;
   scratch.main_queue_process_offset = 0;
 
-  MAKE_NAMED_SCOPE_GUARD(scratchGuard, [&]() { ReleaseQueueMainScratch(scratch); });
+  MAKE_NAMED_SCOPE_GUARD(scratchGuard, [&]() {
+    if (scratch.main_queue_base != nullptr) ReleaseQueueMainScratch(scratch);
+  });
 
   if (scratch.main_size != 0) {
     AcquireQueueMainScratch(scratch);
-- 
2.52.0


From 9c1746cd76a703e4d2321dc2ffe85fc61bfd2f21 Mon Sep 17 00:00:00 2001
From: Luna Nova <git@lunnova.dev>
Date: Tue, 20 Jan 2026 13:00:32 -0800
Subject: [PATCH 2/2] rocm-runtime: log for errors in QueueCreate

---
 .../runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp   | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
index 83db40dacc..ae68732eb5 100644
--- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
@@ -1799,6 +1799,9 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, u
   if (scratch.main_size != 0) {
     AcquireQueueMainScratch(scratch);
     if (scratch.main_queue_base == nullptr) {
+      LogPrint(HSA_AMD_LOG_FLAG_INFO,
+               "Failed to allocate scratch memory for queue, size=%zu, node=%u",
+               scratch.main_size, node_id());
       return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
     }
   }
@@ -1827,7 +1830,11 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, u
             node_id()));
   }
 
-  if (!shared_queue) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  if (!shared_queue) {
+    LogPrint(HSA_AMD_LOG_FLAG_INFO,
+             "Failed to allocate shared queue descriptor memory, node=%u", node_id());
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
 
   auto aql_queue = new AqlQueue(shared_queue, this, size, node_id(), scratch, event_callback, data,
                                 flags);
-- 
2.52.0