intel · sophimao · Oct 25, 2022 · Oct 27, 2022 · Oct 28, 2022 · Oct 28, 2022
diff --git a/include/acl_types.h b/include/acl_types.h
@@ -1636,6 +1636,9 @@ typedef struct _cl_platform_id
   // The device operation queue.
   // These are the operations that can run immediately on the device.
   acl_device_op_queue_t device_op_queue;
+  // Thread used to update device_op_queue when kernel interrupt triggers
+  acl_thread_t device_op_queue_update_thread;
+  bool outstanding_interrupt;
 
   // Limits. See clGetDeviceInfo for semantics.
   unsigned int max_param_size;

diff --git a/lib/acl_threadsupport/include/acl_threadsupport/acl_threadsupport.h b/lib/acl_threadsupport/include/acl_threadsupport/acl_threadsupport.h
@@ -189,7 +189,7 @@ int acl_sem_destroy(acl_sem_t *sem);
 // See this Microsoft Research paper on how to implement condition
 // variables with only semaphores
 //    http://research.microsoft.com/pubs/64242/implementingcvs.pdf
-// It's veyr instructive, but we can't use its implementation because:
+// It's very instructive, but we can't use its implementation because:
 //    - The signaler acquires a mutex
 //    - It keeps an explicit linked list of waiters
 //

diff --git a/src/acl_kernel.cpp b/src/acl_kernel.cpp
@@ -3176,6 +3176,10 @@ void acl_receive_kernel_update(int activation_id, cl_int status) {
   std::unique_lock lock{acl_mutex_wrapper, std::defer_lock};
   if (!acl_is_inside_sig()) {
     lock.lock();
+  } else {
+    // Let the device op queue update thread know there is an interrupt from
+    // the kernel interrupt signal handler
+    acl_platform.outstanding_interrupt = 1;
   }
 
   if (activation_id >= 0 && activation_id < doq->max_ops) {

diff --git a/src/acl_platform.cpp b/src/acl_platform.cpp
@@ -78,6 +78,7 @@ static void l_initialize_devices(const acl_system_def_t *present_board_def,
                                  int offline_mode, unsigned int num_devices,
                                  const cl_device_id *devices);
 static void l_add_device(int idx);
+void *l_eagerly_update_device_op_queue(void *arg);
 
 //////////////////////////////
 // OpenCL API
@@ -412,6 +413,10 @@ void acl_init_platform(void) {
 
   // Device operation queue.
   acl_init_device_op_queue(&acl_platform.device_op_queue);
+  // Send off device_op_queue update thread
+  acl_platform.outstanding_interrupt = 0;
+  acl_thread_create(&acl_platform.device_op_queue_update_thread, 0,
+                    l_eagerly_update_device_op_queue, NULL);
 
   // Initialize sampler allocator.
   for (int i = 0; i < ACL_MAX_SAMPLER; i++) {
@@ -737,6 +742,25 @@ static void l_add_device(int idx) {
   device->address_bits = 64;              // Yes, our devices are 64-bit.
 }
 
+void *l_eagerly_update_device_op_queue(void *arg) {
+  while (true) {
+    std::scoped_lock lock{acl_mutex_wrapper};
+
+    // Sleep if no interrupt happening
+    acl_wait_for_device_update(NULL);
+
+    if (!acl_platform.initialized) {
+      break;
+    }
+    if (acl_platform.outstanding_interrupt) {
+      acl_print_debug_msg("Serving outstanding kernel interrupt...\n");
+      acl_update_device_op_queue(&(acl_platform.device_op_queue));
+      acl_platform.outstanding_interrupt = 0;
+    }
+  }
+  return NULL;
+}
+
 // These functions check to see if a given object is known to the system.
 // acl_*_is_valid( * );
 // This is simple because everything is statically allocated.

diff --git a/src/acl_thread.cpp b/src/acl_thread.cpp
@@ -9,6 +9,7 @@
 #include <acl_context.h>
 #include <acl_hal.h>
 #include <acl_thread.h>
+#include <acl_util.h>
 
 ACL_TLS int acl_global_lock_count = 0;
 ACL_TLS int acl_inside_sig_flag = 0;
@@ -55,7 +56,7 @@ void acl_mutex_wrapper_t::resume_lock(int lock_count) {
 
 void acl_wait_for_device_update(cl_context context) {
   acl_assert_locked();
-  if (acl_get_hal()->get_debug_verbosity &&
+  if (acl_context_is_valid(context) && acl_get_hal()->get_debug_verbosity &&
       acl_get_hal()->get_debug_verbosity() > 0) {
     unsigned timeout = 5; // Seconds
     // Keep waiting until signal is received
@@ -102,6 +103,12 @@ __attribute__((constructor)) static void l_global_lock_init() {
 }
 
 __attribute__((destructor)) static void l_global_lock_uninit() {
+  {
+    std::scoped_lock lock{acl_mutex_wrapper};
+    acl_get_platform()->initialized = 0;
+    acl_signal_condvar(&l_acl_global_condvar); // wake up waiting thread
+  }
+  acl_thread_join(&acl_get_platform()->device_op_queue_update_thread);
   acl_reset_condvar(&l_acl_global_condvar);
 }