intel · sophimao · Oct 25, 2022 · Oct 27, 2022 · Oct 28, 2022 · Oct 28, 2022
diff --git a/include/acl_globals.h b/include/acl_globals.h
@@ -22,6 +22,8 @@ int acl_present_board_is_valid(void);
 // Can't use ACL after this.
 // Undoes acl_init().
 void acl_reset(void);
+// Version of reset used in unit test only
+void acl_reset_join_thread(void);
 
 // Initializes the HAL and loads the builtin system definition.
 //

diff --git a/include/acl_types.h b/include/acl_types.h
@@ -1636,6 +1636,9 @@ typedef struct _cl_platform_id
   // The device operation queue.
   // These are the operations that can run immediately on the device.
   acl_device_op_queue_t device_op_queue;
+  // Thread used to update device_op_queue when kernel interrupt triggers
+  acl_thread_t device_op_queue_update_thread;
+  bool outstanding_interrupt;
 
   // Limits. See clGetDeviceInfo for semantics.
   unsigned int max_param_size;

diff --git a/lib/acl_threadsupport/include/acl_threadsupport/acl_threadsupport.h b/lib/acl_threadsupport/include/acl_threadsupport/acl_threadsupport.h
@@ -189,7 +189,7 @@ int acl_sem_destroy(acl_sem_t *sem);
 // See this Microsoft Research paper on how to implement condition
 // variables with only semaphores
 //    http://research.microsoft.com/pubs/64242/implementingcvs.pdf
-// It's veyr instructive, but we can't use its implementation because:
+// It's very instructive, but we can't use its implementation because:
 //    - The signaler acquires a mutex
 //    - It keeps an explicit linked list of waiters
 //

diff --git a/src/acl_globals.cpp b/src/acl_globals.cpp
@@ -231,6 +231,35 @@ void acl_reset(void) {
   acl_platform.initialized = 0;
 }
 
+// This function should only be used in the unit test
+void acl_reset_join_thread(void) {
+  {
+    std::scoped_lock lock{acl_mutex_wrapper};
+
+    l_reset_present_board();
+
+    acl_platform.offline_device = "";
+    acl_platform.num_devices = 0;
+    for (unsigned i = 0; i < ACL_MAX_DEVICE; ++i) {
+      acl_platform.device[i] = _cl_device_id();
+    }
+    acl_platform.initialized = 0;
+    acl_signal_device_update();
+  }
+  // Each unit test test groups are sequentially run and acl_init and acl_reset
+  // is called once at the start (setup) and end (teardown) of the test group.
+  // As acl_init wouldn't be called before acl_reset finished, it is okay to
+  // block here to wait for the device op queue update thread to finish here.
+
+  // Note that the join has to be called without holding the acl global lock, if
+  // reset acquires lock and wait, the device op queue update thread will try to
+  // obtain the lock forever, resulting in deadlock in the unit test.
+  if (acl_platform.device_op_queue_update_thread != 0) {
+    acl_thread_join(&acl_platform.device_op_queue_update_thread);
+    acl_platform.device_op_queue_update_thread = 0;
+  }
+}
+
 ////////////////////////////////////////////////////
 // Static functions
 

diff --git a/src/acl_kernel.cpp b/src/acl_kernel.cpp
@@ -3176,6 +3176,10 @@ void acl_receive_kernel_update(int activation_id, cl_int status) {
   std::unique_lock lock{acl_mutex_wrapper, std::defer_lock};
   if (!acl_is_inside_sig()) {
     lock.lock();
+  } else {
+    // Let the device op queue update thread know there is an interrupt from
+    // the kernel interrupt signal handler
+    acl_platform.outstanding_interrupt = 1;
   }
 
   if (activation_id >= 0 && activation_id < doq->max_ops) {

diff --git a/src/acl_platform.cpp b/src/acl_platform.cpp
@@ -78,6 +78,7 @@ static void l_initialize_devices(const acl_system_def_t *present_board_def,
                                  int offline_mode, unsigned int num_devices,
                                  const cl_device_id *devices);
 static void l_add_device(int idx);
+void *l_eagerly_update_device_op_queue(void *arg);
 
 //////////////////////////////
 // OpenCL API
@@ -412,6 +413,10 @@ void acl_init_platform(void) {
 
   // Device operation queue.
   acl_init_device_op_queue(&acl_platform.device_op_queue);
+  // Send off device_op_queue update thread
+  acl_platform.outstanding_interrupt = 0;
+  acl_thread_create(&acl_platform.device_op_queue_update_thread, 0,
+                    l_eagerly_update_device_op_queue, NULL);
 
   // Initialize sampler allocator.
   for (int i = 0; i < ACL_MAX_SAMPLER; i++) {
@@ -737,6 +742,25 @@ static void l_add_device(int idx) {
   device->address_bits = 64;              // Yes, our devices are 64-bit.
 }
 
+void *l_eagerly_update_device_op_queue(void *arg) {
+  while (true) {
+    std::scoped_lock lock{acl_mutex_wrapper};
+
+    // Sleep if no interrupt happening
+    acl_wait_for_device_update(NULL);
+
+    if (!acl_platform.initialized) {
+      break;
+    }
+    if (acl_platform.outstanding_interrupt) {
+      acl_print_debug_msg("Serving outstanding kernel interrupt...\n");
+      acl_update_device_op_queue(&(acl_platform.device_op_queue));
+      acl_platform.outstanding_interrupt = 0;
+    }
+  }
+  return NULL;
+}
+
 // These functions check to see if a given object is known to the system.
 // acl_*_is_valid( * );
 // This is simple because everything is statically allocated.

diff --git a/src/acl_thread.cpp b/src/acl_thread.cpp
@@ -9,6 +9,7 @@
 #include <acl_context.h>
 #include <acl_hal.h>
 #include <acl_thread.h>
+#include <acl_util.h>
 
 ACL_TLS int acl_global_lock_count = 0;
 ACL_TLS int acl_inside_sig_flag = 0;
@@ -55,7 +56,7 @@ void acl_mutex_wrapper_t::resume_lock(int lock_count) {
 
 void acl_wait_for_device_update(cl_context context) {
   acl_assert_locked();
-  if (acl_get_hal()->get_debug_verbosity &&
+  if (acl_context_is_valid(context) && acl_get_hal()->get_debug_verbosity &&
       acl_get_hal()->get_debug_verbosity() > 0) {
     unsigned timeout = 5; // Seconds
     // Keep waiting until signal is received
@@ -102,6 +103,14 @@ __attribute__((constructor)) static void l_global_lock_init() {
 }
 
 __attribute__((destructor)) static void l_global_lock_uninit() {
+  if (acl_get_platform()->device_op_queue_update_thread) {
+    {
+      std::scoped_lock lock{acl_mutex_wrapper};
+      acl_get_platform()->initialized = 0;
+      acl_signal_condvar(&l_acl_global_condvar); // wake up waiting thread
+    }
+    acl_thread_join(&acl_get_platform()->device_op_queue_update_thread);
+  }
   acl_reset_condvar(&l_acl_global_condvar);
 }
 

diff --git a/test/acl_command_queue_test.cpp b/test/acl_command_queue_test.cpp
@@ -40,7 +40,7 @@ MT_TEST_GROUP(acl_command_queue) {
     syncThreads();
 
     if (threadNum() == 0) {
-      ACL_LOCKED(acl_test_teardown_generic_system());
+      acl_test_teardown_generic_system();
     }
 
     acl_test_run_standard_teardown_checks();

diff --git a/test/acl_context_test.cpp b/test/acl_context_test.cpp
@@ -54,7 +54,7 @@ MT_TEST_GROUP(Context) {
     syncThreads();
 
     if (threadNum() == 0) {
-      ACL_LOCKED(acl_test_teardown_generic_system());
+      acl_test_teardown_generic_system();
     }
     acl_test_run_standard_teardown_checks();
   }

diff --git a/test/acl_device_op_test.cpp b/test/acl_device_op_test.cpp
@@ -151,8 +151,8 @@ TEST_GROUP(device_op) {
 
   virtual void teardown() {
     unload();
-    acl_test_teardown_generic_system();
     acl_mutex_wrapper.unlock();
+    acl_test_teardown_generic_system();
     acl_test_run_standard_teardown_checks();
   }
 

diff --git a/test/acl_device_test.cpp b/test/acl_device_test.cpp
@@ -29,7 +29,7 @@ syncThreads();
 void teardown() {
   syncThreads();
   if (threadNum() == 0) {
-    ACL_LOCKED(acl_test_teardown_generic_system());
+    acl_test_teardown_generic_system();
   }
   acl_test_run_standard_teardown_checks();
 }

diff --git a/test/acl_event_test.cpp b/test/acl_event_test.cpp
@@ -44,7 +44,7 @@ MT_TEST_GROUP(acl_event) {
     syncThreads();
 
     if (threadNum() == 0) {
-      ACL_LOCKED(acl_test_teardown_generic_system());
+      acl_test_teardown_generic_system();
     }
 
     acl_test_run_standard_teardown_checks();
@@ -129,7 +129,7 @@ MT_TEST_GROUP(acl_event_default_config) {
 
     syncThreads();
     if (threadNum() == 0) {
-      ACL_LOCKED(acl_test_teardown_generic_system());
+      acl_test_teardown_generic_system();
     }
 
     acl_test_run_standard_teardown_checks();

diff --git a/test/acl_globals_test.cpp b/test/acl_globals_test.cpp
@@ -716,7 +716,9 @@ TEST(acl_globals_undef, valid_init_simple) {
   CHECK(0 != acl_present_board_def());
   CHECK(0 != acl_present_board_is_valid());
   // Teardown
-  acl_reset();
+  acl_mutex_wrapper.unlock();
+  acl_reset_join_thread();
+  acl_mutex_wrapper.lock();
   CHECK(0 == acl_present_board_def());
   CHECK(0 == acl_present_board_is_valid());
 }
@@ -726,7 +728,9 @@ TEST(acl_globals_undef, valid_init_empty) {
   CHECK(0 != acl_present_board_def());
   CHECK(0 != acl_present_board_is_valid());
   // Teardown
-  acl_reset();
+  acl_mutex_wrapper.unlock();
+  acl_reset_join_thread();
+  acl_mutex_wrapper.lock();
   CHECK(0 == acl_present_board_def());
   CHECK(0 == acl_present_board_is_valid());
 }
@@ -735,6 +739,8 @@ TEST(acl_globals_undef, valid_init_complex) {
   CHECK_EQUAL(1, acl_init(&acltest_complex_system));
   CHECK(0 != acl_present_board_def());
   // Teardown
-  acl_reset();
+  acl_mutex_wrapper.unlock();
+  acl_reset_join_thread();
+  acl_mutex_wrapper.lock();
   CHECK_EQUAL(0, acl_present_board_def());
 }
diff --git a/test/acl_platform_test.cpp b/test/acl_platform_test.cpp
@@ -378,7 +378,7 @@ TEST(offline_device, offline_hal) {
   cl_bool result;
 
   acl_test_setenv(m_env, offline_device);
-  ACL_LOCKED(acl_reset());
+  acl_reset_join_thread();
   ACL_LOCKED(result = acl_init_from_hal_discovery());
   CHECK_EQUAL(CL_TRUE, result);
   // Exercise the offline HAL: printing, and the timestamps.
@@ -390,6 +390,11 @@ TEST(offline_device, offline_hal) {
   ACL_LOCKED(now = acl_get_hal()->get_timestamp());
   ACL_LOCKED(acl_print_debug_msg("offline hal time is %08x%08x", (now >> 32),
                                  (now & 0xffffffff)));
+
+  syncThreads();
+  if (threadNum() == 0) {
+    acl_test_teardown_system();
+  }
 }
 
 struct live_info_t {
@@ -435,7 +440,7 @@ MT_TEST_GROUP(track_object) {
     syncThreads();
     if (threadNum() == 0) {
       acl_test_unsetenv(m_offline_env);
-      ACL_LOCKED(acl_reset());
+      acl_reset_join_thread();
     }
     acl_test_run_standard_teardown_checks();
   }

diff --git a/test/acl_support_test.cpp b/test/acl_support_test.cpp
@@ -27,8 +27,8 @@ TEST_GROUP(support){void setup(){acl_mutex_wrapper.lock();
 acl_test_setup_generic_system();
 }
 void teardown() {
-  acl_test_teardown_generic_system();
   acl_mutex_wrapper.unlock();
+  acl_test_teardown_generic_system();
   acl_test_run_standard_teardown_checks();
 }
 

diff --git a/test/acl_test.cpp b/test/acl_test.cpp
@@ -117,8 +117,8 @@ void acl_test_teardown_sample_default_board_system(void) {
 
 void acl_test_teardown_generic_system(void) { acl_test_teardown_system(); }
 void acl_test_teardown_system(void) {
+  acl_reset_join_thread();
   acl_mutex_wrapper.lock();
-  acl_reset();
   acl_reset_hal();
   acltest_hal_teardown();
   acl_mutex_wrapper.unlock();
@@ -358,7 +358,7 @@ static void l_load_example_binary(void) {
     acl_test_setenv(envvar_program_lib, program_lib_old_value);
   }
 
-  ACL_LOCKED(acl_test_teardown_generic_system());
+  acl_test_teardown_generic_system();
 }
 
 // Return a context properties array that specifies preloaded binary only.