NVIDIA · ttnghia · Nov 21, 2023 · Nov 20, 2023 · Nov 20, 2023 · Nov 20, 2023
diff --git a/src/main/cpp/src/row_conversion.cu b/src/main/cpp/src/row_conversion.cu
@@ -81,6 +81,7 @@ constexpr auto NUM_WARPS_IN_BLOCK = 32;
 #pragma nv_diag_suppress static_var_with_dynamic_init
 
 using namespace cudf;
+using detail::make_device_uvector_sync;
 using detail::make_device_uvector_async;
 using rmm::device_uvector;
 
@@ -231,7 +232,7 @@ build_string_row_offsets(table_view const& tbl,
                  offsets_iter + tbl.num_columns(),
                  std::back_inserter(offsets_iterators),
                  [](auto const& offset_ptr) { return offset_ptr != nullptr; });
-    return make_device_uvector_async(
+    return make_device_uvector_sync(
       offsets_iterators, stream, rmm::mr::get_current_device_resource());
   }();
 
@@ -1556,8 +1557,13 @@ batch_data build_batches(size_type num_rows,
   batch_row_boundaries.push_back(0);
   size_type last_row_end = 0;
   device_uvector<uint64_t> cumulative_row_sizes(num_rows, stream);
-  thrust::inclusive_scan(
-    rmm::exec_policy(stream), row_sizes, row_sizes + num_rows, cumulative_row_sizes.begin());
+
+  // Evaluate the row size values before calling `inclusive_scan` to workaround
+  // memory issue in https://github.com/NVIDIA/spark-rapids-jni/issues/1567.
+  thrust::copy(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows,
+               cumulative_row_sizes.begin());
+  thrust::inclusive_scan(rmm::exec_policy(stream), cumulative_row_sizes.begin(),
+                         cumulative_row_sizes.end(), cumulative_row_sizes.begin());
 
   // This needs to be split this into 2 gig batches. Care must be taken to avoid a batch larger than
   // 2 gigs. Imagine a table with 900 meg rows. The batches should occur every 2 rows, but if a