From 7402dbe4c2342c7d96a776d4188475a0f4c96abd Mon Sep 17 00:00:00 2001 From: Aristotelis Vontzalidis Date: Sun, 27 Oct 2024 20:13:26 +0000 Subject: [PATCH] [Bugfix] HDFS reading csv kernel - Distributed workers could start reading from the wrong row within a csv file if different file segments contained different number of rows. - Fixed by correctly using the offset (starting row) provided by the utility function (HDFSUtils::findSegmentAndOffset). --- src/runtime/local/io/HDFS/ReadHDFSCsv.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/runtime/local/io/HDFS/ReadHDFSCsv.h b/src/runtime/local/io/HDFS/ReadHDFSCsv.h index 9787ed269..3f89d7147 100644 --- a/src/runtime/local/io/HDFS/ReadHDFSCsv.h +++ b/src/runtime/local/io/HDFS/ReadHDFSCsv.h @@ -76,8 +76,8 @@ template struct ReadHDFSCsv> { std::cerr << "Error connecting to HDFS" << std::endl; } - [[maybe_unused]] auto [startSegment, dummy] = - HDFSUtils::findSegmendAndOffset(*fs, 0, startRow, hdfsDir, numCols * sizeof(VT)); + [[maybe_unused]] auto [startSegment, startRowWithinSegment] = + HDFSUtils::findSegmendAndOffset(*fs, 0, startRow, hdfsDir, 1); // TODO verify file exists size_t parsedRows = 0; @@ -124,8 +124,9 @@ template struct ReadHDFSCsv> { } } while (cur == nullptr); // If first segment, skip rows - if (parsedRows == 0 && startRow > (segment - 2) * segFmd.numRows + r) + if (parsedRows == 0 && startRowWithinSegment > r) { continue; + } size_t pos = 0; for (size_t c = 0; c < numCols; c++) {