Skip to content

Commit

Permalink
[Bugfix] HDFS reading csv kernel
Browse files Browse the repository at this point in the history
- Distributed workers could start reading from the wrong row within a csv
file if different file segments contained different number of rows.
- Fixed by correctly using the offset (starting row) provided by the
utility function (HDFSUtils::findSegmentAndOffset).
  • Loading branch information
aristotelis96 committed Oct 27, 2024
1 parent 84981ac commit 7402dbe
Showing 1 changed file with 4 additions and 3 deletions.
7 changes: 4 additions & 3 deletions src/runtime/local/io/HDFS/ReadHDFSCsv.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ template <typename VT> struct ReadHDFSCsv<DenseMatrix<VT>> {
std::cerr << "Error connecting to HDFS" << std::endl;
}

[[maybe_unused]] auto [startSegment, dummy] =
HDFSUtils::findSegmendAndOffset(*fs, 0, startRow, hdfsDir, numCols * sizeof(VT));
[[maybe_unused]] auto [startSegment, startRowWithinSegment] =
HDFSUtils::findSegmendAndOffset(*fs, 0, startRow, hdfsDir, 1);
// TODO verify file exists

size_t parsedRows = 0;
Expand Down Expand Up @@ -124,8 +124,9 @@ template <typename VT> struct ReadHDFSCsv<DenseMatrix<VT>> {
}
} while (cur == nullptr);
// If first segment, skip rows
if (parsedRows == 0 && startRow > (segment - 2) * segFmd.numRows + r)
if (parsedRows == 0 && startRowWithinSegment > r) {
continue;
}

size_t pos = 0;
for (size_t c = 0; c < numCols; c++) {
Expand Down

0 comments on commit 7402dbe

Please sign in to comment.