Skip to content

Commit

Permalink
[DAPHNE-#834] support string value type in frame (#860)
Browse files Browse the repository at this point in the history
This commit closes #834 by adding support for reading CSV files containing string columns into frames.

Changes:
- General support for string columns in Frames at kernel level:
  - String values can now be used as column value types within Frames, enhancing type support.
- Support for reading string columns in Frames at kernel level:
  - The system now supports reading string values into Frames at the kernel level, ensuring better handling of mixed data types.
- Support for reading string columns from CSV files into Frames in DaphneDSL:
  - When reading a CSV file with string columns, the value type for string columns should be set to "str" in the .meta file to ensure proper type recognition.

Testing: New test cases have been added to validate the reading of CSV files containing string columns into Frames and constructing Frames with string columns, ensuring correct behavior and performance.
  • Loading branch information
saminbassiri authored Oct 22, 2024
1 parent 9f27ac3 commit 649b2da
Show file tree
Hide file tree
Showing 12 changed files with 152 additions and 11 deletions.
21 changes: 17 additions & 4 deletions src/runtime/local/datastructures/Frame.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,22 @@ class Frame : public Structure {
this->schema[i] = schema[i];
this->labels[i] = labels ? labels[i] : getDefaultLabel(i);
const size_t sizeAlloc = maxNumRows * ValueTypeUtils::sizeOf(schema[i]);
this->columns[i] =
std::shared_ptr<ColByteType>(new ColByteType[sizeAlloc], std::default_delete<ColByteType[]>());
if (zero)
memset(this->columns[i].get(), 0, sizeAlloc);

if (this->schema[i] == ValueTypeCode::STR) {
// If this is a string column, we must make sure that the column array contains only valid std::string
// objects.
this->columns[i] =
std::shared_ptr<ColByteType>(reinterpret_cast<ColByteType *>(new std::string[maxNumRows]),
[](ColByteType *p) { delete[] reinterpret_cast<std::string *>(p); });
if (zero)
std::fill(reinterpret_cast<std::string *>(this->columns[i].get()),
reinterpret_cast<std::string *>(this->columns[i].get()) + maxNumRows, std::string(""));
} else {
this->columns[i] =
std::shared_ptr<ColByteType>(new ColByteType[sizeAlloc], std::default_delete<ColByteType[]>());
if (zero)
memset(this->columns[i].get(), 0, sizeAlloc);
}
}
initLabels2Idxs();
}
Expand Down Expand Up @@ -226,6 +238,7 @@ class Frame : public Structure {
found = found || tryValueType<uint64_t>(colMat, schema + c, columns + c);
found = found || tryValueType<float>(colMat, schema + c, columns + c);
found = found || tryValueType<double>(colMat, schema + c, columns + c);
found = found || tryValueType<std::string>(colMat, schema + c, columns + c);
if (!found)
throw std::runtime_error("unsupported value type");
}
Expand Down
7 changes: 7 additions & 0 deletions src/runtime/local/datastructures/ValueTypeUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ size_t ValueTypeUtils::sizeOf(ValueTypeCode type) {
return sizeof(double);
case ValueTypeCode::STR:
return sizeof(std::string);
case ValueTypeCode::FIXEDSTR16:
return sizeof(FixedStr16);
default:
throw std::runtime_error("ValueTypeUtils::sizeOf: unknown value type code");
}
Expand Down Expand Up @@ -77,6 +79,9 @@ void ValueTypeUtils::printValue(std::ostream &os, ValueTypeCode type, const void
case ValueTypeCode::F64:
os << reinterpret_cast<const double *>(array)[pos];
break;
case ValueTypeCode::STR:
os << reinterpret_cast<const std::string *>(array)[pos];
break;
default:
throw std::runtime_error("ValueTypeUtils::printValue: unknown value type code");
}
Expand Down Expand Up @@ -146,6 +151,8 @@ const std::string ValueTypeUtils::cppNameForCode(ValueTypeCode type) {
return cppNameFor<float>;
case ValueTypeCode::F64:
return cppNameFor<double>;
case ValueTypeCode::STR:
return cppNameFor<std::string>;
default:
throw std::runtime_error("ValueTypeUtils::cppNameForCode: unknown value type code");
}
Expand Down
12 changes: 12 additions & 0 deletions src/runtime/local/io/ReadCsvFile.h
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,18 @@ template <> struct ReadCsvFile<Frame> {
convertCstr(file->line + pos, &val_f64);
reinterpret_cast<double *>(rawCols[col])[row] = val_f64;
break;
case ValueTypeCode::STR: {
std::string val_str = "";
pos = setCString(file, pos, &val_str, delim);
reinterpret_cast<std::string *>(rawCols[col])[row] = val_str;
break;
}
case ValueTypeCode::FIXEDSTR16: {
std::string val_str = "";
pos = setCString(file, pos, &val_str, delim);
reinterpret_cast<FixedStr16 *>(rawCols[col])[row] = FixedStr16(val_str);
break;
}
default:
throw std::runtime_error("ReadCsvFile::apply: unknown value type code");
}
Expand Down
4 changes: 4 additions & 0 deletions test/api/cli/io/ReadCsv3.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
1,-1,""
2,-2,
3,-3,"multi-line,"
4,-4,simple string
18 changes: 18 additions & 0 deletions test/api/cli/io/ReadCsv3.csv.meta
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"numRows": 4,
"numCols": 3,
"schema": [
{
"label": "a",
"valueType": "si8"
},
{
"label": "b",
"valueType": "ui8"
},
{
"label": "c",
"valueType": "str"
}
]
}
4 changes: 4 additions & 0 deletions test/api/cli/io/ReadTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ TEST_CASE("readFrameFromCSV", TAG_IO) {
compareDaphneToRef(dirPath + "testReadFrame.txt", dirPath + "testReadFrame.daphne");
}

TEST_CASE("readStringValuesIntoFrameFromCSV", TAG_IO) {
compareDaphneToRef(dirPath + "testReadStringIntoFrame.txt", dirPath + "testReadStringIntoFrame.daphne");
}

TEST_CASE("readMatrixFromCSV", TAG_IO) {
compareDaphneToRef(dirPath + "testReadMatrix.txt", dirPath + "testReadMatrix.daphne");
}
Expand Down
3 changes: 3 additions & 0 deletions test/api/cli/io/testReadStringIntoFrame.daphne
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
// Test reading frame with string columns.

print(readFrame("test/api/cli/io/ReadCsv3.csv"));
5 changes: 5 additions & 0 deletions test/api/cli/io/testReadStringIntoFrame.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Frame(4x3, [a:int8_t, b:uint8_t, c:std::string])
1 255
2 254
3 253 multi-line,
4 252 simple string
33 changes: 26 additions & 7 deletions test/runtime/local/datastructures/FrameTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,32 +34,35 @@ TEST_CASE("Frame allocates enough space", TAG_DATASTRUCTURES) {
// crashing.

const size_t numRows = 10000;
const ValueTypeCode schema[] = {ValueTypeCode::SI8, ValueTypeCode::UI32, ValueTypeCode::F64};
const ValueTypeCode schema[] = {ValueTypeCode::SI8, ValueTypeCode::UI32, ValueTypeCode::F64, ValueTypeCode::STR};
const size_t numCols = sizeof(schema) / sizeof(ValueTypeCode);

Frame *f = DataObjectFactory::create<Frame>(numRows, numCols, schema, nullptr, false);

int8_t *col0 = f->getColumn<int8_t>(0)->getValues();
uint32_t *col1 = f->getColumn<uint32_t>(1)->getValues();
double *col2 = f->getColumn<double>(2)->getValues();
std::string *col3 = f->getColumn<std::string>(3)->getValues();

// Fill the column arrays with ones of the respective value type.
for (size_t i = 0; i < numRows; i++) {
col0[i] = int8_t(1);
col1[i] = uint32_t(1);
col2[i] = double(1);
col3[i] = std::string("a");
}

DataObjectFactory::destroy(f);
}

TEST_CASE("Frame sub-frame works properly", TAG_DATASTRUCTURES) {
const size_t numRowsOrig = 10;
const ValueTypeCode schemaOrig[] = {ValueTypeCode::SI8, ValueTypeCode::UI32, ValueTypeCode::F64};
const ValueTypeCode schemaOrig[] = {ValueTypeCode::SI8, ValueTypeCode::UI32, ValueTypeCode::F64,
ValueTypeCode::STR};
const size_t numColsOrig = sizeof(schemaOrig) / sizeof(ValueTypeCode);

Frame *fOrig = DataObjectFactory::create<Frame>(numRowsOrig, numColsOrig, schemaOrig, nullptr, true);
const size_t colIdxsSub[] = {2, 0};
const size_t colIdxsSub[] = {2, 0, 3};
const size_t numColsSub = sizeof(colIdxsSub) / sizeof(size_t);
Frame *fSub = DataObjectFactory::create<Frame>(fOrig, 3, 5, numColsSub, colIdxsSub);

Expand All @@ -70,18 +73,24 @@ TEST_CASE("Frame sub-frame works properly", TAG_DATASTRUCTURES) {
// Sub-frame schema is as expected.
CHECK(fSub->getColumnType(0) == ValueTypeCode::F64);
CHECK(fSub->getColumnType(1) == ValueTypeCode::SI8);
CHECK(fSub->getColumnType(2) == ValueTypeCode::STR);

// Sub-frame shares data arrays with original.
int8_t *colOrig0 = fOrig->getColumn<int8_t>(0)->getValues();
double *colOrig2 = fOrig->getColumn<double>(2)->getValues();
std::string *colOrig3 = fOrig->getColumn<std::string>(3)->getValues();
double *colSub0 = fSub->getColumn<double>(0)->getValues();
int8_t *colSub1 = fSub->getColumn<int8_t>(1)->getValues();
std::string *colSub2 = fSub->getColumn<std::string>(2)->getValues();
CHECK((colSub0 >= colOrig2 && colSub0 < colOrig2 + numRowsOrig));
CHECK((colSub1 >= colOrig0 && colSub1 < colOrig0 + numRowsOrig));
CHECK((colSub2 >= colOrig3 && colSub2 < colOrig3 + numRowsOrig));
colSub0[0] = double(123);
colSub1[0] = int8_t(456);
colSub2[0] = std::string("abcd");
CHECK(colOrig2[3] == double(123));
CHECK(colOrig0[3] == int8_t(456));
CHECK(colOrig3[3] == std::string("abcd"));

// Freeing both frames does not result in double-free errors.
SECTION("Freeing the original frame first is fine") {
Expand All @@ -98,33 +107,38 @@ TEST_CASE("Frame columns can be accessed by label", TAG_DATASTRUCTURES) {
auto c0 = genGivenVals<DenseMatrix<int64_t>>(3, {-1, -2, -3});
auto c1 = genGivenVals<DenseMatrix<double>>(3, {1.1, 2.2, 3.3});
auto c2 = genGivenVals<DenseMatrix<uint8_t>>(3, {10, 20, 30});
auto c3 = genGivenVals<DenseMatrix<std::string>>(3, {"abcdefj", "", "12345"});

std::vector<Structure *> colMats = {c0, c1, c2};
std::vector<Structure *> colMats = {c0, c1, c2, c3};

SECTION("implit labels") {
auto f = DataObjectFactory::create<Frame>(colMats, nullptr);

auto c0_ = f->getColumn<int64_t>("col_0");
auto c1_ = f->getColumn<double>("col_1");
auto c2_ = f->getColumn<uint8_t>("col_2");
auto c3_ = f->getColumn<std::string>("col_3");

CHECK(*c0_ == *c0);
CHECK(*c1_ == *c1);
CHECK(*c2_ == *c2);
CHECK(*c3_ == *c3);

DataObjectFactory::destroy(f);
}
SECTION("explicit labels") {
const std::string labels[] = {"zero", "one", "two"};
const std::string labels[] = {"zero", "one", "two", "three"};
auto f = DataObjectFactory::create<Frame>(colMats, labels);

auto c0_ = f->getColumn<int64_t>("zero");
auto c1_ = f->getColumn<double>("one");
auto c2_ = f->getColumn<uint8_t>("two");
auto c3_ = f->getColumn<std::string>("three");

CHECK(*c0_ == *c0);
CHECK(*c1_ == *c1);
CHECK(*c2_ == *c2);
CHECK(*c3_ == *c3);

DataObjectFactory::destroy(f);
}
Expand All @@ -149,11 +163,12 @@ TEST_CASE("Frame column labels must be unique", TAG_DATASTRUCTURES) {

TEST_CASE("Frame sub-frame for empty source frame works properly", TAG_DATASTRUCTURES) {
const size_t numRowsOrig = 0;
const ValueTypeCode schemaOrig[] = {ValueTypeCode::SI8, ValueTypeCode::UI32, ValueTypeCode::F64};
const ValueTypeCode schemaOrig[] = {ValueTypeCode::SI8, ValueTypeCode::UI32, ValueTypeCode::F64,
ValueTypeCode::STR};
const size_t numColsOrig = sizeof(schemaOrig) / sizeof(ValueTypeCode);

Frame *fOrig = DataObjectFactory::create<Frame>(numRowsOrig, numColsOrig, schemaOrig, nullptr, true);
const size_t colIdxsSub[] = {2, 0};
const size_t colIdxsSub[] = {2, 0, 3};
const size_t numColsSub = sizeof(colIdxsSub) / sizeof(size_t);
Frame *fSub = DataObjectFactory::create<Frame>(fOrig, 0, 0, numColsSub, colIdxsSub);

Expand All @@ -164,14 +179,18 @@ TEST_CASE("Frame sub-frame for empty source frame works properly", TAG_DATASTRUC
// Sub-frame schema is as expected.
CHECK(fSub->getColumnType(0) == ValueTypeCode::F64);
CHECK(fSub->getColumnType(1) == ValueTypeCode::SI8);
CHECK(fSub->getColumnType(2) == ValueTypeCode::STR);

// Sub-frame shares data arrays with original.
int8_t *colOrig0 = fOrig->getColumn<int8_t>(0)->getValues();
double *colOrig2 = fOrig->getColumn<double>(2)->getValues();
std::string *colOrig3 = fOrig->getColumn<std::string>(3)->getValues();
double *colSub0 = fSub->getColumn<double>(0)->getValues();
int8_t *colSub1 = fSub->getColumn<int8_t>(1)->getValues();
std::string *colSub2 = fSub->getColumn<std::string>(2)->getValues();
CHECK(colSub0 == colOrig2);
CHECK(colSub1 == colOrig0);
CHECK(colSub2 == colOrig3);

// Freeing both frames does not result in double-free errors.
SECTION("Freeing the original frame first is fine") {
Expand Down
4 changes: 4 additions & 0 deletions test/runtime/local/io/ReadCsv5.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
1,-1,""
2,-2,
3,-3,"multi-line,"
4,-4,simple string
18 changes: 18 additions & 0 deletions test/runtime/local/io/ReadCsv5.csv.meta
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"numRows": 4,
"numCols": 3,
"schema": [
{
"label": "a",
"valueType": "si8"
},
{
"label": "b",
"valueType": "ui8"
},
{
"label": "c",
"valueType": "str"
}
]
}
34 changes: 34 additions & 0 deletions test/runtime/local/io/ReadCsvTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,40 @@ TEST_CASE("ReadCsv, frame of uint8s", TAG_IO) {
DataObjectFactory::destroy(m);
}

TEST_CASE("ReadCsv, frame of uint8s and strings", TAG_IO) {
ValueTypeCode schema[] = {ValueTypeCode::UI8, ValueTypeCode::UI8, ValueTypeCode::STR};
Frame *m = NULL;

size_t numRows = 4;
size_t numCols = 3;

char filename[] = "./test/runtime/local/io/ReadCsv5.csv";
char delim = ',';

readCsv(m, filename, numRows, numCols, delim, schema);

REQUIRE(m->getNumRows() == numRows);
REQUIRE(m->getNumCols() == numCols);

CHECK(m->getColumn<uint8_t>(0)->get(0, 0) == 1);
CHECK(m->getColumn<uint8_t>(0)->get(1, 0) == 2);
CHECK(m->getColumn<uint8_t>(0)->get(2, 0) == 3);
CHECK(m->getColumn<uint8_t>(0)->get(3, 0) == 4);

/* File contains negative numbers. Expect cast to positive */
CHECK(m->getColumn<uint8_t>(1)->get(0, 0) == 255);
CHECK(m->getColumn<uint8_t>(1)->get(1, 0) == 254);
CHECK(m->getColumn<uint8_t>(1)->get(2, 0) == 253);
CHECK(m->getColumn<uint8_t>(1)->get(3, 0) == 252);

CHECK(m->getColumn<std::string>(2)->get(0, 0) == "");
CHECK(m->getColumn<std::string>(2)->get(1, 0) == "");
CHECK(m->getColumn<std::string>(2)->get(2, 0) == "multi-line,");
CHECK(m->getColumn<std::string>(2)->get(3, 0) == "simple string");

DataObjectFactory::destroy(m);
}

TEST_CASE("ReadCsv, col + row ignore", TAG_IO) {
ValueTypeCode schema[] = {ValueTypeCode::UI8, ValueTypeCode::UI8};
Frame *m = NULL;
Expand Down

0 comments on commit 649b2da

Please sign in to comment.