From 7304013c8627f3d692d9d831a9427f8a31417cfe Mon Sep 17 00:00:00 2001 From: Sven Obser Date: Sat, 7 Nov 2020 16:42:40 +0100 Subject: [PATCH] Support `ignoreUnknownColumns` Allow to ignore and skip unknown columns. Closes #9 --- CHANGELOG.md | 2 + README.md | 1 + .../serialization/csv/CsvConfiguration.kt | 4 +- .../kotlinx/serialization/csv/CsvException.kt | 43 +++++++ .../csv/decode/ClassCsvDecoder.kt | 34 +++++- .../serialization/csv/decode/CsvDecoder.kt | 27 +++-- .../serialization/csv/decode/CsvReader.kt | 3 + .../serialization/csv/decode/Source.kt | 5 + .../serialization/csv/decode/StringSource.kt | 3 + .../csv/config/CsvIgnoreUnknownKeysTest.kt | 106 ++++++++++++++++++ .../serialization/test/TestingFramework.kt | 13 +++ 11 files changed, 230 insertions(+), 11 deletions(-) create mode 100644 library/src/main/kotlin/kotlinx/serialization/csv/CsvException.kt create mode 100644 library/src/test/kotlin/kotlinx/serialization/csv/config/CsvIgnoreUnknownKeysTest.kt diff --git a/CHANGELOG.md b/CHANGELOG.md index b0f7f98..1dd64e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- Support `ignoreUnknownColumns`. ## [1.0.2] - 2020-10-11 ### Changed diff --git a/README.md b/README.md index a772f80..4ba6150 100644 --- a/README.md +++ b/README.md @@ -94,6 +94,7 @@ CSV serialization and parsing options can be changed by providing a custom `CsvC | `ignoreEmptyLines` | `true` | Ignore empty lines during parsing. | | `hasHeaderRecord` | `false` | First line is header record. | | `headerSeparator` | `.` | Character that is used to separate hierarchical header names. | +| `ignoreUnknownColumns` | `false` | Ignore unknown columns (only has effect when `hasHeaderRecord` is enabled). | | `hasTrailingDelimiter` | `false` | If records end with a trailing `delimiter`. | ## Requirements diff --git a/library/src/main/kotlin/kotlinx/serialization/csv/CsvConfiguration.kt b/library/src/main/kotlin/kotlinx/serialization/csv/CsvConfiguration.kt index 02c0fc7..9fa9b17 100644 --- a/library/src/main/kotlin/kotlinx/serialization/csv/CsvConfiguration.kt +++ b/library/src/main/kotlin/kotlinx/serialization/csv/CsvConfiguration.kt @@ -12,6 +12,7 @@ package kotlinx.serialization.csv * @param ignoreEmptyLines Ignore empty lines during parsing (default: `true`). * @param hasHeaderRecord First line is header record (default: `false`). * @param headerSeparator Character that is used to separate hierarchical header names (default: `.`). + * @param ignoreUnknownColumns Ignore unknown columns when `hasHeaderRecord` is enabled (default: `false`). * @param hasTrailingDelimiter If records end with a trailing [delimiter] (default: `false`). */ data class CsvConfiguration( @@ -24,7 +25,8 @@ data class CsvConfiguration( val ignoreEmptyLines: Boolean = true, val hasHeaderRecord: Boolean = false, val headerSeparator: Char = '.', - val hasTrailingDelimiter: Boolean = false + val ignoreUnknownColumns: Boolean = false, + val hasTrailingDelimiter: Boolean = false, ) { init { diff --git a/library/src/main/kotlin/kotlinx/serialization/csv/CsvException.kt b/library/src/main/kotlin/kotlinx/serialization/csv/CsvException.kt new file mode 100644 index 0000000..cd7e1eb --- /dev/null +++ b/library/src/main/kotlin/kotlinx/serialization/csv/CsvException.kt @@ -0,0 +1,43 @@ +@file:Suppress("FunctionName") + +package kotlinx.serialization.csv + +import kotlinx.serialization.ExperimentalSerializationApi +import kotlinx.serialization.SerializationException +import kotlinx.serialization.descriptors.SerialDescriptor + +/** + * Generic exception indicating a problem with CSV serialization and deserialization. + */ +internal open class CsvException(message: String) : SerializationException(message) + +/** + * Thrown when [Csv] has failed to create a CSV string from the given value. + */ +internal class CsvEncodingException(message: String) : CsvException(message) + +@OptIn(ExperimentalSerializationApi::class) +internal fun UnsupportedSerialDescriptorException(descriptor: SerialDescriptor) = CsvEncodingException ( + "CSV does not support '${descriptor.kind}'." +) + +@OptIn(ExperimentalSerializationApi::class) +internal fun HeadersNotSupportedForSerialDescriptorException(descriptor: SerialDescriptor) = CsvEncodingException ( + "CSV headers are not supported for variable sized type '${descriptor.kind}'." +) + +/** + * Thrown when [Csv] has failed to parse the given CSV string or deserialize it to a target class. + */ +internal class CsvDecodingException(message: String) : CsvException(message) + +internal fun CsvDecodingException(offset: Int?, message: String) = + CsvDecodingException(if (offset != null) "Unexpected CSV token at offset $offset: $message" else message) + +internal fun UnknownColumnHeaderException(offset: Int, header: String) = CsvDecodingException( + offset, + """ + |Encountered unknown column header '$header'. + |Use 'ignoreUnknownColumns = true' in 'Csv {}' builder to ignore unknown columns. + |""".trimMargin() +) diff --git a/library/src/main/kotlin/kotlinx/serialization/csv/decode/ClassCsvDecoder.kt b/library/src/main/kotlin/kotlinx/serialization/csv/decode/ClassCsvDecoder.kt index 5efa82c..3a660f8 100644 --- a/library/src/main/kotlin/kotlinx/serialization/csv/decode/ClassCsvDecoder.kt +++ b/library/src/main/kotlin/kotlinx/serialization/csv/decode/ClassCsvDecoder.kt @@ -20,10 +20,23 @@ internal class ClassCsvDecoder( ) : CsvDecoder(csv, reader, parent) { private var elementIndex = 0 + private var columnIndex = 0 override fun decodeElementIndex(descriptor: SerialDescriptor): Int = when { - reader.isDone || elementIndex >= descriptor.elementsCount -> CompositeDecoder.DECODE_DONE - classHeaders != null -> classHeaders[elementIndex] + reader.isDone -> CompositeDecoder.DECODE_DONE + elementIndex >= descriptor.elementsCount -> CompositeDecoder.DECODE_DONE + classHeaders != null && columnIndex >= classHeaders.size -> CompositeDecoder.DECODE_DONE + + classHeaders != null -> + when (val result = classHeaders[columnIndex]) { + CompositeDecoder.UNKNOWN_NAME -> { + ignoreColumn() + decodeElementIndex(descriptor) + } + null -> CompositeDecoder.UNKNOWN_NAME + else -> result + } + else -> elementIndex } @@ -42,14 +55,31 @@ internal class ClassCsvDecoder( } } + override fun endStructure(descriptor: SerialDescriptor) { + super.endStructure(descriptor) + + if (classHeaders != null && csv.configuration.ignoreUnknownColumns) { + while (columnIndex < classHeaders.size) { + ignoreColumn() + } + } + } + override fun endChildStructure(descriptor: SerialDescriptor) { super.endChildStructure(descriptor) elementIndex++ + columnIndex++ } override fun decodeColumn(): String { val value = super.decodeColumn() elementIndex++ + columnIndex++ return value } + + private fun ignoreColumn() { + reader.readColumn() + columnIndex++ + } } diff --git a/library/src/main/kotlin/kotlinx/serialization/csv/decode/CsvDecoder.kt b/library/src/main/kotlin/kotlinx/serialization/csv/decode/CsvDecoder.kt index 32738a2..7edd760 100644 --- a/library/src/main/kotlin/kotlinx/serialization/csv/decode/CsvDecoder.kt +++ b/library/src/main/kotlin/kotlinx/serialization/csv/decode/CsvDecoder.kt @@ -3,6 +3,8 @@ package kotlinx.serialization.csv.decode import kotlinx.serialization.ExperimentalSerializationApi import kotlinx.serialization.csv.Csv import kotlinx.serialization.csv.CsvConfiguration +import kotlinx.serialization.csv.UnknownColumnHeaderException +import kotlinx.serialization.csv.UnsupportedSerialDescriptorException import kotlinx.serialization.descriptors.PolymorphicKind import kotlinx.serialization.descriptors.SerialDescriptor import kotlinx.serialization.descriptors.StructureKind @@ -24,10 +26,10 @@ internal abstract class CsvDecoder( override val serializersModule: SerializersModule get() = csv.serializersModule - protected val configuration: CsvConfiguration + private val configuration: CsvConfiguration get() = csv.configuration - protected var headers: Headers? = null + private var headers: Headers? = null override fun beginStructure(descriptor: SerialDescriptor): CompositeDecoder { return when (descriptor.kind) { @@ -47,8 +49,7 @@ internal abstract class CsvDecoder( PolymorphicKind.OPEN -> ClassCsvDecoder(csv, reader, this, headers) - else -> - error("CSV does not support '${descriptor.kind}'.") + else -> throw UnsupportedSerialDescriptorException(descriptor) } } @@ -124,9 +125,11 @@ internal abstract class CsvDecoder( private fun readHeaders(desc: SerialDescriptor, prefix: String): Headers { val headers = Headers() var position = 0 - while (reader.isFirstRecord) { - // Read header value and check if it (still) starts with required prefix + while (!reader.isDone && reader.isFirstRecord) { + val offset = reader.offset reader.mark() + + // Read header value and check if it (still) starts with required prefix val value = reader.readColumn() if (!value.startsWith(prefix)) { reader.reset() @@ -151,8 +154,13 @@ internal abstract class CsvDecoder( } else { reader.unmark() } - } else { + } else if (csv.configuration.ignoreUnknownColumns) { + headers[position] = CompositeDecoder.UNKNOWN_NAME reader.unmark() + } else if (value == "" && !reader.isFirstRecord && configuration.hasTrailingDelimiter) { + reader.unmark() + } else { + throw UnknownColumnHeaderException(offset, value) } } position++ @@ -176,8 +184,11 @@ internal abstract class CsvDecoder( private val map = mutableMapOf() private val subHeaders = mutableMapOf() + val size + get() = map.size + operator fun get(position: Int) = - map.getOrElse(position) { CompositeDecoder.UNKNOWN_NAME } + map[position] operator fun set(key: Int, value: Int) { map[key] = value diff --git a/library/src/main/kotlin/kotlinx/serialization/csv/decode/CsvReader.kt b/library/src/main/kotlin/kotlinx/serialization/csv/decode/CsvReader.kt index bd57777..6c7bae0 100644 --- a/library/src/main/kotlin/kotlinx/serialization/csv/decode/CsvReader.kt +++ b/library/src/main/kotlin/kotlinx/serialization/csv/decode/CsvReader.kt @@ -7,6 +7,9 @@ import kotlinx.serialization.csv.CsvConfiguration */ internal class CsvReader(private val source: Source, private val configuration: CsvConfiguration) { + val offset + get() = source.offset + var recordNo = 0 private set diff --git a/library/src/main/kotlin/kotlinx/serialization/csv/decode/Source.kt b/library/src/main/kotlin/kotlinx/serialization/csv/decode/Source.kt index b460737..7edd749 100644 --- a/library/src/main/kotlin/kotlinx/serialization/csv/decode/Source.kt +++ b/library/src/main/kotlin/kotlinx/serialization/csv/decode/Source.kt @@ -5,6 +5,11 @@ package kotlinx.serialization.csv.decode */ interface Source { + /** + * Current read offset in the source. + */ + val offset: Int + /** * Check if there are more characters to read. * @return True if EOF has not been read, yet; false if EOF has already been read. diff --git a/library/src/main/kotlin/kotlinx/serialization/csv/decode/StringSource.kt b/library/src/main/kotlin/kotlinx/serialization/csv/decode/StringSource.kt index 3b6574a..9974a2a 100644 --- a/library/src/main/kotlin/kotlinx/serialization/csv/decode/StringSource.kt +++ b/library/src/main/kotlin/kotlinx/serialization/csv/decode/StringSource.kt @@ -7,6 +7,9 @@ private const val EOF = -1 */ internal class StringSource(private val content: String) : Source { + override val offset: Int + get() = position + private var position = 0 private var marks = arrayListOf() diff --git a/library/src/test/kotlin/kotlinx/serialization/csv/config/CsvIgnoreUnknownKeysTest.kt b/library/src/test/kotlin/kotlinx/serialization/csv/config/CsvIgnoreUnknownKeysTest.kt new file mode 100644 index 0000000..75a92f6 --- /dev/null +++ b/library/src/test/kotlin/kotlinx/serialization/csv/config/CsvIgnoreUnknownKeysTest.kt @@ -0,0 +1,106 @@ +package kotlinx.serialization.csv.config + +import kotlinx.serialization.ExperimentalSerializationApi +import kotlinx.serialization.builtins.ListSerializer +import kotlinx.serialization.csv.Csv +import kotlinx.serialization.csv.CsvConfiguration +import kotlinx.serialization.csv.records.Data +import kotlinx.serialization.csv.records.IntStringRecord +import kotlinx.serialization.csv.records.Location +import kotlinx.serialization.csv.records.NestedRecord +import kotlinx.serialization.test.assertParse +import kotlinx.serialization.test.assertParseFails +import kotlin.test.Test + +@OptIn(ExperimentalSerializationApi::class) +internal class CsvIgnoreUnknownKeysTest { + + @Test + fun testMultipleColumns() = assertParse( + "a,b,IGNORED\r\n1,testing,ignored", + IntStringRecord(1, "testing"), + IntStringRecord.serializer(), + Csv( + CsvConfiguration( + hasHeaderRecord = true, + ignoreUnknownColumns = true + ) + ) + ) + + @Test + fun testMultipleColumns_failure() = assertParseFails( + "a,b,IGNORED\r\n1,testing,ignored", + IntStringRecord.serializer(), + Csv( + CsvConfiguration( + hasHeaderRecord = true + ) + ) + ) + + @Test + fun testMultipleColumnsReordered() = assertParse( + "IGNORED,b,a\r\nignored,testing,1", + IntStringRecord(1, "testing"), + IntStringRecord.serializer(), + Csv( + CsvConfiguration( + hasHeaderRecord = true, + ignoreUnknownColumns = true + ) + ) + ) + + @Test + fun testMultipleColumnsReordered_failure() = assertParseFails( + "IGNORED,b,a\r\nignored,testing,1", + IntStringRecord.serializer(), + Csv( + CsvConfiguration( + hasHeaderRecord = true + ) + ) + ) + + @Test + fun testNestedRecordListWithHeaderReordered() = assertParse( + """IGNORED,time,name,data.location.lon,data.location.IGNORED,data.location.lat,data.speed,data.info,IGNORED + |IGNORED,0,Alice,1.0,IGNORED,0.0,100,info,IGNORED + |IGNORED,1,Bob,20.0,IGNORED,10.0,50,info2,IGNORED + |""".trimMargin().replace("\n", "\r\n"), + listOf( + NestedRecord( + time = 0, + name = "Alice", + data = Data( + location = Location( + lat = 0.0, + lon = 1.0 + ), + speed = 100, + info = "info" + ) + ), + NestedRecord( + time = 1, + name = "Bob", + data = Data( + location = Location( + lat = 10.0, + lon = 20.0 + ), + speed = 50, + info = "info2" + ) + ) + ), + ListSerializer(NestedRecord.serializer()), + Csv( + CsvConfiguration( + hasHeaderRecord = true, + ignoreUnknownColumns = true + ) + ) + ) +} diff --git a/library/src/test/kotlin/kotlinx/serialization/test/TestingFramework.kt b/library/src/test/kotlin/kotlinx/serialization/test/TestingFramework.kt index adc6451..74163e9 100644 --- a/library/src/test/kotlin/kotlinx/serialization/test/TestingFramework.kt +++ b/library/src/test/kotlin/kotlinx/serialization/test/TestingFramework.kt @@ -7,7 +7,9 @@ package kotlinx.serialization.test import kotlinx.serialization.ExperimentalSerializationApi import kotlinx.serialization.KSerializer import kotlinx.serialization.StringFormat +import kotlinx.serialization.csv.CsvDecodingException import kotlin.test.assertEquals +import kotlin.test.assertFailsWith @OptIn(ExperimentalSerializationApi::class) inline fun assertStringFormAndRestored( @@ -38,6 +40,17 @@ inline fun assertParse( assertEquals(expected, restored) } +@OptIn(ExperimentalSerializationApi::class) +inline fun assertParseFails( + input: String, + serializer: KSerializer, + format: StringFormat +) { + assertFailsWith { + format.decodeFromString(serializer, input) + } +} + @OptIn(ExperimentalSerializationApi::class) inline fun StringFormat.assertStringFormAndRestored( expected: String,