From f6fc9e56dbbc9d8e5f75130af384f5591db906d1 Mon Sep 17 00:00:00 2001 From: jimeng Date: Fri, 4 Oct 2024 22:56:59 +0800 Subject: [PATCH] fix comments, rename SimdjsonParser2 to SimdjsonParserWithFixPath --- build.gradle | 11 +- .../ParseAndSelectFixPathBenchMark.java | 49 ++++ src/main/java/org/simdjson/BitIndexes.java | 8 +- .../java/org/simdjson/SimdJsonParser.java | 7 +- .../simdjson/SimdJsonParserWithFixPath.java | 219 ++++++++++++++++++ .../simdjson/JsonMultiValueParsingTest.java | 33 +++ 6 files changed, 322 insertions(+), 5 deletions(-) create mode 100644 src/jmh/java/org/simdjson/ParseAndSelectFixPathBenchMark.java create mode 100644 src/main/java/org/simdjson/SimdJsonParserWithFixPath.java create mode 100644 src/test/java/org/simdjson/JsonMultiValueParsingTest.java diff --git a/build.gradle b/build.gradle index 60e5f4b..1d9ace0 100644 --- a/build.gradle +++ b/build.gradle @@ -26,6 +26,7 @@ group = 'org.simdjson' version = scmVersion.version repositories { + mavenLocal() mavenCentral() } @@ -45,6 +46,7 @@ java { ext { junitVersion = '5.10.2' jsoniterScalaVersion = '2.28.4' + lombokVersion = '1.18.34' } dependencies { @@ -53,6 +55,10 @@ dependencies { jmhImplementation group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-core_2.13', version: jsoniterScalaVersion jmhImplementation group: 'com.google.guava', name: 'guava', version: '32.1.2-jre' compileOnly group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-macros_2.13', version: jsoniterScalaVersion + compileOnly group: 'org.projectlombok', name: 'lombok', version: lombokVersion + annotationProcessor group: 'org.projectlombok', name: 'lombok', version: lombokVersion + testCompileOnly group: 'org.projectlombok', name: 'lombok', version: lombokVersion + testAnnotationProcessor group: 'org.projectlombok', name: 'lombok', version: lombokVersion testImplementation group: 'org.assertj', name: 'assertj-core', version: '3.24.2' testImplementation group: 'org.apache.commons', name: 'commons-text', version: '1.10.0' @@ -60,6 +66,7 @@ dependencies { testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-api', version: junitVersion testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-params', version: junitVersion testRuntimeOnly group: 'org.junit.jupiter', name: 'junit-jupiter-engine', version: junitVersion + } tasks.register('downloadTestData') { @@ -160,7 +167,9 @@ publishing { publications { mavenJava(MavenPublication) { from(components.java) - + groupId = 'org.simdjson' + artifactId = 'simdjson-java' + version = scmVersion.version pom { name = project.name description = 'A Java version of simdjson, a high-performance JSON parser utilizing SIMD instructions.' diff --git a/src/jmh/java/org/simdjson/ParseAndSelectFixPathBenchMark.java b/src/jmh/java/org/simdjson/ParseAndSelectFixPathBenchMark.java new file mode 100644 index 0000000..18de638 --- /dev/null +++ b/src/jmh/java/org/simdjson/ParseAndSelectFixPathBenchMark.java @@ -0,0 +1,49 @@ +package org.simdjson; + +import java.io.IOException; +import java.io.InputStream; +import java.util.concurrent.TimeUnit; + +import org.openjdk.jmh.annotations.*; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + +@State(Scope.Benchmark) +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.SECONDS) +public class ParseAndSelectFixPathBenchMark { + @Param({"/twitter.json"}) + String fileName; + private byte[] buffer; + private final SimdJsonParser simdJsonParser = new SimdJsonParser(); + private final ObjectMapper jacksonObjectMapper = new ObjectMapper(); + private final SimdJsonParserWithFixPath simdJsonParserWithFixPath = new SimdJsonParserWithFixPath( + "statuses.0.user.default_profile", "statuses.0.user.screen_name", + "statuses.0.user.name", "statuses.0.user.id", "statuses.0.user.description", + "statuses.1.user.default_profile", "statuses.1.user.screen_name", + "statuses.1.user.name", "statuses.1.user.id", "statuses.1.user.description"); + + @Setup(Level.Trial) + public void setup() throws IOException { + try (InputStream is = ParseBenchmark.class.getResourceAsStream("/twitter.json")) { + buffer = is.readAllBytes(); + } + System.out.println("VectorSpecies = " + VectorUtils.BYTE_SPECIES); + } + + @Benchmark + public JsonValue parseMultiValuesForFixPaths_SimdJson() { + return simdJsonParser.parse(buffer, buffer.length); + } + + @Benchmark + public String[] parseMultiValuesForFixPaths_SimdJsonParserWithFixPath() { + return simdJsonParserWithFixPath.parse(buffer, buffer.length); + } + + @Benchmark + public JsonNode parseMultiValuesForFixPaths_Jackson() throws IOException { + return jacksonObjectMapper.readTree(buffer); + } +} diff --git a/src/main/java/org/simdjson/BitIndexes.java b/src/main/java/org/simdjson/BitIndexes.java index 59c0dc3..637eac9 100644 --- a/src/main/java/org/simdjson/BitIndexes.java +++ b/src/main/java/org/simdjson/BitIndexes.java @@ -1,6 +1,8 @@ package org.simdjson; -class BitIndexes { +import java.util.Arrays; + +public class BitIndexes { private final int[] indexes; @@ -44,8 +46,8 @@ private long clearLowestBit(long bits) { return bits & (bits - 1); } - void advance() { - readIdx++; + int advance() { + return indexes[readIdx++]; } int getAndAdvance() { diff --git a/src/main/java/org/simdjson/SimdJsonParser.java b/src/main/java/org/simdjson/SimdJsonParser.java index 707124c..3825185 100644 --- a/src/main/java/org/simdjson/SimdJsonParser.java +++ b/src/main/java/org/simdjson/SimdJsonParser.java @@ -24,7 +24,12 @@ public SimdJsonParser(int capacity, int maxDepth) { paddedBuffer = new byte[capacity]; indexer = new StructuralIndexer(bitIndexes); } - + BitIndexes buildBitIndex (byte[] buffer, int len) { + byte[] padded = padIfNeeded(buffer, len); + reset(); + stage1(padded, len); + return bitIndexes; + } public T parse(byte[] buffer, int len, Class expectedType) { byte[] padded = padIfNeeded(buffer, len); reset(); diff --git a/src/main/java/org/simdjson/SimdJsonParserWithFixPath.java b/src/main/java/org/simdjson/SimdJsonParserWithFixPath.java new file mode 100644 index 0000000..5866d4e --- /dev/null +++ b/src/main/java/org/simdjson/SimdJsonParserWithFixPath.java @@ -0,0 +1,219 @@ +package org.simdjson; + +import java.util.HashMap; +import java.util.Map; + +import lombok.Data; +import lombok.RequiredArgsConstructor; + +public class SimdJsonParserWithFixPath { + + @Data + @RequiredArgsConstructor + static class JsonNode { + private long version = 0; + private boolean isLeaf = false; + private final String name; + private String value = null; + private JsonNode parent = null; + private Map children = new HashMap<>(); + private int start = -1; + private int end = -1; + } + + private final SimdJsonParser parser; + private BitIndexes bitIndexes; + private final JsonNode root = new JsonNode(null); + private final JsonNode[] row; + private final String[] result; + private final String[] emptyResult; + private JsonNode ptr; + private byte[] buffer; + private final int expectParseCols; + // every time json string is processed, currentVersion will be incremented by 1 + private long currentVersion = 0; + + public SimdJsonParserWithFixPath(String... args) { + parser = new SimdJsonParser(); + expectParseCols = args.length; + row = new JsonNode[expectParseCols]; + result = new String[expectParseCols]; + emptyResult = new String[expectParseCols]; + for (int i = 0; i < args.length; i++) { + emptyResult[i] = null; + } + for (int i = 0; i < expectParseCols; i++) { + JsonNode cur = root; + String[] paths = args[i].split("\\."); + for (int j = 0; j < paths.length; j++) { + if (!cur.getChildren().containsKey(paths[j])) { + JsonNode child = new JsonNode(paths[j]); + cur.getChildren().put(paths[j], child); + child.setParent(cur); + } + cur = cur.getChildren().get(paths[j]); + } + cur.setLeaf(true); + row[i] = cur; + } + + } + + public String[] parse(byte[] buffer, int len) { + this.bitIndexes = parser.buildBitIndex(buffer, len); + if (buffer == null || buffer.length == 0) { + return emptyResult; + } + this.currentVersion++; + this.ptr = root; + this.buffer = buffer; + + switch (buffer[bitIndexes.peek()]) { + case '{' -> { + parseMap(); + } + case '[' -> { + parseList(); + } + default -> { + throw new RuntimeException("invalid json format"); + } + } + return getResult(); + } + + private String parseValue() { + int start = bitIndexes.advance(); + int next = bitIndexes.peek(); + String field = new String(buffer, start, next - start).trim(); + if ("null".equalsIgnoreCase(field)) { + return null; + } + // field type is string or type is decimal + if (field.startsWith("\"")) { + field = field.substring(1, field.length() - 1); + } + return field; + } + + private void parseElement(String expectFieldName) { + // if expectFieldName is null, parent is map, else is list + if (expectFieldName == null) { + expectFieldName = parseValue(); + bitIndexes.advance(); // skip : + } + if (!ptr.getChildren().containsKey(expectFieldName)) { + skip(false); + return; + } + ptr = ptr.getChildren().get(expectFieldName); + switch (buffer[bitIndexes.peek()]) { + case '{' -> { + parseMap(); + } + case '[' -> { + parseList(); + } + default -> { + ptr.setValue(skip(true)); + ptr.setVersion(currentVersion); + } + } + ptr = ptr.getParent(); + } + + private void parseMap() { + if (ptr.getChildren() == null) { + ptr.setValue(skip(true)); + ptr.setVersion(currentVersion); + return; + } + ptr.setStart(bitIndexes.peek()); + bitIndexes.advance(); + while (bitIndexes.hasNext() && buffer[bitIndexes.peek()] != '}') { + parseElement(null); + if (buffer[bitIndexes.peek()] == ',') { + bitIndexes.advance(); + } + } + ptr.setEnd(bitIndexes.peek()); + if (ptr.isLeaf()) { + ptr.setValue(new String(buffer, ptr.getStart(), ptr.getEnd() - ptr.getStart() + 1)); + ptr.setVersion(currentVersion); + } + bitIndexes.advance(); + } + + private void parseList() { + if (ptr.getChildren() == null) { + ptr.setValue(skip(true)); + ptr.setVersion(currentVersion); + return; + } + ptr.setStart(bitIndexes.peek()); + bitIndexes.advance(); + int i = 0; + while (bitIndexes.hasNext() && buffer[bitIndexes.peek()] != ']') { + parseElement("" + i); + if (buffer[bitIndexes.peek()] == ',') { + bitIndexes.advance(); + } + i++; + } + ptr.setEnd(bitIndexes.peek()); + if (ptr.isLeaf()) { + ptr.setValue(new String(buffer, ptr.getStart(), ptr.getEnd() - ptr.getStart() + 1)); + ptr.setVersion(currentVersion); + } + bitIndexes.advance(); + } + + private String skip(boolean retainValue) { + int i = 0; + int start = retainValue ? bitIndexes.peek() : 0; + switch (buffer[bitIndexes.peek()]) { + case '{' -> { + i++; + while (i > 0) { + bitIndexes.advance(); + if (buffer[bitIndexes.peek()] == '{') { + i++; + } else if (buffer[bitIndexes.peek()] == '}') { + i--; + } + } + int end = bitIndexes.peek(); + bitIndexes.advance(); + return retainValue ? new String(buffer, start, end - start + 1) : null; + } + case '[' -> { + i++; + while (i > 0) { + bitIndexes.advance(); + if (buffer[bitIndexes.peek()] == '[') { + i++; + } else if (buffer[bitIndexes.peek()] == ']') { + i--; + } + } + int end = bitIndexes.peek(); + bitIndexes.advance(); + return retainValue ? new String(buffer, start, end - start + 1) : null; + } + default -> { + return parseValue(); + } + } + } + + private String[] getResult() { + for (int i = 0; i < expectParseCols; i++) { + if (row[i].getVersion() < currentVersion) { + result[i] = null; + continue; + } + result[i] = row[i].getValue(); + } + return result; + } +} diff --git a/src/test/java/org/simdjson/JsonMultiValueParsingTest.java b/src/test/java/org/simdjson/JsonMultiValueParsingTest.java new file mode 100644 index 0000000..b0100cf --- /dev/null +++ b/src/test/java/org/simdjson/JsonMultiValueParsingTest.java @@ -0,0 +1,33 @@ +package org.simdjson; + +import static org.simdjson.testutils.SimdJsonAssertions.assertThat; +import static org.simdjson.testutils.TestUtils.toUtf8; + +import org.junit.jupiter.api.Test; + +public class JsonMultiValueParsingTest { + @Test + public void testParseMultiValue() { + byte[] json = toUtf8("{\"field1\":{\"field2\":\"value2\",\"field3\":3},\"field4\":[\"value4\",\"value5\"],\"field5\":null}"); + SimdJsonParserWithFixPath parser = new SimdJsonParserWithFixPath("field1.field2", "field1.field3", "field4", "field4.0", "field5"); + String[] result = parser.parse(json, json.length); + assertThat(result[0]).isEqualTo("value2"); + assertThat(result[1]).isEqualTo("3"); + assertThat(result[2]).isEqualTo("[\"value4\",\"value5\"]"); + assertThat(result[3]).isEqualTo("value4"); + assertThat(result[4]).isEqualTo(null); + } + + @Test + public void testNonAsciiCharacters() { + byte[] json = toUtf8("{\"ąćśńźż\": 1, \"\\u20A9\\u0E3F\": 2, \"αβγ\": 3, \"😀abc😀\": 4}"); + SimdJsonParserWithFixPath parser = new SimdJsonParserWithFixPath("ąćśńźż", "\\u20A9\\u0E3F", "αβγ", "😀abc😀"); + // when + String[] result = parser.parse(json, json.length); + // then + assertThat(result[0]).isEqualTo("1"); + assertThat(result[1]).isEqualTo("2"); + assertThat(result[2]).isEqualTo("3"); + assertThat(result[3]).isEqualTo("4"); + } +}