Skip to content

Commit

Permalink
Add schema-based parsing (#43)
Browse files Browse the repository at this point in the history
  • Loading branch information
piotrrzysko authored Apr 28, 2024
1 parent 84736d5 commit 394e76c
Show file tree
Hide file tree
Showing 111 changed files with 10,911 additions and 1,086 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
build
profilers
testdata
hotspot_*.log
36 changes: 24 additions & 12 deletions build.gradle
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import me.champeau.jmh.JmhBytecodeGeneratorTask
import org.gradle.internal.os.OperatingSystem
import org.ajoberstar.grgit.Grgit
import org.gradle.internal.os.OperatingSystem

import java.time.Duration

plugins {
Expand Down Expand Up @@ -42,20 +43,20 @@ java {
}

ext {
junitVersion = '5.9.1'
jsoniterScalaVersion = '2.24.4'
junitVersion = '5.10.2'
jsoniterScalaVersion = '2.28.4'
}

dependencies {
jmhImplementation group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.16.0'
jmhImplementation group: 'com.alibaba.fastjson2', name: 'fastjson2', version: '2.0.42'
jmhImplementation group: 'com.jsoniter', name: 'jsoniter', version: '0.9.23'
jmhImplementation group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.17.0'
jmhImplementation group: 'com.alibaba.fastjson2', name: 'fastjson2', version: '2.0.49'
jmhImplementation group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-core_2.13', version: jsoniterScalaVersion
jmhImplementation group: 'com.google.guava', name: 'guava', version: '32.1.2-jre'
compileOnly group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-macros_2.13', version: jsoniterScalaVersion

testImplementation group: 'org.assertj', name: 'assertj-core', version: '3.24.2'
testImplementation group: 'org.apache.commons', name: 'commons-text', version: '1.10.0'
testImplementation group: 'org.junit-pioneer', name: 'junit-pioneer', version: '2.2.0'
testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-api', version: junitVersion
testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-params', version: junitVersion
testRuntimeOnly group: 'org.junit.jupiter', name: 'junit-jupiter-engine', version: junitVersion
Expand Down Expand Up @@ -136,15 +137,21 @@ jmh {
'--add-modules=jdk.incubator.vector'
]
if (getBooleanProperty('jmh.profilersEnabled', false)) {
createDirIfDoesNotExist('./profilers')
if (OperatingSystem.current().isLinux()) {
profilers = [
'perf',
'perfasm:intelSyntax=true',
'async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getAsyncProfilerLibPath('LD_LIBRARY_PATH')
def profilerList = [
'async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getLibPath('LD_LIBRARY_PATH')
]
if (getBooleanProperty('jmh.jitLogEnabled', false)) {
createDirIfDoesNotExist('./profilers/perfasm')
profilerList += [
'perfasm:intelSyntax=true;saveLog=true;saveLogTo=./profilers/perfasm'
]
}
profilers = profilerList
} else if (OperatingSystem.current().isMacOsX()) {
profilers = [
'async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getAsyncProfilerLibPath('DYLD_LIBRARY_PATH')
'async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getLibPath('DYLD_LIBRARY_PATH')
]
}
}
Expand Down Expand Up @@ -218,6 +225,11 @@ def getBooleanProperty(String name, boolean defaultValue) {
Boolean.valueOf((project.findProperty(name) ?: defaultValue) as String)
}

static def getAsyncProfilerLibPath(String envVarName) {
static def getLibPath(String envVarName) {
System.getenv(envVarName) ?: System.getProperty('java.library.path')
}

static createDirIfDoesNotExist(String dir) {
File file = new File(dir)
file.mkdirs()
}
4 changes: 2 additions & 2 deletions src/jmh/java/org/simdjson/NumberParserBenchmark.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
public class NumberParserBenchmark {

private final Tape tape = new Tape(100);
private final NumberParser numberParser = new NumberParser(tape);
private final NumberParser numberParser = new NumberParser();

@Param({
"2.2250738585072013e-308", // fast path
Expand All @@ -43,7 +43,7 @@ public double baseline() {
@Benchmark
public double simdjson() {
tape.reset();
numberParser.parseNumber(numberUtf8Bytes, 0);
numberParser.parseNumber(numberUtf8Bytes, 0, tape);
return tape.getDouble(0);
}
}
31 changes: 1 addition & 30 deletions src/jmh/java/org/simdjson/ParseAndSelectBenchmark.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@
import com.alibaba.fastjson2.JSONObject;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.github.plokhotnyuk.jsoniter_scala.core.ReaderConfig$;
import com.github.plokhotnyuk.jsoniter_scala.core.package$;
import com.jsoniter.JsonIterator;
import com.jsoniter.any.Any;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Level;
Expand Down Expand Up @@ -43,19 +39,7 @@ public void setup() throws IOException {
buffer = is.readAllBytes();
bufferPadded = padded(buffer);
}
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_jsoniter_scala() throws IOException {
Twitter twitter = package$.MODULE$.readFromArray(buffer, ReaderConfig$.MODULE$, Twitter$.MODULE$.codec());
Set<String> defaultUsers = new HashSet<>();
for (Status tweet: twitter.statuses()) {
User user = tweet.user();
if (user.default_profile()) {
defaultUsers.add(user.screen_name());
}
}
return defaultUsers.size();
System.out.println("VectorSpecies = " + StructuralIndexer.BYTE_SPECIES);
}

@Benchmark
Expand Down Expand Up @@ -88,19 +72,6 @@ public int countUniqueUsersWithDefaultProfile_fastjson() {
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_jsoniter() {
Any json = JsonIterator.deserialize(buffer);
Set<String> defaultUsers = new HashSet<>();
for (Any tweet : json.get("statuses")) {
Any user = tweet.get("user");
if (user.get("default_profile").toBoolean()) {
defaultUsers.add(user.get("screen_name").toString());
}
}
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_simdjson() {
JsonValue simdJsonValue = simdJsonParser.parse(buffer, buffer.length);
Expand Down
123 changes: 123 additions & 0 deletions src/jmh/java/org/simdjson/SchemaBasedParseAndSelectBenchmark.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
package org.simdjson;

import com.alibaba.fastjson2.JSON;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.github.plokhotnyuk.jsoniter_scala.core.ReaderConfig$;
import com.github.plokhotnyuk.jsoniter_scala.core.package$;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import static org.simdjson.SimdJsonPaddingUtil.padded;

@State(Scope.Benchmark)
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
public class SchemaBasedParseAndSelectBenchmark {

private final SimdJsonParser simdJsonParser = new SimdJsonParser();
private final ObjectMapper objectMapper = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);

private byte[] buffer;
private byte[] bufferPadded;

@Setup(Level.Trial)
public void setup() throws IOException {
try (InputStream is = ParseBenchmark.class.getResourceAsStream("/twitter.json")) {
buffer = is.readAllBytes();
bufferPadded = padded(buffer);
}
System.out.println("VectorSpecies = " + StructuralIndexer.BYTE_SPECIES);
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_simdjson() {
Set<String> defaultUsers = new HashSet<>();
SimdJsonTwitter twitter = simdJsonParser.parse(buffer, buffer.length, SimdJsonTwitter.class);
for (SimdJsonStatus status : twitter.statuses()) {
SimdJsonUser user = status.user();
if (user.default_profile()) {
defaultUsers.add(user.screen_name());
}
}
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_simdjsonPadded() {
Set<String> defaultUsers = new HashSet<>();
SimdJsonTwitter twitter = simdJsonParser.parse(bufferPadded, buffer.length, SimdJsonTwitter.class);
for (SimdJsonStatus status : twitter.statuses()) {
SimdJsonUser user = status.user();
if (user.default_profile()) {
defaultUsers.add(user.screen_name());
}
}
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_jackson() throws IOException {
Set<String> defaultUsers = new HashSet<>();
SimdJsonTwitter twitter = objectMapper.readValue(buffer, SimdJsonTwitter.class);
for (SimdJsonStatus status : twitter.statuses()) {
SimdJsonUser user = status.user();
if (user.default_profile()) {
defaultUsers.add(user.screen_name());
}
}
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_jsoniter_scala() {
Twitter twitter = package$.MODULE$.readFromArray(buffer, ReaderConfig$.MODULE$, Twitter$.MODULE$.codec());
Set<String> defaultUsers = new HashSet<>();
for (Status tweet: twitter.statuses()) {
User user = tweet.user();
if (user.default_profile()) {
defaultUsers.add(user.screen_name());
}
}
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_fastjson() {
Set<String> defaultUsers = new HashSet<>();
SimdJsonTwitter twitter = JSON.parseObject(buffer, SimdJsonTwitter.class);
for (SimdJsonStatus status : twitter.statuses()) {
SimdJsonUser user = status.user();
if (user.default_profile()) {
defaultUsers.add(user.screen_name());
}
}
return defaultUsers.size();
}

record SimdJsonUser(boolean default_profile, String screen_name) {

}

record SimdJsonStatus(SimdJsonUser user) {

}

record SimdJsonTwitter(List<SimdJsonStatus> statuses) {

}
}
37 changes: 36 additions & 1 deletion src/main/java/org/simdjson/BitIndexes.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,26 @@ private long clearLowestBit(long bits) {
return bits & (bits - 1);
}

int advance() {
void advance() {
readIdx++;
}

int getAndAdvance() {
assert readIdx <= writeIdx;
return indexes[readIdx++];
}

int getLast() {
return indexes[writeIdx - 1];
}

int advanceAndGet() {
assert readIdx + 1 <= writeIdx;
return indexes[++readIdx];
}

int peek() {
assert readIdx <= writeIdx;
return indexes[readIdx];
}

Expand All @@ -60,6 +75,26 @@ boolean isEnd() {
return writeIdx == readIdx;
}

boolean isPastEnd() {
return readIdx > writeIdx;
}

void finish() {
// If we go past the end of the detected structural indexes, it means we are dealing with an invalid JSON.
// Thus, we need to stop processing immediately and throw an exception. To avoid checking after every increment
// of readIdx whether this has happened, we jump to the first structural element. This should produce the
// desired outcome, i.e., an iterator should detect invalid JSON. To understand how this works, let's first
// exclude primitive values (numbers, strings, booleans, nulls) from the scope of possible JSON documents. We
// can do this because, when these values are parsed, the length of the input buffer is verified, ensuring we
// never go past its end. Therefore, we can focus solely on objects and arrays. Since we always check that if
// the first character is '{', the last one must be '}', and if the first character is '[', the last one must
// be ']', we know that if we've reached beyond the buffer without crashing, the input is either '{...}' or '[...]'.
// Thus, if we jump to the first structural element, we will generate either '{...}{' or '[...]['. Both of these
// are invalid sequences and will be detected by the iterator, which will then stop processing and throw an
// exception informing about the invalid JSON.
indexes[writeIdx] = 0;
}

void reset() {
writeIdx = 0;
readIdx = 0;
Expand Down
24 changes: 24 additions & 0 deletions src/main/java/org/simdjson/ClassResolver.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package org.simdjson;

import java.lang.reflect.Type;
import java.util.HashMap;
import java.util.Map;

class ClassResolver {

private final Map<Type, ResolvedClass> classCache = new HashMap<>();

ResolvedClass resolveClass(Type type) {
ResolvedClass resolvedClass = classCache.get(type);
if (resolvedClass != null) {
return resolvedClass;
}
resolvedClass = new ResolvedClass(type, this);
classCache.put(type, resolvedClass);
return resolvedClass;
}

void reset() {
classCache.clear();
}
}
4 changes: 4 additions & 0 deletions src/main/java/org/simdjson/ConstructorArgument.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
package org.simdjson;

record ConstructorArgument(int idx, ResolvedClass resolvedClass) {
}
Loading

0 comments on commit 394e76c

Please sign in to comment.