From 77105a61c085a318a9b901d8d9801a9d3eb4c3d3 Mon Sep 17 00:00:00 2001
From: Justin Tay <49700559+justin-tay@users.noreply.github.com>
Date: Sat, 15 Jun 2024 09:41:53 +0800
Subject: [PATCH] Improve documentation on regular expressions
---
README.md | 38 +++++-----
doc/compatibility.md | 17 ++++-
doc/ecma-262.md | 71 ++++++++++++++-----
.../schema/regex/JDKRegularExpression.java | 3 +
.../regex/GraalJSRegularExpressionTest.java | 17 +++++
.../regex/JDKRegularExpressionTest.java | 19 +++++
.../regex/JoniRegularExpressionTest.java | 19 +++++
7 files changed, 144 insertions(+), 40 deletions(-)
diff --git a/README.md b/README.md
index 7ac24f86e..eea5814c6 100644
--- a/README.md
+++ b/README.md
@@ -98,7 +98,7 @@ This implementation is tested against the [JSON Schema Test Suite](https://githu
|-----------------|-------------------------------------------------------------------------|-------------------------------------------------------------------|---------------------------------------------------------------------|--------------------------------------------------------------------|------------------------------------------------------------------------|----------------------------------------------------------------------|------------------------------------------------------------------------|
| NetworkNt | pass: r:4703 (100.0%) o:2369 (100.0%)
fail: r:0 (0.0%) o:1 (0.0%) | | pass: r:600 (100.0%) o:251 (100.0%)
fail: r:0 (0.0%) o:0 (0.0%) | pass: r:796 (100.0%) o:318 (100.0%)
fail: r:0 (0.0%) o:0 (0.0%) | pass: r:880 (100.0%) o:541 (100.0%)
fail: r:0 (0.0%) o:0 (0.0%) | pass: r:1201 (100.0%) o:625 (100.0%)
fail: r:0 (0.0%) o:0 (0.0%) | pass: r:1226 (100.0%) o:634 (99.8%)
fail: r:0 (0.0%) o:1 (0.2%) |
-* Note that this uses the ECMA 262 Validator option turned on for the `pattern` tests.
+* Note that this uses the `JoniRegularExpressionFactory` for the `pattern` and `format` `regex` tests.
#### Jackson Parser
@@ -157,17 +157,6 @@ The following are the optional dependencies that may be required for certain opt
These are not automatically included and setting the relevant option without adding the library will result in a `ClassNotFoundException`.
```xml
-
-
-
-
-
- org.jruby.joni
- joni
- ${version.joni}
- true
-
-
@@ -175,7 +164,15 @@ These are not automatically included and setting the relevant option without add
org.graalvm.js
js
${version.graaljs}
- true
+
+
+
+
+
+
+ org.jruby.joni
+ joni
+ ${version.joni}
```
@@ -270,8 +267,9 @@ SchemaValidatorsConfig config = new SchemaValidatorsConfig();
// By default JSON Path is used for reporting the instance location and evaluation path
config.setPathType(PathType.JSON_POINTER);
// By default the JDK regular expression implementation which is not ECMA 262 compliant is used
-// Note that setting this to true requires including the optional joni or graaljs dependency
-// config.setEcma262Validator(true);
+// Note that setting this requires including optional depedencies
+// config.setRegularExpressionFactory(GraalJSRegularExpressionFactory.getInstance());
+// config.setRegularExpressionFactory(JoniRegularExpressionFactory.getInstance());
// Due to the mapping the schema will be retrieved from the classpath at classpath:schema/example-main.json.
// If the schema data does not specify an $id the absolute IRI of the schema location will be used as the $id.
@@ -305,8 +303,9 @@ SchemaValidatorsConfig config = new SchemaValidatorsConfig();
// By default JSON Path is used for reporting the instance location and evaluation path
config.setPathType(PathType.JSON_POINTER);
// By default the JDK regular expression implementation which is not ECMA 262 compliant is used
-// Note that setting this to true requires including the optional joni or graaljs dependency
-// config.setEcma262Validator(true);
+// Note that setting this requires including optional depedencies
+// config.setRegularExpressionFactory(GraalJSRegularExpressionFactory.getInstance());
+// config.setRegularExpressionFactory(JoniRegularExpressionFactory.getInstance());
// Due to the mapping the meta-schema will be retrieved from the classpath at classpath:draft/2020-12/schema.
JsonSchema schema = jsonSchemaFactory.getSchema(SchemaLocation.of(SchemaId.V202012), config);
@@ -529,7 +528,6 @@ The following is sample output from the Hierarchical format.
| Name | Description | Default Value
|---------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------
| `pathType` | The path type to use for reporting the instance location and evaluation path. Set to `PathType.JSON_POINTER` to use JSON Pointer. | `PathType.DEFAULT`
-| `ecma262Validator` | Whether to use the ECMA 262 `joni` or `graaljs` library to validate the `pattern` keyword. This requires the dependency to be manually added to the project or a `ClassNotFoundException` will be thrown. | `false`
| `executionContextCustomizer` | This can be used to customize the `ExecutionContext` generated by the `JsonSchema` for each validation run. | `null`
| `schemaIdValidator` | This is used to customize how the `$id` values are validated. Note that the default implementation allows non-empty fragments where no base IRI is specified and also allows non-absolute IRI `$id` values in the root schema. | `JsonSchemaIdValidator.DEFAULT`
| `messageSource` | This is used to retrieve the locale specific messages. | `DefaultMessageSource.getInstance()`
@@ -539,7 +537,7 @@ The following is sample output from the Hierarchical format.
| `locale` | The locale to use for generating messages in the `ValidationMessage`. | `Locale.getDefault()`
| `failFast` | Whether to return failure immediately when an assertion is generated. | `false`
| `formatAssertionsEnabled` | The default is to generate format assertions from Draft 4 to Draft 7 and to only generate annotations from Draft 2019-09. Setting to `true` or `false` will override the default behavior. | `null`
-| `regularExpressionFactory` | The factory to use to create regular expressions for instance `JoniRegularExpressionFactory` or `GraalJSRegularExpressionFactory`. | `JDKRegularExpressionFactory.getInstance()`
+| `regularExpressionFactory` | The factory to use to create regular expressions for instance `JoniRegularExpressionFactory` or `GraalJSRegularExpressionFactory`. This requires the dependency to be manually added to the project or a `ClassNotFoundException` will be thrown. | `JDKRegularExpressionFactory.getInstance()`
## Performance Considerations
@@ -576,7 +574,7 @@ This does not mean that using a schema with a later draft specification will aut
## [JSON Schema Walkers and WalkListeners](doc/walkers.md)
-## [ECMA-262 Regex](doc/ecma-262.md)
+## [Regular Expressions](doc/ecma-262.md)
## [Custom Message](doc/cust-msg.md)
diff --git a/doc/compatibility.md b/doc/compatibility.md
index 09ad27a37..410f894a8 100644
--- a/doc/compatibility.md
+++ b/doc/compatibility.md
@@ -7,7 +7,7 @@
[![Draft 6](https://img.shields.io/endpoint?url=https%3A%2F%2Fbowtie.report%2Fbadges%2Fjava-com.networknt-json-schema-validator%2Fcompliance%2Fdraft6.json)](https://bowtie.report/#/dialects/draft6)
[![Draft 4](https://img.shields.io/endpoint?url=https%3A%2F%2Fbowtie.report%2Fbadges%2Fjava-com.networknt-json-schema-validator%2Fcompliance%2Fdraft4.json)](https://bowtie.report/#/dialects/draft4)
-The `pattern` validator by default uses the JDK regular expression implementation which is not ECMA-262 compliant and is thus not compliant with the JSON Schema specification. The library can however be configured to use a ECMA-262 compliant regular expression implementation.
+The `pattern` and `format` `regex` validator by default uses the JDK regular expression implementation which is not ECMA-262 compliant and is thus not compliant with the JSON Schema specification. The library can however be configured to use a ECMA-262 compliant regular expression implementation such as `GraalJS` or `Joni`.
Annotation processing and reporting are implemented. Note that the collection of annotations will have an adverse performance impact.
@@ -113,13 +113,24 @@ By default the `pattern` keyword uses the JDK regular expression implementation
This is not ECMA-262 compliant and is thus not compliant with the JSON Schema specification. This is however the more likely desired behavior as other logic will most likely be using the default JDK regular expression implementation to perform downstream processing.
-The library can be configured to use a ECMA-262 compliant regular expression validator which is implemented using [joni](https://github.com/jruby/joni). This can be configured by setting `setEcma262Validator` to `true`.
+The library can be configured to use a ECMA-262 compliant regular expression validator which is implemented using [GraalJS](https://github.com/oracle/graaljs) or [Joni](https://github.com/jruby/joni). This can be configured by setting `setRegularExpressionFactory` to the respective `GraalJSRegularExpressionFactory` or `JoniRegularExpressionFactory` instances.
-This also requires adding the `joni` dependency.
+This also requires adding the `org.graalvm.js:js` or `org.jruby.joni:joni` dependency.
```xml
+
+
+ org.graalvm.js
+ js
+ ${version.graaljs}
+
+
+
+
+
+
org.jruby.joni
joni
${version.joni}
diff --git a/doc/ecma-262.md b/doc/ecma-262.md
index b7869a2ae..418686ebc 100644
--- a/doc/ecma-262.md
+++ b/doc/ecma-262.md
@@ -1,28 +1,65 @@
-For the pattern validator, we now have two options for regex in the library. The default one is `java.util.regex`; however, you can use the ECMA-262 standard library `org.jruby.joni` by configuration.
+# Regular Expressions
-As we know, the JSON schema is designed based on the Javascript language and its regex. The Java internal implementation has some differences which don't comply with the standard. For most users, these edge cases are not the issue as they are not using them anyway. Even when they are using it, they are expecting the Java regex result as the application is built on the Java platform. For users who want to ensure that they are using 100% standard patter validator, we have provided an option to override the default regex library with `org.jruby.joni` that is complying with the ECMA-262 standard.
+For the `pattern` and `format` `regex` validators there are 3 built in options in the library.
-### Which one to choose?
+A custom implementation can be made by implementing `com.networknt.schema.regex.RegularExpressionFactory` to return a custom implementation of `com.networknt.schema.regex.RegularExpression`.
-If you want a faster regex lib and don't care about the slight difference between Java and Javascript regex, then you don't need to do anything. The default regex lib is the `java.util.regex`.
+| Regular Expression Factory | Description |
+|--------------------------------------------------|----------------------------------------------------|
+| `JDKRegularExpressionFactory` | Uses Java's standard `java.util.regex` and calls the `find()` method. Note that `matches()` is not called as that attempts to match the entire string, implicitly adding anchors. This is the default implementation and does not require any additional libraries. |
+| `JoniRegularExpressionFactory` | Uses `org.joni.Regex` with `Syntax.ECMAScript`. This requires adding the `org.jruby.joni:joni` dependency which will require about 2MB. |
+| `GraalJSRegularExpressionFactory` | Uses GraalJS with `new RegExp(pattern, 'u')`. This requires adding the `org.graalvm.js:js` dependency which will require about 50MB. |
-If you want to ensure full compliance, use the `org.jruby.joni`. It is 1.5 times slower then `java.util.regex`. Depending on your use case, it might not be an issue.
+## Specification
-### How to switch?
+The use of Regular Expressions is specified in JSON Schema at https://json-schema.org/draft/2020-12/json-schema-core#name-regular-expressions.
-Here is the test case that shows how to pass a config object to use the ECMA-262 library.
+```
+Keywords MAY use regular expressions to express constraints, or constrain the instance value to be a regular expression. These regular expressions SHOULD be valid according to the regular expression dialect described in ECMA-262, section 21.2.1 [ecma262].
+
+Regular expressions SHOULD be built with the "u" flag (or equivalent) to provide Unicode support, or processed in such a way which provides Unicode support as defined by ECMA-262.
+
+Furthermore, given the high disparity in regular expression constructs support, schema authors SHOULD limit themselves to the following regular expression tokens:
+
+individual Unicode characters, as defined by the JSON specification [RFC8259];
+simple character classes ([abc]), range character classes ([a-z]);
+complemented character classes ([^abc], [^a-z]);
+simple quantifiers: "+" (one or more), "*" (zero or more), "?" (zero or one), and their lazy versions ("+?", "*?", "??");
+range quantifiers: "{x}" (exactly x occurrences), "{x,y}" (at least x, at most y, occurrences), {x,} (x occurrences or more), and their lazy versions;
+the beginning-of-input ("^") and end-of-input ("$") anchors;
+simple grouping ("(...)") and alternation ("|").
+Finally, implementations MUST NOT take regular expressions to be anchored, neither at the beginning nor at the end. This means, for instance, the pattern "es" matches "expression".
+```
+
+## Considerations when selecting implementation
+
+If strict compliance with the regular expression dialect described in ECMA-262 is required. Then only the `GraalJS` implementation meets that criteria.
+
+The `Joni` implementation is configured to attempt to match the ECMA-262 regular expression dialect. However this dialect isn't directly maintained by its maintainers as it doesn't come from its upstream `Oniguruma`. The current implementation has known issues matching inputs with newlines and not respecting `^` and `$` anchors.
+
+The `JDK` implementation is the default and uses `java.util.regex` with the `find()` method.
+
+As the implementations are used when validating regular expressions, using `format` `regex`, one consideration is how the regular expression is used. For instance if the system that consumes the input is implemented in Javascript then the `GraalJS` implementation will ensure that this regular expression will work. If the system that consumes the input is implemented in Java then the `JDK` implementation may be better.
+
+## Configuration of implementation
+
+The following test case shows how to pass a config object to use the `GraalJS` factory.
```java
-@Test(expected = JsonSchemaException.class)
-public void testInvalidPatternPropertiesValidatorECMA262() throws Exception {
- SchemaValidatorsConfig config = new SchemaValidatorsConfig();
- config.setEcma262Validator(true);
- JsonSchemaFactory factory = JsonSchemaFactory.getInstance(SpecVersion.VersionFlag.V4);
- JsonSchema schema = factory.getSchema("{\"patternProperties\":6}", config);
-
- JsonNode node = getJsonNodeFromStringContent("");
- Set errors = schema.validate(node);
- Assert.assertEquals(errors.size(), 0);
+public class RegularExpressionTest {
+ @Test
+ public void testInvalidRegexValidatorECMA262() throws Exception {
+ SchemaValidatorsConfig config = new SchemaValidatorsConfig();
+ config.setRegularExpressionFactory(GraalJSRegularExpressionFactory.getInstance());
+ JsonSchemaFactory factory = JsonSchemaFactory.getInstance(VersionFlag.V202012);
+ JsonSchema schema = factory.getSchema("{\r\n"
+ + " \"format\": \"regex\"\r\n"
+ + "}", config);
+ Set errors = schema.validate("\"\\\\a\"", InputFormat.JSON, executionContext -> {
+ executionContext.getExecutionConfig().setFormatAssertionsEnabled(true);
+ });
+ assertFalse(errors.isEmpty());
+ }
}
```
diff --git a/src/main/java/com/networknt/schema/regex/JDKRegularExpression.java b/src/main/java/com/networknt/schema/regex/JDKRegularExpression.java
index 408117f80..f3353cb27 100644
--- a/src/main/java/com/networknt/schema/regex/JDKRegularExpression.java
+++ b/src/main/java/com/networknt/schema/regex/JDKRegularExpression.java
@@ -14,6 +14,9 @@ class JDKRegularExpression implements RegularExpression {
@Override
public boolean matches(String value) {
+ /*
+ * Note that the matches function is not used here as it implicitly adds anchors
+ */
return this.pattern.matcher(value).find();
}
}
\ No newline at end of file
diff --git a/src/test/java/com/networknt/schema/regex/GraalJSRegularExpressionTest.java b/src/test/java/com/networknt/schema/regex/GraalJSRegularExpressionTest.java
index 1c1fd7d55..6d1531c87 100644
--- a/src/test/java/com/networknt/schema/regex/GraalJSRegularExpressionTest.java
+++ b/src/test/java/com/networknt/schema/regex/GraalJSRegularExpressionTest.java
@@ -157,6 +157,23 @@ void namedBackreference() {
assertTrue(regex.matches("title=\"Named capturing groups\\' advantages\""));
}
+ @Test
+ void anchorShouldNotMatchMultilineInput() {
+ RegularExpression regex = new GraalJSRegularExpression("^[a-z]{1,10}$", CONTEXT);
+ assertFalse(regex.matches("abc\n"));
+ }
+
+ /**
+ * This test is because the JDK regex matches function implicitly adds anchors
+ * which isn't expected.
+ */
+ @Test
+ void noImplicitAnchors() {
+ RegularExpression regex = new GraalJSRegularExpression("[a-z]{1,10}", CONTEXT);
+ assertTrue(regex.matches("1abc1"));
+ }
+
+
@Test
void concurrency() throws Exception {
RegularExpression regex = new GraalJSRegularExpression("\\d", CONTEXT);
diff --git a/src/test/java/com/networknt/schema/regex/JDKRegularExpressionTest.java b/src/test/java/com/networknt/schema/regex/JDKRegularExpressionTest.java
index 8cca8a79d..92b4dcc41 100644
--- a/src/test/java/com/networknt/schema/regex/JDKRegularExpressionTest.java
+++ b/src/test/java/com/networknt/schema/regex/JDKRegularExpressionTest.java
@@ -15,9 +15,11 @@
*/
package com.networknt.schema.regex;
+import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
+import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
/**
@@ -40,4 +42,21 @@ void namedBackreference() {
RegularExpression regex = new JDKRegularExpression("title=(?[\"'])(.*?)\\k");
assertTrue(regex.matches("title=\"Named capturing groups\\' advantages\""));
}
+
+ @Test
+ @Disabled
+ void anchorShouldNotMatchMultilineInput() {
+ RegularExpression regex = new JDKRegularExpression("^[a-z]{1,10}$");
+ assertFalse(regex.matches("abc\n"));
+ }
+
+ /**
+ * This test is because the JDK regex matches function implicitly adds anchors
+ * which isn't expected.
+ */
+ @Test
+ void noImplicitAnchors() {
+ RegularExpression regex = new JDKRegularExpression("[a-z]{1,10}");
+ assertTrue(regex.matches("1abc1"));
+ }
}
diff --git a/src/test/java/com/networknt/schema/regex/JoniRegularExpressionTest.java b/src/test/java/com/networknt/schema/regex/JoniRegularExpressionTest.java
index 9a1d0cda2..83279ba06 100644
--- a/src/test/java/com/networknt/schema/regex/JoniRegularExpressionTest.java
+++ b/src/test/java/com/networknt/schema/regex/JoniRegularExpressionTest.java
@@ -17,10 +17,12 @@
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.joni.exception.SyntaxException;
+import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.EnumSource;
@@ -153,4 +155,21 @@ void namedBackreference() {
RegularExpression regex = new JoniRegularExpression("title=(?[\"'])(.*?)\\k");
assertTrue(regex.matches("title=\"Named capturing groups\\' advantages\""));
}
+
+ @Test
+ @Disabled // This test should pass but currently doesn't see issue #495
+ void anchorShouldNotMatchMultilineInput() {
+ RegularExpression regex = new JoniRegularExpression("^[a-z]{1,10}$");
+ assertFalse(regex.matches("abc\n"));
+ }
+
+ /**
+ * This test is because the JDK regex matches function implicitly adds anchors
+ * which isn't expected.
+ */
+ @Test
+ void noImplicitAnchors() {
+ RegularExpression regex = new JoniRegularExpression("[a-z]{1,10}");
+ assertTrue(regex.matches("1abc1"));
+ }
}