SWAT-engineering · sungshik · Jul 10, 2024 · Jul 8, 2024 · Jul 9, 2024 · Jul 9, 2024
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -23,6 +23,10 @@ jobs:
           java-version: 11
           distribution: 'temurin'
           cache: 'maven'
+      - uses: actions/setup-node@v4
+      - name: install-tokenizer
+        working-directory: rascal-textmate-core
+        run: npm install vscode-tmgrammar-test
       - name: run-tests
         working-directory: rascal-textmate-core
         run: mvn test -B -Drascal.compile.skip -Drascal.tutor.skip

diff --git a/README.md b/README.md
@@ -16,5 +16,18 @@ TextMate grammars, this project applies partial conversion. Alternatively, a
 previous [project](https://github.com/TarVK/syntax-highlighter) by
 [@TarVK](https://github.com/TarVK) applies total conversion.
 
-The [walkthrough](src/main/rascal/lang/textmate/conversiontests/Walkthrough.rsc)
+## Documentation
+
+The [walkthrough](rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Walkthrough.rsc)
 explains the main ideas behind the conversion algorithm in this project.
+
+## Tests
+
+To test tokenization (as part of the conversion
+[tests](rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests)),
+the [`vscode-tmgrammar-test`](https://github.com/PanAeon/vscode-tmgrammar-test)
+tool is used. Install it locally in directory `rascal-textmate-core` as follows:
+
+```
+npm install vscode-tmgrammar-test
+```
diff --git a/rascal-textmate-core/.gitignore b/rascal-textmate-core/.gitignore
@@ -1 +1,6 @@
-target
+target
+
+# Only used to run tests:
+node_modules
+package-lock.json
+package.json
diff --git a/rascal-textmate-core/src/main/rascal/lang/oniguruma/Conversion.rsc b/rascal-textmate-core/src/main/rascal/lang/oniguruma/Conversion.rsc
@@ -158,7 +158,9 @@ default RegExp toRegExp(Grammar _, Condition c) {
     Converts a character range to a regular expression.
 }
 
-RegExp toRegExp(Grammar _, range(begin, end))
+RegExp toRegExp(Grammar _, range(char, char))
+    = regExp("<encode(char)>", []);
+default RegExp toRegExp(Grammar _, range(begin, end))
     = regExp("[<encode(begin)>-<encode(end)>]", []);
 
 @synopsis{

diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/ConversionTests.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/ConversionTests.rsc
@@ -5,16 +5,24 @@
 module lang::textmate::ConversionTests
 
 import Grammar;
+import IO;
 import List;
 import Map;
 import ParseTree;
+import String;
+import util::ShellExec;
+import util::SystemAPI;
 
 import lang::textmate::Conversion;
 import lang::textmate::Grammar;
 
-bool doAnalyzeTest(RscGrammar rsc, list[ConversionUnit] expect) {
+bool doAnalyzeTest(RscGrammar rsc, list[ConversionUnit] expect, bool printActual = false) {
     list[ConversionUnit] actual = analyze(rsc);
 
+    for (printActual, u <- actual) {
+        println("unit(rsc, <u.prod>),");
+    }
+
     for (u <- actual) {
         assert u in expect : "Actual but not expected: <u.prod>";
     }
@@ -32,11 +40,13 @@ bool doAnalyzeTest(RscGrammar rsc, list[ConversionUnit] expect) {
     return true;
 }
 
-bool doTransformTest(list[ConversionUnit] units, RepositoryStats expect) {
-    TmGrammar tm = transform(units);
+bool doTransformTest(list[ConversionUnit] units, RepositoryStats expect, str name = "") {
+    TmGrammar tm = transform(units)[scopeName = "<name>"];
     Repository repo = tm.repository;
     list[TmRule] pats = tm.patterns;
 
+    // Test structural properties of the TextMate grammar
+
     RepositoryStats actual = <
         (0 | it + 1 | s <- repo, repo[s] is match),
         (0 | it + 1 | s <- repo, repo[s] is beginEnd),
@@ -59,10 +69,38 @@ bool doTransformTest(list[ConversionUnit] units, RepositoryStats expect) {
     assert size(pats) == size(repo) : "Actual patterns list size: <size(pats)>. Expected: <size(repo)>.";
     assert (true | it && r is include | r <- pats) : "Patterns list contains pattern(s) of unexpected kind";
     assert (true | it && s in repo | r <- pats, include(/#<s:.*>$/) := r) : "Patterns list contains pattern(s) outside repository";
+
+    // Test behavioral properties of the TextMate grammar
+
+    loc lProject = |project://rascal-textmate-core|;
+    loc lGrammar = lProject + "/target/generated-test-grammars/<name>.tmLanguage.json";
+    toJSON(tm, l = resolveLocation(lGrammar));
+
+    loc lTest = lProject + "/src/main/rascal/lang/textmate/conversiontests/<name>.test";
+    loc lTester = lProject + "/node_modules/vscode-tmgrammar-test";
+    if (!exists(lTest)) {
+        println("[LOG] No tokenization tests available for `<name>` (`<resolveLocation(lTest).path>` does not exist)");
+    } elseif (!exists(lTester)) {
+        println("[LOG] No tokenizer available (`<resolveLocation(lTester).path>` does not exist)");
+    } else {
+        bool windows = startsWith(getSystemProperty("os.name"), "Windows"); 
+        loc lExec = lProject + "/src/test/sh/lang/textmate/conversion-tests.<windows ? "bat" : "sh">";
+
+        if (<output, exitCode> := execWithCode(lExec, args = [name]) && exitCode != 0) {
+            println(output);
+            assert false : "Actual tokenization does not match expected tokenization (see output above for details)";
+        }
+    }
+
     return true;
 }
 
-alias RepositoryStats = tuple[int match, int beginEnd, int include];
+alias RepositoryStats = tuple[
+    int match,
+    int beginEnd,
+    int include];
 
 int sum(RepositoryStats stats)
-    = stats.match + stats.beginEnd + stats.include;
+    = stats.match
+    + stats.beginEnd
+    + stats.include;
diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Emoji.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Emoji.rsc
@@ -21,4 +21,4 @@ list[ConversionUnit] units = [
 ];
 
 test bool analyzeTest()   = doAnalyzeTest(rsc, units);
-test bool transformTest() = doTransformTest(units, <3, 0, 0>);
+test bool transformTest() = doTransformTest(units, <3, 0, 0>, name = "Emoji");
diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Emoji.test b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Emoji.test
@@ -0,0 +1,17 @@
+# SYNTAX TEST "Emoji"
+
+🌊
+# <-- constant.language
+
+foo 🌊 bar
+#  ^ -constant.language
+#   ^^ constant.language
+#     ^ -constant.language
+
+🙂🙁
+# <---- constant.language
+
+foo 🙂 bar 🙁
+#   ^^ constant.language
+#     ^^^^^ -constant.language
+#          ^^ constant.language
diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Pico.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Pico.rsc
@@ -15,4 +15,4 @@ list[ConversionUnit] units = [
 ];
 
 test bool analyzeTest()   = doAnalyzeTest(rsc, units);
-test bool transformTest() = doTransformTest(units, <3, 0, 0>);
+test bool transformTest() = doTransformTest(units, <3, 0, 0>, name = "Pico");
diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Pico.test b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Pico.test
@@ -0,0 +1,11 @@
+# SYNTAX TEST "Pico"
+
+%% foo bar
+# <-- Comment
+#  ^^^^^^^ Comment
+
+%% do
+#  ^^ Comment
+
+do
+# <-- keyword.control
diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc
@@ -60,4 +60,4 @@ list[ConversionUnit] units = [
 ];
 
 test bool analyzeTest()   = doAnalyzeTest(rsc, units);
-test bool transformTest() = doTransformTest(units, <8, 0, 0>);
+test bool transformTest() = doTransformTest(units, <8, 0, 0>, name = "PicoWithCategories");
diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.test
@@ -0,0 +1,41 @@
+# SYNTAX TEST "PicoWithCategories"
+
+%% foo bar
+# <-- comment.line
+#  ^^^^^^^ comment.line
+
+%% do
+#  ^^ comment.line
+
+do
+# <-- variable.other
+# Note: Keywords can be identifiers in Pico
+
+123
+# <--- constant.numeric
+
+foo123
+# <------- variable.other
+
+foo_123
+# <--- variable.other
+#  ^ -variable.other
+#  ^ -constant.numeric
+#   ^^^ constant.numeric
+# Note: Underscores cannot occur in identifiers in Pico
+
+#begin
+## <----- keyword.control
+## TODO: `begin` outside `begin`-`end` can never be an identifier in Pico
+
+#"foo" "bar"
+## <----- string.quoted.double
+##    ^ -string.quoted.double
+##     ^^^^^ string.quoted.double
+## TODO: Support multiline productions
+
+#"foo\"bar"
+## <---- string.quoted.double
+##   ^^ string.quoted.double
+##     ^^^^ string.quoted.double
+## TODO: Support multiline productions with escaping
diff --git a/rascal-textmate-core/src/test/sh/lang/textmate/conversion-tests.bat b/rascal-textmate-core/src/test/sh/lang/textmate/conversion-tests.bat
@@ -0,0 +1,3 @@
+npx vscode-tmgrammar-test ^
+    --grammar target/generated-test-grammars/%1.tmLanguage.json ^
+    "./src/main/rascal/lang/textmate/conversiontests/%1.test"
diff --git a/rascal-textmate-core/src/test/sh/lang/textmate/conversion-tests.sh b/rascal-textmate-core/src/test/sh/lang/textmate/conversion-tests.sh
@@ -0,0 +1,3 @@
+npx vscode-tmgrammar-test \
+    --grammar target/generated-test-grammars/$1.tmLanguage.json \
+    "./src/main/rascal/lang/textmate/conversiontests/$1.test"
diff --git a/vscode-extension/syntaxes/rascal.tmLanguage.json b/vscode-extension/syntaxes/rascal.tmLanguage.json
@@ -1,7 +1,7 @@
 {
   "repository": {
     "prod(lex(\"Comment\"),[lit(\"//\"),conditional(\\iter-star(\\char-class([range(1,9),range(11,1114111)])),{\\not-follow(\\char-class([range(9,9),range(13,13),range(32,32),range(160,160),range(5760,5760),range(8192,8202),range(8239,8239),range(8287,8287),range(12288,12288)])),\\end-of-line()})],{tag(\"category\"(\"comment\"))})": {
-      "match": "((?:\\x{2F}\\x{2F})(?:(?:[\\x{1}-\\x{9}]|[\\x{B}-\\x{10FFFF}])*?(?!(?:[\\x{9}-\\x{9}]|[\\x{D}-\\x{D}]|[\\x{20}-\\x{20}]|[\\x{A0}-\\x{A0}]|[\\x{1680}-\\x{1680}]|[\\x{2000}-\\x{200A}]|[\\x{202F}-\\x{202F}]|[\\x{205F}-\\x{205F}]|[\\x{3000}-\\x{3000}]))(?:$)))",
+      "match": "((?:\\x{2F}\\x{2F})(?:(?:[\\x{1}-\\x{9}]|[\\x{B}-\\x{10FFFF}])*?(?!(?:\\x{9}|\\x{D}|\\x{20}|\\x{A0}|\\x{1680}|[\\x{2000}-\\x{200A}]|\\x{202F}|\\x{205F}|\\x{3000}))(?:$)))",
       "name": "prod(lex(\"Comment\"),[lit(\"//\"),conditional(\\iter-star(\\char-class([range(1,9),range(11,1114111)])),{\\not-follow(\\char-class([range(9,9),range(13,13),range(32,32),range(160,160),range(5760,5760),range(8192,8202),range(8239,8239),range(8287,8287),range(12288,12288)])),\\end-of-line()})],{tag(\"category\"(\"comment\"))})",
       "captures": {
         "1": {
@@ -46,7 +46,7 @@
           }
         },
         {
-          "match": "((?:[\\x{1}-\\x{9}]|[\\x{B}-\\x{3B}]|[\\x{3D}-\\x{3D}]|[\\x{3F}-\\x{5B}]|[\\x{5D}-\\x{5F}]|[a-\\x{10FFFF}])+?(?!(?:[\\x{1}-\\x{9}]|[\\x{B}-\\x{3B}]|[\\x{3D}-\\x{3D}]|[\\x{3F}-\\x{5B}]|[\\x{5D}-\\x{5F}]|[a-\\x{10FFFF}])))",
+          "match": "((?:[\\x{1}-\\x{9}]|[\\x{B}-\\x{3B}]|\\x{3D}|[\\x{3F}-\\x{5B}]|[\\x{5D}-\\x{5F}]|[a-\\x{10FFFF}])+?(?!(?:[\\x{1}-\\x{9}]|[\\x{B}-\\x{3B}]|\\x{3D}|[\\x{3F}-\\x{5B}]|[\\x{5D}-\\x{5F}]|[a-\\x{10FFFF}])))",
           "name": "prod(label(\"text\",lex(\"ConcretePart\")),[conditional(iter(\\char-class([range(1,9),range(11,59),range(61,61),range(63,91),range(93,95),range(97,1114111)])),{\\not-follow(\\char-class([range(1,9),range(11,59),range(61,61),range(63,91),range(93,95),range(97,1114111)]))})],{tag(\"category\"(\"string\"))})",
           "captures": {
             "1": {
@@ -80,7 +80,7 @@
       "end": "(?:\\x{5D})",
       "patterns": [
         {
-          "match": "((?:\\x{5C})(?:[\\x{20}-\\x{20}]|[\\x{22}-\\x{22}]|[\\x{27}-\\x{27}]|[\\x{2D}-\\x{2D}]|[\\x{3C}-\\x{3C}]|[\\x{3E}-\\x{3E}]|[\\x{5B}-\\x{5D}]|[b-b]|[f-f]|[n-n]|[r-r]|[t-t]))",
+          "match": "((?:\\x{5C})(?:\\x{20}|\\x{22}|\\x{27}|\\x{2D}|\\x{3C}|\\x{3E}|[\\x{5B}-\\x{5D}]|b|f|n|r|t))",
           "name": "prod(lex(\"Char\"),[lit(\"\\\\\"),\\char-class([range(32,32),range(34,34),range(39,39),range(45,45),range(60,60),range(62,62),range(91,93),range(98,98),range(102,102),range(110,110),range(114,114),range(116,116)])],{tag(\"category\"(\"constant\"))})",
           "captures": {
             "1": {
@@ -89,7 +89,7 @@
           }
         },
         {
-          "match": "((?:(?:\\x{5C})[U-U](?:(?:\\b10\\b)|(?:(?:\\b0\\b)(?:[0-9]|[A-F]|[a-f])))(?:[0-9]|[A-F]|[a-f])(?:[0-9]|[A-F]|[a-f])(?:[0-9]|[A-F]|[a-f])(?:[0-9]|[A-F]|[a-f]))|(?:(?:\\x{5C})[u-u](?:[0-9]|[A-F]|[a-f])(?:[0-9]|[A-F]|[a-f])(?:[0-9]|[A-F]|[a-f])(?:[0-9]|[A-F]|[a-f]))|(?:(?:\\x{5C})[a-a][0-7](?:[0-9]|[A-F]|[a-f])))",
+          "match": "((?:(?:\\x{5C})U(?:(?:\\b10\\b)|(?:(?:\\b0\\b)(?:[0-9]|[A-F]|[a-f])))(?:[0-9]|[A-F]|[a-f])(?:[0-9]|[A-F]|[a-f])(?:[0-9]|[A-F]|[a-f])(?:[0-9]|[A-F]|[a-f]))|(?:(?:\\x{5C})u(?:[0-9]|[A-F]|[a-f])(?:[0-9]|[A-F]|[a-f])(?:[0-9]|[A-F]|[a-f])(?:[0-9]|[A-F]|[a-f]))|(?:(?:\\x{5C})a[0-7](?:[0-9]|[A-F]|[a-f])))",
           "name": "prod(lex(\"Char\"),[lex(\"UnicodeEscape\")],{tag(\"category\"(\"constant\"))})",
           "captures": {
             "1": {