Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

No conversion when delimiter occurs elsewhere too #19

Merged
merged 15 commits into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
@synopsis{
Utility functions to work with grammars, productions, and symbols.
Utility functions to work with grammars, productions, and symbols
}

module lang::rascal::grammar::Util
Expand All @@ -9,6 +9,8 @@ import Grammar;
import ParseTree;
import String;

import util::ListUtil;

@synopsis{
Utility functions for grammars
}
Expand Down Expand Up @@ -47,6 +49,62 @@ bool isRecursive(Grammar g, Symbol s) {
return check({}, s);
}

@synopsis{
Representation of a pointer to a symbol in (the list of symbols of) a
production. This is useful to distinguish between different occurrences of
the same symbol in a grammar (i.e., they have different pointers).
}

alias Pointer = tuple[Production p, int index];

@synopsis{
Finds the list of pointers -- a *trace* -- to the first occurrence of symbol
`s`, if any, starting from production `p`, optionally in a particular
direction (default: `forward()`). That is: if `<p1,i>` is followed by
`<p2,_>` in the returned list, then `p1.symbols[i]` is a non-terminal and
`p2` is one of its productions.
}

@description{
For instance, consider the following grammar:

```
lexical X = Y;
lexical Y = alt1: "[" "[" "[" Z1 "]" "]" "]" | alt2: "<" Z2 ">";
lexical Z1 = "foo" "bar";
lexical Z2 = "baz";
```

The list of pointers to `"bar"`, starting from `X`, is:

- `<X,0>`
- `<Y.alt1,3>`
- `<Z1,1>`

The list of pointers to `"qux"` is just empty.
}

list[Pointer] find(Grammar g, Production p, Symbol s, Direction dir = forward()) {

list[Pointer] doFind(set[Production] doing, Production haystack, Symbol needle) {
for (haystack notin doing, i <- reorder([0..size(haystack.symbols)], dir)) {
Symbol ith = delabel(haystack.symbols[i]);
if (ith == needle) {
return [<haystack, i>];
}
for (isNonTerminalType(ith), child <- lookup(g, ith)) {
if (list[Pointer] l: [_, *_] := doFind(doing + haystack, child, s)) {
return [<haystack, i>] + l;
}
}
}

return [];
}

return doFind({}, p, s);
sungshik marked this conversation as resolved.
Show resolved Hide resolved
}

@synopsis{
Lookups a list of productions for symbol `s` in grammar `g`, replacing
formal parameters with actual parameters when needed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,9 @@ default Maybe[Symbol] unique(set[Maybe[Symbol]] _) = nothing();
}

bool isDelimiter(lit(string))
= /^\w+$/ !:= string;
= /^\W+$/ := string;
bool isDelimiter(cilit(string))
sungshik marked this conversation as resolved.
Show resolved Hide resolved
= /^\w+$/ !:= string;
= isDelimiter(lit(string));

default bool isDelimiter(Symbol _)
= false;
Expand All @@ -205,9 +205,9 @@ default bool isDelimiter(Symbol _)
}

bool isKeyword(lit(string))
= /^\w+$/ := string;
= /^\w.*$/ := string;
bool isKeyword(cilit(string))
sungshik marked this conversation as resolved.
Show resolved Hide resolved
= /^\w+$/ := string;
= isKeyword(lit(string));

default bool isKeyword(Symbol _)
= false;
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,16 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
// Convert all units in the group to match patterns (including,
// optimistically, multi-line units as-if they are single-line)
for (u <- group, !u.recursive) {
TmRule r = toTmRule(toRegExp(u.rsc, u.prod, guard = true))

// Add the guard (i.e., look-behind condition to match layout) only
// when the units in the group don't begin with a delimiter. Why is
// is this? We *don't* want `32` to be highlighted as a number in
// `int aer32 = 34`. However, we *do* want `>bar"` to be highlighted
// as a string in `"foo<x==5>bar"`. As a heuristic, if the token
// starts with a delimiter (e.g., `>`), then it should be allowed
// for its occurrence to not be preceded by layout.
bool guard = nothing() := u.innerDelimiters.begin;
TmRule r = toTmRule(toRegExp(u.rsc, u.prod, guard = guard))
[name = "/inner/single/<u.name>"];

rules = insertIn(rules, (u: r));
Expand All @@ -217,6 +226,25 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {

// Simple case: each unit does have an `end` inner delimiter
if (_ <- group && all(u <- group, just(_) := u.innerDelimiters.end)) {

// Create a set of pointers to the first (resp. last) occurrence
// of `pivot` in each unit, when `pivot` is a `begin` delimiter
// (resp. an `end` delimiter) of the group. If `pivot` occurs
// elsewhere in the grammar as well, then skip the conversion
// of these multi-line units to a begin/end pattern. This is to
// avoid tokenization mistakes in which the other occurrences of
// `pivot` in the input are mistakenly interpreted as the
// beginning or ending of a unit in the group.

Symbol pivot = key.val;

set[Pointer] pointers = {};
pointers += pivot in begins ? {*find(rsc, u.prod, pivot, dir = forward()) [-1..] | u <- group} : {};
pointers += pivot in ends ? {*find(rsc, u.prod, pivot, dir = backward())[-1..] | u <- group} : {};

if (any(/p: prod(_, [*before, pivot, *_], _) := rsc.rules, <p, size(before)> notin pointers)) {
continue;
}

// Compute a set of segments that need to be consumed between
// the `begin` delimiter and the `end` delimiters. Each of these
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ bool doTransformTest(list[ConversionUnit] units, RepositoryStats expect, str nam
assert actual.include == expect.include : "Actual number of top-level include patterns in repository: <actual.include>. Expected: <expect.include>.";

// Test behavioral properties of the TextMate grammar

loc lTest = lProject + "/src/main/rascal/lang/textmate/conversiontests/<name>.test";
loc lTester = lProject + "/node_modules/vscode-tmgrammar-test";
if (!exists(lTest)) {
Expand All @@ -103,7 +102,21 @@ bool doTransformTest(list[ConversionUnit] units, RepositoryStats expect, str nam
resolveLocation(lTest).path[(windows ? 1 : 0)..]
];

if (<output, exitCode> := execWithCode(lExec, args = args) && exitCode != 0) {
// TODO: The following function serves as a workaround for a race
// in (the Java-part of) the implementation of `execWithCode`. A fix is
// already available but not yet released. When it is, this function
// should be removed (and `execWithCode` called directly). See also:
// https://github.com/usethesource/rascal/commit/1ce9e59dfd7098327bbaf55a985c2a643ff52861
tuple[str, int] execWithCodeUntilSuccess() {
try {
return execWithCode(lExec, args = args);
} catch e: {
println("[LOG] Retrying after unexpected exception: <e>");
return execWithCodeUntilSuccess();
}
}

if (<output, exitCode> := execWithCodeUntilSuccess() && exitCode != 0) {
println(output);
assert false : "Actual tokenization does not match expected tokenization (see output above for details)";
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ import lang::pico::\syntax::Main;
Grammar rsc = preprocess(grammar(#Program));

list[ConversionUnit] units = [
unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("-"),lit(","),lit(")"),lit("("),lit("+"),lit("||"),lit(":="),lit("\""),lit(";"),lit("nil-type")})],{}), false, false, <nothing(),nothing()>, <nothing(),nothing()>),
unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("-"),lit(","),lit(")"),lit("("),lit("+"),lit("||"),lit(":="),lit("\""),lit(";")})],{}), false, false, <nothing(),nothing()>, <nothing(),nothing()>),
unit(rsc, prod(lex("WhitespaceAndComment"),[lit("%%"),conditional(\iter-star(\char-class([range(1,9),range(11,1114111)])),{\end-of-line()})],{\tag("category"("Comment"))}), false, false, <nothing(),nothing()>, <just(lit("%%")),nothing()>),
unit(rsc, prod(lex("WhitespaceAndComment"),[lit("%"),iter(\char-class([range(1,36),range(38,1114111)])),lit("%")],{\tag("category"("Comment"))}), false, true, <nothing(),nothing()>, <just(lit("%")),just(lit("%"))>),
unit(rsc, prod(lex(KEYWORDS_PRODUCTION_NAME),[alt({lit("do"),lit("declare"),lit("fi"),lit("else"),lit("end"),lit("od"),lit("begin"),lit("natural"),lit("then"),lit("if"),lit("while"),lit("string")})],{\tag("category"("keyword.control"))}), false, false, <nothing(),nothing()>, <nothing(),nothing()>)
unit(rsc, prod(lex(KEYWORDS_PRODUCTION_NAME),[alt({lit("do"),lit("declare"),lit("fi"),lit("else"),lit("end"),lit("od"),lit("nil-type"),lit("begin"),lit("natural"),lit("then"),lit("if"),lit("while"),lit("string")})],{\tag("category"("keyword.control"))}), false, false, <nothing(),nothing()>, <nothing(),nothing()>)
];

test bool analyzeTest() = doAnalyzeTest(rsc, units);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,14 @@ Grammar rsc = preprocess(grammar(#Program));
list[ConversionUnit] units = [
unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("-"),lit(","),lit(")"),lit("("),lit("+"),lit("||"),lit(":="),lit("\\")})],{}), false, false, <nothing(),nothing()>, <nothing(),nothing()>),
unit(rsc, prod(label("natural",sort("Type")),[lit("natural")],{\tag("category"("storage.type"))}), false, false, <just(lit(":")),just(lit(";"))>, <nothing(),nothing()>),
unit(rsc, prod(label("nil",sort("Type")),[lit("nil-type")],{\tag("category"("storage.type"))}), false, false, <just(lit(":")),just(lit(";"))>, <just(lit("nil-type")),just(lit("nil-type"))>),
unit(rsc, prod(label("nil",sort("Type")),[lit("nil-type")],{\tag("category"("storage.type"))}), false, false, <just(lit(":")),just(lit(";"))>, <nothing(),nothing()>),
unit(rsc, prod(label("string",sort("Type")),[lit("string")],{\tag("category"("storage.type"))}), false, false, <just(lit(":")),just(lit(";"))>, <nothing(),nothing()>),
unit(rsc, prod(lex("WhitespaceAndComment"),[lit("%%"),conditional(\iter-star(\char-class([range(1,9),range(11,1114111)])),{\end-of-line()})],{\tag("category"("comment.line"))}), false, false, <nothing(),nothing()>, <just(lit("%%")),nothing()>),
unit(rsc, prod(lex("WhitespaceAndComment"),[lit("%"),iter(\char-class([range(1,36),range(38,1114111)])),lit("%")],{\tag("category"("comment.block"))}), false, true, <nothing(),nothing()>, <just(lit("%")),just(lit("%"))>),
unit(rsc, prod(label("strcon",sort("Expression")),[label("string",lex("String"))],{\tag("category"("string.quoted.double"))}), false, true, <nothing(),nothing()>, <just(lit("\"")),just(lit("\""))>),
unit(rsc, prod(label("id",sort("Expression")),[label("name",lex("Id"))],{\tag("category"("variable.other"))}), false, false, <nothing(),nothing()>, <nothing(),nothing()>),
unit(rsc, prod(label("natcon",sort("Expression")),[label("natcon",lex("Natural"))],{\tag("category"("constant.numeric"))}), false, false, <nothing(),nothing()>, <nothing(),nothing()>),
unit(rsc, prod(lex(KEYWORDS_PRODUCTION_NAME),[alt({lit("do"),lit("declare"),lit("fi"),lit("else"),lit("end"),lit("od"),lit("begin"),lit("natural"),lit("then"),lit("if"),lit("while"),lit("string")})],{\tag("category"("keyword.control"))}), false, false, <nothing(),nothing()>, <nothing(),nothing()>)
unit(rsc, prod(lex(KEYWORDS_PRODUCTION_NAME),[alt({lit("do"),lit("declare"),lit("fi"),lit("else"),lit("end"),lit("od"),lit("nil-type"),lit("begin"),lit("natural"),lit("then"),lit("if"),lit("while"),lit("string")})],{\tag("category"("keyword.control"))}), false, false, <nothing(),nothing()>, <nothing(),nothing()>)
];

test bool analyzeTest() = doAnalyzeTest(rsc, units);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import lang::rascal::\syntax::Rascal;
Grammar rsc = preprocess(grammar(#Module));

list[ConversionUnit] units = [
unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("bottom-up-break"),lit(")"),lit("("),lit("%"),lit("!:="),lit("\<==\>"),lit("\<\<="),lit("!="),lit("\>="),lit("://"),lit("non-assoc"),lit("&="),lit("\<-"),lit("*="),lit("+="),lit("top-down-break"),lit(","),lit("..."),lit("/="),lit("!\<\<"),lit("=\>"),lit("!\>\>"),lit("||"),lit("\>\>"),lit("::"),lit("&&"),lit(":="),lit("#"),lit("?="),lit("\<:"),lit("==\>"),lit("^"),lit(";"),lit("{"),lit("-="),lit("$T")})],{}), false, false, <nothing(),nothing()>, <nothing(),nothing()>),
unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit(","),lit(")"),lit("("),lit("%"),lit("\<==\>"),lit("\<\<="),lit("!="),lit("\>="),lit("://"),lit("&="),lit("\<-"),lit("-="),lit("*="),lit("+="),lit("..."),lit("/="),lit("!:="),lit("$"),lit("!\<\<"),lit("=\>"),lit("!\>\>"),lit("||"),lit("\>\>"),lit("::"),lit("&&"),lit(":="),lit("#"),lit("?="),lit("\<:"),lit("==\>"),lit("^"),lit(";"),lit("{")})],{}), false, false, <nothing(),nothing()>, <nothing(),nothing()>),
unit(rsc, prod(label("stderrOutput",lex("Output")),[conditional(lit("⚠"),{\begin-of-line()}),\iter-star(\char-class([range(1,9),range(11,12),range(14,1114111)])),lit("\n")],{\tag("category"("StdErr"))}), false, false, <nothing(),nothing()>, <just(lit("⚠")),just(lit("\n"))>),
unit(rsc, prod(label("stdoutOutput",lex("Output")),[conditional(lit("≫"),{\begin-of-line()}),\iter-star(\char-class([range(1,9),range(11,12),range(14,1114111)])),lit("\n")],{\tag("category"("StdOut"))}), false, false, <nothing(),nothing()>, <just(lit("≫")),just(lit("\n"))>),
unit(rsc, prod(label("resultOutput",lex("Output")),[lit("⇨"),\iter-star(\char-class([range(1,9),range(11,12),range(14,1114111)])),lit("\n")],{\tag("category"("Result"))}), false, false, <nothing(),nothing()>, <just(lit("⇨")),just(lit("\n"))>),
Expand All @@ -35,8 +35,8 @@ list[ConversionUnit] units = [
unit(rsc, prod(lex("CaseInsensitiveStringConstant"),[lit("\'"),label("chars",\iter-star(lex("StringCharacter"))),lit("\'")],{\tag("category"("Constant"))}), false, true, <nothing(),nothing()>, <just(lit("\'")),just(lit("\'"))>),
unit(rsc, prod(lex("PreStringChars"),[lit("\""),\iter-star(lex("StringCharacter")),lit("\<")],{\tag("category"("Constant"))}), false, true, <nothing(),nothing()>, <just(lit("\"")),just(lit("\<"))>),
unit(rsc, prod(lex("StringConstant"),[lit("\""),label("chars",\iter-star(lex("StringCharacter"))),lit("\"")],{\tag("category"("Constant"))}), false, true, <nothing(),nothing()>, <just(lit("\"")),just(lit("\""))>),
unit(rsc, prod(lex(KEYWORDS_PRODUCTION_NAME),[alt({lit("lexical"),lit("loc"),lit("if"),lit("assoc"),lit("test"),lit("lrel"),lit("throws"),lit("clear"),lit("module"),lit("any"),lit("int"),lit("quit"),lit("o"),lit("anno"),lit("true"),lit("public"),lit("keyword"),lit("for"),lit("tuple"),lit("bracket"),lit("bag"),lit("it"),lit("visit"),lit("do"),lit("data"),lit("layout"),lit("bool"),lit("edit"),lit("join"),lit("is"),lit("import"),lit("view"),lit("in"),lit("rat"),lit("modules"),lit("continue"),lit("left"),lit("num"),lit("assert"),lit("throw"),lit("one"),lit("help"),lit("default"),lit("all"),lit("global"),lit("syntax"),lit("false"),lit("finally"),lit("private"),lit("mod"),lit("java"),lit("node"),lit("start"),lit("set"),lit("right"),lit("variable"),lit("map"),lit("10"),lit("on"),lit("break"),lit("dynamic"),lit("solve"),lit("fail"),lit("unimport"),lit("outermost"),lit("real"),lit("list"),lit("insert"),lit("innermost"),lit("declarations"),lit("else"),lit("rel"),lit("function"),lit("notin"),lit("filter"),lit("datetime"),lit("catch"),lit("try"),lit("renaming"),lit("tag"),lit("has"),lit("Z"),lit("when"),lit("type"),lit("append"),lit("extend"),lit("switch"),lit("void"),lit("history"),lit("T"),lit("while"),lit("str"),lit("value"),lit("undeclare"),lit("case"),lit("alias"),lit("return"),lit("0")})],{\tag("category"("keyword.control"))}), false, false, <nothing(),nothing()>, <nothing(),nothing()>)
unit(rsc, prod(lex(KEYWORDS_PRODUCTION_NAME),[alt({lit("lexical"),lit("loc"),lit("test"),lit("lrel"),lit("throws"),lit("clear"),lit("top-down-break"),lit("module"),lit("any"),lit("int"),lit("quit"),lit("bottom-up-break"),lit("o"),lit("anno"),lit("true"),lit("public"),lit("keyword"),lit("for"),lit("tuple"),lit("bracket"),lit("bag"),lit("it"),lit("visit"),lit("do"),lit("data"),lit("layout"),lit("bool"),lit("edit"),lit("join"),lit("is"),lit("import"),lit("view"),lit("in"),lit("rat"),lit("modules"),lit("continue"),lit("left"),lit("num"),lit("assert"),lit("throw"),lit("one"),lit("help"),lit("default"),lit("all"),lit("global"),lit("syntax"),lit("false"),lit("finally"),lit("private"),lit("mod"),lit("java"),lit("node"),lit("start"),lit("set"),lit("if"),lit("bottom-up"),lit("right"),lit("variable"),lit("map"),lit("10"),lit("on"),lit("break"),lit("dynamic"),lit("solve"),lit("fail"),lit("unimport"),lit("outermost"),lit("real"),lit("list"),lit("insert"),lit("innermost"),lit("declarations"),lit("else"),lit("rel"),lit("function"),lit("notin"),lit("filter"),lit("datetime"),lit("catch"),lit("try"),lit("renaming"),lit("tag"),lit("has"),lit("top-down"),lit("Z"),lit("when"),lit("type"),lit("append"),lit("extend"),lit("non-assoc"),lit("assoc"),lit("switch"),lit("void"),lit("history"),lit("T"),lit("while"),lit("str"),lit("value"),lit("undeclare"),lit("case"),lit("alias"),lit("return"),lit("0")})],{\tag("category"("keyword.control"))}), false, false, <nothing(),nothing()>, <nothing(),nothing()>)
];

test bool analyzeTest() = doAnalyzeTest(rsc, units);
test bool transformTest() = doTransformTest(units, <20, 8, 0>);
test bool transformTest() = doTransformTest(units, <20, 4, 0>, name = "Rascal");
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# SYNTAX TEST "Rascal"

"foo bar"
# ^^^^^^^^^ Constant

"foo<x + 1>bar"
# ^^^^^ ^^^^^ Constant
# ^^^^^ -Constant

### TODO: The following test shows that, currently, multi-line strings are
### disabled. This is because the converter determines that:
### - `>` doesn't uniquely delineate interpolation (it could also be
### greater-than in expressions or prioritize-before in grammars);
### - `"` doesn't uniquely delineate strings (it could also be the end of
### interpolation).
### Therefore, to avoid excessive tokenization errors, the converter doesn't
### generate begin/end patterns that begin with `>` or `"`. This might be
### improved in the future.

"foo
# ^^^^ -Constant
bar"
# ^^^^ -Constant
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,4 @@ list[ConversionUnit] units = [
];

test bool analyzeTest() = doAnalyzeTest(rsc, units);
test bool transformTest() = doTransformTest(units, <5, 1, 0>);
test bool transformTest() = doTransformTest(units, <5, 1, 0>, name = "RascalClass");
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# SYNTAX TEST "RascalClass"
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,4 @@ list[ConversionUnit] units = [
];

test bool analyzeTest() = doAnalyzeTest(rsc, units);
test bool transformTest() = doTransformTest(units, <7, 1, 0>);
test bool transformTest() = doTransformTest(units, <7, 1, 0>, name = "RascalConcrete");
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# SYNTAX TEST "RascalConcrete"
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,4 @@ list[ConversionUnit] units = [
];

test bool analyzeTest() = doAnalyzeTest(rsc, units);
test bool transformTest() = doTransformTest(units, <6, 2, 0>, name = "RascalStringLiteral");
test bool transformTest() = doTransformTest(units, <6, 0, 0>, name = "RascalStringLiteral");
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,5 @@

"foo <5 > 6> bar"
# ^^^^^^ Constant
# ^^ -Constant
# ^^^^^^^^^ Constant
## TODO: Improve this? (Probably very hard to do with TextMate...)
# ^^^^^ -Constant
# ^^^^^^ Constant
Original file line number Diff line number Diff line change
Expand Up @@ -298,4 +298,4 @@ list[ConversionUnit] units = [
];

test bool analyzeTest() = doAnalyzeTest(rsc, units);
test bool transformTest() = doTransformTest(units, <7, 2, 0>);
test bool transformTest() = doTransformTest(units, <7, 2, 0>, name = "Walkthrough");
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# SYNTAX TEST "Walkthrough"
Loading