Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support nested categories #22

Merged
merged 16 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,15 @@ RegExp toRegExp(Grammar g, list[Symbol] symbols, set[Attr] attributes) {
RegExp toRegExp(Grammar g, \label(_, symbol))
= toRegExp(g, symbol);
RegExp toRegExp(Grammar g, \parameter(_, _)) {
throw "Presumably unreachable..."; } // Covered by `lookup` (which substitutes actuals for formals)
throw "Presumably unreachable..."; } // Covered by `prodsOf` (which substitutes actuals for formals)

// `ParseTree`: Start
RegExp toRegExp(Grammar g, \start(symbol))
= toRegExp(g, symbol);

// `ParseTree`: Non-terminals
RegExp toRegExp(Grammar g, Symbol s)
= infix("|", [toRegExp(g, p) | p <- lookup(g, s)]) when isNonTerminalType(s);
= infix("|", [toRegExp(g, p) | p <- prodsOf(g, s)]) when isNonTerminalType(s);

// `ParseTree`: Terminals
RegExp toRegExp(Grammar _, \lit(string))
Expand Down Expand Up @@ -103,7 +103,7 @@ RegExp toRegExp(Grammar g, \conditional(symbol, conditions)) {
prefixConditions = [c | c <- conditions, isPrefixCondition(c)];
suffixConditions = [c | c <- conditions, isSuffixCondition(c)];
deleteConditions = [c | c <- conditions, isDeleteCondition(c)];

// Convert except conditions (depends on previous conversion)
if (_ <- exceptConditions) {
if (/\choice(symbol, alternatives) := g) {
Expand All @@ -112,7 +112,7 @@ RegExp toRegExp(Grammar g, \conditional(symbol, conditions)) {
= \label(l, _) := def
? \except(l) notin exceptConditions
: true;

re = infix("|", toRegExps(g, {a | a <- alternatives, keep(a)}));
}
}
Expand All @@ -130,7 +130,7 @@ RegExp toRegExp(Grammar g, \conditional(symbol, conditions)) {
// Convert delete conditions (depends on previous conversions)
if (_ <- deleteConditions) {
RegExp delete = infix("|", [toRegExp(g, s) | \delete(s) <- deleteConditions]);

// TODO: Explain this complicated conversion...
str string = "(?=(?\<head\><re.string>)(?\<tail\>.*)$)(?!(?:<delete.string>)\\k\<tail\>$)\\k\<head\>";
list[str] categories = ["", *re.categories, "", *delete.categories];
Expand Down Expand Up @@ -196,7 +196,7 @@ str encode(int char) = preEncoded[char] ? "\\x{<toHex(char)>}";
private set[int] charRange(str from, str to) = {*[charAt(from, 0)..charAt(to, 0) + 1]};

private str toHex(int i)
= i < 16
= i < 16
? hex[i]
: toHex(i / 16) + toHex(i % 16);

Expand Down
43 changes: 26 additions & 17 deletions rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc
Original file line number Diff line number Diff line change
Expand Up @@ -37,18 +37,20 @@ bool tryParse(Grammar g, Symbol s, str input, bool allowAmbiguity = false) {
Checks if symbol `s` is recursive in grammar `g`
}

bool isRecursive(Grammar g, Symbol s) {
set[Symbol] getChildren(Symbol s)
= {s | p <- lookup(g, s), /Symbol s := p.symbols};
bool isRecursive(Grammar g, Symbol s, set[Symbol] checking = {})
= s in checking
? true
: any(p <- prodsOf(g, delabel(s)),
/Symbol child := p.symbols,
isRecursive(g, child, checking = checking + s));

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just a refactoring to make the code simpler (no more nested functions)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!

it can also be written as:

= s in checking || any(...)

bool check(set[Symbol] checking, Symbol s)
= s in checking
? true
: any(child <- getChildren(s), check(checking + s, child));

return check({}, s);
@synopsis{
Checks if production `p` is recursive in grammar `g`
}

bool isRecursive(Grammar g, Production p)
= any(/Symbol s := p.symbols, isRecursive(g, s));

@synopsis{
Representation of a pointer to a symbol in (the list of symbols of) a
production. This is useful to distinguish between different occurrences of
Expand All @@ -70,7 +72,7 @@ alias Pointer = tuple[Production p, int index];

```
lexical X = Y;
lexical Y = alt1: "[" "[" "[" Z1 "]" "]" "]" | alt2: "<" Z2 ">";
lexical Y = alt1: "[" "[" "[" Z1 "]" "]" "]" | alt2: "<" Z2 ">";
lexical Z1 = "foo" "bar";
lexical Z2 = "baz";
```
Expand All @@ -80,7 +82,7 @@ alias Pointer = tuple[Production p, int index];
- `<X,0>`
- `<Y.alt1,3>`
- `<Z1,1>`

The list of pointers to `"qux"` is just empty.
}

Expand All @@ -92,7 +94,7 @@ list[Pointer] find(Grammar g, Production p, Symbol s, Direction dir = forward())
if (ith == needle) {
return [<haystack, i>];
}
for (isNonTerminalType(ith), child <- lookup(g, ith)) {
for (isNonTerminalType(ith), child <- prodsOf(g, ith)) {
if (list[Pointer] l: [_, *_] := doFind(doing + haystack, child, s)) {
return [<haystack, i>] + l;
}
Expand All @@ -106,19 +108,26 @@ list[Pointer] find(Grammar g, Production p, Symbol s, Direction dir = forward())
}

@synopsis{
Lookups a list of productions for symbol `s` in grammar `g`, replacing
Gets the list of productions that contain symbol `s` in grammar `g`
}

set[Production] prodsWith(Grammar g, Symbol s)
= {parent | /parent: prod(_, /Symbol _: s, _) := g};

@synopsis{
Gets the list of productions of symbol `s` in grammar `g`, replacing
formal parameters with actual parameters when needed
}

list[Production] lookup(Grammar g, s: \parameterized-sort(name, actual))
list[Production] prodsOf(Grammar g, s: \parameterized-sort(name, actual))
= [subst(p, formal, actual) | /p: prod(\parameterized-sort(name, formal), _, _) := g.rules[s] ? []]
+ [subst(p, formal, actual) | /p: prod(label(_, \parameterized-sort(name, formal)), _, _) := g.rules[s] ? []];

list[Production] lookup(Grammar g, s: \parameterized-lex(name, actual))
list[Production] prodsOf(Grammar g, s: \parameterized-lex(name, actual))
= [subst(p, formal, actual) | /p: prod(\parameterized-lex(name, formal), _, _) := g.rules[s] ? []]
+ [subst(p, formal, actual) | /p: prod(label(_, \parameterized-lex(name, formal)), _, _) := g.rules[s] ? []];

default list[Production] lookup(Grammar g, Symbol s)
default list[Production] prodsOf(Grammar g, Symbol s)
= [p | /p: prod(s, _, _) := g.rules[s] ? []]
+ [p | /p: prod(label(_, s), _, _) := g.rules[s] ? []];

Expand All @@ -130,7 +139,7 @@ default list[Production] lookup(Grammar g, Symbol s)
&T subst(&T t, list[Symbol] from, list[Symbol] to)
= subst(t, toMapUnique(zip2(from, to)))
when size(from) == size(to);

private &T subst(&T t, map[Symbol, Symbol] m)
= visit (t) { case Symbol s => m[s] when s in m };

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
module lang::rascal::grammar::analyze::Categories

import Grammar;
import ParseTree;

import lang::rascal::grammar::Util;

@synopsis{
Special value to indicate that a production has no category
}

public str NO_CATEGORY = "";

@synopsis{
Gets a set of categories such that, for each category, there exists a string
with that category produced by production `p`, as part of a string produced
by a start production of grammar `g`
}

set[str] getCategories(Grammar g, Production p)
= getCategoriesByProduction(g)[p];

@memo
private map[Production, set[str]] getCategoriesByProduction(Grammar g) {
map[Production, set[str]] ret = (p: {} | /p: prod(_, _, _) := g);

void doGet(Production p, set[str] parentCategories) {
set[str] categories = {c | /\tag("category"(str c)) := p};

set[str] old = ret[p];
set[str] new = _ <- categories ? categories : old + parentCategories;
ret[p] = new;

// If the new categories of `p` are different from the old ones, then
// propagate these changes to the children of `p`
for (old != new, /Symbol s := p.symbols, child <- prodsOf(g, delabel(s))) {
doGet(child, new);
}
}

// Propagate categories from the roots of the grammar
for (root: prod(\start(_), _, _) <- ret) {
doGet(root, {NO_CATEGORY});
}

return ret;
}
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ DelimiterPair getInnerDelimiterPair(Grammar g, Symbol s, bool getOnlyFirst = fal
```
lexical X = Y;
lexical Y = Y1 | Y2;
lexical Y1 = "[" Z "]";
lexical Y1 = "[" Z "]";
lexical Y2 = "[" Z ")" [a-z];
lexical Z = [a-z];
```
Expand Down Expand Up @@ -83,7 +83,7 @@ private map[Symbol, Maybe[Symbol]] getInnerDelimiterBySymbol(Grammar g, Directio
@memo
private map[Production, Maybe[Symbol]] getInnerDelimiterByProduction(Grammar g, Direction direction, bool getOnlyFirst = false) {
map[Production, Maybe[Symbol]] ret = (p: nothing() | /p: prod(_, _, _) := g);

solve (ret) {
for (p <- ret, ret[p] == nothing()) {
for (s <- reorder(p.symbols, direction)) {
Expand All @@ -108,7 +108,7 @@ private map[Production, Maybe[Symbol]] getInnerDelimiterByProduction(Grammar g,
}

private set[Production] getChildren(Grammar g, Symbol s)
= {*lookup(g, s)};
= {*prodsOf(g, s)};

@synopsis{
Gets the unique rightmost delimiter (`begin`) and the unique leftmost
Expand All @@ -122,7 +122,7 @@ private set[Production] getChildren(Grammar g, Symbol s)
```
lexical X = Y;
lexical Y = Y1 | Y2;
lexical Y1 = "[" Z "]";
lexical Y1 = "[" Z "]";
lexical Y2 = "[" Z ")" [a-z];
lexical Z = [a-z];
```
Expand Down Expand Up @@ -166,7 +166,7 @@ private map[Symbol, Maybe[Symbol]] getOuterDelimiterBySymbol(Grammar g, Directio
ret[s] = unique(delimiters);
}
}

return ret;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ private map[Production, Maybe[set[Segment]]] getSegmentsByProduction(Grammar g)
}

private Maybe[set[Segment]] getSegmentsWithEnvironment(
Grammar g, list[Symbol] symbols,
Grammar g, list[Symbol] symbols,
map[Production, Maybe[set[Segment]]] env) {

// General idea: Recursively traverse `symbols` from left to right, while
Expand All @@ -73,9 +73,9 @@ private Maybe[set[Segment]] getSegmentsWithEnvironment(
set[Symbol] nested = {s | /Symbol s := head};

Maybe[set[Segment]] finished = get(running, [], final = tail == []);

// If the head contains a non-terminal, then: (1) finish the running
// segment; (2) lookup the segments of the non-terminals in the
// segment; (2) look up the segments of the non-terminals in the
// environment, if any; (3) compute the segments of the tail. Return the
// union of 1-3.
if (any(s <- nested, isNonTerminalType(s))) {
Expand All @@ -85,15 +85,15 @@ private Maybe[set[Segment]] getSegmentsWithEnvironment(
sets += finished;

// (2)
sets += for (s <- nested, isNonTerminalType(s), p <- lookup(g, s)) {
sets += for (s <- nested, isNonTerminalType(s), p <- prodsOf(g, s)) {

bool isInitial(Segment seg)
= seg.initial && running.initial && running.symbols == [];
bool isFinal(Segment seg)
= seg.final && tail == [];
Segment update(Segment seg)
= seg[initial = isInitial(seg)][final = isFinal(seg)];

append just(segs) := env[p] ? just({update(seg) | seg <- segs}) : nothing();
}

Expand All @@ -103,21 +103,21 @@ private Maybe[set[Segment]] getSegmentsWithEnvironment(
// Return union
return (sets[0] | union(it, \set) | \set <- sets[1..]);
}

// If the head doesn't contain a non-terminal, but it has a newline,
// then: (1) finish the running segment; (2) compute the segments of the
// tail. Return the union of 1-2. Note: the head, as it has a newline,
// is ignored and won't be part of any segment.
else if (any(s <- nested, hasNewline(g, s))) {
return union(finished, get(segment([]), tail));
}

// If the head doesn't contain a non-terminal, and if it doesn't have a
// newline, then add the head to the running segment and proceed with
// the tail.
else {
Segment old = running;
Segment new = old[symbols = old.symbols + head];
Segment new = old[symbols = old.symbols + head];
return get(new, tail);
}
}
Expand All @@ -130,7 +130,7 @@ private Maybe[set[Segment]] getSegmentsWithEnvironment(
}

bool hasNewline(Grammar g, Symbol s) {
return any(p <- lookup(g, delabel(s)), hasNewline(g, p));
return any(p <- prodsOf(g, delabel(s)), hasNewline(g, p));
}

@synopsis{
Expand All @@ -149,7 +149,7 @@ private map[Production, bool] hasNewlineByProduction(Grammar g) {
for (p <- ret, !ret[p]) {
set[Symbol] nonTerminals = {s | /Symbol s := p.symbols, isNonTerminalType(s)};
ret[p] = ret[p] || any(/r: range(_, _) := p.symbols, hasNewline(r))
|| any(s <- nonTerminals, Production child <- lookup(g, s), ret[child]);
|| any(s <- nonTerminals, Production child <- prodsOf(g, s), ret[child]);
}
}

Expand All @@ -165,7 +165,7 @@ private map[Production, bool] hasNewlineByProduction(Grammar g) {

bool hasNewline(str s)
= LF in chars(s);

bool hasNewline(range(begin, end))
= begin <= LF && LF <= end;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ private map[Symbol, Maybe[set[Symbol]]] firstBySymbol(Grammar g, bool(Symbol) pr
for (s <- ret, nothing() == ret[s]) {
if (predicate(s)) {
ret[s] = just({s});
} else if (list[Production] prods: [_, *_] := lookup(g, s)) {
} else if (list[Production] prods: [_, *_] := prodsOf(g, s)) {
ret[s] = (just({}) | union(it, firstOf(reorder(p.symbols, dir))) | p <- prods);
} else {
ret[s] = just({\empty()});
Expand Down Expand Up @@ -84,7 +84,7 @@ set[Symbol] follow(Grammar g, Symbol s)
@memo
private map[Symbol, Maybe[set[Symbol]]] followBySymbol(Grammar g, bool(Symbol) predicate, Direction dir) {
map[Symbol, Maybe[set[Symbol]]] ret = (delabel(s): nothing() | s <- g.rules); // Non-terminals

Maybe[set[Symbol]] followOf(Symbol parent, [])
= ret[delabel(parent)];
Maybe[set[Symbol]] followOf(Symbol parent, [h, *t])
Expand Down Expand Up @@ -142,6 +142,8 @@ private default Maybe[int] max(Maybe[int] _, Maybe[int] _) = nothing();
Computes the length of a terminal symbol as a range
}

Range length(label(_, symbol)) = length(symbol);

Range length(\lit(string)) = <size(string), just(size(string))>;
Range length(\cilit(string)) = <size(string), just(size(string))>;
Range length(\char-class(_)) = <1, just(1)>;
Expand Down
Loading