Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rewrote squeeze in Rascal with reified classes #2002

Merged
merged 17 commits into from
Nov 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions src/org/rascalmpl/library/String.rsc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ module String

extend Exception;
import List;
import ParseTree;

@synopsis{All functions in this module that have a charset parameter use this as default.}
private str DEFAULT_CHARSET = "UTF-8";
Expand Down Expand Up @@ -519,11 +520,46 @@ for the allowed syntax in `charSet`.
```rascal-shell
import String;
squeeze("hello", "el");
// the other squeeze function uses character class types instead:
squeeze("hello", "el") == squeeze("hello", #[el]);
```
}
@javaClass{org.rascalmpl.library.Prelude}
@deprecated{Use the other squeence function that accepts Rascal character class syntax.}
public java str squeeze(str src, str charSet);

@synopsis{Squeeze repeated occurrences of characters.}
@description{
Squeeze repeated occurrences in `src` of characters, if they are a member of `&CharClass`, removed.

* `src` is any string
* `&CharClass` is a reified character class type such as `[a-z]` (a type that is a subtype of the class of all characters `![]`)
* To pass in a char-class type used the type reifier operator: `#[a-z]` or `#![]`
}
@benefits{
* to squeeze all characters use the universal character class: `#![]` (the negation of the empty class).
* this function is type-safe; you can only pass in correct reified character classes like `#[A-Za-z]`.
}
@pitfalls{
* `![]` excludes the 0'th unicode character, so we can not squeeze the unicode codepoint `0` using this function.
If you really need to squeeze 0 then it's best to write your own:
```rascal
visit (x) {
case /<dot:.>+/ => "\a00" when dot == "\a00"
}
````
* Do not confuse the character `0` (codepoint 48) with the zero codepoint: `#[0] != #[\a00]`
}
@examples{
```rascal-shell
import String;
squeeze("hello", #[el]);
```
}
public str squeeze(str src, type[&CharClass] _:type[![]] _) = visit(src) {
case /<c:.><c>+/ => c
when &CharClass _ := Tree::char(charAt(c, 0))
};


@synopsis{Split a string into a list of strings based on a literal separator.}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ module lang::rascal::grammar::tests::CharactersTests

import lang::rascal::grammar::definition::Characters;
import ParseTree;
import String;

test bool testFlip() = \new-char-class([range(2,2), range(1,1)]) == \char-class([range(1,2)]);
test bool testMerge() = \new-char-class([range(3,4), range(2,2), range(1,1)]) == \char-class([range(1,4)]);
Expand All @@ -24,3 +25,57 @@ test bool testDiff1() = difference(\char-class([range(10,30)]), \char-class([ran
test bool testDiff2() = difference(\char-class([range(10,30), range(40,50)]), \char-class([range(25,45)])) ==\char-class( [range(10,24), range(46,50)]);


test bool asciiEscape() = \char-class([range(0,127)]) == #[\a00-\a7F].symbol;
test bool utf16Escape() = \char-class([range(0,65535)]) == #[\u0000-\uFFFF].symbol;
test bool utf32Escape() = \char-class([range(0,1114111)]) == #[\U000000-\U10FFFF].symbol;
test bool highLowSurrogateRange1() = \char-class([range(9312,12991)]) == #[①-㊿].symbol;
test bool highLowSurrogateRange2() = \char-class([range(127829,127829)]) == #[🍕].symbol;
test bool differentEscapesSameResult1() = #[\a00-\a7F] == #[\u0000-\u007F];
test bool differentEscapesSameResult2() = #[\a00-\a7F] == #[\U000000-\U00007F];

/* to avoid a known ambiguity */
alias NotAZ = ![A-Z];

test bool unicodeCharacterClassSubtype1() {
Tree t = char(charAt("⑭", 0));

if ([①-㊿] circled := t) {
assert [⑭] _ := circled;
assert NotAZ _ := circled;
return true;
}

return false;
}

test bool unicodeCharacterClassSubtype2() {
Tree t = char(charAt("🍕", 0));

if ([🍕] pizza := t) {
assert [\a00-🍕] _ := pizza;
assert NotAZ _ := pizza;
return true;
}

return false;
}

test bool literalAsciiEscape1() = lit("\n") == #"\a0A".symbol;
test bool literalAsciiEscape2() = lit("w") == #"\a77".symbol;
test bool literalAsciiEscape3() = lit("\f") == #"\a0C".symbol;
test bool literalAsciiEscape4() = lit("\n") == #"\n".symbol;
@ignore{vallang must re-introduce the \f notation}
test bool literalAsciiEscape5() = lit("\f") == #"\f".symbol;
test bool literalUtf16Escape() = lit("\n") == #"\u000A".symbol;
test bool literalUtf32Escape1() = lit("\n") == #"\U00000A".symbol;
test bool literalUtf32Escape2() = lit("🍕") == #"\U01F355".symbol;

test bool ciliteralAsciiEscape1() = cilit("\n") == #'\a0A'.symbol;
test bool ciliteralAsciiEscape2() = cilit("w") == #'\a77'.symbol;
test bool ciliteralAsciiEscape3() = cilit("\f") == #'\a0C'.symbol;
test bool ciliteralAsciiEscape4() = cilit("\n") == #'\n'.symbol;
@ignore{vallang must re-introduce the \f notation}
test bool ciliteralAsciiEscape5() = cilit("\f") == #'\f'.symbol;
test bool ciliteralUtf16Escape() = cilit("\n") == #'\u000A'.symbol;
test bool ciliteralUtf32Escape1() = cilit("\n") == #'\U00000A'.symbol;
test bool ciliteralUtf32Escape2() = cilit("🍕") == #'\U01F355'.symbol;
16 changes: 16 additions & 0 deletions src/org/rascalmpl/library/lang/rascal/tests/basic/Strings1.rsc
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,22 @@ test bool tstSqueezeCase3() = squeeze("aabcc", "a-c") == "abc";
test bool tstSqueezeCase4() = squeeze("aabbcc", "a-c") == "abc";
test bool tstSqueezeCase5() = squeeze("aaabc", "a-c") == "abc";

// second squeeze
test bool tstSqueeze1CC(str S) = /<c:[a-zA-Z]><c>/ !:= squeeze(S, #[a-zA-Z]);
test bool tstSqueeze2CC(str S) = squeeze(S, #[]) == S;
test bool tstSqueeze3CC(str S) {
if (/<c:[a-zA-Z]><c>/ := S) {
return /<c><c>/ := squeeze(S, #[0-9]);
}
return true;
}

test bool tstSqueezeUnicodeCC() = squeeze("Hi 🍝🍝World", #[🍝]) == "Hi 🍝World";
test bool tstSqueezeCase1CC() = squeeze("abc", #[a-c]) == "abc";
test bool tstSqueezeCase2CC() = squeeze("aabc", #[a-c]) == "abc";
test bool tstSqueezeCase3CC() = squeeze("aabcc", #[a-c]) == "abc";
test bool tstSqueezeCase4CC() = squeeze("aabbcc", #[a-c]) == "abc";
test bool tstSqueezeCase5CC() = squeeze("aaabc", #[a-c]) == "abc";

test bool tstStartsWith(str S1, str S2) = startsWith(S1+S2, S1);

Expand Down
167 changes: 55 additions & 112 deletions src/org/rascalmpl/values/parsetrees/SymbolFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,11 @@
*******************************************************************************/
package org.rascalmpl.values.parsetrees;

import java.io.IOException;
import java.io.StringReader;
import java.util.List;

import org.apache.commons.lang3.ArrayUtils;
import org.rascalmpl.ast.CaseInsensitiveStringConstant;
import org.rascalmpl.ast.Char;
import org.rascalmpl.ast.Class;
Expand All @@ -34,6 +37,8 @@
import io.usethesource.vallang.IString;
import io.usethesource.vallang.IValue;
import io.usethesource.vallang.IValueFactory;
import io.usethesource.vallang.exceptions.FactTypeUseException;
import io.usethesource.vallang.io.StandardTextReader;

import org.rascalmpl.values.RascalValueFactory;
import org.rascalmpl.values.ValueFactoryFactory;
Expand Down Expand Up @@ -72,7 +77,7 @@
boolean noExpand = lex || layout == null;

if (symbol.isCaseInsensitiveLiteral()) {
return factory.constructor(RascalValueFactory.Symbol_Cilit, ciliteral2Symbol(symbol.getCistring()));
return ciliteral2Symbol(symbol.getCistring());

Check warning on line 80 in src/org/rascalmpl/values/parsetrees/SymbolFactory.java

View check run for this annotation

Codecov / codecov/patch

src/org/rascalmpl/values/parsetrees/SymbolFactory.java#L80

Added line #L80 was not covered by tests
}
if (symbol.isCharacterClass()) {
Class cc = symbol.getCharClass();
Expand Down Expand Up @@ -197,101 +202,34 @@
}

private static IValue literal2Symbol(StringConstant sep) {
String lit = ((StringConstant.Lexical) sep).getString();
StringBuilder builder = new StringBuilder(lit.length());

// TODO: did we deal with all escapes here? probably not!
for (int i = 1; i < lit.length() - 1; i++) {
if (lit.charAt(i) == '\\') {
i++;
switch (lit.charAt(i)) {
case 'b':
builder.append('\b');
break;
case 'f':
builder.append('\f');
break;
case 'n':
builder.append('\n');
break;
case 't':
builder.append('\t');
break;
case 'r':
builder.append('\r');
break;
case '\\':
builder.append('\\');
break;
case '\"':
builder.append('\"');
break;
case '>':
builder.append('>');
break;
case '<':
builder.append('<');
break;
case '\'':
builder.append('\'');
break;
case 'u':
while (lit.charAt(i++) == 'u');
builder.append((char) Integer.decode("0x" + lit.substring(i, i+4)).intValue());
i+=4;
break;
default:
// octal escape
int a = lit.charAt(i++);
int b = lit.charAt(i++);
int c = lit.charAt(i);
builder.append( (char) (100 * a + 10 * b + c));
}
}
else {
builder.append(lit.charAt(i));
}
try {
String lit = ((StringConstant.Lexical) sep).getString();
// this should be the exact notation for string literals in vallang
IValue string = new StandardTextReader().read(factory, new StringReader(lit));

return factory.constructor(RascalValueFactory.Symbol_Lit, string);
}
catch (FactTypeUseException | IOException e) {

Check warning on line 212 in src/org/rascalmpl/values/parsetrees/SymbolFactory.java

View check run for this annotation

Codecov / codecov/patch

src/org/rascalmpl/values/parsetrees/SymbolFactory.java#L212

Added line #L212 was not covered by tests
// this would mean Rascal's syntax definition for string constants is not aligned with vallang's string notation
throw new RuntimeException("Internal error: parsed stringconstant notation does not coincide with vallang stringconstant notation");

Check warning on line 214 in src/org/rascalmpl/values/parsetrees/SymbolFactory.java

View check run for this annotation

Codecov / codecov/patch

src/org/rascalmpl/values/parsetrees/SymbolFactory.java#L214

Added line #L214 was not covered by tests
}

return factory.constructor(RascalValueFactory.Symbol_Lit, factory.string(builder.toString()));
}

private static IValue ciliteral2Symbol(CaseInsensitiveStringConstant constant) {
String lit = ((CaseInsensitiveStringConstant.Lexical) constant).getString();
StringBuilder builder = new StringBuilder(lit.length());

for (int i = 1; i < lit.length() - 1; i++) {
if (lit.charAt(i) == '\\') {
i++;
switch (lit.charAt(i)) {
case 'n':
builder.append('\n');
break;
case 't':
builder.append('\t');
break;
case 'r':
builder.append('\r');
break;
case '\\':
builder.append('\\');
break;
case '\"':
builder.append('\'');
break;
default:
int a = lit.charAt(i++);
int b = lit.charAt(i++);
int c = lit.charAt(i);
builder.append( (char) (100 * a + 10 * b + c));
}
}
else {
builder.append(lit.charAt(i));
}
try {
String lit = ((CaseInsensitiveStringConstant.Lexical) constant).getString();

Check warning on line 220 in src/org/rascalmpl/values/parsetrees/SymbolFactory.java

View check run for this annotation

Codecov / codecov/patch

src/org/rascalmpl/values/parsetrees/SymbolFactory.java#L220

Added line #L220 was not covered by tests
// replace single quotes by double quotes first
lit = "\"" + lit.substring(1, lit.length() - 1) + "\"";

Check warning on line 222 in src/org/rascalmpl/values/parsetrees/SymbolFactory.java

View check run for this annotation

Codecov / codecov/patch

src/org/rascalmpl/values/parsetrees/SymbolFactory.java#L222

Added line #L222 was not covered by tests

// this should be the exact notation for string literals in vallang
IValue string = new StandardTextReader().read(factory, new StringReader(lit));

Check warning on line 225 in src/org/rascalmpl/values/parsetrees/SymbolFactory.java

View check run for this annotation

Codecov / codecov/patch

src/org/rascalmpl/values/parsetrees/SymbolFactory.java#L225

Added line #L225 was not covered by tests

return factory.constructor(RascalValueFactory.Symbol_Cilit, string);

Check warning on line 227 in src/org/rascalmpl/values/parsetrees/SymbolFactory.java

View check run for this annotation

Codecov / codecov/patch

src/org/rascalmpl/values/parsetrees/SymbolFactory.java#L227

Added line #L227 was not covered by tests
}
catch (FactTypeUseException | IOException e) {

Check warning on line 229 in src/org/rascalmpl/values/parsetrees/SymbolFactory.java

View check run for this annotation

Codecov / codecov/patch

src/org/rascalmpl/values/parsetrees/SymbolFactory.java#L229

Added line #L229 was not covered by tests
// this would mean Rascal's syntax definition for string constants is not aligned with vallang's string notation
throw new RuntimeException("Internal error: parsed stringconstant notation does not coincide with vallang stringconstant notation");

Check warning on line 231 in src/org/rascalmpl/values/parsetrees/SymbolFactory.java

View check run for this annotation

Codecov / codecov/patch

src/org/rascalmpl/values/parsetrees/SymbolFactory.java#L231

Added line #L231 was not covered by tests
}

return factory.constructor(RascalValueFactory.Symbol_Lit, factory.string(builder.toString()));
}

private static IConstructor charclass2Symbol(Class cc) {
Expand Down Expand Up @@ -338,30 +276,35 @@
private static IValue char2int(Char character) {
String s = ((Char.Lexical) character).getString();
if (s.startsWith("\\")) {
if (s.length() > 1 && java.lang.Character.isDigit(s.charAt(1))) { // octal escape
// TODO
throw new NotYetImplemented("octal escape sequence in character class types");
}
if (s.length() > 1 && s.charAt(1) == 'u') { // octal escape
// TODO
throw new NotYetImplemented("unicode escape sequence in character class types");
if (ArrayUtils.contains(new int[] { 'a', 'u', 'U'}, s.charAt(1))) {
// lexical UnicodeEscape
// = utf16: "\\" [u] [0-9 A-F a-f] [0-9 A-F a-f] [0-9 A-F a-f] [0-9 A-F a-f]
// | utf32: "\\" [U] (("0" [0-9 A-F a-f]) | "10") [0-9 A-F a-f] [0-9 A-F a-f] [0-9 A-F a-f] [0-9 A-F a-f] // 24 bits
// | ascii: "\\" [a] [0-7] [0-9A-Fa-f]
// ;
return factory.integer(Integer.parseUnsignedInt(s.substring(2), 16));

Check warning on line 285 in src/org/rascalmpl/values/parsetrees/SymbolFactory.java

View check run for this annotation

Codecov / codecov/patch

src/org/rascalmpl/values/parsetrees/SymbolFactory.java#L285

Added line #L285 was not covered by tests
}
char cha = s.charAt(1);
switch (cha) {
case 't': return factory.integer('\t');
case 'n': return factory.integer('\n');
case 'r': return factory.integer('\r');
case '\"' : return factory.integer('\"');
case '\'' : return factory.integer('\'');
case '-' : return factory.integer('-');
case '<' : return factory.integer('<');
case '>' : return factory.integer('>');
case '\\' : return factory.integer('\\');
else {
int cha = s.codePointAt(1);

Check warning on line 288 in src/org/rascalmpl/values/parsetrees/SymbolFactory.java

View check run for this annotation

Codecov / codecov/patch

src/org/rascalmpl/values/parsetrees/SymbolFactory.java#L288

Added line #L288 was not covered by tests
switch (cha) {
case 't': return factory.integer('\t');
case 'n': return factory.integer('\n');
case 'r': return factory.integer('\r');
case '\"' : return factory.integer('\"');
case '\'' : return factory.integer('\'');
case '-' : return factory.integer('-');
case '<' : return factory.integer('<');
case '>' : return factory.integer('>');
case '\\' : return factory.integer('\\');

Check warning on line 298 in src/org/rascalmpl/values/parsetrees/SymbolFactory.java

View check run for this annotation

Codecov / codecov/patch

src/org/rascalmpl/values/parsetrees/SymbolFactory.java#L290-L298

Added lines #L290 - L298 were not covered by tests
default:
return factory.integer(cha);

Check warning on line 300 in src/org/rascalmpl/values/parsetrees/SymbolFactory.java

View check run for this annotation

Codecov / codecov/patch

src/org/rascalmpl/values/parsetrees/SymbolFactory.java#L300

Added line #L300 was not covered by tests
}
}
s = s.substring(1);
}
char cha = s.charAt(0);
return factory.integer(cha);
else {
int cha = s.codePointAt(0);
return factory.integer(cha);
}
}

public static IConstructor charClass(int ch) {
Expand Down
Loading