From a26e55677b34f3d6b033bfd05a29cae2f975178b Mon Sep 17 00:00:00 2001 From: zufuliu Date: Wed, 8 Jan 2025 18:27:59 +0800 Subject: [PATCH] Simplify Ruby lexer. --- readme.md | 2 +- scintilla/lexers/LexRuby.cxx | 142 ++++++++++------------------------- tools/lang/Ruby.rb | 15 ++-- 3 files changed, 50 insertions(+), 109 deletions(-) diff --git a/readme.md b/readme.md index c1ba84d98b..edd891a21c 100644 --- a/readme.md +++ b/readme.md @@ -118,7 +118,7 @@ Latest development builds (artifacts in Release configuration for each compiler * Windows Rescouce Script * [R](tools/lang/R.r), up to R 4.2. * [REBOL](tools/lang/Rebol.r) 3 and [Red](tools/lang/Red.red) - * [Ruby](tools/lang/Ruby.rb), up to Ruby 3.3. [Screenshots](https://github.com/zufuliu/notepad4/wiki/Screenshots#ruby) + * [Ruby](tools/lang/Ruby.rb), up to Ruby 3.4. [Screenshots](https://github.com/zufuliu/notepad4/wiki/Screenshots#ruby) * [Rust](tools/lang/Rust.rs), up to Rust 1.38. [Screenshots](https://github.com/zufuliu/notepad4/wiki/Screenshots#rust) * [SAS](tools/lang/SAS.sas), up to SAS Viya 2024.06. * [Scala](tools/lang/Scala.scala), up to Scala 3.2. diff --git a/scintilla/lexers/LexRuby.cxx b/scintilla/lexers/LexRuby.cxx index 608a8ec97b..6f4864c322 100644 --- a/scintilla/lexers/LexRuby.cxx +++ b/scintilla/lexers/LexRuby.cxx @@ -85,7 +85,7 @@ constexpr bool isQestionMarkChar(char chNext, char chNext2) noexcept { return !IsASpace(chNext); } -#define MAX_KEYWORD_LENGTH (MaxKeywordSize - 1) +#define MAX_KEYWORD_LENGTH 7 // module bool followsDot(Sci_PositionU pos, LexAccessor &styler) { while (pos > 1) { @@ -106,8 +106,6 @@ bool followsDot(Sci_PositionU pos, LexAccessor &styler) { return false; } -// Forward declarations -bool keywordDoStartsLoop(Sci_Position pos, LexAccessor &styler); bool keywordIsModifier(const char *word, Sci_Position pos, LexAccessor &styler); constexpr bool IsIdentifierStyle(int style) noexcept { @@ -118,8 +116,8 @@ constexpr bool IsIdentifierStyle(int style) noexcept { || style == SCE_RB_BUILTIN_FUNCTION; } -int ClassifyWordRb(Sci_PositionU end, char ch, char chNext, LexerWordList keywordLists, LexAccessor &styler, char *prevWord) { - char s[MAX_KEYWORD_LENGTH + 1]; +int ClassifyWordRb(Sci_PositionU end, char ch, char chNext, LexerWordList keywordLists, LexAccessor &styler, bool &modifierDo, char *prevWord) { + char s[MaxKeywordSize]; const Sci_PositionU start = styler.GetStartSegment(); styler.GetRange(start, end, s, sizeof(s)); int chAttr = SCE_RB_IDENTIFIER; @@ -142,7 +140,7 @@ int ClassifyWordRb(Sci_PositionU end, char ch, char chNext, LexerWordList keywor } else if (keywordLists[KeywordIndex_Keyword].InList(s) && ((start < 2) || !followsDot(start, styler))) { // Order from most likely used to least likely // Lots of ways to do a loop in Ruby besides 'while/until' - if ((StrEqual(s, "do") && keywordDoStartsLoop(start, styler)) + if ((modifierDo && StrEqual(s, "do")) || (StrEqualsAny(s, "if", "while", "unless", "until") && keywordIsModifier(s, start, styler))) { // Demoted keywords are colored as keywords, @@ -156,9 +154,12 @@ int ClassifyWordRb(Sci_PositionU end, char ch, char chNext, LexerWordList keywor chAttr = SCE_RB_WORD_DEMOTED; } else { + if (StrEqualsAny(s, "while", "until", "for")) { + modifierDo = true; + } chAttr = SCE_RB_WORD; style = SCE_RB_WORD; - strcpy(prevWord, s); + memcpy(prevWord, s, MAX_KEYWORD_LENGTH + 1); } } else { if (IsUpperCase(s[0])) { @@ -342,39 +343,38 @@ void InterpolateVariable(LexAccessor &styler, int state, Sci_Position &i, char & // // iPrev points to the start of << -bool sureThisIsHeredoc(Sci_Position iPrev, LexAccessor &styler, char *prevWord) { +bool sureThisIsHeredoc(Sci_Position iPrev, LexAccessor &styler) { // Not so fast, since Ruby's so dynamic. Check the context // to make sure we're OK. const Sci_Line lineStart = styler.GetLine(iPrev); const Sci_Position lineStartPosn = styler.LineStart(lineStart); - styler.Flush(); // Find the first word after some whitespace const Sci_Position firstWordPosn = LexSkipSpaceTab(styler, lineStartPosn, iPrev); - if (firstWordPosn >= iPrev) { + if (firstWordPosn + 3 > iPrev) { // Have something like {^ <<} //XXX Look at the first previous non-comment non-white line // to establish the context. Not too likely though. return true; } + + styler.Flush(); const int prevStyle = styler.StyleAt(firstWordPosn); - switch (prevStyle) { - case SCE_RB_WORD: - case SCE_RB_WORD_DEMOTED: - //case SCE_RB_IDENTIFIER: - break; - default: + if (prevStyle != SCE_RB_WORD) { return true; } + + char prevWord[MAX_KEYWORD_LENGTH + 1]; + unsigned wordLen = 0; Sci_Position firstWordEndPosn = firstWordPosn; - char *dst = prevWord; for (;;) { - if (firstWordEndPosn >= iPrev || - styler.StyleAt(firstWordEndPosn) != prevStyle) { - *dst = 0; + if (wordLen == MAX_KEYWORD_LENGTH || firstWordEndPosn >= iPrev || + styler.StyleAt(firstWordEndPosn) != SCE_RB_WORD) { + prevWord[wordLen] = '\0'; break; } - *dst++ = styler[firstWordEndPosn]; + prevWord[wordLen] = styler[firstWordEndPosn]; + wordLen += 1; firstWordEndPosn += 1; } //XXX Write a style-aware thing to regex scintilla buffer objects @@ -683,12 +683,13 @@ void ColouriseRbDoc(Sci_PositionU startPos, Sci_Position length, int initStyle, bool preferRE = true; bool afterDef = false; + bool is_real_number = true; // Differentiate between constants and ?-sequences. + bool modifierDo = false; int state = initStyle; - char prevWord[MAX_KEYWORD_LENGTH + 1] {}; // 1 byte for zero + char prevWord[MAX_KEYWORD_LENGTH + 1]{}; char chPrev = styler.SafeGetCharAt(startPos - 1); char chNext = styler.SafeGetCharAt(startPos); - bool is_real_number = true; // Differentiate between constants and ?-sequences. styler.StartAt(startPos); styler.StartSegment(startPos); @@ -780,7 +781,7 @@ void ColouriseRbDoc(Sci_PositionU startPos, Sci_Position length, int initStyle, // Begin of here-doc (the line after the here-doc delimiter): HereDoc.State = 2; if (state == SCE_RB_WORD) { - ClassifyWordRb(i, ch, chNext, keywordLists, styler, prevWord); + ClassifyWordRb(i, ch, chNext, keywordLists, styler, modifierDo, prevWord); } else { styler.ColorTo(i, state); } @@ -876,7 +877,7 @@ void ColouriseRbDoc(Sci_PositionU startPos, Sci_Position length, int initStyle, // heredoc_identifier routine. // Nothing else to do. } else if (preferRE) { - if (sureThisIsHeredoc(i - 1, styler, prevWord)) { + if (sureThisIsHeredoc(i - 1, styler)) { state = SCE_RB_HERE_DELIM; HereDoc.State = 0; } @@ -1143,12 +1144,12 @@ void ColouriseRbDoc(Sci_PositionU startPos, Sci_Position length, int initStyle, // No need to handle this state -- we'll just move to the end preferRE = false; } else { - const int word_style = ClassifyWordRb(i, ch, chNext, keywordLists, styler, prevWord); + const int word_style = ClassifyWordRb(i, ch, chNext, keywordLists, styler, modifierDo, prevWord); preferRE = false; switch (word_style) { case SCE_RB_WORD: afterDef = StrEqual(prevWord, "def"); - preferRE = !IsLowerCase(prevWord[0]) || keywordLists[KeywordIndex_Regex].InList(prevWord); + preferRE = keywordLists[KeywordIndex_Regex].InList(prevWord); break; case SCE_RB_WORD_DEMOTED: @@ -1474,11 +1475,14 @@ void ColouriseRbDoc(Sci_PositionU startPos, Sci_Position length, int initStyle, break; } chPrev = ch; + if (modifierDo && IsEOLChar(ch)) { + modifierDo = false; + } } if (state == SCE_RB_WORD) { // We've ended on a word, possibly at EOF, and need to // classify it. - ClassifyWordRb(lengthDoc, '\0', '\0', keywordLists, styler, prevWord); + ClassifyWordRb(lengthDoc, '\0', '\0', keywordLists, styler, modifierDo, prevWord); } else { styler.ColorTo(lengthDoc, state); } @@ -1548,17 +1552,14 @@ bool keywordIsModifier(const char *word, Sci_Position pos, LexAccessor &styler) return false; } // First things where the action is unambiguous - switch (style) { - case SCE_RB_DEFAULT: - case SCE_RB_COMMENTLINE: - case SCE_RB_POD: - case SCE_RB_CLASS_NAME: - case SCE_RB_DEF_NAME: - case SCE_RB_MODULE_NAME: - return false; - case SCE_RB_OPERATOR: - break; - case SCE_RB_WORD: + if (style == SCE_RB_OPERATOR) { + // Assume that if the keyword follows an operator, + // usually it's a block assignment, like + // a << if x then y else z + const char ch = styler[pos]; + return AnyOf(ch, ')', ']', '}'); + } + if (style == SCE_RB_WORD) { // Watch out for uses of 'else if' //XXX: Make a list of other keywords where 'if' isn't a modifier // and can appear legitimately @@ -1569,69 +1570,8 @@ bool keywordIsModifier(const char *word, Sci_Position pos, LexAccessor &styler) return !StrEqual(prevWord, "else"); } return true; - default: - return true; - } - // Assume that if the keyword follows an operator, - // usually it's a block assignment, like - // a << if x then y else z - - const char ch = styler[pos]; - return AnyOf(ch, ')', ']', '}'); -} - -#define WHILE_BACKWARDS "elihw" -#define UNTIL_BACKWARDS "litnu" -#define FOR_BACKWARDS "rof" - -// Nothing fancy -- look to see if we follow a while/until somewhere -// on the current line - -bool keywordDoStartsLoop(Sci_Position pos, LexAccessor &styler) { - const Sci_Line lineStart = styler.GetLine(pos); - const Sci_Position lineStartPosn = styler.LineStart(lineStart); - styler.Flush(); - while (--pos >= lineStartPosn) { - const int style = styler.StyleAt(pos); - if (style == SCE_RB_DEFAULT) { - const char ch = styler[pos]; - if (ch == '\r' || ch == '\n') { - // Scintilla's LineStart() and GetLine() routines aren't - // platform-independent, so if we have text prepared with - // a different system we can't rely on it. - return false; - } - } else if (style == SCE_RB_WORD) { - // Check for while or until, but write the word in backwards - char prevWord[MAX_KEYWORD_LENGTH + 1]; // 1 byte for zero - char *dst = prevWord; - int wordLen = 0; - Sci_Position start_word; - for (start_word = pos; - start_word >= lineStartPosn && styler.StyleAt(start_word) == SCE_RB_WORD; - start_word--) { - if (++wordLen < MAX_KEYWORD_LENGTH) { - *dst++ = styler[start_word]; - } - } - *dst = 0; - // Did we see our keyword? - if (StrEqualsAny(prevWord, WHILE_BACKWARDS, UNTIL_BACKWARDS, FOR_BACKWARDS)) { - return true; - } - // We can move pos to the beginning of the keyword, and then - // accept another decrement, as we can never have two contiguous - // keywords: - // word1 word2 - // ^ - // <- move to start_word - // ^ - // <- loop decrement - // ^ # pointing to end of word1 is fine - pos = start_word; - } } - return false; + return !AnyOf(style, SCE_RB_DEFAULT, SCE_RB_COMMENTLINE, SCE_RB_POD, SCE_RB_CLASS_NAME, SCE_RB_DEF_NAME, SCE_RB_MODULE_NAME); } /* diff --git a/tools/lang/Ruby.rb b/tools/lang/Ruby.rb index 8b7c6c8c86..921b3c6e4e 100644 --- a/tools/lang/Ruby.rb +++ b/tools/lang/Ruby.rb @@ -1,10 +1,10 @@ -# Ruby 3.3 https://www.ruby-lang.org/en/documentation/ +# Ruby 3.4 https://www.ruby-lang.org/en/documentation/ # https://docs.ruby-lang.org/en/ # https://ruby-doc.org/ # https://rubyreferences.github.io/ #! Keywords =========================================================== -# keywords_rdoc.html +# https://docs.ruby-lang.org/en/master/syntax/keywords_rdoc.html __ENCODING__ __LINE__ __FILE__ @@ -29,7 +29,7 @@ it #! Pre-defined constants =================================================== -# globals_rdoc.html#label-Pre-defined+global+constants +# https://docs.ruby-lang.org/en/master/globals_rdoc.html#label-Pre-Defined+Global+Constants TRUE FALSE NIL @@ -52,6 +52,7 @@ SCRIPT_LINES__ #! code folding =========================================================== +# https://docs.ruby-lang.org/en/master/syntax/control_expressions_rdoc.html begin end case @@ -62,7 +63,7 @@ def end end do end -for +for do end if end @@ -70,13 +71,13 @@ module end end unless end -until +until do end -while +while do end #! Pre-defined variables =================================================== -# globals_rdoc.html +# https://docs.ruby-lang.org/en/master/globals_rdoc.html#label-Pre-Defined+Global+Variables $DEBUG $LOAD_PATH $LOADED_FEATURES