From b3f05be54b113bc0cc69c3b87f3f8b4deb18dfb7 Mon Sep 17 00:00:00 2001 From: Dan Balasescu Date: Thu, 1 Aug 2024 20:17:02 +0900 Subject: [PATCH 1/3] Add failing tests --- .../PhpStringLiteralSyntaxNodeTest.cs | 56 ++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/LocalisationAnalyser.Tools.Tests/PhpStringLiteralSyntaxNodeTest.cs b/LocalisationAnalyser.Tools.Tests/PhpStringLiteralSyntaxNodeTest.cs index 7baaae4..236bfeb 100644 --- a/LocalisationAnalyser.Tools.Tests/PhpStringLiteralSyntaxNodeTest.cs +++ b/LocalisationAnalyser.Tools.Tests/PhpStringLiteralSyntaxNodeTest.cs @@ -27,14 +27,68 @@ public void TestEmptyString(string input) [InlineData("😊😊", "\"😊😊\"")] [InlineData("hello\nworld", "'hello\nworld'")] [InlineData("hello\nworld", "\"hello\nworld\"")] + [InlineData("hello\nworld", "\"hello\\nworld\"")] public void TestBasicString(string expected, string input) { Assert.Equal(expected, parse(input)); } [Theory] + // \' + [InlineData("\\'", "\"\\'\"")] + [InlineData("'", "'\\''")] + // \\ + [InlineData("\\", "\"\\\\\"")] + [InlineData("\\", "'\\\\'")] + // \n + [InlineData("\n", "\"\\n\"")] + [InlineData("\\n", "'\\n'")] + // \r + [InlineData("\r", "\"\\r\"")] + [InlineData("\\r", "'\\r'")] + // \t + [InlineData("\t", "\"\\t\"")] + [InlineData("\\t", "'\\t'")] + // \v + [InlineData("\v", "\"\\v\"")] + [InlineData("\\v", "'\\v'")] + // \e + [InlineData("\x1B", "\"\\e\"")] + [InlineData("\\e", "'\\e'")] + // \f + [InlineData("\f", "\"\\f\"")] + [InlineData("\\f", "'\\f'")] + // \$ + [InlineData("$", "\"\\$\"")] + [InlineData("\\$", "'\\$'")] + // \" + [InlineData("\"", "\"\\\"\"")] + [InlineData("\\\"", "'\\\"'")] + // \[0-7]{1,3} + [InlineData("A", "\"\\101\"")] + [InlineData("\\101", "'\\101'")] + [InlineData("AB", "\"\\101\\102\"")] + [InlineData("\\101\\102", "'\\101\\102'")] + [InlineData("\0", "\"\\400\"")] + [InlineData("\\400", "'\\400'")] + [InlineData("\\800", "\"\\800\"")] + [InlineData("\\800", "'\\800'")] + // \x[0-9A-Fa-f]{1,2} + [InlineData("A", "\"\\x41\"")] + [InlineData("\\x41", "'\\x41'")] + [InlineData("AB", "\"\\x41\\x42\"")] + [InlineData("\\x41\\x42", "'\\x41\\x42'")] + // \u{[0-9A-Fa-f]+} + [InlineData("A", "\"\\u{41}\"")] + [InlineData("\\u{41}", "'\\u{41}'")] + [InlineData("AB", "\"\\u{41}\\u{42}\"")] + [InlineData("\\u{41}\\u{42}", "'\\u{41}\\u{42}'")] + // Invalid escape sequence + [InlineData("\\g", "\"\\g\"")] + [InlineData("\\g", "'\\g'")] + // Other escaped strings [InlineData(":username's data", "':username\\'s data'")] - [InlineData(":username's data", "\":username\\'s data\"")] + [InlineData(":username\\'s data", "\":username\\'s data\"")] [InlineData("\"escaped\" quotes", "\"\\\"escaped\\\" quotes\"")] public void TestEscapedString(string expected, string input) { From d3a00b797a4b38927f584bc4d828e57ffcc99b80 Mon Sep 17 00:00:00 2001 From: Dan Balasescu Date: Thu, 1 Aug 2024 20:17:19 +0900 Subject: [PATCH 2/3] Implement PHP string escaping semantics --- .../Php/PhpStringLiteralSyntaxNode.cs | 115 ++++++++++++++++-- .../Php/PhpTokeniser.cs | 23 ++++ 2 files changed, 129 insertions(+), 9 deletions(-) diff --git a/LocalisationAnalyser.Tools/Php/PhpStringLiteralSyntaxNode.cs b/LocalisationAnalyser.Tools/Php/PhpStringLiteralSyntaxNode.cs index 646a3d6..aaedea5 100644 --- a/LocalisationAnalyser.Tools/Php/PhpStringLiteralSyntaxNode.cs +++ b/LocalisationAnalyser.Tools/Php/PhpStringLiteralSyntaxNode.cs @@ -1,7 +1,10 @@ // Copyright (c) ppy Pty Ltd . Licensed under the MIT Licence. // See the LICENCE file in the repository root for full licence text. +using System; +using System.Globalization; using System.Text; +using System.Text.RegularExpressions; namespace LocalisationAnalyser.Tools.Php { @@ -10,6 +13,10 @@ namespace LocalisationAnalyser.Tools.Php /// public class PhpStringLiteralSyntaxNode : PhpLiteralSyntaxNode { + private static readonly Regex oct_pattern = new Regex("^([0-7]{1,3})", RegexOptions.Compiled); + private static readonly Regex hex_pattern = new Regex("^(x[0-9A-Fa-f]{1,2})", RegexOptions.Compiled); + private static readonly Regex uni_pattern = new Regex("^(u{[0-9A-Fa-f]+})", RegexOptions.Compiled); + public readonly string Text; public PhpStringLiteralSyntaxNode(string text) @@ -21,27 +28,23 @@ public static PhpStringLiteralSyntaxNode Parse(PhpTokeniser tokeniser) { tokeniser.SkipWhitespace(); - char trivia = tokeniser.GetTrivia(); - - // Skip leading trivia. + char leader = tokeniser.GetTrivia(); tokeniser.Advance(); var stringBuilder = new StringBuilder(); - bool isEscaping = false; - while (isEscaping || tokeniser.GetTrivia() != trivia) + while (tokeniser.GetTrivia() != leader) { - var token = tokeniser.GetTrivia(); + char token = tokeniser.GetTrivia(); tokeniser.Advance(); - if (token == '\\' && !isEscaping) + if (token == '\\') { - isEscaping = true; + stringBuilder.Append(processEscapeSequence(leader, tokeniser)); continue; } stringBuilder.Append(token); - isEscaping = false; } // Skip trailing trivia. @@ -50,5 +53,99 @@ public static PhpStringLiteralSyntaxNode Parse(PhpTokeniser tokeniser) return new PhpStringLiteralSyntaxNode(stringBuilder.ToString()); } + + private static string processEscapeSequence(char leader, PhpTokeniser tokeniser) + { + char trivia = tokeniser.GetTrivia(); + + // Base cases for \{leader} and \\, supported by both single- and double-quoted strings. + if (trivia == leader || trivia == '\\') + { + tokeniser.Advance(); + return trivia.ToString(); + } + + // No other escape sequences are supported for single-quoted strings. + if (leader == '\'') + return @"\"; + + // Double-quoted strings have a few more cases... + switch (trivia) + { + case 'n': + tokeniser.Advance(); + return "\n"; + + case 'r': + tokeniser.Advance(); + return "\r"; + + case 't': + tokeniser.Advance(); + return "\t"; + + case 'v': + tokeniser.Advance(); + return "\v"; + + case 'e': + tokeniser.Advance(); + return "\x1B"; + + case 'f': + tokeniser.Advance(); + return "\f"; + + case '$': + tokeniser.Advance(); + return "$"; + + case >= '0' and <= '7': + { + Match match = oct_pattern.Match($"{trivia}{tokeniser.PeekNext(2)}"); + + if (match.Success) + { + tokeniser.Advance(match.Length); + + unchecked + { + byte octValue = (byte)Convert.ToInt32(match.Value, 8); + return ((char)octValue).ToString(); + } + } + + break; + } + + case 'x': + { + Match match = hex_pattern.Match($"{trivia}{tokeniser.PeekNext(2)}"); + + if (match.Success) + { + tokeniser.Advance(match.Length); + return ((char)byte.Parse(match.Value[1..], NumberStyles.HexNumber, CultureInfo.InvariantCulture)).ToString(); + } + + break; + } + + case 'u': + { + Match match = uni_pattern.Match($"{trivia}{tokeniser.PeekNext(16)}"); + + if (match.Success) + { + tokeniser.Advance(match.Length); + return char.ConvertFromUtf32(Convert.ToInt32(match.Value[2..^1], 16)); + } + + break; + } + } + + return @"\"; + } } } diff --git a/LocalisationAnalyser.Tools/Php/PhpTokeniser.cs b/LocalisationAnalyser.Tools/Php/PhpTokeniser.cs index dc3cc71..4aab7c5 100644 --- a/LocalisationAnalyser.Tools/Php/PhpTokeniser.cs +++ b/LocalisationAnalyser.Tools/Php/PhpTokeniser.cs @@ -67,6 +67,16 @@ public void TryAdvance() Advance(); } + /// + /// Advances by a number of trivia. + /// + /// The number of trivia to advance by. + public void Advance(int length) + { + for (int i = 0; i < length; i++) + Advance(); + } + /// /// Advances to the next trivia. /// @@ -104,6 +114,19 @@ public bool TryPeekNext(out char trivia) return true; } + /// + /// Peeks a number of future trivia. + /// + /// The length of trivia to peek. + /// The trivia. + public string PeekNext(int length) + { + int startIndex = Math.Min(content.Length, currentIndex + 1); + int endIndex = Math.Min(content.Length, startIndex + length); + + return content.AsSpan()[startIndex..endIndex].ToString(); + } + /// /// Skips all current whitespace and comments. /// From eb1cd1f208026bd4e18d2c69952e61eb4f044dfa Mon Sep 17 00:00:00 2001 From: Dan Balasescu Date: Thu, 1 Aug 2024 23:10:42 +0900 Subject: [PATCH 3/3] Add another test --- .../PhpStringLiteralSyntaxNodeTest.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/LocalisationAnalyser.Tools.Tests/PhpStringLiteralSyntaxNodeTest.cs b/LocalisationAnalyser.Tools.Tests/PhpStringLiteralSyntaxNodeTest.cs index 236bfeb..506dca0 100644 --- a/LocalisationAnalyser.Tools.Tests/PhpStringLiteralSyntaxNodeTest.cs +++ b/LocalisationAnalyser.Tools.Tests/PhpStringLiteralSyntaxNodeTest.cs @@ -27,6 +27,7 @@ public void TestEmptyString(string input) [InlineData("😊😊", "\"😊😊\"")] [InlineData("hello\nworld", "'hello\nworld'")] [InlineData("hello\nworld", "\"hello\nworld\"")] + [InlineData("hello\\nworld", "'hello\\nworld'")] [InlineData("hello\nworld", "\"hello\\nworld\"")] public void TestBasicString(string expected, string input) {