Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement PHP string escaping semantics #64

Merged
merged 3 commits into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,69 @@ public void TestEmptyString(string input)
[InlineData("😊😊", "\"😊😊\"")]
[InlineData("hello\nworld", "'hello\nworld'")]
[InlineData("hello\nworld", "\"hello\nworld\"")]
[InlineData("hello\\nworld", "'hello\\nworld'")]
[InlineData("hello\nworld", "\"hello\\nworld\"")]
public void TestBasicString(string expected, string input)
{
Assert.Equal(expected, parse(input));
}

[Theory]
// \'
[InlineData("\\'", "\"\\'\"")]
[InlineData("'", "'\\''")]
// \\
[InlineData("\\", "\"\\\\\"")]
[InlineData("\\", "'\\\\'")]
// \n
[InlineData("\n", "\"\\n\"")]
[InlineData("\\n", "'\\n'")]
// \r
[InlineData("\r", "\"\\r\"")]
[InlineData("\\r", "'\\r'")]
// \t
[InlineData("\t", "\"\\t\"")]
[InlineData("\\t", "'\\t'")]
// \v
[InlineData("\v", "\"\\v\"")]
[InlineData("\\v", "'\\v'")]
// \e
[InlineData("\x1B", "\"\\e\"")]
[InlineData("\\e", "'\\e'")]
// \f
[InlineData("\f", "\"\\f\"")]
[InlineData("\\f", "'\\f'")]
// \$
[InlineData("$", "\"\\$\"")]
[InlineData("\\$", "'\\$'")]
// \"
[InlineData("\"", "\"\\\"\"")]
[InlineData("\\\"", "'\\\"'")]
// \[0-7]{1,3}
[InlineData("A", "\"\\101\"")]
[InlineData("\\101", "'\\101'")]
[InlineData("AB", "\"\\101\\102\"")]
[InlineData("\\101\\102", "'\\101\\102'")]
[InlineData("\0", "\"\\400\"")]
[InlineData("\\400", "'\\400'")]
[InlineData("\\800", "\"\\800\"")]
[InlineData("\\800", "'\\800'")]
// \x[0-9A-Fa-f]{1,2}
[InlineData("A", "\"\\x41\"")]
[InlineData("\\x41", "'\\x41'")]
[InlineData("AB", "\"\\x41\\x42\"")]
[InlineData("\\x41\\x42", "'\\x41\\x42'")]
// \u{[0-9A-Fa-f]+}
[InlineData("A", "\"\\u{41}\"")]
[InlineData("\\u{41}", "'\\u{41}'")]
[InlineData("AB", "\"\\u{41}\\u{42}\"")]
[InlineData("\\u{41}\\u{42}", "'\\u{41}\\u{42}'")]
// Invalid escape sequence
[InlineData("\\g", "\"\\g\"")]
[InlineData("\\g", "'\\g'")]
// Other escaped strings
[InlineData(":username's data", "':username\\'s data'")]
[InlineData(":username's data", "\":username\\'s data\"")]
[InlineData(":username\\'s data", "\":username\\'s data\"")]
[InlineData("\"escaped\" quotes", "\"\\\"escaped\\\" quotes\"")]
public void TestEscapedString(string expected, string input)
{
Expand Down
115 changes: 106 additions & 9 deletions LocalisationAnalyser.Tools/Php/PhpStringLiteralSyntaxNode.cs
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
// Copyright (c) ppy Pty Ltd <[email protected]>. Licensed under the MIT Licence.
// See the LICENCE file in the repository root for full licence text.

using System;
using System.Globalization;
using System.Text;
using System.Text.RegularExpressions;

namespace LocalisationAnalyser.Tools.Php
{
Expand All @@ -10,6 +13,10 @@ namespace LocalisationAnalyser.Tools.Php
/// </summary>
public class PhpStringLiteralSyntaxNode : PhpLiteralSyntaxNode
{
private static readonly Regex oct_pattern = new Regex("^([0-7]{1,3})", RegexOptions.Compiled);
private static readonly Regex hex_pattern = new Regex("^(x[0-9A-Fa-f]{1,2})", RegexOptions.Compiled);
private static readonly Regex uni_pattern = new Regex("^(u{[0-9A-Fa-f]+})", RegexOptions.Compiled);

public readonly string Text;

public PhpStringLiteralSyntaxNode(string text)
Expand All @@ -21,27 +28,23 @@ public static PhpStringLiteralSyntaxNode Parse(PhpTokeniser tokeniser)
{
tokeniser.SkipWhitespace();

char trivia = tokeniser.GetTrivia();

// Skip leading trivia.
char leader = tokeniser.GetTrivia();
tokeniser.Advance();

var stringBuilder = new StringBuilder();
bool isEscaping = false;

while (isEscaping || tokeniser.GetTrivia() != trivia)
while (tokeniser.GetTrivia() != leader)
{
var token = tokeniser.GetTrivia();
char token = tokeniser.GetTrivia();
tokeniser.Advance();

if (token == '\\' && !isEscaping)
if (token == '\\')
{
isEscaping = true;
stringBuilder.Append(processEscapeSequence(leader, tokeniser));
continue;
}

stringBuilder.Append(token);
isEscaping = false;
}

// Skip trailing trivia.
Expand All @@ -50,5 +53,99 @@ public static PhpStringLiteralSyntaxNode Parse(PhpTokeniser tokeniser)

return new PhpStringLiteralSyntaxNode(stringBuilder.ToString());
}

private static string processEscapeSequence(char leader, PhpTokeniser tokeniser)
{
char trivia = tokeniser.GetTrivia();

// Base cases for \{leader} and \\, supported by both single- and double-quoted strings.
if (trivia == leader || trivia == '\\')
{
tokeniser.Advance();
return trivia.ToString();
}

// No other escape sequences are supported for single-quoted strings.
if (leader == '\'')
return @"\";

// Double-quoted strings have a few more cases...
switch (trivia)
{
case 'n':
tokeniser.Advance();
return "\n";

case 'r':
tokeniser.Advance();
return "\r";

case 't':
tokeniser.Advance();
return "\t";

case 'v':
tokeniser.Advance();
return "\v";

case 'e':
tokeniser.Advance();
return "\x1B";

case 'f':
tokeniser.Advance();
return "\f";

case '$':
tokeniser.Advance();
return "$";

case >= '0' and <= '7':
{
Match match = oct_pattern.Match($"{trivia}{tokeniser.PeekNext(2)}");

if (match.Success)
{
tokeniser.Advance(match.Length);

unchecked
{
byte octValue = (byte)Convert.ToInt32(match.Value, 8);
return ((char)octValue).ToString();
}
}

break;
}

case 'x':
{
Match match = hex_pattern.Match($"{trivia}{tokeniser.PeekNext(2)}");

if (match.Success)
{
tokeniser.Advance(match.Length);
return ((char)byte.Parse(match.Value[1..], NumberStyles.HexNumber, CultureInfo.InvariantCulture)).ToString();
}

break;
}

case 'u':
{
Match match = uni_pattern.Match($"{trivia}{tokeniser.PeekNext(16)}");

if (match.Success)
{
tokeniser.Advance(match.Length);
return char.ConvertFromUtf32(Convert.ToInt32(match.Value[2..^1], 16));
}

break;
}
}

return @"\";
}
}
}
23 changes: 23 additions & 0 deletions LocalisationAnalyser.Tools/Php/PhpTokeniser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,16 @@ public void TryAdvance()
Advance();
}

/// <summary>
/// Advances by a number of trivia.
/// </summary>
/// <param name="length">The number of trivia to advance by.</param>
public void Advance(int length)
{
for (int i = 0; i < length; i++)
Advance();
}

/// <summary>
/// Advances to the next trivia.
/// </summary>
Expand Down Expand Up @@ -104,6 +114,19 @@ public bool TryPeekNext(out char trivia)
return true;
}

/// <summary>
/// Peeks a number of future trivia.
/// </summary>
/// <param name="length">The length of trivia to peek.</param>
/// <returns>The trivia.</returns>
public string PeekNext(int length)
{
int startIndex = Math.Min(content.Length, currentIndex + 1);
int endIndex = Math.Min(content.Length, startIndex + length);

return content.AsSpan()[startIndex..endIndex].ToString();
}

/// <summary>
/// Skips all current whitespace and comments.
/// </summary>
Expand Down
Loading