diff --git a/README.md b/README.md
index b61c221..5865cc9 100644
--- a/README.md
+++ b/README.md
@@ -53,6 +53,7 @@ There is also the possibility to extend this library to support protocols other
| RSS 2.0 feeds | ❌ | 0.8 |
| Atom 0.3/1.0 feeds | ❌ | 0.8 |
| Simple text sitemaps | ❌ | 0.5 |
+| Memory management (500 KiB parsing limit) | ✔️ | |
| Caching support | ❌ | 0.3 |
# Usage
diff --git a/src/Robots.Txt.Parser/UrlRule.cs b/src/Robots.Txt.Parser/UrlRule.cs
index dbc0178..5b3f313 100644
--- a/src/Robots.Txt.Parser/UrlRule.cs
+++ b/src/Robots.Txt.Parser/UrlRule.cs
@@ -1,5 +1,7 @@
+using System;
+using System.Collections.Generic;
using System.Linq;
-using System.Web;
+using System.Text;
namespace Robots.Txt.Parser;
@@ -10,6 +12,14 @@ namespace Robots.Txt.Parser;
/// URL path pattern
public record UrlRule(RuleType Type, UrlPathPattern Pattern);
+///
+/// Robots.txt rule type
+///
+public enum RuleType
+{
+ Allow, Disallow
+}
+
public class UrlPathPattern
{
private readonly bool _matchSubPaths;
@@ -20,8 +30,8 @@ private UrlPathPattern(string value)
Length = value.Length;
if (value.EndsWith('$')) value = value[..^1];
else _matchSubPaths = true;
- _patternParts = value.Split('*', System.StringSplitOptions.None)
- .Select(part => HttpUtility.UrlDecode(part.Replace("%2F", "%252F")))
+ _patternParts = value.Split('*', StringSplitOptions.None)
+ .Select(PathHelpers.PreparePathForComparison)
.ToArray();
}
@@ -52,20 +62,118 @@ public class UrlPath
{
internal readonly string _value;
- private UrlPath(string value)
- {
- _value = HttpUtility.UrlDecode(value.Replace("%2F", "%252F"));
- }
+ private UrlPath(string value) => _value = PathHelpers.PreparePathForComparison(value);
public int Length => _value.Length;
public static implicit operator UrlPath(string value) => new(value);
}
-///
-/// Robots.txt rule type
-///
-public enum RuleType
+static class PathHelpers
{
- Allow, Disallow
-}
\ No newline at end of file
+ private static readonly HashSet _reservedChars = new()
+ {
+ ':' , '/' , '?' , '#' , '[' , ']' , '@',
+ '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '='
+ };
+
+ /*
+ * A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
+ * a b c d e f g h i j k l m n o p q r s t u v w x y z
+ * 0 1 2 3 4 5 6 7 8 9 - _ . ~
+ */
+ private static readonly string[] _unreservedCharactersPercentEncoded = new string[]
+ {
+ "%41", "%42", "%43", "%44", "%45", "%46", "%47", "%48", "%49", "%4A", "%4B", "%4C", "%4D", "%4E", "%4F", "%50", "%51", "%52", "%53", "%54", "%55", "%56", "%57", "%58", "%59", "%5A",
+ "%61", "%62", "%63", "%64", "%65", "%66", "%67", "%68", "%69", "%6A", "%6B", "%6C", "%6D", "%6E", "%6F", "%70", "%71", "%72", "%73", "%74", "%75", "%76", "%77", "%78", "%79", "%7A",
+ "%30", "%31", "%32", "%33", "%34", "%35", "%36", "%37", "%38", "%39", "%2D", "%5F", "%2E", "%7E"
+ };
+
+ internal static string PreparePathForComparison(string value)
+ {
+ /*
+ Octets in the URI and robots.txt paths outside the range of the ASCII coded character set, and those in the
+ reserved range defined by [RFC3986], MUST be percent-encoded as defined by [RFC3986] prior to comparison.
+ */
+ var encodedPath = EncodeUrlPath(value);
+
+ /*
+ If a percent-encoded ASCII octet is encountered in the URI, it MUST be unencoded prior to comparison,
+ unless it is a reserved character in the URI as defined by [RFC3986] or the character is outside the unreserved character range.
+ */
+ return DecodePercentEncodedUnreservedCharacters(encodedPath);
+ }
+
+ private static string EncodeUrlPath(string value)
+ {
+ var pathAndTheRest = value.Split('?', 2);
+ var path = pathAndTheRest[0];
+
+ var encodedUrlPathBuilder = new StringBuilder();
+
+ for (int i = 0; i < path.Length; i++)
+ {
+ var character = path[i];
+
+ // skip over chars already % encoded
+ if (character == '%'
+ && i < path.Length - 2
+ && char.IsAsciiHexDigit(path[i + 1])
+ && char.IsAsciiHexDigit(path[i + 2]))
+ {
+ encodedUrlPathBuilder.Append('%');
+ // normalize % encoding casing
+ encodedUrlPathBuilder.Append(char.ToUpperInvariant(path[i + 1]));
+ encodedUrlPathBuilder.Append(char.ToUpperInvariant(path[i + 2]));
+ i += 2;
+ continue;
+ }
+
+ // if (character == '/' || _pChars.Value.Contains(character)) encodedUrlPathBuilder.Append(character);
+ if (character == '/' || (char.IsAscii(character) && !_reservedChars.Contains(character))) encodedUrlPathBuilder.Append(character);
+ else encodedUrlPathBuilder.Append(Uri.HexEscape(character));
+ }
+
+ if (pathAndTheRest.Length == 1) return encodedUrlPathBuilder.ToString();
+
+ // fragment can be discarded for path rule matching
+ var query = pathAndTheRest[1].Split('#', 2)[0];
+ encodedUrlPathBuilder.Append('?');
+
+ for (int i = 0; i < query.Length; i++)
+ {
+ var character = query[i];
+
+ // skip over chars already % encoded
+ if (character == '%'
+ && i < query.Length - 2
+ && char.IsAsciiHexDigit(query[i + 1])
+ && char.IsAsciiHexDigit(query[i + 2]))
+ {
+ encodedUrlPathBuilder.Append('%');
+ encodedUrlPathBuilder.Append(query[i + 1]);
+ encodedUrlPathBuilder.Append(query[i + 2]);
+ i += 2;
+ continue;
+ }
+
+ if (char.IsAscii(character) && !_reservedChars.Contains(character)) encodedUrlPathBuilder.Append(character);
+ else encodedUrlPathBuilder.Append(Uri.HexEscape(character));
+ }
+
+ return encodedUrlPathBuilder.ToString();
+ }
+
+ private static string DecodePercentEncodedUnreservedCharacters(string value)
+ {
+ foreach (var percentEncoding in _unreservedCharactersPercentEncoded)
+ {
+ value = value.Replace(
+ percentEncoding,
+ Convert.ToChar(Convert.ToUInt32(percentEncoding[1..], 16)).ToString(),
+ StringComparison.InvariantCultureIgnoreCase);
+ }
+
+ return value;
+ }
+}
diff --git a/tests/Robots.Txt.Parser.Tests.Unit/UrlRuleTests.cs b/tests/Robots.Txt.Parser.Tests.Unit/UrlRuleTests.cs
index d4d7e20..14c6987 100644
--- a/tests/Robots.Txt.Parser.Tests.Unit/UrlRuleTests.cs
+++ b/tests/Robots.Txt.Parser.Tests.Unit/UrlRuleTests.cs
@@ -84,7 +84,7 @@ public void Matches_SubdirectoryMatch_ReturnTrue()
}
[Fact]
- public void Matches_OctectBothLowercase_ReturnTrue()
+ public void Matches_PercentEncodedCharacterBothLowercase_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path%3c");
@@ -97,7 +97,7 @@ public void Matches_OctectBothLowercase_ReturnTrue()
}
[Fact]
- public void Matches_OctectBothUppercase_ReturnTrue()
+ public void Matches_PercentEncodedCharacterBothUppercase_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path%3C");
@@ -110,7 +110,7 @@ public void Matches_OctectBothUppercase_ReturnTrue()
}
[Fact]
- public void Matches_OctectRuleLowercasePathUppercase_ReturnTrue()
+ public void Matches_PercentEncodedCharacterRuleLowercasePathUppercase_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path%3c");
@@ -123,7 +123,7 @@ public void Matches_OctectRuleLowercasePathUppercase_ReturnTrue()
}
[Fact]
- public void Matches_OctectRuleUppercasePathLowercase_ReturnTrue()
+ public void Matches_PercentEncodedCharacterRuleUppercasePathLowercase_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path%3C");
@@ -136,7 +136,7 @@ public void Matches_OctectRuleUppercasePathLowercase_ReturnTrue()
}
[Fact]
- public void Matches_OctectForwardSlashBothUrl_ReturnTrue()
+ public void Matches_PercentEncodedCharacterForwardSlashBothUrl_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path%2F");
@@ -149,7 +149,7 @@ public void Matches_OctectForwardSlashBothUrl_ReturnTrue()
}
[Fact]
- public void Matches_OctectForwardSlashOnlyInRule_ReturnFalse()
+ public void Matches_PercentEncodedCharacterForwardSlashOnlyInRule_ReturnFalse()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path%2F");
@@ -162,7 +162,7 @@ public void Matches_OctectForwardSlashOnlyInRule_ReturnFalse()
}
[Fact]
- public void Matches_OctectForwardSlashOnlyInPath_ReturnFalse()
+ public void Matches_PercentEncodedCharacterForwardSlashOnlyInPath_ReturnFalse()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path/");
@@ -175,7 +175,85 @@ public void Matches_OctectForwardSlashOnlyInPath_ReturnFalse()
}
[Fact]
- public void Matches_OctectNotForwardSlashLowercaseOnlyInRule_ReturnTrue()
+ public void Matches_PercentEncodedCharacterAsteriskBothUrl_ReturnTrue()
+ {
+ // Arrange
+ var urlRule = new UrlRule(RuleType.Disallow, "/some%2Apath");
+
+ // Act
+ var matches = urlRule.Pattern.Matches("/some%2Apath");
+
+ // Assert
+ matches.Should().Be(true);
+ }
+
+ [Fact]
+ public void Matches_PercentEncodedCharacterAsteriskOnlyInRule_ReturnTrue()
+ {
+ // Arrange
+ var urlRule = new UrlRule(RuleType.Disallow, "/some%2Apath");
+
+ // Act
+ var matches = urlRule.Pattern.Matches("/some*path");
+
+ // Assert
+ matches.Should().Be(true);
+ }
+
+ [Fact]
+ public void Matches_PercentEncodedCharacterAsteriskOnlyInPath_ReturnTrue()
+ {
+ // Arrange
+ var urlRule = new UrlRule(RuleType.Disallow, "/some*path");
+
+ // Act
+ var matches = urlRule.Pattern.Matches("/some%2Apath");
+
+ // Assert
+ matches.Should().Be(true);
+ }
+
+ [Fact]
+ public void Matches_PercentEncodedCharacterReservedBothUrl_ReturnTrue()
+ {
+ // Arrange
+ var urlRule = new UrlRule(RuleType.Disallow, "/some%24path");
+
+ // Act
+ var matches = urlRule.Pattern.Matches("/some%24path");
+
+ // Assert
+ matches.Should().Be(true);
+ }
+
+ [Fact]
+ public void Matches_PercentEncodedCharacterReservedOnlyInRule_ReturnTrue()
+ {
+ // Arrange
+ var urlRule = new UrlRule(RuleType.Disallow, "/some%24path");
+
+ // Act
+ var matches = urlRule.Pattern.Matches("/some$path");
+
+ // Assert
+ matches.Should().Be(true);
+ }
+
+ [Fact]
+ public void Matches_PercentEncodedCharacterReservedOnlyInPath_ReturnTrue()
+ {
+ // Arrange
+ var urlRule = new UrlRule(RuleType.Disallow, "/some$path");
+
+ // Act
+ var matches = urlRule.Pattern.Matches("/some%24path");
+
+ // Assert
+ matches.Should().Be(true);
+ }
+
+ [Fact]
+ public void Matches_PercentEncodedCharacterNotSpecialLowercaseOnlyInRule_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path%7e");
@@ -188,7 +266,7 @@ public void Matches_OctectNotForwardSlashLowercaseOnlyInRule_ReturnTrue()
}
[Fact]
- public void Matches_OctectNotForwardSlashLowercaseOnlyInPath_ReturnTrue()
+ public void Matches_PercentEncodedCharacterNotSpecialLowercaseOnlyInPath_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path~");
@@ -201,7 +279,7 @@ public void Matches_OctectNotForwardSlashLowercaseOnlyInPath_ReturnTrue()
}
[Fact]
- public void Matches_OctectNotForwardSlashUppercaseOnlyInRule_ReturnTrue()
+ public void Matches_PercentEncodedCharacterNotSpecialUppercaseOnlyInRule_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path%7E");
@@ -214,7 +292,7 @@ public void Matches_OctectNotForwardSlashUppercaseOnlyInRule_ReturnTrue()
}
[Fact]
- public void Matches_OctectNotForwardSlashUppercaseOnlyInPath_ReturnTrue()
+ public void Matches_PercentEncodedCharacterNotSpecialUppercaseOnlyInPath_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path~");
@@ -225,4 +303,43 @@ public void Matches_OctectNotForwardSlashUppercaseOnlyInPath_ReturnTrue()
// Assert
matches.Should().Be(true);
}
+
+ [Fact]
+ public void Matches_UnescapedQueryStringInRuleAndPath_ReturnTrue()
+ {
+ // Arrange
+ var urlRule = new UrlRule(RuleType.Disallow, "/foo/bar?baz=https://foo.bar");
+
+ // Act
+ var matches = urlRule.Pattern.Matches("/foo/bar?baz=https://foo.bar");
+
+ // Assert
+ matches.Should().Be(true);
+ }
+
+ [Fact]
+ public void Matches_UnescapedQueryStringInRuleButPathEscaped_ReturnTrue()
+ {
+ // Arrange
+ var urlRule = new UrlRule(RuleType.Disallow, "/foo/bar?baz=https://foo.bar");
+
+ // Act
+ var matches = urlRule.Pattern.Matches("/foo/bar?baz=https%3A%2F%2Ffoo.bar");
+
+ // Assert
+ matches.Should().Be(true);
+ }
+
+ [Fact]
+ public void Matches_UnescapedQueryStringInPathButRuleEscaped_ReturnTrue()
+ {
+ // Arrange
+ var urlRule = new UrlRule(RuleType.Disallow, "/foo/bar?baz=https%3A%2F%2Ffoo.bar");
+
+ // Act
+ var matches = urlRule.Pattern.Matches("/foo/bar?baz=https://foo.bar");
+
+ // Assert
+ matches.Should().Be(true);
+ }
}