Skip to content

Commit

Permalink
Support wildcard (*) and end-of-match ($) paths
Browse files Browse the repository at this point in the history
  • Loading branch information
drmathias committed Aug 26, 2023
1 parent 1a96bbe commit 20cd6c2
Show file tree
Hide file tree
Showing 5 changed files with 206 additions and 38 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ There is also the possibility to extend this library to support protocols other
| FTPS/FTPS || 0.1 |
| Wildcard (`*`) User-agent | ✔️ | |
| Allow & disallow rules | ✔️ | |
| End-of-match (`$`) and wildcard (`*`) paths | | 1.0 |
| End-of-match (`$`) and wildcard (`*`) paths | ✔️ | |
| Sitemap entries | ✔️ | |
| Host directive | ✔️ | |
| Crawl-delay directive | ✔️ | |
Expand Down
4 changes: 2 additions & 2 deletions src/Robots.Txt.Parser/RobotRuleChecker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ public bool IsAllowed(string path)
"The /robots.txt URL is always allowed"
*/
if (_rules.Count == 0 || path == "/robots.txt") return true;
var ruleMatch = _rules.Where(rule => rule.Matches(path))
.OrderByDescending(rule => rule.Path.Length)
var ruleMatch = _rules.Where(rule => rule.Pattern.Matches(path))
.OrderByDescending(rule => rule.Pattern.Length)
.ThenBy(rule => rule.Type, new RuleTypeComparer())
.FirstOrDefault();
return ruleMatch is null || ruleMatch.Type == RuleType.Allow;
Expand Down
51 changes: 33 additions & 18 deletions src/Robots.Txt.Parser/UrlRule.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
using System.Linq;
using System.Web;

namespace Robots.Txt.Parser;
Expand All @@ -6,44 +7,58 @@ namespace Robots.Txt.Parser;
/// Describes a robots.txt rule for a URL
/// </summary>
/// <param name="Type">Rule type; either <see cref="RuleType.Allow"/> or <see cref="RuleType.Disallow"/></param>
/// <param name="Path">URL path</param>
public record UrlRule(RuleType Type, UrlPathPattern Path)
/// <param name="Pattern">URL path pattern</param>
public record UrlRule(RuleType Type, UrlPathPattern Pattern);

public class UrlPathPattern
{
private readonly bool _matchSubPaths;
private readonly string[] _patternParts;

private UrlPathPattern(string value)
{
Length = value.Length;
if (value.EndsWith('$')) value = value[..^1];
else _matchSubPaths = true;
_patternParts = value.Split('*', System.StringSplitOptions.None)
.Select(part => HttpUtility.UrlDecode(part.Replace("%2F", "%252F")))
.ToArray();
}

public int Length { get; }

/// <summary>
/// Checks if a path matches the URL rule
/// </summary>
/// <param name="path">The URL path</param>
/// <returns>True if the path matches or is a sub-path; otherwise false</returns>
public bool Matches(UrlPath path) => !Path.IsEmpty && path.StartsWith(Path);
}

public class UrlPathPattern : UrlPath
{
private UrlPathPattern(string value, bool exactMatch) : base(value)
public bool Matches(UrlPath path)
{
ExactPattern = exactMatch;
if (Length == 0 || path._value.IndexOf(_patternParts[0]) != 0) return false;
var currentIndex = _patternParts[0].Length;
for (var x = 1; x < _patternParts.Length; x++)
{
var matchIndex = path._value.IndexOf(_patternParts[x], currentIndex);
if (matchIndex == -1) return false;
currentIndex = matchIndex + _patternParts[x].Length;
}
return _matchSubPaths || currentIndex == path.Length;
}

public bool ExactPattern { get; }

public static implicit operator UrlPathPattern(string value) => !value.EndsWith('$') ? new(value, false) : new(value[..^1], true);
public static implicit operator UrlPathPattern(string value) => new(value);
}

public class UrlPath
{
private readonly string _value;
internal readonly string _value;

protected UrlPath(string value)
private UrlPath(string value)
{
_value = HttpUtility.UrlDecode(value.Replace("%2F", "%252F"));
}

public int Length => _value.Length;

public bool IsEmpty => _value == "";

public bool StartsWith(UrlPath path) => _value.StartsWith(path._value);

public static implicit operator UrlPath(string value) => new(value);
}

Expand Down
153 changes: 153 additions & 0 deletions tests/Robots.Txt.Parser.Tests.Unit/RobotTxtRuleCheckerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,139 @@ public async Task UserAgentWildcard_DisallowPath_DisallowOnMatch()
ruleChecker.IsAllowed("/some/path").Should().Be(false);
}

[Fact]
public async Task UserAgentWildcard_DisallowWildcardPath_DisallowOnMatch()
{
// Arrange
var file =
@"User-agent: *
Disallow: /some/*/path
";
var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));

// Act
var robotsTxt = await _parser.ReadFromStreamAsync(stream);

// Assert
robotsTxt.TryGetRules("SomeBot", out var ruleChecker);
robotsTxt.Should().NotBe(null);
ruleChecker.IsAllowed("/some/other/sub/path").Should().Be(false);
}

[Fact]
public async Task UserAgentWildcard_DisallowDoubleWildcardPath_DisallowOnMatch()
{
// Arrange
var file =
@"User-agent: *
Disallow: /some/**/path
";
var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));

// Act
var robotsTxt = await _parser.ReadFromStreamAsync(stream);

// Assert
robotsTxt.TryGetRules("SomeBot", out var ruleChecker);
robotsTxt.Should().NotBe(null);
ruleChecker.IsAllowed("/some/other/sub/path").Should().Be(false);
}

[Fact]
public async Task UserAgentWildcard_TwoPartWildcardPath_DisallowOnMatch()
{
// Arrange
var file =
@"User-agent: *
Disallow: /some/*/*/path
";
var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));

// Act
var robotsTxt = await _parser.ReadFromStreamAsync(stream);

// Assert
robotsTxt.TryGetRules("SomeBot", out var ruleChecker);
robotsTxt.Should().NotBe(null);
ruleChecker.IsAllowed("/some/other/sub/path").Should().Be(false);
}

[Fact]
public async Task UserAgentWildcard_TwoPartWildcardPath_DisallowSubpathMatch()
{
// Arrange
var file =
@"User-agent: *
Disallow: /some/*/*/path
";
var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));

// Act
var robotsTxt = await _parser.ReadFromStreamAsync(stream);

// Assert
robotsTxt.TryGetRules("SomeBot", out var ruleChecker);
robotsTxt.Should().NotBe(null);
ruleChecker.IsAllowed("/some/other/sub/path/end").Should().Be(false);
}

[Fact]
public async Task UserAgentWildcard_WildcardPathWithEndOfMatch_AllowSubpathMatch()
{
// Arrange
var file =
@"User-agent: *
Disallow: /some/*/*/path$
";
var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));

// Act
var robotsTxt = await _parser.ReadFromStreamAsync(stream);

// Assert
robotsTxt.TryGetRules("SomeBot", out var ruleChecker);
robotsTxt.Should().NotBe(null);
ruleChecker.IsAllowed("/some/other/sub/path/end").Should().Be(true);
}

[Fact]
public async Task UserAgentWildcard_DisallowEndOfMatchPath_DisallowOnExactMatch()
{
// Arrange
var file =
@"User-agent: *
Disallow: /some/path$
";
var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));

// Act
var robotsTxt = await _parser.ReadFromStreamAsync(stream);

// Assert
robotsTxt.TryGetRules("SomeBot", out var ruleChecker);
robotsTxt.Should().NotBe(null);
ruleChecker.IsAllowed("/some/path").Should().Be(false);
}

[Fact]
public async Task UserAgentWildcard_DisallowEndOfMatchPath_AllowOnSubPathMatch()
{
// Arrange
var file =
@"User-agent: *
Disallow: /some/path$
";
var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));

// Act
var robotsTxt = await _parser.ReadFromStreamAsync(stream);

// Assert
robotsTxt.TryGetRules("SomeBot", out var ruleChecker);
robotsTxt.Should().NotBe(null);
ruleChecker.IsAllowed("/some/path/subdirectory").Should().Be(true);
}

[Fact]
public async Task UserAgentWildcard_DisallowPath_DisallowOnSubpath()
{
Expand Down Expand Up @@ -406,6 +539,26 @@ public async Task WildcardUserAgent_DisallowAllAndAllowPath_AllowPathMatch()
ruleChecker.IsAllowed("/some/path").Should().Be(true);
}

[Fact]
public async Task UserAgentWildcard_DisallowAllAndAllowWildcardPath_AllowWildcardPathMatch()
{
// Arrange
var file =
@"User-agent: *
Disallow: /
Allow: /some/*/path
";
var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));

// Act
var robotsTxt = await _parser.ReadFromStreamAsync(stream);

// Assert
robotsTxt.TryGetRules("SomeBot", out var ruleChecker);
robotsTxt.Should().NotBe(null);
ruleChecker.IsAllowed("/some/other/sub/path").Should().Be(true);
}

[Fact]
public async Task WildcardUserAgentRuleMatch_DisallowAllAndAllowPath_AllowSubpathMatch()
{
Expand Down
Loading

0 comments on commit 20cd6c2

Please sign in to comment.