Skip to content

Commit

Permalink
Merge pull request #23 from InfiniteEnergy/synonyms
Browse files Browse the repository at this point in the history
add support for synonyms
  • Loading branch information
ryepup authored Apr 27, 2021
2 parents 3a2a1bc + 1b5a4db commit 61199c8
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 2 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ This config will:
|`matchBy`|how to compare the two values, see `matchBy` below|`string`|
|`remove`|if present: before comparison, remove this string from both values|`null`|
|`zeroRepresentsEmpty`|if true: before comparison, convert any zero values (e.g. `0`, `0.0`) to empty string|`false`|
|`synonyms`|if present: this is a nested list of terms that should be treated as identical. For example, with these synonyms: `[ ["street", "st"], ["n", "north"] ]`, then `123 North Main St` will match `123 N Main Street`|`null`|

### `matchBy` options

Expand Down
19 changes: 19 additions & 0 deletions src/XlsxCompare.Tests/AssertionTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,20 @@ public static IEnumerable<object[]> AssertionsThatMatch()
"",
"0.00"
};
yield return new object[]{
new Assertion("leftCol", "rightCol",
MatchBy: MatchBy.Tokens,
Synonyms: new[]{new HashSet<string>(new[]{"rd", "Road"})}),
"1234\tFOO\tROAD\tS\t# 123A",
"1234 S Foo Rd # 123A"
};

yield return new object[]{
new Assertion("leftCol", "rightCol",
Synonyms: new[]{new HashSet<string>(new[]{"suite", " ste"})}),
"Suite 2",
"Ste 2"
};
}

[TestMethod]
Expand Down Expand Up @@ -60,6 +74,11 @@ public static IEnumerable<object[]> AssertionsThatDoNotMatch()
"1",
"0"
};
yield return new object[]{
new Assertion("leftCol", "rightCol", Synonyms: new[]{new HashSet<string>(new[]{"rd", "road"})}),
"foord",
"fooroad"
};
}
[TestMethod]
[DynamicData(nameof(AssertionsThatDoNotMatch), DynamicDataSourceType.Method)]
Expand Down
60 changes: 58 additions & 2 deletions src/XlsxCompare/Assertion.cs
Original file line number Diff line number Diff line change
@@ -1,13 +1,28 @@
using System;
using System.Collections.Generic;
using System.Linq;

namespace XlsxCompare
{

public record Assertion(
string LeftColumnName,
string RightColumnName,
MatchBy? MatchBy = null,
string? Remove = null,
bool ZeroRepresentsEmpty = false
bool ZeroRepresentsEmpty = false,
IReadOnlyCollection<ISet<string>>? Synonyms = null
)
{
private bool HasSynonyms => Synonyms?.Any() == true;

/// <summary>
/// map from a synonym to it's canonical form
/// </summary>
private IReadOnlyDictionary<string, string> SynonymMap
=> _synonymMapCache ??= CreateSynonymMap();
private IReadOnlyDictionary<string, string>? _synonymMapCache;

public bool IsMatch(string left, string right)
{
if (Remove != null)
Expand All @@ -20,12 +35,53 @@ public bool IsMatch(string left, string right)
left = NormalizeZeroToEmpty(left);
right = NormalizeZeroToEmpty(right);
}
if (HasSynonyms)
{
left = NormalizeSynonyms(left);
right = NormalizeSynonyms(right);
}

return MatchBy.IsMatch(left, right);
}

private static string NormalizeZeroToEmpty(string input)
=> decimal.TryParse(input, out var parsed) && parsed == 0
? ""
: input;
};

private string NormalizeSynonyms(string input)
{
var tokens = input.Split(new[] { ' ', '\t', '\n', '\r' }, StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries)
.Select(NormalizeSynonym);
return string.Join(' ', tokens);
}

private string NormalizeSynonym(string token)
=> SynonymMap.TryGetValue(token, out var replacement)
? replacement
: token;

private IReadOnlyDictionary<string, string> CreateSynonymMap()
{
// don't want to care about capitalization
var mapping = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);

foreach (var set in Synonyms ?? Enumerable.Empty<ISet<string>>())
{
// don't want to care about whitespace
var winner = set.First().Trim();
foreach (var synonym in set.Select(x => x.Trim()))
{
if (synonym.Contains(' '))
{
throw new NotSupportedException($"multi-word synonyms are not supported: '{synonym}'");
}
mapping.Add(synonym, winner);
};
}

return mapping;
}

}
}

0 comments on commit 61199c8

Please sign in to comment.