From 1b5a4dbe53d9f2aa9e167d7d1916630a8e6fa6b9 Mon Sep 17 00:00:00 2001 From: Ryan Davis Date: Tue, 27 Apr 2021 17:30:57 -0400 Subject: [PATCH] add support for synonyms Many words are equivalent, especially in addresses. Suppport per-assertion lists of synonyms. Considered a global synonym list, but felt like that assumes too much about the data fields. Some synonyms might be invalid for some columns. Copy/pasting config seems like the lesser evil. --- README.md | 1 + src/XlsxCompare.Tests/AssertionTests.cs | 19 ++++++++ src/XlsxCompare/Assertion.cs | 60 ++++++++++++++++++++++++- 3 files changed, 78 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bfc20bd..cdacf1f 100644 --- a/README.md +++ b/README.md @@ -115,6 +115,7 @@ This config will: |`matchBy`|how to compare the two values, see `matchBy` below|`string`| |`remove`|if present: before comparison, remove this string from both values|`null`| |`zeroRepresentsEmpty`|if true: before comparison, convert any zero values (e.g. `0`, `0.0`) to empty string|`false`| +|`synonyms`|if present: this is a nested list of terms that should be treated as identical. For example, with these synonyms: `[ ["street", "st"], ["n", "north"] ]`, then `123 North Main St` will match `123 N Main Street`|`null`| ### `matchBy` options diff --git a/src/XlsxCompare.Tests/AssertionTests.cs b/src/XlsxCompare.Tests/AssertionTests.cs index 8009f31..38142ec 100644 --- a/src/XlsxCompare.Tests/AssertionTests.cs +++ b/src/XlsxCompare.Tests/AssertionTests.cs @@ -29,6 +29,20 @@ public static IEnumerable AssertionsThatMatch() "", "0.00" }; + yield return new object[]{ + new Assertion("leftCol", "rightCol", + MatchBy: MatchBy.Tokens, + Synonyms: new[]{new HashSet(new[]{"rd", "Road"})}), + "1234\tFOO\tROAD\tS\t# 123A", + "1234 S Foo Rd # 123A" + }; + + yield return new object[]{ + new Assertion("leftCol", "rightCol", + Synonyms: new[]{new HashSet(new[]{"suite", " ste"})}), + "Suite 2", + "Ste 2" + }; } [TestMethod] @@ -60,6 +74,11 @@ public static IEnumerable AssertionsThatDoNotMatch() "1", "0" }; + yield return new object[]{ + new Assertion("leftCol", "rightCol", Synonyms: new[]{new HashSet(new[]{"rd", "road"})}), + "foord", + "fooroad" + }; } [TestMethod] [DynamicData(nameof(AssertionsThatDoNotMatch), DynamicDataSourceType.Method)] diff --git a/src/XlsxCompare/Assertion.cs b/src/XlsxCompare/Assertion.cs index 90ea0d5..4f2e793 100644 --- a/src/XlsxCompare/Assertion.cs +++ b/src/XlsxCompare/Assertion.cs @@ -1,13 +1,28 @@ +using System; +using System.Collections.Generic; +using System.Linq; + namespace XlsxCompare { + public record Assertion( string LeftColumnName, string RightColumnName, MatchBy? MatchBy = null, string? Remove = null, - bool ZeroRepresentsEmpty = false + bool ZeroRepresentsEmpty = false, + IReadOnlyCollection>? Synonyms = null ) { + private bool HasSynonyms => Synonyms?.Any() == true; + + /// + /// map from a synonym to it's canonical form + /// + private IReadOnlyDictionary SynonymMap + => _synonymMapCache ??= CreateSynonymMap(); + private IReadOnlyDictionary? _synonymMapCache; + public bool IsMatch(string left, string right) { if (Remove != null) @@ -20,6 +35,12 @@ public bool IsMatch(string left, string right) left = NormalizeZeroToEmpty(left); right = NormalizeZeroToEmpty(right); } + if (HasSynonyms) + { + left = NormalizeSynonyms(left); + right = NormalizeSynonyms(right); + } + return MatchBy.IsMatch(left, right); } @@ -27,5 +48,40 @@ private static string NormalizeZeroToEmpty(string input) => decimal.TryParse(input, out var parsed) && parsed == 0 ? "" : input; - }; + + private string NormalizeSynonyms(string input) + { + var tokens = input.Split(new[] { ' ', '\t', '\n', '\r' }, StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries) + .Select(NormalizeSynonym); + return string.Join(' ', tokens); + } + + private string NormalizeSynonym(string token) + => SynonymMap.TryGetValue(token, out var replacement) + ? replacement + : token; + + private IReadOnlyDictionary CreateSynonymMap() + { + // don't want to care about capitalization + var mapping = new Dictionary(StringComparer.OrdinalIgnoreCase); + + foreach (var set in Synonyms ?? Enumerable.Empty>()) + { + // don't want to care about whitespace + var winner = set.First().Trim(); + foreach (var synonym in set.Select(x => x.Trim())) + { + if (synonym.Contains(' ')) + { + throw new NotSupportedException($"multi-word synonyms are not supported: '{synonym}'"); + } + mapping.Add(synonym, winner); + }; + } + + return mapping; + } + + } }