forked from TheAlgorithms/C-Sharp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathJaccardSimilarity.cs
98 lines (88 loc) · 3.48 KB
/
JaccardSimilarity.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
using System;
using System.Collections.Generic;
namespace Algorithms.Strings.Similarity;
/// <summary>
/// <para>
/// Jaccard similarity is a statistic that measures how similar two sets of data are. It is calculated by dividing
/// the number of common elements in both sets by the number of elements in either set. More formally, it is the
/// quotient of the division of the size of the size of the intersection divided by the size of the union of two sets.
/// </para>
/// <para>
/// The result is a value between 0 and 1, where 0 means no similarity and 1 means perfect similarity.
/// </para>
/// <para>
/// For example, suppose we have two sets of words:
/// <list type="bullet">
/// <item>
/// A = {apple, banana, cherry, date}
/// </item>
/// <item>
/// B = {banana, cherry, elderberry, fig}
/// </item>
/// </list>
/// </para>
/// <para>
/// The number of common elements in both sets is 2 (banana and cherry). The number of elements in either set is 6
/// (apple, banana, cherry, date, elderberry, fig).
/// </para>
/// <para>
/// The Jaccard similarity coefficient is 2 / 6 = 0.333333 or 33.333% similarity.
/// </para>
/// </summary>
public class JaccardSimilarity
{
/// <summary>
/// Calculates the Jaccard similarity coefficient between two strings.
/// </summary>
/// <param name="left">The first string to compare.</param>
/// <param name="right">The second string to compare.</param>
/// <returns>A double value between 0 and 1 that represents the similarity of the two strings.</returns>
/// <exception cref="ArgumentNullException">Thrown when either the input is null.</exception>
/// <remarks>
/// This method uses a HashSet to represent the sets of characters in the input strings.
/// </remarks>
public double Calculate(string left, string right)
{
// Validate the input strings before proceeding.
ValidateInput(left, right);
// Get the lengths of the input strings.
var leftLength = left.Length;
var rightLength = right.Length;
// If both strings are empty, return 1.0 as the similarity coefficient.
if (leftLength == 0 && rightLength == 0)
{
return 1.0d;
}
// If either string is empty, return 0.0 as the similarity coefficient.
if (leftLength == 0 || rightLength == 0)
{
return 0.0d;
}
// Get the unique characters in each string.
var leftSet = new HashSet<char>(left);
var rightSet = new HashSet<char>(right);
// Get the union of the two strings.
var unionSet = new HashSet<char>(leftSet);
foreach (var c in rightSet)
{
unionSet.Add(c);
}
// Calculate the intersection size of the two strings.
var intersectionSize = leftSet.Count + rightSet.Count - unionSet.Count;
// Return the Jaccard similarity coefficient as the ratio of intersection to union.
return 1.0d * intersectionSize / unionSet.Count;
}
/// <summary>
/// Validates the input strings and throws an exception if either is null.
/// </summary>
/// <param name="left">The first string to validate.</param>
/// <param name="right">The second string to validate.</param>
private void ValidateInput(string left, string right)
{
if (left == null || right == null)
{
var paramName = left == null ? nameof(left) : nameof(right);
throw new ArgumentNullException(paramName, "Input cannot be null");
}
}
}