using System; using System.Collections.Generic; using System.Linq; using System.Text; namespace Semmle.Util { /// /// A dictionary from strings to elements of type T. /// /// /// /// This data structure is able to locate items based on an "approximate match" /// of the key. This is used for example when attempting to identify two terms /// in different trap files which are similar but not identical. /// /// The algorithm locates the closest match to a string based on a "distance function". /// /// Whilst many distance functions are possible, a bespoke algorithm is used here, /// for efficiency and suitablility for the domain. /// /// The distance is defined as the Hamming Distance of the numbers in the string. /// Each string is split into the base "form" (stripped of numbers) and a vector of /// numbers. (Numbers are non-negative integers in this context). /// /// Strings with a different "form" are considered different and have a distance /// of infinity. /// /// This distance function is reflexive, symmetric and obeys the triangle inequality. /// /// E.g. foo(bar,1,2) has form "foo(bar,,)" and integers {1,2} /// /// distance(foo(bar,1,2), foo(bar,1,2)) = 0 /// distance(foo(bar,1,2), foo(bar,1,3)) = 1 /// distance(foo(bar,2,1), foo(bar,1,2)) = 2 /// distance(foo(bar,1,2), foo(baz,1,2)) = infinity /// /// /// The value type. public class FuzzyDictionary { // All data items indexed by the "base string" (stripped of numbers) readonly Dictionary>> index = new Dictionary>>(); /// /// Stores a new KeyValuePair in the data structure. /// /// The key. /// The value. public void Add(string k, T v) { var kv = new KeyValuePair(k, v); string root = StripDigits(k); index.AddAnother(root, kv); } /// /// Computes the Hamming Distance between two sequences of the same length. /// /// Vector 1 /// Vector 2 /// The Hamming Distance. static int HammingDistance(IEnumerable v1, IEnumerable v2) { return v1.Zip(v2, (x, y) => x.Equals(y) ? 0 : 1).Sum(); } /// /// Locates the value with the smallest Hamming Distance from the query. /// /// The query string. /// The distance between the query string and the stored string. /// The best match, or null (default). public T FindMatch(string query, out int distance) { string root = StripDigits(query); List> list; if (!index.TryGetValue(root, out list)) { distance = 0; return default(T); } return BestMatch(query, list, (a, b) => HammingDistance(ExtractIntegers(a), ExtractIntegers(b)), out distance); } /// /// Returns the best match (with the smallest distance) for a query. /// /// The query string. /// The list of candidate matches. /// The distance function. /// The distance between the query and the stored string. /// The stored value. static T BestMatch(string query, IEnumerable> candidates, Func distance, out int bestDistance) { T bestMatch = default(T); bestDistance = 0; bool first = true; foreach (var candidate in candidates) { int d = distance(query, candidate.Key); if (d == 0) return candidate.Value; if (first || d < bestDistance) { bestDistance = d; bestMatch = candidate.Value; first = false; } } return bestMatch; } /// /// Removes all digits from a string. /// /// The input string. /// String with digits removed. static string StripDigits(string input) { StringBuilder result = new StringBuilder(); foreach (char c in input.Where(c => !char.IsDigit(c))) result.Append(c); return result.ToString(); } /// /// Extracts and enumerates all non-negative integers in a string. /// /// The string to enumerate. /// The sequence of integers. public static IEnumerable ExtractIntegers(string input) { bool inNumber = false; int value = 0; foreach (char c in input) { if (char.IsDigit(c)) { if (inNumber) { value = value * 10 + (c - '0'); } else { inNumber = true; value = c - '0'; } } else { if (inNumber) { yield return value; inNumber = false; } } } } } }