using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace Semmle.Util
{
///
/// A dictionary from strings to elements of type T.
///
///
///
/// This data structure is able to locate items based on an "approximate match"
/// of the key. This is used for example when attempting to identify two terms
/// in different trap files which are similar but not identical.
///
/// The algorithm locates the closest match to a string based on a "distance function".
///
/// Whilst many distance functions are possible, a bespoke algorithm is used here,
/// for efficiency and suitablility for the domain.
///
/// The distance is defined as the Hamming Distance of the numbers in the string.
/// Each string is split into the base "form" (stripped of numbers) and a vector of
/// numbers. (Numbers are non-negative integers in this context).
///
/// Strings with a different "form" are considered different and have a distance
/// of infinity.
///
/// This distance function is reflexive, symmetric and obeys the triangle inequality.
///
/// E.g. foo(bar,1,2) has form "foo(bar,,)" and integers {1,2}
///
/// distance(foo(bar,1,2), foo(bar,1,2)) = 0
/// distance(foo(bar,1,2), foo(bar,1,3)) = 1
/// distance(foo(bar,2,1), foo(bar,1,2)) = 2
/// distance(foo(bar,1,2), foo(baz,1,2)) = infinity
///
///
/// The value type.
public class FuzzyDictionary
{
// All data items indexed by the "base string" (stripped of numbers)
readonly Dictionary>> index = new Dictionary>>();
///
/// Stores a new KeyValuePair in the data structure.
///
/// The key.
/// The value.
public void Add(string k, T v)
{
var kv = new KeyValuePair(k, v);
string root = StripDigits(k);
index.AddAnother(root, kv);
}
///
/// Computes the Hamming Distance between two sequences of the same length.
///
/// Vector 1
/// Vector 2
/// The Hamming Distance.
static int HammingDistance(IEnumerable v1, IEnumerable v2)
{
return v1.Zip(v2, (x, y) => x.Equals(y) ? 0 : 1).Sum();
}
///
/// Locates the value with the smallest Hamming Distance from the query.
///
/// The query string.
/// The distance between the query string and the stored string.
/// The best match, or null (default).
public T FindMatch(string query, out int distance)
{
string root = StripDigits(query);
List> list;
if (!index.TryGetValue(root, out list))
{
distance = 0;
return default(T);
}
return BestMatch(query, list, (a, b) => HammingDistance(ExtractIntegers(a), ExtractIntegers(b)), out distance);
}
///
/// Returns the best match (with the smallest distance) for a query.
///
/// The query string.
/// The list of candidate matches.
/// The distance function.
/// The distance between the query and the stored string.
/// The stored value.
static T BestMatch(string query, IEnumerable> candidates, Func distance, out int bestDistance)
{
T bestMatch = default(T);
bestDistance = 0;
bool first = true;
foreach (var candidate in candidates)
{
int d = distance(query, candidate.Key);
if (d == 0) return candidate.Value;
if (first || d < bestDistance)
{
bestDistance = d;
bestMatch = candidate.Value;
first = false;
}
}
return bestMatch;
}
///
/// Removes all digits from a string.
///
/// The input string.
/// String with digits removed.
static string StripDigits(string input)
{
StringBuilder result = new StringBuilder();
foreach (char c in input.Where(c => !char.IsDigit(c)))
result.Append(c);
return result.ToString();
}
///
/// Extracts and enumerates all non-negative integers in a string.
///
/// The string to enumerate.
/// The sequence of integers.
public static IEnumerable ExtractIntegers(string input)
{
bool inNumber = false;
int value = 0;
foreach (char c in input)
{
if (char.IsDigit(c))
{
if (inNumber)
{
value = value * 10 + (c - '0');
}
else
{
inNumber = true;
value = c - '0';
}
}
else
{
if (inNumber)
{
yield return value;
inNumber = false;
}
}
}
}
}
}