## A module for fast computation of ## ## - Levenshtein (edit) distance and edit sequence manipulation ## - string similarity ## - approximate median strings, and generally string averaging ## - string sequence and set similarity ## ## This module uses the standalone version C API of the python-Levenshtein library. So no Python ## required. ## ## The API reflects the API of the python-Levenshtein library. If you want to have a lower level ## interface, you can import ``nimlevenshtein/wrapper``, which gives you nimified wrapper functions ## over the C API functions. And if you want to go more lower level, you can use ## ``nimlevenshtein/binding``. import nimlevenshtein/wrapper export EditOp export OpCode export EditType export MatchingBlock proc distance*(a, b: string): int = ## Compute absolute Levenshtein distance of two strings. runnableExamples: assert distance("Levenshtein", "Lenvinsten") == 4 assert distance("Levenshtein", "Levensthein") == 2 assert distance("Levenshtein", "Levenshten") == 1 assert distance("Levenshtein", "Levenshtein") == 0 distance(a, b, 0) proc ratio*(a, b: string): float = ## Compute similarity of two strings. ## ## The similarity is a number between 0 and 1, it's usually equal or somewhat higher than ## Python's ``difflib.SequenceMatcher.ratio()``, because it's based on real minimal edit ## distance. runnableExamples: from math import round assert round(ratio("Hello world!", "Holly grail!"), 4) == 0.5833 assert ratio("Brian", "Jesus") == 0.0 let lensum = len(a) + len(b) if lensum == 0: return 1.0 let ldist = distance(a, b, 1) result = (lensum - ldist).float / lensum.float proc hamming*(a, b: string): int = ## Compute Hamming distance of two strings. ## ## The Hamming distance is simply the number of differing characters. That means the length of ## the strings must be the same. runnableExamples: assert hamming("Hello world!", "Holly grail!") == 7 assert hamming("Brian", "Jesus") == 5 if len(a) != len(b): raise newException(Exception, "expected two strings of the same length") hammingDistance(a, b) proc jaro*(a, b: string): float = ## Compute Jaro string similarity metric of two strings. ## ## The Jaro string similarity metric is intended for short strings like personal last names. It ## is 0 for completely different strings and 1 for identical strings. runnableExamples: from math import round assert jaro("Brian", "Jesus") == 0.0 assert round(jaro("Thorkel", "Thorgier"), 4) == 0.7798 assert round(jaro("Dinsdale", "D"), 4) == 0.7083 jaroRatio(a, b) proc jaroWinkler*(a, b: string, prefixWeight: float = 0.1): float = ## Compute Jaro string similarity metric of two strings. ## ## The Jaro-Winkler string similarity metric is a modification of Jaro metric giving more weight ## to common prefix, as spelling mistakes are more likely to occur near ends of words. ## ## The prefix weight is inverse value of common prefix length sufficient to consider the strings ## *identical*. If no prefix weight is specified, 1/10 is used. runnableExamples: from math import round assert jaroWinkler("Brian", "Jesus") == 0.0 assert round(jaroWinkler("Thorkel", "Thorgier"), 4) == 0.8679 assert round(jaroWinkler("Dinsdale", "D"), 4) == 0.7375 assert jaroWinkler("Thorkel", "Thorgier", 0.25) == 1.0 if prefixWeight < 0: raise newException(Exception, "negative prefix weight") jaroWinklerRatio(a, b, prefixWeight) template medianCommon(f: typed, strings: seq[string], weights: seq[float] = @[]): string = if len(strings) == 0: return "" var wl = weights if len(wl) == 0: setLen(wl, len(strings)) for i in 0..