From 5f94819322df6fc54d35388736be4caaccb68856 Mon Sep 17 00:00:00 2001 From: Oskari Timperi Date: Mon, 7 Oct 2019 22:15:50 +0300 Subject: Initial commit --- src/nimlevenshtein.nim | 364 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 364 insertions(+) create mode 100644 src/nimlevenshtein.nim (limited to 'src/nimlevenshtein.nim') diff --git a/src/nimlevenshtein.nim b/src/nimlevenshtein.nim new file mode 100644 index 0000000..37f715e --- /dev/null +++ b/src/nimlevenshtein.nim @@ -0,0 +1,364 @@ +## A module for fast computation of +## +## - Levenshtein (edit) distance and edit sequence manipulation +## - string similarity +## - approximate median strings, and generally string averaging +## - string sequence and set similarity +## +## This module uses the standalone version C API of the python-Levenshtein library. So no Python +## required. +## +## The API reflects the API of the python-Levenshtein library. If you want to have a lower level +## interface, you can import ``nimlevenshtein/wrapper``, which gives you nimified wrapper functions +## over the C API functions. And if you want to go more lower level, you can use +## ``nimlevenshtein/binding``. + +import nimlevenshtein/wrapper + +export EditOp +export OpCode +export EditType +export MatchingBlock + +proc distance*(a, b: string): int = + ## Compute absolute Levenshtein distance of two strings. + runnableExamples: + assert distance("Levenshtein", "Lenvinsten") == 4 + assert distance("Levenshtein", "Levensthein") == 2 + assert distance("Levenshtein", "Levenshten") == 1 + assert distance("Levenshtein", "Levenshtein") == 0 + distance(a, b, 0) + +proc ratio*(a, b: string): float = + ## Compute similarity of two strings. + ## + ## The similarity is a number between 0 and 1, it's usually equal or somewhat higher than + ## Python's ``difflib.SequenceMatcher.ratio()``, because it's based on real minimal edit + ## distance. + runnableExamples: + from math import round + assert round(ratio("Hello world!", "Holly grail!"), 4) == 0.5833 + assert ratio("Brian", "Jesus") == 0.0 + let lensum = len(a) + len(b) + if lensum == 0: + return 1.0 + let ldist = distance(a, b, 1) + result = (lensum - ldist).float / lensum.float + +proc hamming*(a, b: string): int = + ## Compute Hamming distance of two strings. + ## + ## The Hamming distance is simply the number of differing characters. That means the length of + ## the strings must be the same. + runnableExamples: + assert hamming("Hello world!", "Holly grail!") == 7 + assert hamming("Brian", "Jesus") == 5 + if len(a) != len(b): + raise newException(Exception, "expected two strings of the same length") + hammingDistance(a, b) + +proc jaro*(a, b: string): float = + ## Compute Jaro string similarity metric of two strings. + ## + ## The Jaro string similarity metric is intended for short strings like personal last names. It + ## is 0 for completely different strings and 1 for identical strings. + runnableExamples: + from math import round + assert jaro("Brian", "Jesus") == 0.0 + assert round(jaro("Thorkel", "Thorgier"), 4) == 0.7798 + assert round(jaro("Dinsdale", "D"), 4) == 0.7083 + jaroRatio(a, b) + +proc jaroWinkler*(a, b: string, prefixWeight: float = 0.1): float = + ## Compute Jaro string similarity metric of two strings. + ## + ## The Jaro-Winkler string similarity metric is a modification of Jaro metric giving more weight + ## to common prefix, as spelling mistakes are more likely to occur near ends of words. + ## + ## The prefix weight is inverse value of common prefix length sufficient to consider the strings + ## *identical*. If no prefix weight is specified, 1/10 is used. + runnableExamples: + from math import round + assert jaroWinkler("Brian", "Jesus") == 0.0 + assert round(jaroWinkler("Thorkel", "Thorgier"), 4) == 0.8679 + assert round(jaroWinkler("Dinsdale", "D"), 4) == 0.7375 + assert jaroWinkler("Thorkel", "Thorgier", 0.25) == 1.0 + if prefixWeight < 0: + raise newException(Exception, "negative prefix weight") + jaroWinklerRatio(a, b, prefixWeight) + +template medianCommon(f: typed, strings: seq[string], weights: seq[float] = @[]): string = + if len(strings) == 0: + return "" + + var wl = weights + if len(wl) == 0: + setLen(wl, len(strings)) + for i in 0..