diff options
Diffstat (limited to 'src/nimlevenshtein.nim')
| -rw-r--r-- | src/nimlevenshtein.nim | 364 |
1 files changed, 364 insertions, 0 deletions
diff --git a/src/nimlevenshtein.nim b/src/nimlevenshtein.nim new file mode 100644 index 0000000..37f715e --- /dev/null +++ b/src/nimlevenshtein.nim @@ -0,0 +1,364 @@ +## A module for fast computation of +## +## - Levenshtein (edit) distance and edit sequence manipulation +## - string similarity +## - approximate median strings, and generally string averaging +## - string sequence and set similarity +## +## This module uses the standalone version C API of the python-Levenshtein library. So no Python +## required. +## +## The API reflects the API of the python-Levenshtein library. If you want to have a lower level +## interface, you can import ``nimlevenshtein/wrapper``, which gives you nimified wrapper functions +## over the C API functions. And if you want to go more lower level, you can use +## ``nimlevenshtein/binding``. + +import nimlevenshtein/wrapper + +export EditOp +export OpCode +export EditType +export MatchingBlock + +proc distance*(a, b: string): int = + ## Compute absolute Levenshtein distance of two strings. + runnableExamples: + assert distance("Levenshtein", "Lenvinsten") == 4 + assert distance("Levenshtein", "Levensthein") == 2 + assert distance("Levenshtein", "Levenshten") == 1 + assert distance("Levenshtein", "Levenshtein") == 0 + distance(a, b, 0) + +proc ratio*(a, b: string): float = + ## Compute similarity of two strings. + ## + ## The similarity is a number between 0 and 1, it's usually equal or somewhat higher than + ## Python's ``difflib.SequenceMatcher.ratio()``, because it's based on real minimal edit + ## distance. + runnableExamples: + from math import round + assert round(ratio("Hello world!", "Holly grail!"), 4) == 0.5833 + assert ratio("Brian", "Jesus") == 0.0 + let lensum = len(a) + len(b) + if lensum == 0: + return 1.0 + let ldist = distance(a, b, 1) + result = (lensum - ldist).float / lensum.float + +proc hamming*(a, b: string): int = + ## Compute Hamming distance of two strings. + ## + ## The Hamming distance is simply the number of differing characters. That means the length of + ## the strings must be the same. + runnableExamples: + assert hamming("Hello world!", "Holly grail!") == 7 + assert hamming("Brian", "Jesus") == 5 + if len(a) != len(b): + raise newException(Exception, "expected two strings of the same length") + hammingDistance(a, b) + +proc jaro*(a, b: string): float = + ## Compute Jaro string similarity metric of two strings. + ## + ## The Jaro string similarity metric is intended for short strings like personal last names. It + ## is 0 for completely different strings and 1 for identical strings. + runnableExamples: + from math import round + assert jaro("Brian", "Jesus") == 0.0 + assert round(jaro("Thorkel", "Thorgier"), 4) == 0.7798 + assert round(jaro("Dinsdale", "D"), 4) == 0.7083 + jaroRatio(a, b) + +proc jaroWinkler*(a, b: string, prefixWeight: float = 0.1): float = + ## Compute Jaro string similarity metric of two strings. + ## + ## The Jaro-Winkler string similarity metric is a modification of Jaro metric giving more weight + ## to common prefix, as spelling mistakes are more likely to occur near ends of words. + ## + ## The prefix weight is inverse value of common prefix length sufficient to consider the strings + ## *identical*. If no prefix weight is specified, 1/10 is used. + runnableExamples: + from math import round + assert jaroWinkler("Brian", "Jesus") == 0.0 + assert round(jaroWinkler("Thorkel", "Thorgier"), 4) == 0.8679 + assert round(jaroWinkler("Dinsdale", "D"), 4) == 0.7375 + assert jaroWinkler("Thorkel", "Thorgier", 0.25) == 1.0 + if prefixWeight < 0: + raise newException(Exception, "negative prefix weight") + jaroWinklerRatio(a, b, prefixWeight) + +template medianCommon(f: typed, strings: seq[string], weights: seq[float] = @[]): string = + if len(strings) == 0: + return "" + + var wl = weights + if len(wl) == 0: + setLen(wl, len(strings)) + for i in 0..<len(wl): + wl[i] = 1.0 + else: + if len(strings) != len(weights): + raise newException(Exception, "expected same amount of strings and weights") + + f(strings, wl) + +proc median*(strings: seq[string], weights: seq[float] = @[]): string = + ## Find an approximate generalized median string using greedy algorithm. + ## + ## You can optionally pass a weight for each string as the second argument. The weights are + ## interpreted as item multiplicities, although any non-negative real numbers are accepted. Use + ## them to improve computation speed when strings often appear multiple times in the sequence. + runnableExamples: + assert median(@["SpSm", "mpamm", "Spam", "Spa", "Sua", "hSam"]) == "Spam" + let fixme = @["Levnhtein", "Leveshein", "Leenshten", + "Leveshtei", "Lenshtein", "Lvenstein", + "Levenhtin", "evenshtei"] + assert median(fixme) == "Levenshtein" + medianCommon(greedyMedian, strings, weights) + +proc medianImprove*(s: string, strings: seq[string], weights: seq[float] = @[]): string = + ## Improve an approximate generalized median string by perturbations. + ## + ## The first argument is the estimated generalized median string you want to improve, the others + ## are the same as in `median()`. It returns a string with total distance less or equal to that + ## of the given string. + ## + ## Note this is much slower than `median()`. Also note it performs only one improvement step, + ## calling `medianImprove()` again on the result may improve it further, though this is unlikely + ## to happen unless the given string was not very similar to the actual generalized median. + runnableExamples: + let fixme = @["Levnhtein", "Leveshein", "Leenshten", + "Leveshtei", "Lenshtein", "Lvenstein", + "Levenhtin", "evenshtei"] + assert medianImprove("spam", fixme) == "enhtein" + assert medianImprove(medianImprove("spam", fixme), fixme) == "Levenshtein" + if len(strings) == 0: + return "" + + var wl = weights + if len(wl) == 0: + setLen(wl, len(strings)) + for i in 0..<len(wl): + wl[i] = 1.0 + else: + if len(strings) != len(weights): + raise newException(Exception, "expected same amount of strings and weights") + + wrapper.medianImprove(s, strings, wl) + +proc quickmedian*(strings: seq[string], weights: seq[float] = @[]): string = + ## Find a very approximate generalized median string, but fast. + ## + ## See `median()` for argument description. + ## + ## This method is somewhere between `setmedian()` and picking a random string from the set; both + ## speedwise and quality-wise. + runnableExamples: + let fixme = @["Levnhtein", "Leveshein", "Leenshten", + "Leveshtei", "Lenshtein", "Lvenstein", + "Levenhtin", "evenshtei"] + assert quickmedian(fixme) == "Levnshein" + medianCommon(wrapper.quickMedian, strings, weights) + +proc setmedian*(strings: seq[string], weights: seq[float] = @[]): string = + ## Find set median of a string set (passed as a sequence). + ## + ## See `median()` for argument description. + ## + ## The returned string is always one of the strings in the sequence. + runnableExamples: + assert setmedian(@["ehee", "cceaes", "chees", "chreesc", + "chees", "cheesee", "cseese", "chetese"]) == "chees" + medianCommon(wrapper.setMedian, strings, weights) + +template setSeqCommon(f: typed, a: seq[string], b: seq[string]): float = + let lensum = len(a) + len(b) + + if lensum == 0: + return 1.0 + + if len(a) == 0 or len(b) == 0: + return 0.0 + + let r = f(a, b) + + if r < 0.0: + raise newException(Exception, "failed") + + (lensum.float - r) / lensum.float + +proc seqratio*(a: seq[string], b: seq[string]): float = + ## Compute similarity ratio of two sequences of strings. + ## + ## This is like `ratio()`, but for string sequences. A kind of `ratio()` is used to to measure + ## the cost of item change operation for the strings. + runnableExamples: + from math import round + assert round(seqratio(@["newspaper", "litter bin", "tinny", "antelope"], + @["caribou", "sausage", "gorn", "woody"]), 4) == 0.2152 + assert seqratio(@[], @["foobar"]) == 0.0 + assert seqratio(@["foobar"], @[]) == 0.0 + setSeqCommon(editSeqDistance, a, b) + +proc setratio*(a: seq[string], b: seq[string]): float = + ## Compute similarity ratio of two strings sets (passed as sequences). + ## + ## The best match between any strings in the first set and the second set (passed as sequences) + ## is attempted. I.e., the order doesn't matter here. + runnableExamples: + from math import round + assert round(setratio(@["newspaper", "litter bin", "tinny", "antelope"], + @["caribou", "sausage", "gorn", "woody"]), 4) == 0.2818 + assert setratio(@[], @["foobar"]) == 0.0 + assert setratio(@["foobar"], @[]) == 0.0 + setSeqCommon(setDistance, a, b) + +proc editops*(string1: string, string2: string): seq[EditOp] = + ## Find sequence of edit operations transforming one string to another. + ## + ## The result is a sequence of `EditOp` objects. These are operations on single characters. + ## In fact the returned sequence doesn't contain the *equal*, but all the related functions + ## accept both lists with and without *equals*. + runnableExamples: + let ops = editops("spam", "park") + assert ops[0] == EditOp(`type`: EditType.Delete, spos: 0, dpos: 0) + assert ops[1] == EditOp(`type`: EditType.Insert, spos: 3, dpos: 2) + assert ops[2] == EditOp(`type`: EditType.Replace, spos: 3, dpos: 3) + wrapper.editops(string1, string2) + +proc editops*(ops: seq[OpCode], aLen: int, bLen: int): seq[EditOp] = + ## Can be used for conversion from `OpCode` to `EditOp`. You can either pass in strings or + ## their lengths, the result is the same. + checkErrors(aLen, bLen, ops) + toEditOps(ops, false) + +proc editops*(ops: seq[OpCode], a: string, b: string): seq[EditOp] = + ## Can be used for conversion from `OpCode` to `EditOp`. You can either pass in strings or + ## their lengths, the result is the same. + editops(ops, len(a), len(b)) + +proc opcodes*(a, b: string): seq[OpCode] = + ## Find sequence of edit operations transforming one string to another. + ## + ## The result is a sequence of `OpCode` objects. + runnableExamples: + let ops = opcodes("spam", "park") + assert ops[0] == OpCode(`type`: EditType.Delete, sbeg: 0, send: 1, dbeg: 0, dend: 0) + assert ops[1] == OpCode(`type`: EditType.Keep, sbeg: 1, send: 3, dbeg: 0, dend: 2) + assert ops[2] == OpCode(`type`: EditType.Insert, sbeg: 3, send: 3, dbeg: 2, dend: 3) + assert ops[3] == OpCode(`type`: EditType.Replace, sbeg: 3, send: 4, dbeg: 3, dend: 4) + editops(a, b).toOpCodes(len(a), len(b)) + +proc opcodes*(ops: seq[EditOp], aLen: int, bLen: int): seq[OpCode] = + ## Can be used for conversion from `EditOp` to `OpCode`. You can either pass in strings or + ## their lengths, the result is the same. + checkErrors(aLen, bLen, ops) + ops.toOpCodes(aLen, bLen) + +proc opcodes*(ops: seq[EditOp], a: string, b: string): seq[OpCode] = + ## Can be used for conversion from `EditOp` to `OpCode`. You can either pass in strings or + ## their lengths, the result is the same. + opcodes(ops, len(a), len(b)) + +proc inverse*(ops: seq[EditOp]): seq[EditOp] = + ## Invert the sense of an edit operation sequence. + ## + ## In other words, it returns a sequence of edit operations transforming the second + ## (destination) string to the first (source). It can be used with both editops and opcodes. + runnableExamples: + let inv = inverse(editops("spam", "park")) + assert inv[0] == EditOp(`type`: EditType.Insert, spos: 0, dpos: 0) + assert inv[1] == EditOp(`type`: EditType.Delete, spos: 2, dpos: 3) + assert inv[2] == EditOp(`type`: EditType.Replace, spos: 3, dpos: 3) + assert editops("park", "spam") == inv + result = ops + result.invert() + +proc inverse*(ops: seq[OpCode]): seq[OpCode] = + ## Invert the sense of an edit operation sequence. + ## + ## In other words, it returns a sequence of edit operations transforming the second + ## (destination) string to the first (source). It can be used with both editops and opcodes. + result = ops + result.invert() + +proc applyEdit*(ops: seq[EditOp], a: string, b:string): string = + ## Apply a sequence of edit operations to a string. + ## + ## In the case of editops, the sequence can be arbitrary ordered subset of the edit sequence + ## transforming source string to destination string. + runnableExamples: + let ops = editops("man", "scotsman") + assert applyEdit(ops, "man", "scotsman") == "scotsman" + assert applyEdit(ops[0..2], "man", "scotsman") == "scoman" + if len(ops) == 0: + return a + checkErrors(len(a), len(b), ops) + apply(a, b, ops) + +proc applyEdit*(ops: seq[OpCode], a: string, b:string): string = + if len(ops) == 0: + return a + checkErrors(len(a), len(b), ops) + apply(a, b, ops) + +template matchingBlocksCommon(ops: typed, aLen: int, bLen: int) = + checkErrors(aLen, bLen, ops) + result = matchingBlocks(aLen, bLen, ops) + result.add(MatchingBlock(spos: aLen, dpos: bLen, len: 0)) + +proc matchingBlocks*(ops: seq[EditOp], aLen: int, bLen: int): seq[MatchingBlock] = + ## Find identical blocks in two strings. + ## + ## The result is a sequence of `MatchingBlock` objects. It can be used with both editops and + ## opcodes. The second and third arguments don't have to be actually strings, their lengths are + ## enough. + runnableExamples: + let (a, b) = ("spam", "park") + let mb = matchingBlocks(editops(a, b), a, b) + assert mb[0] == MatchingBlock(spos: 1, dpos: 0, `len`: 2) + ## The last zero-length block is not an error, but it's there for compatibility with Pythons + ## `difflib` which always emits it. + assert mb[1] == MatchingBlock(spos: 4, dpos: 4, `len`: 0) + assert mb == matchingBlocks(editops(a, b), len(a), len(b)) + runnableExamples: + ## One can join the matching blocks to get two identical strings: + from sequtils import map + from strutils import join + let (a, b) = ("dog kennels", "mattresses") + let mb = matchingBlocks(editops(a, b), a, b) + assert join(map(mb, proc (x: MatchingBlock): string = + a[x.spos ..< x.spos + x.len])) == "ees" + assert join(map(mb, proc (x: MatchingBlock): string = + b[x.dpos ..< x.dpos + x.len])) == "ees" + matchingBlocksCommon(ops, aLen, bLen) + +proc matchingBlocks*(ops: seq[EditOp], a: string, b: string): seq[MatchingBlock] = + matchingBlocksCommon(ops, len(a), len(b)) + +proc matchingBlocks*(ops: seq[OpCode], aLen: int, bLen: int): seq[MatchingBlock] = + matchingBlocksCommon(ops, aLen, bLen) + +proc matchingBlocks*(ops: seq[OpCode], a: string, b: string): seq[MatchingBlock] = + matchingBlocksCommon(ops, len(a), len(b)) + +proc subtractEdit*(ops: seq[EditOp], subsequence: seq[EditOp]): seq[EditOp] = + ## Subtract an edit subsequence from a sequence. + ## + ## The result is equivalent to ``editops(applyEdit(subsequence, s1, s2), s2)``, except that is + ## constructed directly from the edit operations. That is, if you apply it to the result of + ## subsequence application, you get the same final string as from application of complete `ops`. + ## It may be not identical, though (in ambiguous cases, like insertion of a character next to + ## the same character). + ## + ## The subtracted subsequence must be an ordered subset of `ops`. + ## + ## Note this function does not accept difflib-style opcodes as no one in his right mind wants to + ## create subsequences from them. + runnableExamples: + let e = editops("man", "scotsman") + let e1 = e[0..2] + let bastard = applyEdit(e1, "man", "scotsman") + assert bastard == "scoman" + assert applyEdit(subtractEdit(e, e1), bastard, "scotsman") == "scotsman" + subtract(ops, subsequence) |
