src/nimlevenshtein.nim


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364

## A module for fast computation of
##
## - Levenshtein (edit) distance and edit sequence manipulation
## - string similarity
## - approximate median strings, and generally string averaging
## - string sequence and set similarity
##
## This module uses the standalone version C API of the python-Levenshtein library. So no Python
## required.
##
## The API reflects the API of the python-Levenshtein library. If you want to have a lower level
## interface, you can import ``nimlevenshtein/wrapper``, which gives you nimified wrapper functions
## over the C API functions. And if you want to go more lower level, you can use
## ``nimlevenshtein/binding``.

import nimlevenshtein/wrapper

export EditOp
export OpCode
export EditType
export MatchingBlock

proc distance*(a, b: string): int =
    ## Compute absolute Levenshtein distance of two strings.
    runnableExamples:
        assert distance("Levenshtein", "Lenvinsten") == 4
        assert distance("Levenshtein", "Levensthein") == 2
        assert distance("Levenshtein", "Levenshten") == 1
        assert distance("Levenshtein", "Levenshtein") == 0
    distance(a, b, 0)

proc ratio*(a, b: string): float =
    ## Compute similarity of two strings.
    ##
    ## The similarity is a number between 0 and 1, it's usually equal or somewhat higher than
    ## Python's ``difflib.SequenceMatcher.ratio()``, because it's based on real minimal edit
    ## distance.
    runnableExamples:
        from math import round
        assert round(ratio("Hello world!", "Holly grail!"), 4) == 0.5833
        assert ratio("Brian", "Jesus") == 0.0
    let lensum = len(a) + len(b)
    if lensum == 0:
        return 1.0
    let ldist = distance(a, b, 1)
    result = (lensum - ldist).float / lensum.float

proc hamming*(a, b: string): int =
    ## Compute Hamming distance of two strings.
    ##
    ## The Hamming distance is simply the number of differing characters. That means the length of
    ## the strings must be the same.
    runnableExamples:
        assert hamming("Hello world!", "Holly grail!") == 7
        assert hamming("Brian", "Jesus") == 5
    if len(a) != len(b):
        raise newException(Exception, "expected two strings of the same length")
    hammingDistance(a, b)

proc jaro*(a, b: string): float =
    ## Compute Jaro string similarity metric of two strings.
    ##
    ## The Jaro string similarity metric is intended for short strings like personal last names. It
    ## is 0 for completely different strings and 1 for identical strings.
    runnableExamples:
        from math import round
        assert jaro("Brian", "Jesus") == 0.0
        assert round(jaro("Thorkel", "Thorgier"), 4) == 0.7798
        assert round(jaro("Dinsdale", "D"), 4) == 0.7083
    jaroRatio(a, b)

proc jaroWinkler*(a, b: string, prefixWeight: float = 0.1): float =
    ## Compute Jaro string similarity metric of two strings.
    ##
    ## The Jaro-Winkler string similarity metric is a modification of Jaro metric giving more weight
    ## to common prefix, as spelling mistakes are more likely to occur near ends of words.
    ##
    ## The prefix weight is inverse value of common prefix length sufficient to consider the strings
    ## *identical*. If no prefix weight is specified, 1/10 is used.
    runnableExamples:
        from math import round
        assert jaroWinkler("Brian", "Jesus") == 0.0
        assert round(jaroWinkler("Thorkel", "Thorgier"), 4) == 0.8679
        assert round(jaroWinkler("Dinsdale", "D"), 4) == 0.7375
        assert jaroWinkler("Thorkel", "Thorgier", 0.25) == 1.0
    if prefixWeight < 0:
        raise newException(Exception, "negative prefix weight")
    jaroWinklerRatio(a, b, prefixWeight)

template medianCommon(f: typed, strings: seq[string], weights: seq[float] = @[]): string =
    if len(strings) == 0:
        return ""

    var wl = weights
    if len(wl) == 0:
        setLen(wl, len(strings))
        for i in 0..<len(wl):
            wl[i] = 1.0
    else:
        if len(strings) != len(weights):
            raise newException(Exception, "expected same amount of strings and weights")

    f(strings, wl)

proc median*(strings: seq[string], weights: seq[float] = @[]): string =
    ## Find an approximate generalized median string using greedy algorithm.
    ##
    ## You can optionally pass a weight for each string as the second argument. The weights are
    ## interpreted as item multiplicities, although any non-negative real numbers are accepted. Use
    ## them to improve computation speed when strings often appear multiple times in the sequence.
    runnableExamples:
        assert median(@["SpSm", "mpamm", "Spam", "Spa", "Sua", "hSam"]) == "Spam"
        let fixme = @["Levnhtein", "Leveshein", "Leenshten",
                      "Leveshtei", "Lenshtein", "Lvenstein",
                      "Levenhtin", "evenshtei"]
        assert median(fixme) == "Levenshtein"
    medianCommon(greedyMedian, strings, weights)

proc medianImprove*(s: string, strings: seq[string], weights: seq[float] = @[]): string =
    ## Improve an approximate generalized median string by perturbations.
    ##
    ## The first argument is the estimated generalized median string you want to improve, the others
    ## are the same as in `median()`. It returns a string with total distance less or equal to that
    ## of the given string.
    ##
    ## Note this is much slower than `median()`. Also note it performs only one improvement step,
    ## calling `medianImprove()` again on the result may improve it further, though this is unlikely
    ## to happen unless the given string was not very similar to the actual generalized median.
    runnableExamples:
        let fixme = @["Levnhtein", "Leveshein", "Leenshten",
                      "Leveshtei", "Lenshtein", "Lvenstein",
                      "Levenhtin", "evenshtei"]
        assert medianImprove("spam", fixme) == "enhtein"
        assert medianImprove(medianImprove("spam", fixme), fixme) == "Levenshtein"
    if len(strings) == 0:
        return ""

    var wl = weights
    if len(wl) == 0:
        setLen(wl, len(strings))
        for i in 0..<len(wl):
            wl[i] = 1.0
    else:
        if len(strings) != len(weights):
            raise newException(Exception, "expected same amount of strings and weights")

    wrapper.medianImprove(s, strings, wl)

proc quickmedian*(strings: seq[string], weights: seq[float] = @[]): string =
    ## Find a very approximate generalized median string, but fast.
    ##
    ## See `median()` for argument description.
    ##
    ## This method is somewhere between `setmedian()` and picking a random string from the set; both
    ## speedwise and quality-wise.
    runnableExamples:
        let fixme = @["Levnhtein", "Leveshein", "Leenshten",
                      "Leveshtei", "Lenshtein", "Lvenstein",
                      "Levenhtin", "evenshtei"]
        assert quickmedian(fixme) == "Levnshein"
    medianCommon(wrapper.quickMedian, strings, weights)

proc setmedian*(strings: seq[string], weights: seq[float] = @[]): string =
    ## Find set median of a string set (passed as a sequence).
    ##
    ## See `median()` for argument description.
    ##
    ## The returned string is always one of the strings in the sequence.
    runnableExamples:
        assert setmedian(@["ehee", "cceaes", "chees", "chreesc",
                           "chees", "cheesee", "cseese", "chetese"]) == "chees"
    medianCommon(wrapper.setMedian, strings, weights)

template setSeqCommon(f: typed, a: seq[string], b: seq[string]): float =
    let lensum = len(a) + len(b)

    if lensum == 0:
        return 1.0

    if len(a) == 0 or len(b) == 0:
        return 0.0

    let r = f(a, b)

    if r < 0.0:
        raise newException(Exception, "failed")

    (lensum.float - r) / lensum.float

proc seqratio*(a: seq[string], b: seq[string]): float =
    ## Compute similarity ratio of two sequences of strings.
    ##
    ## This is like `ratio()`, but for string sequences. A kind of `ratio()` is used to to measure
    ## the cost of item change operation for the strings.
    runnableExamples:
        from math import round
        assert round(seqratio(@["newspaper", "litter bin", "tinny", "antelope"],
            @["caribou", "sausage", "gorn", "woody"]), 4) == 0.2152
        assert seqratio(@[], @["foobar"]) == 0.0
        assert seqratio(@["foobar"], @[]) == 0.0
    setSeqCommon(editSeqDistance, a, b)

proc setratio*(a: seq[string], b: seq[string]): float =
    ## Compute similarity ratio of two strings sets (passed as sequences).
    ##
    ## The best match between any strings in the first set and the second set (passed as sequences)
    ## is attempted. I.e., the order doesn't matter here.
    runnableExamples:
        from math import round
        assert round(setratio(@["newspaper", "litter bin", "tinny", "antelope"],
            @["caribou", "sausage", "gorn", "woody"]), 4) == 0.2818
        assert setratio(@[], @["foobar"]) == 0.0
        assert setratio(@["foobar"], @[]) == 0.0
    setSeqCommon(setDistance, a, b)

proc editops*(string1: string, string2: string): seq[EditOp] =
    ## Find sequence of edit operations transforming one string to another.
    ##
    ## The result is a sequence of `EditOp` objects. These are operations on single characters.
    ## In fact the returned sequence doesn't contain the *equal*, but all the related functions
    ## accept both lists with and without *equals*.
    runnableExamples:
        let ops = editops("spam", "park")
        assert ops[0] == EditOp(`type`: EditType.Delete, spos: 0, dpos: 0)
        assert ops[1] == EditOp(`type`: EditType.Insert, spos: 3, dpos: 2)
        assert ops[2] == EditOp(`type`: EditType.Replace, spos: 3, dpos: 3)
    wrapper.editops(string1, string2)

proc editops*(ops: seq[OpCode], aLen: int, bLen: int): seq[EditOp] =
    ## Can be used for conversion from `OpCode` to `EditOp`. You can either pass in strings or
    ## their lengths, the result is the same.
    checkErrors(aLen, bLen, ops)
    toEditOps(ops, false)

proc editops*(ops: seq[OpCode], a: string, b: string): seq[EditOp] =
    ## Can be used for conversion from `OpCode` to `EditOp`. You can either pass in strings or
    ## their lengths, the result is the same.
    editops(ops, len(a), len(b))

proc opcodes*(a, b: string): seq[OpCode] =
    ## Find sequence of edit operations transforming one string to another.
    ##
    ## The result is a sequence of `OpCode` objects.
    runnableExamples:
        let ops = opcodes("spam", "park")
        assert ops[0] == OpCode(`type`: EditType.Delete, sbeg: 0, send: 1, dbeg: 0, dend: 0)
        assert ops[1] == OpCode(`type`: EditType.Keep, sbeg: 1, send: 3, dbeg: 0, dend: 2)
        assert ops[2] == OpCode(`type`: EditType.Insert, sbeg: 3, send: 3, dbeg: 2, dend: 3)
        assert ops[3] == OpCode(`type`: EditType.Replace, sbeg: 3, send: 4, dbeg: 3, dend: 4)
    editops(a, b).toOpCodes(len(a), len(b))

proc opcodes*(ops: seq[EditOp], aLen: int, bLen: int): seq[OpCode] =
    ## Can be used for conversion from `EditOp` to `OpCode`. You can either pass in strings or
    ## their lengths, the result is the same.
    checkErrors(aLen, bLen, ops)
    ops.toOpCodes(aLen, bLen)

proc opcodes*(ops: seq[EditOp], a: string, b: string): seq[OpCode] =
    ## Can be used for conversion from `EditOp` to `OpCode`. You can either pass in strings or
    ## their lengths, the result is the same.
    opcodes(ops, len(a), len(b))

proc inverse*(ops: seq[EditOp]): seq[EditOp] =
    ## Invert the sense of an edit operation sequence.
    ##
    ## In other words, it returns a sequence of edit operations transforming the second
    ## (destination) string to the first (source). It can be used with both editops and opcodes.
    runnableExamples:
        let inv = inverse(editops("spam", "park"))
        assert inv[0] == EditOp(`type`: EditType.Insert, spos: 0, dpos: 0)
        assert inv[1] == EditOp(`type`: EditType.Delete, spos: 2, dpos: 3)
        assert inv[2] == EditOp(`type`: EditType.Replace, spos: 3, dpos: 3)
        assert editops("park", "spam") == inv
    result = ops
    result.invert()

proc inverse*(ops: seq[OpCode]): seq[OpCode] =
    ## Invert the sense of an edit operation sequence.
    ##
    ## In other words, it returns a sequence of edit operations transforming the second
    ## (destination) string to the first (source). It can be used with both editops and opcodes.
    result = ops
    result.invert()

proc applyEdit*(ops: seq[EditOp], a: string, b:string): string =
    ## Apply a sequence of edit operations to a string.
    ##
    ## In the case of editops, the sequence can be arbitrary ordered subset of the edit sequence
    ## transforming source string to destination string.
    runnableExamples:
        let ops = editops("man", "scotsman")
        assert applyEdit(ops, "man", "scotsman") == "scotsman"
        assert applyEdit(ops[0..2], "man", "scotsman") == "scoman"
    if len(ops) == 0:
        return a
    checkErrors(len(a), len(b), ops)
    apply(a, b, ops)

proc applyEdit*(ops: seq[OpCode], a: string, b:string): string =
    if len(ops) == 0:
        return a
    checkErrors(len(a), len(b), ops)
    apply(a, b, ops)

template matchingBlocksCommon(ops: typed, aLen: int, bLen: int) =
    checkErrors(aLen, bLen, ops)
    result = matchingBlocks(aLen, bLen, ops)
    result.add(MatchingBlock(spos: aLen, dpos: bLen, len: 0))

proc matchingBlocks*(ops: seq[EditOp], aLen: int, bLen: int): seq[MatchingBlock] =
    ## Find identical blocks in two strings.
    ##
    ## The result is a sequence of `MatchingBlock` objects. It can be used with both editops and
    ## opcodes. The second and third arguments don't have to be actually strings, their lengths are
    ## enough.
    runnableExamples:
        let (a, b) = ("spam", "park")
        let mb = matchingBlocks(editops(a, b), a, b)
        assert mb[0] == MatchingBlock(spos: 1, dpos: 0, `len`: 2)
        ## The last zero-length block is not an error, but it's there for compatibility with Pythons
        ## `difflib` which always emits it.
        assert mb[1] == MatchingBlock(spos: 4, dpos: 4, `len`: 0)
        assert mb == matchingBlocks(editops(a, b), len(a), len(b))
    runnableExamples:
        ## One can join the matching blocks to get two identical strings:
        from sequtils import map
        from strutils import join
        let (a, b) = ("dog kennels", "mattresses")
        let mb = matchingBlocks(editops(a, b), a, b)
        assert join(map(mb, proc (x: MatchingBlock): string =
            a[x.spos ..< x.spos + x.len])) == "ees"
        assert join(map(mb, proc (x: MatchingBlock): string =
            b[x.dpos ..< x.dpos + x.len])) == "ees"
    matchingBlocksCommon(ops, aLen, bLen)

proc matchingBlocks*(ops: seq[EditOp], a: string, b: string): seq[MatchingBlock] =
    matchingBlocksCommon(ops, len(a), len(b))

proc matchingBlocks*(ops: seq[OpCode], aLen: int, bLen: int): seq[MatchingBlock] =
    matchingBlocksCommon(ops, aLen, bLen)

proc matchingBlocks*(ops: seq[OpCode], a: string, b: string): seq[MatchingBlock] =
    matchingBlocksCommon(ops, len(a), len(b))

proc subtractEdit*(ops: seq[EditOp], subsequence: seq[EditOp]): seq[EditOp] =
    ## Subtract an edit subsequence from a sequence.
    ##
    ## The result is equivalent to ``editops(applyEdit(subsequence, s1, s2), s2)``, except that is
    ## constructed directly from the edit operations. That is, if you apply it to the result of
    ## subsequence application, you get the same final string as from application of complete `ops`.
    ## It may be not identical, though (in ambiguous cases, like insertion of a character next to
    ## the same character).
    ##
    ## The subtracted subsequence must be an ordered subset of `ops`.
    ##
    ## Note this function does not accept difflib-style opcodes as no one in his right mind wants to
    ## create subsequences from them.
    runnableExamples:
        let e = editops("man", "scotsman")
        let e1 = e[0..2]
        let bastard = applyEdit(e1, "man", "scotsman")
        assert bastard == "scoman"
        assert applyEdit(subtractEdit(e, e1), bastard, "scotsman") == "scotsman"
    subtract(ops, subsequence)