aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOskari Timperi <oskari.timperi@iki.fi>2019-10-07 22:15:50 +0300
committerOskari Timperi <oskari.timperi@iki.fi>2019-10-07 22:15:50 +0300
commit5f94819322df6fc54d35388736be4caaccb68856 (patch)
treed94ee0be7dc2f9274d8e50d2a8472684921ff65a
downloadnimlevenshtein-5f94819322df6fc54d35388736be4caaccb68856.tar.gz
nimlevenshtein-5f94819322df6fc54d35388736be4caaccb68856.zip
Initial commit
-rw-r--r--nimlevenshtein.nimble11
-rw-r--r--src/nimlevenshtein.nim364
-rw-r--r--src/nimlevenshtein/_levenshtein.c6833
-rw-r--r--src/nimlevenshtein/_levenshtein.h394
-rw-r--r--src/nimlevenshtein/binding.nim150
-rw-r--r--src/nimlevenshtein/wrapper.nim286
6 files changed, 8038 insertions, 0 deletions
diff --git a/nimlevenshtein.nimble b/nimlevenshtein.nimble
new file mode 100644
index 0000000..33145ae
--- /dev/null
+++ b/nimlevenshtein.nimble
@@ -0,0 +1,11 @@
+# Package
+
+version = "0.1.0"
+author = "Oskari Timperi"
+description = "A wrapper for algorithms from python-Levenshtein package."
+license = "GPL-2.0"
+srcDir = "src"
+
+# Dependencies
+
+requires "nim >= 1.0.0"
diff --git a/src/nimlevenshtein.nim b/src/nimlevenshtein.nim
new file mode 100644
index 0000000..37f715e
--- /dev/null
+++ b/src/nimlevenshtein.nim
@@ -0,0 +1,364 @@
+## A module for fast computation of
+##
+## - Levenshtein (edit) distance and edit sequence manipulation
+## - string similarity
+## - approximate median strings, and generally string averaging
+## - string sequence and set similarity
+##
+## This module uses the standalone version C API of the python-Levenshtein library. So no Python
+## required.
+##
+## The API reflects the API of the python-Levenshtein library. If you want to have a lower level
+## interface, you can import ``nimlevenshtein/wrapper``, which gives you nimified wrapper functions
+## over the C API functions. And if you want to go more lower level, you can use
+## ``nimlevenshtein/binding``.
+
+import nimlevenshtein/wrapper
+
+export EditOp
+export OpCode
+export EditType
+export MatchingBlock
+
+proc distance*(a, b: string): int =
+ ## Compute absolute Levenshtein distance of two strings.
+ runnableExamples:
+ assert distance("Levenshtein", "Lenvinsten") == 4
+ assert distance("Levenshtein", "Levensthein") == 2
+ assert distance("Levenshtein", "Levenshten") == 1
+ assert distance("Levenshtein", "Levenshtein") == 0
+ distance(a, b, 0)
+
+proc ratio*(a, b: string): float =
+ ## Compute similarity of two strings.
+ ##
+ ## The similarity is a number between 0 and 1, it's usually equal or somewhat higher than
+ ## Python's ``difflib.SequenceMatcher.ratio()``, because it's based on real minimal edit
+ ## distance.
+ runnableExamples:
+ from math import round
+ assert round(ratio("Hello world!", "Holly grail!"), 4) == 0.5833
+ assert ratio("Brian", "Jesus") == 0.0
+ let lensum = len(a) + len(b)
+ if lensum == 0:
+ return 1.0
+ let ldist = distance(a, b, 1)
+ result = (lensum - ldist).float / lensum.float
+
+proc hamming*(a, b: string): int =
+ ## Compute Hamming distance of two strings.
+ ##
+ ## The Hamming distance is simply the number of differing characters. That means the length of
+ ## the strings must be the same.
+ runnableExamples:
+ assert hamming("Hello world!", "Holly grail!") == 7
+ assert hamming("Brian", "Jesus") == 5
+ if len(a) != len(b):
+ raise newException(Exception, "expected two strings of the same length")
+ hammingDistance(a, b)
+
+proc jaro*(a, b: string): float =
+ ## Compute Jaro string similarity metric of two strings.
+ ##
+ ## The Jaro string similarity metric is intended for short strings like personal last names. It
+ ## is 0 for completely different strings and 1 for identical strings.
+ runnableExamples:
+ from math import round
+ assert jaro("Brian", "Jesus") == 0.0
+ assert round(jaro("Thorkel", "Thorgier"), 4) == 0.7798
+ assert round(jaro("Dinsdale", "D"), 4) == 0.7083
+ jaroRatio(a, b)
+
+proc jaroWinkler*(a, b: string, prefixWeight: float = 0.1): float =
+ ## Compute Jaro string similarity metric of two strings.
+ ##
+ ## The Jaro-Winkler string similarity metric is a modification of Jaro metric giving more weight
+ ## to common prefix, as spelling mistakes are more likely to occur near ends of words.
+ ##
+ ## The prefix weight is inverse value of common prefix length sufficient to consider the strings
+ ## *identical*. If no prefix weight is specified, 1/10 is used.
+ runnableExamples:
+ from math import round
+ assert jaroWinkler("Brian", "Jesus") == 0.0
+ assert round(jaroWinkler("Thorkel", "Thorgier"), 4) == 0.8679
+ assert round(jaroWinkler("Dinsdale", "D"), 4) == 0.7375
+ assert jaroWinkler("Thorkel", "Thorgier", 0.25) == 1.0
+ if prefixWeight < 0:
+ raise newException(Exception, "negative prefix weight")
+ jaroWinklerRatio(a, b, prefixWeight)
+
+template medianCommon(f: typed, strings: seq[string], weights: seq[float] = @[]): string =
+ if len(strings) == 0:
+ return ""
+
+ var wl = weights
+ if len(wl) == 0:
+ setLen(wl, len(strings))
+ for i in 0..<len(wl):
+ wl[i] = 1.0
+ else:
+ if len(strings) != len(weights):
+ raise newException(Exception, "expected same amount of strings and weights")
+
+ f(strings, wl)
+
+proc median*(strings: seq[string], weights: seq[float] = @[]): string =
+ ## Find an approximate generalized median string using greedy algorithm.
+ ##
+ ## You can optionally pass a weight for each string as the second argument. The weights are
+ ## interpreted as item multiplicities, although any non-negative real numbers are accepted. Use
+ ## them to improve computation speed when strings often appear multiple times in the sequence.
+ runnableExamples:
+ assert median(@["SpSm", "mpamm", "Spam", "Spa", "Sua", "hSam"]) == "Spam"
+ let fixme = @["Levnhtein", "Leveshein", "Leenshten",
+ "Leveshtei", "Lenshtein", "Lvenstein",
+ "Levenhtin", "evenshtei"]
+ assert median(fixme) == "Levenshtein"
+ medianCommon(greedyMedian, strings, weights)
+
+proc medianImprove*(s: string, strings: seq[string], weights: seq[float] = @[]): string =
+ ## Improve an approximate generalized median string by perturbations.
+ ##
+ ## The first argument is the estimated generalized median string you want to improve, the others
+ ## are the same as in `median()`. It returns a string with total distance less or equal to that
+ ## of the given string.
+ ##
+ ## Note this is much slower than `median()`. Also note it performs only one improvement step,
+ ## calling `medianImprove()` again on the result may improve it further, though this is unlikely
+ ## to happen unless the given string was not very similar to the actual generalized median.
+ runnableExamples:
+ let fixme = @["Levnhtein", "Leveshein", "Leenshten",
+ "Leveshtei", "Lenshtein", "Lvenstein",
+ "Levenhtin", "evenshtei"]
+ assert medianImprove("spam", fixme) == "enhtein"
+ assert medianImprove(medianImprove("spam", fixme), fixme) == "Levenshtein"
+ if len(strings) == 0:
+ return ""
+
+ var wl = weights
+ if len(wl) == 0:
+ setLen(wl, len(strings))
+ for i in 0..<len(wl):
+ wl[i] = 1.0
+ else:
+ if len(strings) != len(weights):
+ raise newException(Exception, "expected same amount of strings and weights")
+
+ wrapper.medianImprove(s, strings, wl)
+
+proc quickmedian*(strings: seq[string], weights: seq[float] = @[]): string =
+ ## Find a very approximate generalized median string, but fast.
+ ##
+ ## See `median()` for argument description.
+ ##
+ ## This method is somewhere between `setmedian()` and picking a random string from the set; both
+ ## speedwise and quality-wise.
+ runnableExamples:
+ let fixme = @["Levnhtein", "Leveshein", "Leenshten",
+ "Leveshtei", "Lenshtein", "Lvenstein",
+ "Levenhtin", "evenshtei"]
+ assert quickmedian(fixme) == "Levnshein"
+ medianCommon(wrapper.quickMedian, strings, weights)
+
+proc setmedian*(strings: seq[string], weights: seq[float] = @[]): string =
+ ## Find set median of a string set (passed as a sequence).
+ ##
+ ## See `median()` for argument description.
+ ##
+ ## The returned string is always one of the strings in the sequence.
+ runnableExamples:
+ assert setmedian(@["ehee", "cceaes", "chees", "chreesc",
+ "chees", "cheesee", "cseese", "chetese"]) == "chees"
+ medianCommon(wrapper.setMedian, strings, weights)
+
+template setSeqCommon(f: typed, a: seq[string], b: seq[string]): float =
+ let lensum = len(a) + len(b)
+
+ if lensum == 0:
+ return 1.0
+
+ if len(a) == 0 or len(b) == 0:
+ return 0.0
+
+ let r = f(a, b)
+
+ if r < 0.0:
+ raise newException(Exception, "failed")
+
+ (lensum.float - r) / lensum.float
+
+proc seqratio*(a: seq[string], b: seq[string]): float =
+ ## Compute similarity ratio of two sequences of strings.
+ ##
+ ## This is like `ratio()`, but for string sequences. A kind of `ratio()` is used to to measure
+ ## the cost of item change operation for the strings.
+ runnableExamples:
+ from math import round
+ assert round(seqratio(@["newspaper", "litter bin", "tinny", "antelope"],
+ @["caribou", "sausage", "gorn", "woody"]), 4) == 0.2152
+ assert seqratio(@[], @["foobar"]) == 0.0
+ assert seqratio(@["foobar"], @[]) == 0.0
+ setSeqCommon(editSeqDistance, a, b)
+
+proc setratio*(a: seq[string], b: seq[string]): float =
+ ## Compute similarity ratio of two strings sets (passed as sequences).
+ ##
+ ## The best match between any strings in the first set and the second set (passed as sequences)
+ ## is attempted. I.e., the order doesn't matter here.
+ runnableExamples:
+ from math import round
+ assert round(setratio(@["newspaper", "litter bin", "tinny", "antelope"],
+ @["caribou", "sausage", "gorn", "woody"]), 4) == 0.2818
+ assert setratio(@[], @["foobar"]) == 0.0
+ assert setratio(@["foobar"], @[]) == 0.0
+ setSeqCommon(setDistance, a, b)
+
+proc editops*(string1: string, string2: string): seq[EditOp] =
+ ## Find sequence of edit operations transforming one string to another.
+ ##
+ ## The result is a sequence of `EditOp` objects. These are operations on single characters.
+ ## In fact the returned sequence doesn't contain the *equal*, but all the related functions
+ ## accept both lists with and without *equals*.
+ runnableExamples:
+ let ops = editops("spam", "park")
+ assert ops[0] == EditOp(`type`: EditType.Delete, spos: 0, dpos: 0)
+ assert ops[1] == EditOp(`type`: EditType.Insert, spos: 3, dpos: 2)
+ assert ops[2] == EditOp(`type`: EditType.Replace, spos: 3, dpos: 3)
+ wrapper.editops(string1, string2)
+
+proc editops*(ops: seq[OpCode], aLen: int, bLen: int): seq[EditOp] =
+ ## Can be used for conversion from `OpCode` to `EditOp`. You can either pass in strings or
+ ## their lengths, the result is the same.
+ checkErrors(aLen, bLen, ops)
+ toEditOps(ops, false)
+
+proc editops*(ops: seq[OpCode], a: string, b: string): seq[EditOp] =
+ ## Can be used for conversion from `OpCode` to `EditOp`. You can either pass in strings or
+ ## their lengths, the result is the same.
+ editops(ops, len(a), len(b))
+
+proc opcodes*(a, b: string): seq[OpCode] =
+ ## Find sequence of edit operations transforming one string to another.
+ ##
+ ## The result is a sequence of `OpCode` objects.
+ runnableExamples:
+ let ops = opcodes("spam", "park")
+ assert ops[0] == OpCode(`type`: EditType.Delete, sbeg: 0, send: 1, dbeg: 0, dend: 0)
+ assert ops[1] == OpCode(`type`: EditType.Keep, sbeg: 1, send: 3, dbeg: 0, dend: 2)
+ assert ops[2] == OpCode(`type`: EditType.Insert, sbeg: 3, send: 3, dbeg: 2, dend: 3)
+ assert ops[3] == OpCode(`type`: EditType.Replace, sbeg: 3, send: 4, dbeg: 3, dend: 4)
+ editops(a, b).toOpCodes(len(a), len(b))
+
+proc opcodes*(ops: seq[EditOp], aLen: int, bLen: int): seq[OpCode] =
+ ## Can be used for conversion from `EditOp` to `OpCode`. You can either pass in strings or
+ ## their lengths, the result is the same.
+ checkErrors(aLen, bLen, ops)
+ ops.toOpCodes(aLen, bLen)
+
+proc opcodes*(ops: seq[EditOp], a: string, b: string): seq[OpCode] =
+ ## Can be used for conversion from `EditOp` to `OpCode`. You can either pass in strings or
+ ## their lengths, the result is the same.
+ opcodes(ops, len(a), len(b))
+
+proc inverse*(ops: seq[EditOp]): seq[EditOp] =
+ ## Invert the sense of an edit operation sequence.
+ ##
+ ## In other words, it returns a sequence of edit operations transforming the second
+ ## (destination) string to the first (source). It can be used with both editops and opcodes.
+ runnableExamples:
+ let inv = inverse(editops("spam", "park"))
+ assert inv[0] == EditOp(`type`: EditType.Insert, spos: 0, dpos: 0)
+ assert inv[1] == EditOp(`type`: EditType.Delete, spos: 2, dpos: 3)
+ assert inv[2] == EditOp(`type`: EditType.Replace, spos: 3, dpos: 3)
+ assert editops("park", "spam") == inv
+ result = ops
+ result.invert()
+
+proc inverse*(ops: seq[OpCode]): seq[OpCode] =
+ ## Invert the sense of an edit operation sequence.
+ ##
+ ## In other words, it returns a sequence of edit operations transforming the second
+ ## (destination) string to the first (source). It can be used with both editops and opcodes.
+ result = ops
+ result.invert()
+
+proc applyEdit*(ops: seq[EditOp], a: string, b:string): string =
+ ## Apply a sequence of edit operations to a string.
+ ##
+ ## In the case of editops, the sequence can be arbitrary ordered subset of the edit sequence
+ ## transforming source string to destination string.
+ runnableExamples:
+ let ops = editops("man", "scotsman")
+ assert applyEdit(ops, "man", "scotsman") == "scotsman"
+ assert applyEdit(ops[0..2], "man", "scotsman") == "scoman"
+ if len(ops) == 0:
+ return a
+ checkErrors(len(a), len(b), ops)
+ apply(a, b, ops)
+
+proc applyEdit*(ops: seq[OpCode], a: string, b:string): string =
+ if len(ops) == 0:
+ return a
+ checkErrors(len(a), len(b), ops)
+ apply(a, b, ops)
+
+template matchingBlocksCommon(ops: typed, aLen: int, bLen: int) =
+ checkErrors(aLen, bLen, ops)
+ result = matchingBlocks(aLen, bLen, ops)
+ result.add(MatchingBlock(spos: aLen, dpos: bLen, len: 0))
+
+proc matchingBlocks*(ops: seq[EditOp], aLen: int, bLen: int): seq[MatchingBlock] =
+ ## Find identical blocks in two strings.
+ ##
+ ## The result is a sequence of `MatchingBlock` objects. It can be used with both editops and
+ ## opcodes. The second and third arguments don't have to be actually strings, their lengths are
+ ## enough.
+ runnableExamples:
+ let (a, b) = ("spam", "park")
+ let mb = matchingBlocks(editops(a, b), a, b)
+ assert mb[0] == MatchingBlock(spos: 1, dpos: 0, `len`: 2)
+ ## The last zero-length block is not an error, but it's there for compatibility with Pythons
+ ## `difflib` which always emits it.
+ assert mb[1] == MatchingBlock(spos: 4, dpos: 4, `len`: 0)
+ assert mb == matchingBlocks(editops(a, b), len(a), len(b))
+ runnableExamples:
+ ## One can join the matching blocks to get two identical strings:
+ from sequtils import map
+ from strutils import join
+ let (a, b) = ("dog kennels", "mattresses")
+ let mb = matchingBlocks(editops(a, b), a, b)
+ assert join(map(mb, proc (x: MatchingBlock): string =
+ a[x.spos ..< x.spos + x.len])) == "ees"
+ assert join(map(mb, proc (x: MatchingBlock): string =
+ b[x.dpos ..< x.dpos + x.len])) == "ees"
+ matchingBlocksCommon(ops, aLen, bLen)
+
+proc matchingBlocks*(ops: seq[EditOp], a: string, b: string): seq[MatchingBlock] =
+ matchingBlocksCommon(ops, len(a), len(b))
+
+proc matchingBlocks*(ops: seq[OpCode], aLen: int, bLen: int): seq[MatchingBlock] =
+ matchingBlocksCommon(ops, aLen, bLen)
+
+proc matchingBlocks*(ops: seq[OpCode], a: string, b: string): seq[MatchingBlock] =
+ matchingBlocksCommon(ops, len(a), len(b))
+
+proc subtractEdit*(ops: seq[EditOp], subsequence: seq[EditOp]): seq[EditOp] =
+ ## Subtract an edit subsequence from a sequence.
+ ##
+ ## The result is equivalent to ``editops(applyEdit(subsequence, s1, s2), s2)``, except that is
+ ## constructed directly from the edit operations. That is, if you apply it to the result of
+ ## subsequence application, you get the same final string as from application of complete `ops`.
+ ## It may be not identical, though (in ambiguous cases, like insertion of a character next to
+ ## the same character).
+ ##
+ ## The subtracted subsequence must be an ordered subset of `ops`.
+ ##
+ ## Note this function does not accept difflib-style opcodes as no one in his right mind wants to
+ ## create subsequences from them.
+ runnableExamples:
+ let e = editops("man", "scotsman")
+ let e1 = e[0..2]
+ let bastard = applyEdit(e1, "man", "scotsman")
+ assert bastard == "scoman"
+ assert applyEdit(subtractEdit(e, e1), bastard, "scotsman") == "scotsman"
+ subtract(ops, subsequence)
diff --git a/src/nimlevenshtein/_levenshtein.c b/src/nimlevenshtein/_levenshtein.c
new file mode 100644
index 0000000..acf4858
--- /dev/null
+++ b/src/nimlevenshtein/_levenshtein.c
@@ -0,0 +1,6833 @@
+/*
+ * Levenshtein.c
+ * @(#) $Id: Levenshtein.c,v 1.41 2005/01/13 20:05:36 yeti Exp $
+ * Python extension computing Levenshtein distances, string similarities,
+ * median strings and other goodies.
+ *
+ * Copyright (C) 2002-2003 David Necas (Yeti) <yeti@physics.muni.cz>.
+ *
+ * The Taus113 random generator:
+ * Copyright (C) 2002 Atakan Gurkan
+ * Copyright (C) 1996, 1997, 1998, 1999, 2000 James Theiler, Brian Gough
+ * (see below for more)
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ **/
+
+/**
+ * TODO:
+ *
+ * - Implement weighted string averaging, see:
+ * H. Bunke et. al.: On the Weighted Mean of a Pair of Strings,
+ * Pattern Analysis and Applications 2002, 5(1): 23-30.
+ * X. Jiang et. al.: Dynamic Computations of Generalized Median Strings,
+ * Pattern Analysis and Applications 2002, ???.
+ * The latter also contains an interesting median-search algorithm.
+ *
+ * - Deal with stray symbols in greedy median() and median_improve().
+ * There are two possibilities:
+ * (i) Remember which strings contain which symbols. This allows certain
+ * small optimizations when processing them.
+ * (ii) Use some overall heuristics to find symbols which don't worth
+ * trying. This is very appealing, but hard to do properly
+ * (requires some inequality strong enough to allow practical exclusion
+ * of certain symbols -- at certain positions)
+ *
+ * - Editops should be an object that only *looks* like a list (which means
+ * it is a list in duck typing) to avoid never-ending conversions from
+ * Python lists to LevEditOp arrays and back
+ *
+ * - Optimize munkers_blackman(), it's pretty dumb (no memory of visited
+ * columns/rows)
+ *
+ * - Make it really usable as a C library (needs some wrappers, headers, ...,
+ * and maybe even documentation ;-)
+ *
+ * - Add interface to various interesting auxiliary results, namely
+ * set and sequence distance (only ratio is exported), the map from
+ * munkers_blackman() itself, ...
+ *
+ * - Generalizations:
+ * - character weight matrix/function
+ * - arbitrary edit operation costs, decomposable edit operations
+ *
+ * - Create a test suite
+ *
+ * - Add more interesting algorithms ;-)
+ *
+ * Postponed TODO (investigated, and a big `but' was found):
+ *
+ * - A linear approximate set median algorithm:
+ * P. Indyk: Sublinear time algorithms for metric space problems,
+ * STOC 1999, http://citeseer.nj.nec.com/indyk00sublinear.html.
+ * BUT: The algorithm seems to be advantageous only in the case of very
+ * large sets -- if my estimates are correct (the article itself is quite
+ * `asymptotic'), say 10^5 at least. On smaller sets either one would get
+ * only an extermely rough median estimate, or the number of distance
+ * computations would be in fact higher than in the dumb O(n^2) algorithm.
+ *
+ * - Improve setmedian() speed with triangular inequality, see:
+ * Juan, A., E. Vidal: An Algorithm for Fast Median Search,
+ * 1997, http://citeseer.nj.nec.com/article/juan97algorithm.html
+ * BUT: It doesn't seem to help much in spaces of high dimension (see the
+ * discussion and graphs in the article itself), a few percents at most,
+ * and strings behave like a space with a very high dimension (locally), so
+ * who knows, it probably wouldn't help much.
+ *
+ **/
+#ifdef NO_PYTHON
+#define _GNU_SOURCE
+#include <string.h>
+#include <math.h>
+/* for debugging */
+#include <stdio.h>
+#else /* NO_PYTHON */
+#define _LEV_STATIC_PY static
+#define lev_wchar Py_UNICODE
+#include <Python.h>
+#endif /* NO_PYTHON */
+
+#if PY_MAJOR_VERSION >= 3
+#define LEV_PYTHON3
+#define PyString_Type PyBytes_Type
+#define PyString_GET_SIZE PyBytes_GET_SIZE
+#define PyString_AS_STRING PyBytes_AS_STRING
+#define PyString_Check PyBytes_Check
+#define PyString_FromStringAndSize PyBytes_FromStringAndSize
+#define PyString_InternFromString PyUnicode_InternFromString
+#define PyInt_AS_LONG PyLong_AsLong
+#define PyInt_FromLong PyLong_FromLong
+#define PyInt_Check PyLong_Check
+#define PY_INIT_MOD(module, name, doc, methods) \
+ static struct PyModuleDef moduledef = { \
+ PyModuleDef_HEAD_INIT, name, doc, -1, methods, }; \
+ module = PyModule_Create(&moduledef);
+ #define PY_MOD_INIT_FUNC_DEF(name) PyObject* PyInit_##name(void)
+#else
+ #define PY_INIT_MOD(module, name, doc, methods) \
+ Py_InitModule3(name, methods, doc);
+ #define PY_MOD_INIT_FUNC_DEF(name) void init##name(void)
+#endif /* PY_MAJOR_VERSION */
+
+#include <assert.h>
+#include "_levenshtein.h"
+
+/* FIXME: inline avaliability should be solved in setup.py, somehow, or
+ * even better in Python.h, like const is...
+ * this should inline at least with gcc and msvc */
+#ifndef __GNUC__
+# ifdef _MSC_VER
+# define inline __inline
+# else
+# define inline /* */
+# endif
+# define __attribute__(x) /* */
+#endif
+
+#define LEV_EPSILON 1e-14
+#define LEV_INFINITY 1e100
+
+/* Me thinks the second argument of PyArg_UnpackTuple() should be const.
+ * Anyway I habitually pass a constant string.
+ * A cast to placate the compiler. */
+#define PYARGCFIX(x) (char*)(x)
+
+/* local functions */
+static size_t*
+munkers_blackman(size_t n1,
+ size_t n2,
+ double *dists);
+
+#define TAUS113_LCG(n) ((69069UL * n) & 0xffffffffUL)
+#define TAUS113_MASK 0xffffffffUL
+
+typedef struct {
+ unsigned long int z1, z2, z3, z4;
+} taus113_state_t;
+
+static inline unsigned long int
+taus113_get(taus113_state_t *state);
+
+static void
+taus113_set(taus113_state_t *state,
+ unsigned long int s);
+
+#ifndef NO_PYTHON
+/* python interface and wrappers */
+/* declarations and docstrings {{{ */
+static PyObject* distance_py(PyObject *self, PyObject *args);
+static PyObject* ratio_py(PyObject *self, PyObject *args);
+static PyObject* hamming_py(PyObject *self, PyObject *args);
+static PyObject* jaro_py(PyObject *self, PyObject *args);
+static PyObject* jaro_winkler_py(PyObject *self, PyObject *args);
+static PyObject* median_py(PyObject *self, PyObject *args);
+static PyObject* median_improve_py(PyObject *self, PyObject *args);
+static PyObject* quickmedian_py(PyObject *self, PyObject *args);
+static PyObject* setmedian_py(PyObject *self, PyObject *args);
+static PyObject* seqratio_py(PyObject *self, PyObject *args);
+static PyObject* setratio_py(PyObject *self, PyObject *args);
+static PyObject* editops_py(PyObject *self, PyObject *args);
+static PyObject* opcodes_py(PyObject *self, PyObject *args);
+static PyObject* inverse_py(PyObject *self, PyObject *args);
+static PyObject* apply_edit_py(PyObject *self, PyObject *args);
+static PyObject* matching_blocks_py(PyObject *self, PyObject *args);
+static PyObject* subtract_edit_py(PyObject *self, PyObject *args);
+
+#define Levenshtein_DESC \
+ "A C extension module for fast computation of:\n" \
+ "- Levenshtein (edit) distance and edit sequence manipulation\n" \
+ "- string similarity\n" \
+ "- approximate median strings, and generally string averaging\n" \
+ "- string sequence and set similarity\n" \
+ "\n" \
+ "Levenshtein has a some overlap with difflib (SequenceMatcher). It\n" \
+ "supports only strings, not arbitrary sequence types, but on the\n" \
+ "other hand it's much faster.\n" \
+ "\n" \
+ "It supports both normal and Unicode strings, but can't mix them, all\n" \
+ "arguments to a function (method) have to be of the same type (or its\n" \
+ "subclasses).\n"
+
+#define distance_DESC \
+ "Compute absolute Levenshtein distance of two strings.\n" \
+ "\n" \
+ "distance(string1, string2)\n" \
+ "\n" \
+ "Examples (it's hard to spell Levenshtein correctly):\n" \
+ "\n" \
+ ">>> distance('Levenshtein', 'Lenvinsten')\n" \
+ "4\n" \
+ ">>> distance('Levenshtein', 'Levensthein')\n" \
+ "2\n" \
+ ">>> distance('Levenshtein', 'Levenshten')\n" \
+ "1\n" \
+ ">>> distance('Levenshtein', 'Levenshtein')\n" \
+ "0\n" \
+ "\n" \
+ "Yeah, we've managed it at last.\n"
+
+#define ratio_DESC \
+ "Compute similarity of two strings.\n" \
+ "\n" \
+ "ratio(string1, string2)\n" \
+ "\n" \
+ "The similarity is a number between 0 and 1, it's usually equal or\n" \
+ "somewhat higher than difflib.SequenceMatcher.ratio(), because it's\n" \
+ "based on real minimal edit distance.\n" \
+ "\n" \
+ "Examples:\n" \
+ "\n" \
+ ">>> ratio('Hello world!', 'Holly grail!') # doctest: +ELLIPSIS\n" \
+ "0.583333...\n" \
+ ">>> ratio('Brian', 'Jesus')\n" \
+ "0.0\n" \
+ "\n" \
+ "Really? I thought there was some similarity.\n"
+
+#define hamming_DESC \
+ "Compute Hamming distance of two strings.\n" \
+ "\n" \
+ "hamming(string1, string2)\n" \
+ "\n" \
+ "The Hamming distance is simply the number of differing characters.\n" \
+ "That means the length of the strings must be the same.\n" \
+ "\n" \
+ "Examples:\n" \
+ ">>> hamming('Hello world!', 'Holly grail!')\n" \
+ "7\n" \
+ ">>> hamming('Brian', 'Jesus')\n" \
+ "5\n"
+
+#define jaro_DESC \
+ "Compute Jaro string similarity metric of two strings.\n" \
+ "\n" \
+ "jaro(string1, string2)\n" \
+ "\n" \
+ "The Jaro string similarity metric is intended for short strings like\n" \
+ "personal last names. It is 0 for completely different strings and\n" \
+ "1 for identical strings.\n" \
+ "\n" \
+ "Examples:\n" \
+ ">>> jaro('Brian', 'Jesus')\n" \
+ "0.0\n" \
+ ">>> jaro('Thorkel', 'Thorgier') # doctest: +ELLIPSIS\n" \
+ "0.779761...\n" \
+ ">>> jaro('Dinsdale', 'D') # doctest: +ELLIPSIS\n" \
+ "0.708333...\n"
+
+#define jaro_winkler_DESC \
+ "Compute Jaro string similarity metric of two strings.\n" \
+ "\n" \
+ "jaro_winkler(string1, string2[, prefix_weight])\n" \
+ "\n" \
+ "The Jaro-Winkler string similarity metric is a modification of Jaro\n" \
+ "metric giving more weight to common prefix, as spelling mistakes are\n" \
+ "more likely to occur near ends of words.\n" \
+ "\n" \
+ "The prefix weight is inverse value of common prefix length sufficient\n" \
+ "to consider the strings *identical*. If no prefix weight is\n" \
+ "specified, 1/10 is used.\n" \
+ "\n" \
+ "Examples:\n" \
+ "\n" \
+ ">>> jaro_winkler('Brian', 'Jesus')\n" \
+ "0.0\n" \
+ ">>> jaro_winkler('Thorkel', 'Thorgier') # doctest: +ELLIPSIS\n" \
+ "0.867857...\n" \
+ ">>> jaro_winkler('Dinsdale', 'D') # doctest: +ELLIPSIS\n" \
+ "0.7375...\n" \
+ ">>> jaro_winkler('Thorkel', 'Thorgier', 0.25)\n" \
+ "1.0\n"
+
+#define median_DESC \
+ "Find an approximate generalized median string using greedy algorithm.\n" \
+ "\n" \
+ "median(string_sequence[, weight_sequence])\n" \
+ "\n" \
+ "You can optionally pass a weight for each string as the second\n" \
+ "argument. The weights are interpreted as item multiplicities,\n" \
+ "although any non-negative real numbers are accepted. Use them to\n" \
+ "improve computation speed when strings often appear multiple times\n" \
+ "in the sequence.\n" \
+ "\n" \
+ "Examples:\n" \
+ "\n" \
+ ">>> median(['SpSm', 'mpamm', 'Spam', 'Spa', 'Sua', 'hSam'])\n" \
+ "'Spam'\n" \
+ ">>> fixme = ['Levnhtein', 'Leveshein', 'Leenshten',\n" \
+ "... 'Leveshtei', 'Lenshtein', 'Lvenstein',\n" \
+ "... 'Levenhtin', 'evenshtei']\n" \
+ ">>> median(fixme)\n" \
+ "'Levenshtein'\n" \
+ "\n" \
+ "Hm. Even a computer program can spell Levenshtein better than me.\n"
+
+#define median_improve_DESC \
+ "Improve an approximate generalized median string by perturbations.\n" \
+ "\n" \
+ "median_improve(string, string_sequence[, weight_sequence])\n" \
+ "\n" \
+ "The first argument is the estimated generalized median string you\n" \
+ "want to improve, the others are the same as in median(). It returns\n" \
+ "a string with total distance less or equal to that of the given string.\n" \
+ "\n" \
+ "Note this is much slower than median(). Also note it performs only\n" \
+ "one improvement step, calling median_improve() again on the result\n" \
+ "may improve it further, though this is unlikely to happen unless the\n" \
+ "given string was not very similar to the actual generalized median.\n" \
+ "\n" \
+ "Examples:\n" \
+ "\n" \
+ ">>> fixme = ['Levnhtein', 'Leveshein', 'Leenshten',\n" \
+ "... 'Leveshtei', 'Lenshtein', 'Lvenstein',\n" \
+ "... 'Levenhtin', 'evenshtei']\n" \
+ ">>> median_improve('spam', fixme)\n" \
+ "'enhtein'\n" \
+ ">>> median_improve(median_improve('spam', fixme), fixme)\n" \
+ "'Levenshtein'\n" \
+ "\n" \
+ "It takes some work to change spam to Levenshtein.\n"
+
+#define quickmedian_DESC \
+ "Find a very approximate generalized median string, but fast.\n" \
+ "\n" \
+ "quickmedian(string[, weight_sequence])\n" \
+ "\n" \
+ "See median() for argument description.\n" \
+ "\n" \
+ "This method is somewhere between setmedian() and picking a random\n" \
+ "string from the set; both speedwise and quality-wise.\n" \
+ "\n" \
+ "Examples:\n" \
+ "\n" \
+ ">>> fixme = ['Levnhtein', 'Leveshein', 'Leenshten',\n" \
+ "... 'Leveshtei', 'Lenshtein', 'Lvenstein',\n" \
+ "... 'Levenhtin', 'evenshtei']\n" \
+ ">>> quickmedian(fixme)\n" \
+ "'Levnshein'\n"
+
+#define setmedian_DESC \
+ "Find set median of a string set (passed as a sequence).\n" \
+ "\n" \
+ "setmedian(string[, weight_sequence])\n" \
+ "\n" \
+ "See median() for argument description.\n" \
+ "\n" \
+ "The returned string is always one of the strings in the sequence.\n" \
+ "\n" \
+ "Examples:\n" \
+ "\n" \
+ ">>> setmedian(['ehee', 'cceaes', 'chees', 'chreesc',\n" \
+ "... 'chees', 'cheesee', 'cseese', 'chetese'])\n" \
+ "'chees'\n" \
+ "\n" \
+ "You haven't asked me about Limburger, sir.\n"
+
+#define seqratio_DESC \
+ "Compute similarity ratio of two sequences of strings.\n" \
+ "\n" \
+ "seqratio(string_sequence1, string_sequence2)\n" \
+ "\n" \
+ "This is like ratio(), but for string sequences. A kind of ratio()\n" \
+ "is used to to measure the cost of item change operation for the\n" \
+ "strings.\n" \
+ "\n" \
+ "Examples:\n" \
+ "\n" \
+ ">>> seqratio(['newspaper', 'litter bin', 'tinny', 'antelope'],\n" \
+ "... ['caribou', 'sausage', 'gorn', 'woody'])\n" \
+ "0.21517857142857144\n"
+
+#define setratio_DESC \
+ "Compute similarity ratio of two strings sets (passed as sequences).\n" \
+ "\n" \
+ "setratio(string_sequence1, string_sequence2)\n" \
+ "\n" \
+ "The best match between any strings in the first set and the second\n" \
+ "set (passed as sequences) is attempted. I.e., the order doesn't\n" \
+ "matter here.\n" \
+ "\n" \
+ "Examples:\n" \
+ "\n" \
+ ">>> setratio(['newspaper', 'litter bin', 'tinny', 'antelope'],\n" \
+ "... ['caribou', 'sausage', 'gorn', 'woody']) # doctest: +ELLIPSIS\n" \
+ "0.281845...\n" \
+ "\n" \
+ "No, even reordering doesn't help the tinny words to match the\n" \
+ "woody ones.\n"
+
+#define editops_DESC \
+ "Find sequence of edit operations transforming one string to another.\n" \
+ "\n" \
+ "editops(source_string, destination_string)\n" \
+ "editops(edit_operations, source_length, destination_length)\n" \
+ "\n" \
+ "The result is a list of triples (operation, spos, dpos), where\n" \
+ "operation is one of 'equal', 'replace', 'insert', or 'delete'; spos\n" \
+ "and dpos are position of characters in the first (source) and the\n" \
+ "second (destination) strings. These are operations on signle\n" \
+ "characters. In fact the returned list doesn't contain the 'equal',\n" \
+ "but all the related functions accept both lists with and without\n" \
+ "'equal's.\n" \
+ "\n" \
+ "Examples:\n" \
+ "\n" \
+ ">>> editops('spam', 'park')\n" \
+ "[('delete', 0, 0), ('insert', 3, 2), ('replace', 3, 3)]\n" \
+ "\n" \
+ "The alternate form editops(opcodes, source_string, destination_string)\n" \
+ "can be used for conversion from opcodes (5-tuples) to editops (you can\n" \
+ "pass strings or their lengths, it doesn't matter).\n"
+
+#define opcodes_DESC \
+ "Find sequence of edit operations transforming one string to another.\n" \
+ "\n" \
+ "opcodes(source_string, destination_string)\n" \
+ "opcodes(edit_operations, source_length, destination_length)\n" \
+ "\n" \
+ "The result is a list of 5-tuples with the same meaning as in\n" \
+ "SequenceMatcher's get_opcodes() output. But since the algorithms\n" \
+ "differ, the actual sequences from Levenshtein and SequenceMatcher\n" \
+ "may differ too.\n" \
+ "\n" \
+ "Examples:\n" \
+ "\n" \
+ ">>> for x in opcodes('spam', 'park'):\n" \
+ "... print(x)\n" \
+ "...\n" \
+ "('delete', 0, 1, 0, 0)\n" \
+ "('equal', 1, 3, 0, 2)\n" \
+ "('insert', 3, 3, 2, 3)\n" \
+ "('replace', 3, 4, 3, 4)\n" \
+ "\n" \
+ "The alternate form opcodes(editops, source_string, destination_string)\n" \
+ "can be used for conversion from editops (triples) to opcodes (you can\n" \
+ "pass strings or their lengths, it doesn't matter).\n"
+
+#define inverse_DESC \
+ "Invert the sense of an edit operation sequence.\n" \
+ "\n" \
+ "inverse(edit_operations)\n" \
+ "\n" \
+ "In other words, it returns a list of edit operations transforming the\n" \
+ "second (destination) string to the first (source). It can be used\n" \
+ "with both editops and opcodes.\n" \
+ "\n" \
+ "Examples:\n" \
+ "\n" \
+ ">>> inverse(editops('spam', 'park'))\n" \
+ "[('insert', 0, 0), ('delete', 2, 3), ('replace', 3, 3)]\n" \
+ ">>> editops('park', 'spam')\n" \
+ "[('insert', 0, 0), ('delete', 2, 3), ('replace', 3, 3)]\n"
+
+#define apply_edit_DESC \
+ "Apply a sequence of edit operations to a string.\n" \
+ "\n" \
+ "apply_edit(edit_operations, source_string, destination_string)\n" \
+ "\n" \
+ "In the case of editops, the sequence can be arbitrary ordered subset\n" \
+ "of the edit sequence transforming source_string to destination_string.\n" \
+ "\n" \
+ "Examples:\n" \
+ "\n" \
+ ">>> e = editops('man', 'scotsman')\n" \
+ ">>> apply_edit(e, 'man', 'scotsman')\n" \
+ "'scotsman'\n" \
+ ">>> apply_edit(e[:3], 'man', 'scotsman')\n" \
+ "'scoman'\n" \
+ "\n" \
+ "The other form of edit operations, opcodes, is not very suitable for\n" \
+ "such a tricks, because it has to always span over complete strings,\n" \
+ "subsets can be created by carefully replacing blocks with 'equal'\n" \
+ "blocks, or by enlarging 'equal' block at the expense of other blocks\n" \
+ "and adjusting the other blocks accordingly.\n" \
+ "\n" \
+ "Examples:\n" \
+ ">>> a, b = 'spam and eggs', 'foo and bar'\n" \
+ ">>> e = opcodes(a, b)\n" \
+ ">>> apply_edit(inverse(e), b, a)\n" \
+ "'spam and eggs'\n" \
+ ">>> e[4] = ('equal', 10, 13, 8, 11)\n" \
+ ">>> a, b, e\n" \
+ ">>> apply_edit(e, a, b)\n" \
+ "'foo and ggs'\n"
+
+#define matching_blocks_DESC \
+ "Find identical blocks in two strings.\n" \
+ "\n" \
+ "matching_blocks(edit_operations, source_length, destination_length)\n" \
+ "\n" \
+ "The result is a list of triples with the same meaning as in\n" \
+ "SequenceMatcher's get_matching_blocks() output. It can be used with\n" \
+ "both editops and opcodes. The second and third arguments don't\n" \
+ "have to be actually strings, their lengths are enough.\n" \
+ "\n" \
+ "Examples:\n" \
+ "\n" \
+ ">>> a, b = 'spam', 'park'\n" \
+ ">>> matching_blocks(editops(a, b), a, b)\n" \
+ "[(1, 0, 2), (4, 4, 0)]\n" \
+ ">>> matching_blocks(editops(a, b), len(a), len(b))\n" \
+ "[(1, 0, 2), (4, 4, 0)]\n" \
+ "\n" \
+ "The last zero-length block is not an error, but it's there for\n" \
+ "compatibility with difflib which always emits it.\n" \
+ "\n" \
+ "One can join the matching blocks to get two identical strings:\n" \
+ ">>> a, b = 'dog kennels', 'mattresses'\n" \
+ ">>> mb = matching_blocks(editops(a,b), a, b)\n" \
+ ">>> ''.join([a[x[0]:x[0]+x[2]] for x in mb])\n" \
+ "'ees'\n" \
+ ">>> ''.join([b[x[1]:x[1]+x[2]] for x in mb])\n" \
+ "'ees'\n"
+
+#define subtract_edit_DESC \
+ "Subtract an edit subsequence from a sequence.\n" \
+ "\n" \
+ "subtract_edit(edit_operations, subsequence)\n" \
+ "\n" \
+ "The result is equivalent to\n" \
+ "editops(apply_edit(subsequence, s1, s2), s2), except that is\n" \
+ "constructed directly from the edit operations. That is, if you apply\n" \
+ "it to the result of subsequence application, you get the same final\n" \
+ "string as from application complete edit_operations. It may be not\n" \
+ "identical, though (in amibuous cases, like insertion of a character\n" \
+ "next to the same character).\n" \
+ "\n" \
+ "The subtracted subsequence must be an ordered subset of\n" \
+ "edit_operations.\n" \
+ "\n" \
+ "Note this function does not accept difflib-style opcodes as no one in\n" \
+ "his right mind wants to create subsequences from them.\n" \
+ "\n" \
+ "Examples:\n" \
+ "\n" \
+ ">>> e = editops('man', 'scotsman')\n" \
+ ">>> e1 = e[:3]\n" \
+ ">>> bastard = apply_edit(e1, 'man', 'scotsman')\n" \
+ ">>> bastard\n" \
+ "'scoman'\n" \
+ ">>> apply_edit(subtract_edit(e, e1), bastard, 'scotsman')\n" \
+ "'scotsman'\n" \
+
+#define METHODS_ITEM(x) { #x, x##_py, METH_VARARGS, x##_DESC }
+static PyMethodDef methods[] = {
+ METHODS_ITEM(distance),
+ METHODS_ITEM(ratio),
+ METHODS_ITEM(hamming),
+ METHODS_ITEM(jaro),
+ METHODS_ITEM(jaro_winkler),
+ METHODS_ITEM(median),
+ METHODS_ITEM(median_improve),
+ METHODS_ITEM(quickmedian),
+ METHODS_ITEM(setmedian),
+ METHODS_ITEM(seqratio),
+ METHODS_ITEM(setratio),
+ METHODS_ITEM(editops),
+ METHODS_ITEM(opcodes),
+ METHODS_ITEM(inverse),
+ METHODS_ITEM(apply_edit),
+ METHODS_ITEM(matching_blocks),
+ METHODS_ITEM(subtract_edit),
+ { NULL, NULL, 0, NULL },
+};
+
+/* opcode names, these are to be initialized in the init func,
+ * indexed by LevEditType values */
+struct {
+ PyObject* pystring;
+ const char *cstring;
+ size_t len;
+}
+static opcode_names[] = {
+ { NULL, "equal", 0 },
+ { NULL, "replace", 0 },
+ { NULL, "insert", 0 },
+ { NULL, "delete", 0 },
+};
+#define N_OPCODE_NAMES ((sizeof(opcode_names)/sizeof(opcode_names[0])))
+
+typedef lev_byte *(*MedianFuncString)(size_t n,
+ const size_t *lengths,
+ const lev_byte *strings[],
+ const double *weights,
+ size_t *medlength);
+typedef Py_UNICODE *(*MedianFuncUnicode)(size_t n,
+ const size_t *lengths,
+ const Py_UNICODE *strings[],
+ const double *weights,
+ size_t *medlength);
+typedef struct {
+ MedianFuncString s;
+ MedianFuncUnicode u;
+} MedianFuncs;
+
+typedef lev_byte *(*MedianImproveFuncString)(size_t len, const lev_byte *s,
+ size_t n,
+ const size_t *lengths,
+ const lev_byte *strings[],
+ const double *weights,
+ size_t *medlength);
+typedef Py_UNICODE *(*MedianImproveFuncUnicode)(size_t len, const Py_UNICODE *s,
+ size_t n,
+ const size_t *lengths,
+ const Py_UNICODE *strings[],
+ const double *weights,
+ size_t *medlength);
+typedef struct {
+ MedianImproveFuncString s;
+ MedianImproveFuncUnicode u;
+} MedianImproveFuncs;
+
+typedef double (*SetSeqFuncString)(size_t n1,
+ const size_t *lengths1,
+ const lev_byte *strings1[],
+ size_t n2,
+ const size_t *lengths2,
+ const lev_byte *strings2[]);
+typedef double (*SetSeqFuncUnicode)(size_t n1,
+ const size_t *lengths1,
+ const Py_UNICODE *strings1[],
+ size_t n2,
+ const size_t *lengths2,
+ const Py_UNICODE *strings2[]);
+
+typedef struct {
+ SetSeqFuncString s;
+ SetSeqFuncUnicode u;
+} SetSeqFuncs;
+
+static long int
+levenshtein_common(PyObject *args,
+ const char *name,
+ size_t xcost,
+ size_t *lensum);
+
+static int
+extract_stringlist(PyObject *list,
+ const char *name,
+ size_t n,
+ size_t **sizelist,
+ void *strlist);
+
+static double*
+extract_weightlist(PyObject *wlist,
+ const char *name,
+ size_t n);
+
+static PyObject*
+median_common(PyObject *args,
+ const char *name,
+ MedianFuncs foo);
+
+static PyObject*
+median_improve_common(PyObject *args,
+ const char *name,
+ MedianImproveFuncs foo);
+
+static double
+setseq_common(PyObject *args,
+ const char *name,
+ SetSeqFuncs foo,
+ size_t *lensum);
+/* }}} */
+
+/****************************************************************************
+ *
+ * Python interface and subroutines
+ *
+ ****************************************************************************/
+/* {{{ */
+
+static long int
+levenshtein_common(PyObject *args, const char *name, size_t xcost,
+ size_t *lensum)
+{
+ PyObject *arg1, *arg2;
+ size_t len1, len2;
+
+ if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 2, &arg1, &arg2))
+ return -1;
+
+ if (PyObject_TypeCheck(arg1, &PyString_Type)
+ && PyObject_TypeCheck(arg2, &PyString_Type)) {
+ lev_byte *string1, *string2;
+
+ len1 = PyString_GET_SIZE(arg1);
+ len2 = PyString_GET_SIZE(arg2);
+ *lensum = len1 + len2;
+ string1 = PyString_AS_STRING(arg1);
+ string2 = PyString_AS_STRING(arg2);
+ {
+ size_t d = lev_edit_distance(len1, string1, len2, string2, xcost);
+ if (d == (size_t)(-1)) {
+ PyErr_NoMemory();
+ return -1;
+ }
+ return d;
+ }
+ }
+ else if (PyObject_TypeCheck(arg1, &PyUnicode_Type)
+ && PyObject_TypeCheck(arg2, &PyUnicode_Type)) {
+ Py_UNICODE *string1, *string2;
+
+ len1 = PyUnicode_GET_SIZE(arg1);
+ len2 = PyUnicode_GET_SIZE(arg2);
+ *lensum = len1 + len2;
+ string1 = PyUnicode_AS_UNICODE(arg1);
+ string2 = PyUnicode_AS_UNICODE(arg2);
+ {
+ size_t d = lev_u_edit_distance(len1, string1, len2, string2, xcost);
+ if (d == (size_t)(-1)) {
+ PyErr_NoMemory();
+ return -1;
+ }
+ return d;
+ }
+ }
+ else {
+ PyErr_Format(PyExc_TypeError,
+ "%s expected two Strings or two Unicodes", name);
+ return -1;
+ }
+}
+
+static PyObject*
+distance_py(PyObject *self, PyObject *args)
+{
+ size_t lensum;
+ long int ldist;
+
+ if ((ldist = levenshtein_common(args, "distance", 0, &lensum)) < 0)
+ return NULL;
+
+ return PyInt_FromLong((long)ldist);
+}
+
+static PyObject*
+ratio_py(PyObject *self, PyObject *args)
+{
+ size_t lensum;
+ long int ldist;
+
+ if ((ldist = levenshtein_common(args, "ratio", 1, &lensum)) < 0)
+ return NULL;
+
+ if (lensum == 0)
+ return PyFloat_FromDouble(1.0);
+
+ return PyFloat_FromDouble((double)(lensum - ldist)/(lensum));
+}
+
+static PyObject*
+hamming_py(PyObject *self, PyObject *args)
+{
+ PyObject *arg1, *arg2;
+ const char *name = "hamming";
+ size_t len1, len2;
+ long int dist;
+
+ if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 2, &arg1, &arg2))
+ return NULL;
+
+ if (PyObject_TypeCheck(arg1, &PyString_Type)
+ && PyObject_TypeCheck(arg2, &PyString_Type)) {
+ lev_byte *string1, *string2;
+
+ len1 = PyString_GET_SIZE(arg1);
+ len2 = PyString_GET_SIZE(arg2);
+ if (len1 != len2) {
+ PyErr_Format(PyExc_ValueError,
+ "%s expected two strings of the same length", name);
+ return NULL;
+ }
+ string1 = PyString_AS_STRING(arg1);
+ string2 = PyString_AS_STRING(arg2);
+ dist = lev_hamming_distance(len1, string1, string2);
+ return PyInt_FromLong(dist);
+ }
+ else if (PyObject_TypeCheck(arg1, &PyUnicode_Type)
+ && PyObject_TypeCheck(arg2, &PyUnicode_Type)) {
+ Py_UNICODE *string1, *string2;
+
+ len1 = PyUnicode_GET_SIZE(arg1);
+ len2 = PyUnicode_GET_SIZE(arg2);
+ if (len1 != len2) {
+ PyErr_Format(PyExc_ValueError,
+ "%s expected two unicodes of the same length", name);
+ return NULL;
+ }
+ string1 = PyUnicode_AS_UNICODE(arg1);
+ string2 = PyUnicode_AS_UNICODE(arg2);
+ dist = lev_u_hamming_distance(len1, string1, string2);
+ return PyInt_FromLong(dist);
+ }
+ else {
+ PyErr_Format(PyExc_TypeError,
+ "%s expected two Strings or two Unicodes", name);
+ return NULL;
+ }
+}
+
+static PyObject*
+jaro_py(PyObject *self, PyObject *args)
+{
+ PyObject *arg1, *arg2;
+ const char *name = "jaro";
+ size_t len1, len2;
+
+ if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 2, &arg1, &arg2))
+ return NULL;
+
+ if (PyObject_TypeCheck(arg1, &PyString_Type)
+ && PyObject_TypeCheck(arg2, &PyString_Type)) {
+ lev_byte *string1, *string2;
+
+ len1 = PyString_GET_SIZE(arg1);
+ len2 = PyString_GET_SIZE(arg2);
+ string1 = PyString_AS_STRING(arg1);
+ string2 = PyString_AS_STRING(arg2);
+ return PyFloat_FromDouble(lev_jaro_ratio(len1, string1, len2, string2));
+ }
+ else if (PyObject_TypeCheck(arg1, &PyUnicode_Type)
+ && PyObject_TypeCheck(arg2, &PyUnicode_Type)) {
+ Py_UNICODE *string1, *string2;
+
+ len1 = PyUnicode_GET_SIZE(arg1);
+ len2 = PyUnicode_GET_SIZE(arg2);
+ string1 = PyUnicode_AS_UNICODE(arg1);
+ string2 = PyUnicode_AS_UNICODE(arg2);
+ return PyFloat_FromDouble(lev_u_jaro_ratio(len1, string1, len2, string2));
+ }
+ else {
+ PyErr_Format(PyExc_TypeError,
+ "%s expected two Strings or two Unicodes", name);
+ return NULL;
+ }
+}
+
+static PyObject*
+jaro_winkler_py(PyObject *self, PyObject *args)
+{
+ PyObject *arg1, *arg2, *arg3 = NULL;
+ double pfweight = 0.1;
+ const char *name = "jaro_winkler";
+ size_t len1, len2;
+
+ if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 3, &arg1, &arg2, &arg3))
+ return NULL;
+
+ if (arg3) {
+ if (!PyObject_TypeCheck(arg3, &PyFloat_Type)) {
+ PyErr_Format(PyExc_TypeError, "%s third argument must be a Float", name);
+ return NULL;
+ }
+ pfweight = PyFloat_AS_DOUBLE(arg3);
+ if (pfweight < 0.0) {
+ PyErr_Format(PyExc_ValueError, "%s negative prefix weight", name);
+ return NULL;
+ }
+ }
+
+ if (PyObject_TypeCheck(arg1, &PyString_Type)
+ && PyObject_TypeCheck(arg2, &PyString_Type)) {
+ lev_byte *string1, *string2;
+
+ len1 = PyString_GET_SIZE(arg1);
+ len2 = PyString_GET_SIZE(arg2);
+ string1 = PyString_AS_STRING(arg1);
+ string2 = PyString_AS_STRING(arg2);
+ return PyFloat_FromDouble(lev_jaro_winkler_ratio(len1, string1,
+ len2, string2,
+ pfweight));
+ }
+ else if (PyObject_TypeCheck(arg1, &PyUnicode_Type)
+ && PyObject_TypeCheck(arg2, &PyUnicode_Type)) {
+ Py_UNICODE *string1, *string2;
+
+ len1 = PyUnicode_GET_SIZE(arg1);
+ len2 = PyUnicode_GET_SIZE(arg2);
+ string1 = PyUnicode_AS_UNICODE(arg1);
+ string2 = PyUnicode_AS_UNICODE(arg2);
+ return PyFloat_FromDouble(lev_u_jaro_winkler_ratio(len1, string1,
+ len2, string2,
+ pfweight));
+ }
+ else {
+ PyErr_Format(PyExc_TypeError,
+ "%s expected two Strings or two Unicodes", name);
+ return NULL;
+ }
+}
+
+static PyObject*
+median_py(PyObject *self, PyObject *args)
+{
+ MedianFuncs engines = { lev_greedy_median, lev_u_greedy_median };
+ return median_common(args, "median", engines);
+}
+
+static PyObject*
+median_improve_py(PyObject *self, PyObject *args)
+{
+ MedianImproveFuncs engines = { lev_median_improve, lev_u_median_improve };
+ return median_improve_common(args, "median_improve", engines);
+}
+
+static PyObject*
+quickmedian_py(PyObject *self, PyObject *args)
+{
+ MedianFuncs engines = { lev_quick_median, lev_u_quick_median };
+ return median_common(args, "quickmedian", engines);
+}
+
+static PyObject*
+setmedian_py(PyObject *self, PyObject *args)
+{
+ MedianFuncs engines = { lev_set_median, lev_u_set_median };
+ return median_common(args, "setmedian", engines);
+}
+
+static PyObject*
+median_common(PyObject *args, const char *name, MedianFuncs foo)
+{
+ size_t n, len;
+ void *strings = NULL;
+ size_t *sizes = NULL;
+ PyObject *strlist = NULL;
+ PyObject *wlist = NULL;
+ PyObject *strseq = NULL;
+ double *weights;
+ int stringtype;
+ PyObject *result = NULL;
+
+ if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 1, 2, &strlist, &wlist))
+ return NULL;
+
+ if (!PySequence_Check(strlist)) {
+ PyErr_Format(PyExc_TypeError,
+ "%s first argument must be a Sequence", name);
+ return NULL;
+ }
+ strseq = PySequence_Fast(strlist, name);
+
+ n = PySequence_Fast_GET_SIZE(strseq);
+ if (n == 0) {
+ Py_INCREF(Py_None);
+ Py_DECREF(strseq);
+ return Py_None;
+ }
+
+ /* get (optional) weights, use 1 if none specified. */
+ weights = extract_weightlist(wlist, name, n);
+ if (!weights) {
+ Py_DECREF(strseq);
+ return NULL;
+ }
+
+ stringtype = extract_stringlist(strseq, name, n, &sizes, &strings);
+ Py_DECREF(strseq);
+ if (stringtype < 0) {
+ free(weights);
+ return NULL;
+ }
+
+ if (stringtype == 0) {
+ lev_byte *medstr = foo.s(n, sizes, strings, weights, &len);
+ if (!medstr && len)
+ result = PyErr_NoMemory();
+ else {
+ result = PyString_FromStringAndSize(medstr, len);
+ free(medstr);
+ }
+ }
+ else if (stringtype == 1) {
+ Py_UNICODE *medstr = foo.u(n, sizes, strings, weights, &len);
+ if (!medstr && len)
+ result = PyErr_NoMemory();
+ else {
+ result = PyUnicode_FromUnicode(medstr, len);
+ free(medstr);
+ }
+ }
+ else
+ PyErr_Format(PyExc_SystemError, "%s internal error", name);
+
+ free(strings);
+ free(weights);
+ free(sizes);
+ return result;
+}
+
+static PyObject*
+median_improve_common(PyObject *args, const char *name, MedianImproveFuncs foo)
+{
+ size_t n, len;
+ void *strings = NULL;
+ size_t *sizes = NULL;
+ PyObject *arg1 = NULL;
+ PyObject *strlist = NULL;
+ PyObject *wlist = NULL;
+ PyObject *strseq = NULL;
+ double *weights;
+ int stringtype;
+ PyObject *result = NULL;
+
+ if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 3, &arg1, &strlist, &wlist))
+ return NULL;
+
+ if (PyObject_TypeCheck(arg1, &PyString_Type))
+ stringtype = 0;
+ else if (PyObject_TypeCheck(arg1, &PyUnicode_Type))
+ stringtype = 1;
+ else {
+ PyErr_Format(PyExc_TypeError,
+ "%s first argument must be a String or Unicode", name);
+ return NULL;
+ }
+
+ if (!PySequence_Check(strlist)) {
+ PyErr_Format(PyExc_TypeError,
+ "%s second argument must be a Sequence", name);
+ return NULL;
+ }
+ strseq = PySequence_Fast(strlist, name);
+
+ n = PySequence_Fast_GET_SIZE(strseq);
+ if (n == 0) {
+ Py_INCREF(Py_None);
+ Py_DECREF(strseq);
+ return Py_None;
+ }
+
+ /* get (optional) weights, use 1 if none specified. */
+ weights = extract_weightlist(wlist, name, n);
+ if (!weights) {
+ Py_DECREF(strseq);
+ return NULL;
+ }
+
+ if (extract_stringlist(strseq, name, n, &sizes, &strings) != stringtype) {
+ PyErr_Format(PyExc_TypeError,
+ "%s argument types don't match", name);
+ free(weights);
+ return NULL;
+ }
+
+ Py_DECREF(strseq);
+ if (stringtype == 0) {
+ lev_byte *s = PyString_AS_STRING(arg1);
+ size_t l = PyString_GET_SIZE(arg1);
+ lev_byte *medstr = foo.s(l, s, n, sizes, strings, weights, &len);
+ if (!medstr && len)
+ result = PyErr_NoMemory();
+ else {
+ result = PyString_FromStringAndSize(medstr, len);
+ free(medstr);
+ }
+ }
+ else if (stringtype == 1) {
+ Py_UNICODE *s = PyUnicode_AS_UNICODE(arg1);
+ size_t l = PyUnicode_GET_SIZE(arg1);
+ Py_UNICODE *medstr = foo.u(l, s, n, sizes, strings, weights, &len);
+ if (!medstr && len)
+ result = PyErr_NoMemory();
+ else {
+ result = PyUnicode_FromUnicode(medstr, len);
+ free(medstr);
+ }
+ }
+ else
+ PyErr_Format(PyExc_SystemError, "%s internal error", name);
+
+ free(strings);
+ free(weights);
+ free(sizes);
+ return result;
+}
+
+static double*
+extract_weightlist(PyObject *wlist, const char *name, size_t n)
+{
+ size_t i;
+ double *weights = NULL;
+ PyObject *seq;
+
+ if (wlist) {
+ if (!PySequence_Check(wlist)) {
+ PyErr_Format(PyExc_TypeError,
+ "%s second argument must be a Sequence", name);
+ return NULL;
+ }
+ seq = PySequence_Fast(wlist, name);
+ if (PySequence_Fast_GET_SIZE(wlist) != n) {
+ PyErr_Format(PyExc_ValueError,
+ "%s got %i strings but %i weights",
+ name, n, PyList_GET_SIZE(wlist));
+ Py_DECREF(seq);
+ return NULL;
+ }
+ weights = (double*)malloc(n*sizeof(double));
+ if (!weights)
+ return (double*)PyErr_NoMemory();
+ for (i = 0; i < n; i++) {
+ PyObject *item = PySequence_Fast_GET_ITEM(wlist, i);
+ PyObject *number = PyNumber_Float(item);
+
+ if (!number) {
+ free(weights);
+ PyErr_Format(PyExc_TypeError,
+ "%s weight #%i is not a Number", name, i);
+ Py_DECREF(seq);
+ return NULL;
+ }
+ weights[i] = PyFloat_AS_DOUBLE(number);
+ Py_DECREF(number);
+ if (weights[i] < 0) {
+ free(weights);
+ PyErr_Format(PyExc_ValueError,
+ "%s weight #%i is negative", name, i);
+ Py_DECREF(seq);
+ return NULL;
+ }
+ }
+ Py_DECREF(seq);
+ }
+ else {
+ weights = (double*)malloc(n*sizeof(double));
+ if (!weights)
+ return (double*)PyErr_NoMemory();
+ for (i = 0; i < n; i++)
+ weights[i] = 1.0;
+ }
+
+ return weights;
+}
+
+/* extract a list of strings or unicode strings, returns
+ * 0 -- strings
+ * 1 -- unicode strings
+ * <0 -- failure
+ */
+static int
+extract_stringlist(PyObject *list, const char *name,
+ size_t n, size_t **sizelist, void *strlist)
+{
+ size_t i;
+ PyObject *first;
+
+ /* check first object type. when it's a string then all others must be
+ * strings too; when it's a unicode string then all others must be unicode
+ * strings too. */
+ first = PySequence_Fast_GET_ITEM(list, 0);
+ /* i don't exactly understand why the problem doesn't exhibit itself earlier
+ * but a queer error message is better than a segfault :o) */
+ if (first == (PyObject*)-1) {
+ PyErr_Format(PyExc_TypeError,
+ "%s undecomposable Sequence???", name);
+ return -1;
+ }
+
+ if (PyObject_TypeCheck(first, &PyString_Type)) {
+ lev_byte **strings;
+ size_t *sizes;
+
+ strings = (lev_byte**)malloc(n*sizeof(lev_byte*));
+ if (!strings) {
+ PyErr_Format(PyExc_MemoryError,
+ "%s cannot allocate memory", name);
+ return -1;
+ }
+ sizes = (size_t*)malloc(n*sizeof(size_t));
+ if (!sizes) {
+ free(strings);
+ PyErr_Format(PyExc_MemoryError,
+ "%s cannot allocate memory", name);
+ return -1;
+ }
+
+ strings[0] = PyString_AS_STRING(first);
+ sizes[0] = PyString_GET_SIZE(first);
+ for (i = 1; i < n; i++) {
+ PyObject *item = PySequence_Fast_GET_ITEM(list, i);
+
+ if (!PyObject_TypeCheck(item, &PyString_Type)) {
+ free(strings);
+ free(sizes);
+ PyErr_Format(PyExc_TypeError,
+ "%s item #%i is not a String", name, i);
+ return -1;
+ }
+ strings[i] = PyString_AS_STRING(item);
+ sizes[i] = PyString_GET_SIZE(item);
+ }
+
+ *(lev_byte***)strlist = strings;
+ *sizelist = sizes;
+ return 0;
+ }
+ if (PyObject_TypeCheck(first, &PyUnicode_Type)) {
+ Py_UNICODE **strings;
+ size_t *sizes;
+
+ strings = (Py_UNICODE**)malloc(n*sizeof(Py_UNICODE*));
+ if (!strings) {
+ PyErr_NoMemory();
+ return -1;
+ }
+ sizes = (size_t*)malloc(n*sizeof(size_t));
+ if (!sizes) {
+ free(strings);
+ PyErr_NoMemory();
+ return -1;
+ }
+
+ strings[0] = PyUnicode_AS_UNICODE(first);
+ sizes[0] = PyUnicode_GET_SIZE(first);
+ for (i = 1; i < n; i++) {
+ PyObject *item = PySequence_Fast_GET_ITEM(list, i);
+
+ if (!PyObject_TypeCheck(item, &PyUnicode_Type)) {
+ free(strings);
+ free(sizes);
+ PyErr_Format(PyExc_TypeError,
+ "%s item #%i is not a Unicode", name, i);
+ return -1;
+ }
+ strings[i] = PyUnicode_AS_UNICODE(item);
+ sizes[i] = PyUnicode_GET_SIZE(item);
+ }
+
+ *(Py_UNICODE***)strlist = strings;
+ *sizelist = sizes;
+ return 1;
+ }
+
+ PyErr_Format(PyExc_TypeError,
+ "%s expected list of Strings or Unicodes", name);
+ return -1;
+}
+
+static PyObject*
+seqratio_py(PyObject *self, PyObject *args)
+{
+ SetSeqFuncs engines = { lev_edit_seq_distance, lev_u_edit_seq_distance };
+ size_t lensum;
+ double r = setseq_common(args, "seqratio", engines, &lensum);
+ if (r < 0)
+ return NULL;
+ if (lensum == 0)
+ return PyFloat_FromDouble(1.0);
+ return PyFloat_FromDouble((lensum - r)/lensum);
+}
+
+static PyObject*
+setratio_py(PyObject *self, PyObject *args)
+{
+ SetSeqFuncs engines = { lev_set_distance, lev_u_set_distance };
+ size_t lensum;
+ double r = setseq_common(args, "setratio", engines, &lensum);
+ if (r < 0)
+ return NULL;
+ if (lensum == 0)
+ return PyFloat_FromDouble(1.0);
+ return PyFloat_FromDouble((lensum - r)/lensum);
+}
+
+static double
+setseq_common(PyObject *args, const char *name, SetSeqFuncs foo,
+ size_t *lensum)
+{
+ size_t n1, n2;
+ void *strings1 = NULL;
+ void *strings2 = NULL;
+ size_t *sizes1 = NULL;
+ size_t *sizes2 = NULL;
+ PyObject *strlist1;
+ PyObject *strlist2;
+ PyObject *strseq1;
+ PyObject *strseq2;
+ int stringtype1, stringtype2;
+ double r = -1.0;
+
+ if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 2, &strlist1, &strlist2))
+ return r;
+
+ if (!PySequence_Check(strlist1)) {
+ PyErr_Format(PyExc_TypeError,
+ "%s first argument must be a Sequence", name);
+ return r;
+ }
+ if (!PySequence_Check(strlist2)) {
+ PyErr_Format(PyExc_TypeError,
+ "%s second argument must be a Sequence", name);
+ return r;
+ }
+
+ strseq1 = PySequence_Fast(strlist1, name);
+ strseq2 = PySequence_Fast(strlist2, name);
+
+ n1 = PySequence_Fast_GET_SIZE(strseq1);
+ n2 = PySequence_Fast_GET_SIZE(strseq2);
+ *lensum = n1 + n2;
+ if (n1 == 0) {
+ Py_DECREF(strseq1);
+ Py_DECREF(strseq2);
+ return (double)n2;
+ }
+ if (n2 == 0) {
+ Py_DECREF(strseq1);
+ Py_DECREF(strseq2);
+ return (double)n1;
+ }
+
+ stringtype1 = extract_stringlist(strseq1, name, n1, &sizes1, &strings1);
+ Py_DECREF(strseq1);
+ if (stringtype1 < 0) {
+ Py_DECREF(strseq2);
+ return r;
+ }
+ stringtype2 = extract_stringlist(strseq2, name, n2, &sizes2, &strings2);
+ Py_DECREF(strseq2);
+ if (stringtype2 < 0) {
+ free(sizes1);
+ free(strings1);
+ return r;
+ }
+
+ if (stringtype1 != stringtype2) {
+ PyErr_Format(PyExc_TypeError,
+ "%s both sequences must consist of items of the same type",
+ name);
+ }
+ else if (stringtype1 == 0) {
+ r = foo.s(n1, sizes1, strings1, n2, sizes2, strings2);
+ if (r < 0.0)
+ PyErr_NoMemory();
+ }
+ else if (stringtype1 == 1) {
+ r = foo.u(n1, sizes1, strings1, n2, sizes2, strings2);
+ if (r < 0.0)
+ PyErr_NoMemory();
+ }
+ else
+ PyErr_Format(PyExc_SystemError, "%s internal error", name);
+
+ free(strings1);
+ free(strings2);
+ free(sizes1);
+ free(sizes2);
+ return r;
+}
+
+static inline LevEditType
+string_to_edittype(PyObject *string)
+{
+ const char *s;
+ size_t i, len;
+
+ for (i = 0; i < N_OPCODE_NAMES; i++) {
+ if (string == opcode_names[i].pystring)
+ return i;
+ }
+
+ /* With Python >= 2.2, we shouldn't get here, except when the strings are
+ * not Strings but subtypes. */
+#ifdef LEV_PYTHON3
+ /* For Python 3, the string is an unicode object; use CompareWithAsciiString */
+ if (!PyUnicode_Check(string)) {
+ return LEV_EDIT_LAST;
+ }
+
+ for (i = 0; i < N_OPCODE_NAMES; i++) {
+ if (PyUnicode_CompareWithASCIIString(string, opcode_names[i].cstring) == 0) {
+ return i;
+ }
+ }
+
+#else
+
+ if (!PyString_Check(string))
+ return LEV_EDIT_LAST;
+
+ s = PyString_AS_STRING(string);
+ len = PyString_GET_SIZE(string);
+ for (i = 0; i < N_OPCODE_NAMES; i++) {
+ if (len == opcode_names[i].len
+ && memcmp(s, opcode_names[i].cstring, len) == 0) {
+ return i;
+ }
+ }
+#endif
+
+ return LEV_EDIT_LAST;
+}
+
+static LevEditOp*
+extract_editops(PyObject *list)
+{
+ LevEditOp *ops;
+ size_t i;
+ LevEditType type;
+ size_t n = PyList_GET_SIZE(list);
+
+ ops = (LevEditOp*)malloc(n*sizeof(LevEditOp));
+ if (!ops)
+ return (LevEditOp*)PyErr_NoMemory();
+ for (i = 0; i < n; i++) {
+ PyObject *item;
+ PyObject *tuple = PyList_GET_ITEM(list, i);
+
+ if (!PyTuple_Check(tuple) || PyTuple_GET_SIZE(tuple) != 3) {
+ free(ops);
+ return NULL;
+ }
+ item = PyTuple_GET_ITEM(tuple, 0);
+ if ((type = string_to_edittype(item)) == LEV_EDIT_LAST) {
+ free(ops);
+ return NULL;
+ }
+ ops[i].type = type;
+ item = PyTuple_GET_ITEM(tuple, 1);
+ if (!PyInt_Check(item)) {
+ free(ops);
+ return NULL;
+ }
+ ops[i].spos = (size_t)PyInt_AS_LONG(item);
+ item = PyTuple_GET_ITEM(tuple, 2);
+ if (!PyInt_Check(item)) {
+ free(ops);
+ return NULL;
+ }
+ ops[i].dpos = (size_t)PyInt_AS_LONG(item);
+ }
+ return ops;
+}
+
+static LevOpCode*
+extract_opcodes(PyObject *list)
+{
+ LevOpCode *bops;
+ size_t i;
+ LevEditType type;
+ size_t nb = PyList_GET_SIZE(list);
+
+ bops = (LevOpCode*)malloc(nb*sizeof(LevOpCode));
+ if (!bops)
+ return (LevOpCode*)PyErr_NoMemory();
+ for (i = 0; i < nb; i++) {
+ PyObject *item;
+ PyObject *tuple = PyList_GET_ITEM(list, i);
+
+ if (!PyTuple_Check(tuple) || PyTuple_GET_SIZE(tuple) != 5) {
+ free(bops);
+ return NULL;
+ }
+
+ item = PyTuple_GET_ITEM(tuple, 0);
+ if ((type = string_to_edittype(item)) == LEV_EDIT_LAST) {
+ free(bops);
+ return NULL;
+ }
+ bops[i].type = type;
+
+ item = PyTuple_GET_ITEM(tuple, 1);
+ if (!PyInt_Check(item)) {
+ free(bops);
+ return NULL;
+ }
+ bops[i].sbeg = (size_t)PyInt_AS_LONG(item);
+
+ item = PyTuple_GET_ITEM(tuple, 2);
+ if (!PyInt_Check(item)) {
+ free(bops);
+ return NULL;
+ }
+ bops[i].send = (size_t)PyInt_AS_LONG(item);
+
+ item = PyTuple_GET_ITEM(tuple, 3);
+ if (!PyInt_Check(item)) {
+ free(bops);
+ return NULL;
+ }
+ bops[i].dbeg = (size_t)PyInt_AS_LONG(item);
+
+ item = PyTuple_GET_ITEM(tuple, 4);
+ if (!PyInt_Check(item)) {
+ free(bops);
+ return NULL;
+ }
+ bops[i].dend = (size_t)PyInt_AS_LONG(item);
+ }
+ return bops;
+}
+
+static PyObject*
+editops_to_tuple_list(size_t n, LevEditOp *ops)
+{
+ PyObject *list;
+ size_t i;
+
+ list = PyList_New(n);
+ for (i = 0; i < n; i++, ops++) {
+ PyObject *tuple = PyTuple_New(3);
+ PyObject *is = opcode_names[ops->type].pystring;
+ Py_INCREF(is);
+ PyTuple_SET_ITEM(tuple, 0, is);
+ PyTuple_SET_ITEM(tuple, 1, PyInt_FromLong((long)ops->spos));
+ PyTuple_SET_ITEM(tuple, 2, PyInt_FromLong((long)ops->dpos));
+ PyList_SET_ITEM(list, i, tuple);
+ }
+
+ return list;
+}
+
+static PyObject*
+matching_blocks_to_tuple_list(size_t len1, size_t len2,
+ size_t nmb, LevMatchingBlock *mblocks)
+{
+ PyObject *list, *tuple;
+ size_t i;
+
+ list = PyList_New(nmb + 1);
+ for (i = 0; i < nmb; i++, mblocks++) {
+ tuple = PyTuple_New(3);
+ PyTuple_SET_ITEM(tuple, 0, PyInt_FromLong((long)mblocks->spos));
+ PyTuple_SET_ITEM(tuple, 1, PyInt_FromLong((long)mblocks->dpos));
+ PyTuple_SET_ITEM(tuple, 2, PyInt_FromLong((long)mblocks->len));
+ PyList_SET_ITEM(list, i, tuple);
+ }
+ tuple = PyTuple_New(3);
+ PyTuple_SET_ITEM(tuple, 0, PyInt_FromLong((long)len1));
+ PyTuple_SET_ITEM(tuple, 1, PyInt_FromLong((long)len2));
+ PyTuple_SET_ITEM(tuple, 2, PyInt_FromLong((long)0));
+ PyList_SET_ITEM(list, nmb, tuple);
+
+ return list;
+}
+
+static size_t
+get_length_of_anything(PyObject *object)
+{
+ if (PyInt_Check(object)) {
+ long int len = PyInt_AS_LONG(object);
+ if (len < 0)
+ len = -1;
+ return (size_t)len;
+ }
+ if (PySequence_Check(object))
+ return PySequence_Length(object);
+ return (size_t)-1;
+}
+
+static PyObject*
+editops_py(PyObject *self, PyObject *args)
+{
+ PyObject *arg1, *arg2, *arg3 = NULL;
+ PyObject *oplist;
+ size_t len1, len2, n;
+ LevEditOp *ops;
+ LevOpCode *bops;
+
+ if (!PyArg_UnpackTuple(args, PYARGCFIX("editops"), 2, 3,
+ &arg1, &arg2, &arg3)) {
+ return NULL;
+ }
+
+ /* convert: we were called (bops, s1, s2) */
+ if (arg3) {
+ if (!PyList_Check(arg1)) {
+ PyErr_Format(PyExc_ValueError,
+ "editops first argument must be a List of edit operations");
+ return NULL;
+ }
+ n = PyList_GET_SIZE(arg1);
+ if (!n) {
+ Py_INCREF(arg1);
+ return arg1;
+ }
+ len1 = get_length_of_anything(arg2);
+ len2 = get_length_of_anything(arg3);
+ if (len1 == (size_t)-1 || len2 == (size_t)-1) {
+ PyErr_Format(PyExc_ValueError,
+ "editops second and third argument must specify sizes");
+ return NULL;
+ }
+
+ if ((bops = extract_opcodes(arg1)) != NULL) {
+ if (lev_opcodes_check_errors(len1, len2, n, bops)) {
+ PyErr_Format(PyExc_ValueError,
+ "editops edit operation list is invalid");
+ free(bops);
+ return NULL;
+ }
+ ops = lev_opcodes_to_editops(n, bops, &n, 0); /* XXX: different n's! */
+ if (!ops && n) {
+ free(bops);
+ return PyErr_NoMemory();
+ }
+ oplist = editops_to_tuple_list(n, ops);
+ free(ops);
+ free(bops);
+ return oplist;
+ }
+ if ((ops = extract_editops(arg1)) != NULL) {
+ if (lev_editops_check_errors(len1, len2, n, ops)) {
+ PyErr_Format(PyExc_ValueError,
+ "editops edit operation list is invalid");
+ free(ops);
+ return NULL;
+ }
+ free(ops);
+ Py_INCREF(arg1); /* editops -> editops is identity */
+ return arg1;
+ }
+ if (!PyErr_Occurred())
+ PyErr_Format(PyExc_TypeError,
+ "editops first argument must be a List of edit operations");
+ return NULL;
+ }
+
+ /* find editops: we were called (s1, s2) */
+ if (PyObject_TypeCheck(arg1, &PyString_Type)
+ && PyObject_TypeCheck(arg2, &PyString_Type)) {
+ lev_byte *string1, *string2;
+
+ len1 = PyString_GET_SIZE(arg1);
+ len2 = PyString_GET_SIZE(arg2);
+ string1 = PyString_AS_STRING(arg1);
+ string2 = PyString_AS_STRING(arg2);
+ ops = lev_editops_find(len1, string1, len2, string2, &n);
+ }
+ else if (PyObject_TypeCheck(arg1, &PyUnicode_Type)
+ && PyObject_TypeCheck(arg2, &PyUnicode_Type)) {
+ Py_UNICODE *string1, *string2;
+
+ len1 = PyUnicode_GET_SIZE(arg1);
+ len2 = PyUnicode_GET_SIZE(arg2);
+ string1 = PyUnicode_AS_UNICODE(arg1);
+ string2 = PyUnicode_AS_UNICODE(arg2);
+ ops = lev_u_editops_find(len1, string1, len2, string2, &n);
+ }
+ else {
+ PyErr_Format(PyExc_TypeError,
+ "editops expected two Strings or two Unicodes");
+ return NULL;
+ }
+ if (!ops && n)
+ return PyErr_NoMemory();
+ oplist = editops_to_tuple_list(n, ops);
+ free(ops);
+ return oplist;
+}
+
+static PyObject*
+opcodes_to_tuple_list(size_t nb, LevOpCode *bops)
+{
+ PyObject *list;
+ size_t i;
+
+ list = PyList_New(nb);
+ for (i = 0; i < nb; i++, bops++) {
+ PyObject *tuple = PyTuple_New(5);
+ PyObject *is = opcode_names[bops->type].pystring;
+ Py_INCREF(is);
+ PyTuple_SET_ITEM(tuple, 0, is);
+ PyTuple_SET_ITEM(tuple, 1, PyInt_FromLong((long)bops->sbeg));
+ PyTuple_SET_ITEM(tuple, 2, PyInt_FromLong((long)bops->send));
+ PyTuple_SET_ITEM(tuple, 3, PyInt_FromLong((long)bops->dbeg));
+ PyTuple_SET_ITEM(tuple, 4, PyInt_FromLong((long)bops->dend));
+ PyList_SET_ITEM(list, i, tuple);
+ }
+
+ return list;
+}
+
+static PyObject*
+opcodes_py(PyObject *self, PyObject *args)
+{
+ PyObject *arg1, *arg2, *arg3 = NULL;
+ PyObject *oplist;
+ size_t len1, len2, n, nb;
+ LevEditOp *ops;
+ LevOpCode *bops;
+
+ if (!PyArg_UnpackTuple(args, PYARGCFIX("opcodes"), 2, 3,
+ &arg1, &arg2, &arg3))
+ return NULL;
+
+ /* convert: we were called (ops, s1, s2) */
+ if (arg3) {
+ if (!PyList_Check(arg1)) {
+ PyErr_Format(PyExc_TypeError,
+ "opcodes first argument must be a List of edit operations");
+ return NULL;
+ }
+ n = PyList_GET_SIZE(arg1);
+ len1 = get_length_of_anything(arg2);
+ len2 = get_length_of_anything(arg3);
+ if (len1 == (size_t)-1 || len2 == (size_t)-1) {
+ PyErr_Format(PyExc_ValueError,
+ "opcodes second and third argument must specify sizes");
+ return NULL;
+ }
+
+ if ((ops = extract_editops(arg1)) != NULL) {
+ if (lev_editops_check_errors(len1, len2, n, ops)) {
+ PyErr_Format(PyExc_ValueError,
+ "opcodes edit operation list is invalid");
+ free(ops);
+ return NULL;
+ }
+ bops = lev_editops_to_opcodes(n, ops, &n, len1, len2); /* XXX: n != n */
+ if (!bops && n) {
+ free(ops);
+ return PyErr_NoMemory();
+ }
+ oplist = opcodes_to_tuple_list(n, bops);
+ free(bops);
+ free(ops);
+ return oplist;
+ }
+ if ((bops = extract_opcodes(arg1)) != NULL) {
+ if (lev_opcodes_check_errors(len1, len2, n, bops)) {
+ PyErr_Format(PyExc_ValueError,
+ "opcodes edit operation list is invalid");
+ free(bops);
+ return NULL;
+ }
+ free(bops);
+ Py_INCREF(arg1); /* opcodes -> opcodes is identity */
+ return arg1;
+ }
+ if (!PyErr_Occurred())
+ PyErr_Format(PyExc_TypeError,
+ "opcodes first argument must be a List of edit operations");
+ return NULL;
+ }
+
+ /* find opcodes: we were called (s1, s2) */
+ if (PyObject_TypeCheck(arg1, &PyString_Type)
+ && PyObject_TypeCheck(arg2, &PyString_Type)) {
+ lev_byte *string1, *string2;
+
+ len1 = PyString_GET_SIZE(arg1);
+ len2 = PyString_GET_SIZE(arg2);
+ string1 = PyString_AS_STRING(arg1);
+ string2 = PyString_AS_STRING(arg2);
+ ops = lev_editops_find(len1, string1, len2, string2, &n);
+ }
+ else if (PyObject_TypeCheck(arg1, &PyUnicode_Type)
+ && PyObject_TypeCheck(arg2, &PyUnicode_Type)) {
+ Py_UNICODE *string1, *string2;
+
+ len1 = PyUnicode_GET_SIZE(arg1);
+ len2 = PyUnicode_GET_SIZE(arg2);
+ string1 = PyUnicode_AS_UNICODE(arg1);
+ string2 = PyUnicode_AS_UNICODE(arg2);
+ ops = lev_u_editops_find(len1, string1, len2, string2, &n);
+ }
+ else {
+ PyErr_Format(PyExc_TypeError,
+ "opcodes expected two Strings or two Unicodes");
+ return NULL;
+ }
+ if (!ops && n)
+ return PyErr_NoMemory();
+ bops = lev_editops_to_opcodes(n, ops, &nb, len1, len2);
+ free(ops);
+ if (!bops && nb)
+ return PyErr_NoMemory();
+ oplist = opcodes_to_tuple_list(nb, bops);
+ free(bops);
+ return oplist;
+}
+
+static PyObject*
+inverse_py(PyObject *self, PyObject *args)
+{
+ PyObject *list, *result;
+ size_t n;
+ LevEditOp *ops;
+ LevOpCode *bops;
+
+ if (!PyArg_UnpackTuple(args, PYARGCFIX("inverse"), 1, 1, &list)
+ || !PyList_Check(list))
+ return NULL;
+
+ n = PyList_GET_SIZE(list);
+ if (!n) {
+ Py_INCREF(list);
+ return list;
+ }
+ if ((ops = extract_editops(list)) != NULL) {
+ lev_editops_invert(n, ops);
+ result = editops_to_tuple_list(n, ops);
+ free(ops);
+ return result;
+ }
+ if ((bops = extract_opcodes(list)) != NULL) {
+ lev_opcodes_invert(n, bops);
+ result = opcodes_to_tuple_list(n, bops);
+ free(bops);
+ return result;
+ }
+
+ if (!PyErr_Occurred())
+ PyErr_Format(PyExc_TypeError,
+ "inverse expected a list of edit operations");
+ return NULL;
+}
+
+static PyObject*
+apply_edit_py(PyObject *self, PyObject *args)
+{
+ PyObject *list, *result, *arg1, *arg2;
+ size_t n, len, len1, len2;
+ LevEditOp *ops;
+ LevOpCode *bops;
+
+ if (!PyArg_UnpackTuple(args, PYARGCFIX("apply_edit"), 3, 3,
+ &list, &arg1, &arg2))
+ return NULL;
+
+ if (!PyList_Check(list)) {
+ PyErr_Format(PyExc_TypeError,
+ "apply_edit first argument must be a List of edit operations");
+ return NULL;
+ }
+ n = PyList_GET_SIZE(list);
+
+ if (PyObject_TypeCheck(arg1, &PyString_Type)
+ && PyObject_TypeCheck(arg2, &PyString_Type)) {
+ lev_byte *string1, *string2, *s;
+
+ if (!n) {
+ Py_INCREF(arg1);
+ return arg1;
+ }
+ len1 = PyString_GET_SIZE(arg1);
+ len2 = PyString_GET_SIZE(arg2);
+ string1 = PyString_AS_STRING(arg1);
+ string2 = PyString_AS_STRING(arg2);
+
+ if ((ops = extract_editops(list)) != NULL) {
+ if (lev_editops_check_errors(len1, len2, n, ops)) {
+ PyErr_Format(PyExc_ValueError,
+ "apply_edit edit operations are invalid or inapplicable");
+ free(ops);
+ return NULL;
+ }
+ s = lev_editops_apply(len1, string1, len2, string2,
+ n, ops, &len);
+ free(ops);
+ if (!s && len)
+ return PyErr_NoMemory();
+ result = PyString_FromStringAndSize(s, len);
+ free(s);
+ return result;
+ }
+ if ((bops = extract_opcodes(list)) != NULL) {
+ if (lev_opcodes_check_errors(len1, len2, n, bops)) {
+ PyErr_Format(PyExc_ValueError,
+ "apply_edit edit operations are invalid or inapplicable");
+ free(bops);
+ return NULL;
+ }
+ s = lev_opcodes_apply(len1, string1, len2, string2,
+ n, bops, &len);
+ free(bops);
+ if (!s && len)
+ return PyErr_NoMemory();
+ result = PyString_FromStringAndSize(s, len);
+ free(s);
+ return result;
+ }
+
+ if (!PyErr_Occurred())
+ PyErr_Format(PyExc_TypeError,
+ "apply_edit first argument must be "
+ "a list of edit operations");
+ return NULL;
+ }
+ if (PyObject_TypeCheck(arg1, &PyUnicode_Type)
+ && PyObject_TypeCheck(arg2, &PyUnicode_Type)) {
+ Py_UNICODE *string1, *string2, *s;
+
+ if (!n) {
+ Py_INCREF(arg1);
+ return arg1;
+ }
+ len1 = PyUnicode_GET_SIZE(arg1);
+ len2 = PyUnicode_GET_SIZE(arg2);
+ string1 = PyUnicode_AS_UNICODE(arg1);
+ string2 = PyUnicode_AS_UNICODE(arg2);
+
+ if ((ops = extract_editops(list)) != NULL) {
+ if (lev_editops_check_errors(len1, len2, n, ops)) {
+ PyErr_Format(PyExc_ValueError,
+ "apply_edit edit operations are invalid or inapplicable");
+ free(ops);
+ return NULL;
+ }
+ s = lev_u_editops_apply(len1, string1, len2, string2,
+ n, ops, &len);
+ free(ops);
+ if (!s && len)
+ return PyErr_NoMemory();
+ result = PyUnicode_FromUnicode(s, len);
+ free(s);
+ return result;
+ }
+ if ((bops = extract_opcodes(list)) != NULL) {
+ if (lev_opcodes_check_errors(len1, len2, n, bops)) {
+ PyErr_Format(PyExc_ValueError,
+ "apply_edit edit operations are invalid or inapplicable");
+ free(bops);
+ return NULL;
+ }
+ s = lev_u_opcodes_apply(len1, string1, len2, string2,
+ n, bops, &len);
+ free(bops);
+ if (!s && len)
+ return PyErr_NoMemory();
+ result = PyUnicode_FromUnicode(s, len);
+ free(s);
+ return result;
+ }
+
+ if (!PyErr_Occurred())
+ PyErr_Format(PyExc_TypeError,
+ "apply_edit first argument must be "
+ "a list of edit operations");
+ return NULL;
+ }
+
+ PyErr_Format(PyExc_TypeError,
+ "apply_edit expected two Strings or two Unicodes");
+ return NULL;
+}
+
+static PyObject*
+matching_blocks_py(PyObject *self, PyObject *args)
+{
+ PyObject *list, *arg1, *arg2, *result;
+ size_t n, nmb, len1, len2;
+ LevEditOp *ops;
+ LevOpCode *bops;
+ LevMatchingBlock *mblocks;
+
+ if (!PyArg_UnpackTuple(args, PYARGCFIX("matching_blocks"), 3, 3,
+ &list, &arg1, &arg2)
+ || !PyList_Check(list))
+ return NULL;
+
+ if (!PyList_Check(list)) {
+ PyErr_Format(PyExc_TypeError,
+ "matching_blocks first argument must be "
+ "a List of edit operations");
+ return NULL;
+ }
+ n = PyList_GET_SIZE(list);
+ len1 = get_length_of_anything(arg1);
+ len2 = get_length_of_anything(arg2);
+ if (len1 == (size_t)-1 || len2 == (size_t)-1) {
+ PyErr_Format(PyExc_ValueError,
+ "matching_blocks second and third argument "
+ "must specify sizes");
+ return NULL;
+ }
+
+ if ((ops = extract_editops(list)) != NULL) {
+ if (lev_editops_check_errors(len1, len2, n, ops)) {
+ PyErr_Format(PyExc_ValueError,
+ "apply_edit edit operations are invalid or inapplicable");
+ free(ops);
+ return NULL;
+ }
+ mblocks = lev_editops_matching_blocks(len1, len2, n, ops, &nmb);
+ free(ops);
+ if (!mblocks && nmb)
+ return PyErr_NoMemory();
+ result = matching_blocks_to_tuple_list(len1, len2, nmb, mblocks);
+ free(mblocks);
+ return result;
+ }
+ if ((bops = extract_opcodes(list)) != NULL) {
+ if (lev_opcodes_check_errors(len1, len2, n, bops)) {
+ PyErr_Format(PyExc_ValueError,
+ "apply_edit edit operations are invalid or inapplicable");
+ free(bops);
+ return NULL;
+ }
+ mblocks = lev_opcodes_matching_blocks(len1, len2, n, bops, &nmb);
+ free(bops);
+ if (!mblocks && nmb)
+ return PyErr_NoMemory();
+ result = matching_blocks_to_tuple_list(len1, len2, nmb, mblocks);
+ free(mblocks);
+ return result;
+ }
+
+ if (!PyErr_Occurred())
+ PyErr_Format(PyExc_TypeError,
+ "inverse expected a list of edit operations");
+ return NULL;
+}
+
+static PyObject*
+subtract_edit_py(PyObject *self, PyObject *args)
+{
+ PyObject *list, *sub, *result;
+ size_t n, ns, nr;
+ LevEditOp *ops, *osub, *orem;
+
+ if (!PyArg_UnpackTuple(args, PYARGCFIX("subtract_edit"), 2, 2, &list, &sub)
+ || !PyList_Check(list))
+ return NULL;
+
+ ns = PyList_GET_SIZE(sub);
+ if (!ns) {
+ Py_INCREF(list);
+ return list;
+ }
+ n = PyList_GET_SIZE(list);
+ if (!n) {
+ PyErr_Format(PyExc_ValueError,
+ "subtract_edit subsequence is not a subsequence "
+ "or is invalid");
+ return NULL;
+ }
+
+ if ((ops = extract_editops(list)) != NULL) {
+ if ((osub = extract_editops(sub)) != NULL) {
+ orem = lev_editops_subtract(n, ops, ns, osub, &nr);
+ free(ops);
+ free(osub);
+
+ if (!orem && nr == -1) {
+ PyErr_Format(PyExc_ValueError,
+ "subtract_edit subsequence is not a subsequence "
+ "or is invalid");
+ return NULL;
+ }
+ result = editops_to_tuple_list(nr, orem);
+ free(orem);
+
+ return result;
+ }
+ free(ops);
+ }
+
+ if (!PyErr_Occurred())
+ PyErr_Format(PyExc_TypeError,
+ "subtract_edit expected two lists of edit operations");
+ return NULL;
+}
+
+
+PY_MOD_INIT_FUNC_DEF(_levenshtein)
+{
+#ifdef LEV_PYTHON3
+ PyObject *module;
+#endif
+ size_t i;
+
+ PY_INIT_MOD(module, "_levenshtein", Levenshtein_DESC, methods)
+ /* create intern strings for edit operation names */
+ if (opcode_names[0].pystring)
+ abort();
+ for (i = 0; i < N_OPCODE_NAMES; i++) {
+#ifdef LEV_PYTHON3
+ opcode_names[i].pystring
+ = PyUnicode_InternFromString(opcode_names[i].cstring);
+#else
+ opcode_names[i].pystring
+ = PyString_InternFromString(opcode_names[i].cstring);
+#endif
+ opcode_names[i].len = strlen(opcode_names[i].cstring);
+ }
+ lev_init_rng(0);
+#ifdef LEV_PYTHON3
+ return module;
+#endif
+}
+/* }}} */
+#endif /* not NO_PYTHON */
+
+/****************************************************************************
+ *
+ * C (i.e. executive) part
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ *
+ * Taus113
+ *
+ ****************************************************************************/
+/* {{{ */
+
+/*
+ * Based on Tausworthe random generator implementation rng/taus113.c
+ * from the GNU Scientific Library (http://sources.redhat.com/gsl/)
+ * which has the notice
+ * Copyright (C) 2002 Atakan Gurkan
+ * Based on the file taus.c which has the notice
+ * Copyright (C) 1996, 1997, 1998, 1999, 2000 James Theiler, Brian Gough
+ */
+
+static inline unsigned long
+taus113_get(taus113_state_t *state)
+{
+ unsigned long b1, b2, b3, b4;
+
+ b1 = ((((state->z1 << 6UL) & TAUS113_MASK) ^ state->z1) >> 13UL);
+ state->z1 = ((((state->z1 & 4294967294UL) << 18UL) & TAUS113_MASK) ^ b1);
+
+ b2 = ((((state->z2 << 2UL) & TAUS113_MASK) ^ state->z2) >> 27UL);
+ state->z2 = ((((state->z2 & 4294967288UL) << 2UL) & TAUS113_MASK) ^ b2);
+
+ b3 = ((((state->z3 << 13UL) & TAUS113_MASK) ^ state->z3) >> 21UL);
+ state->z3 = ((((state->z3 & 4294967280UL) << 7UL) & TAUS113_MASK) ^ b3);
+
+ b4 = ((((state->z4 << 3UL) & TAUS113_MASK) ^ state->z4) >> 12UL);
+ state->z4 = ((((state->z4 & 4294967168UL) << 13UL) & TAUS113_MASK) ^ b4);
+
+ return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4);
+
+}
+
+static void
+taus113_set(taus113_state_t *state,
+ unsigned long int s)
+{
+ if (!s)
+ s = 1UL; /* default seed is 1 */
+
+ state->z1 = TAUS113_LCG(s);
+ if (state->z1 < 2UL)
+ state->z1 += 2UL;
+
+ state->z2 = TAUS113_LCG(state->z1);
+ if (state->z2 < 8UL)
+ state->z2 += 8UL;
+
+ state->z3 = TAUS113_LCG(state->z2);
+ if (state->z3 < 16UL)
+ state->z3 += 16UL;
+
+ state->z4 = TAUS113_LCG(state->z3);
+ if (state->z4 < 128UL)
+ state->z4 += 128UL;
+
+ /* Calling RNG ten times to satify recurrence condition */
+ taus113_get(state);
+ taus113_get(state);
+ taus113_get(state);
+ taus113_get(state);
+ taus113_get(state);
+ taus113_get(state);
+ taus113_get(state);
+ taus113_get(state);
+ taus113_get(state);
+ taus113_get(state);
+}
+
+
+/**
+ * lev_init_rng:
+ * @seed: A seed. If zero, a default value (currently 1) is used instead.
+ *
+ * Initializes the random generator used by some Levenshtein functions.
+ *
+ * This does NOT happen automatically when these functions are used.
+ **/
+_LEV_STATIC_PY void
+lev_init_rng(unsigned long int seed)
+{
+ static taus113_state_t state;
+
+ taus113_set(&state, seed);
+}
+/* }}} */
+
+/****************************************************************************
+ *
+ * Basic stuff, Levenshtein distance
+ *
+ ****************************************************************************/
+/* {{{ */
+
+/**
+ * lev_edit_distance:
+ * @len1: The length of @string1.
+ * @string1: A sequence of bytes of length @len1, may contain NUL characters.
+ * @len2: The length of @string2.
+ * @string2: A sequence of bytes of length @len2, may contain NUL characters.
+ * @xcost: If nonzero, the replace operation has weight 2, otherwise all
+ * edit operations have equal weights of 1.
+ *
+ * Computes Levenshtein edit distance of two strings.
+ *
+ * Returns: The edit distance.
+ **/
+_LEV_STATIC_PY size_t
+lev_edit_distance(size_t len1, const lev_byte *string1,
+ size_t len2, const lev_byte *string2,
+ int xcost)
+{
+ size_t i;
+ size_t *row; /* we only need to keep one row of costs */
+ size_t *end;
+ size_t half;
+
+ /* strip common prefix */
+ while (len1 > 0 && len2 > 0 && *string1 == *string2) {
+ len1--;
+ len2--;
+ string1++;
+ string2++;
+ }
+
+ /* strip common suffix */
+ while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) {
+ len1--;
+ len2--;
+ }
+
+ /* catch trivial cases */
+ if (len1 == 0)
+ return len2;
+ if (len2 == 0)
+ return len1;
+
+ /* make the inner cycle (i.e. string2) the longer one */
+ if (len1 > len2) {
+ size_t nx = len1;
+ const lev_byte *sx = string1;
+ len1 = len2;
+ len2 = nx;
+ string1 = string2;
+ string2 = sx;
+ }
+ /* check len1 == 1 separately */
+ if (len1 == 1) {
+ if (xcost)
+ return len2 + 1 - 2*(memchr(string2, *string1, len2) != NULL);
+ else
+ return len2 - (memchr(string2, *string1, len2) != NULL);
+ }
+ len1++;
+ len2++;
+ half = len1 >> 1;
+
+ /* initalize first row */
+ row = (size_t*)malloc(len2*sizeof(size_t));
+ if (!row)
+ return (size_t)(-1);
+ end = row + len2 - 1;
+ for (i = 0; i < len2 - (xcost ? 0 : half); i++)
+ row[i] = i;
+
+ /* go through the matrix and compute the costs. yes, this is an extremely
+ * obfuscated version, but also extremely memory-conservative and relatively
+ * fast. */
+ if (xcost) {
+ for (i = 1; i < len1; i++) {
+ size_t *p = row + 1;
+ const lev_byte char1 = string1[i - 1];
+ const lev_byte *char2p = string2;
+ size_t D = i;
+ size_t x = i;
+ while (p <= end) {
+ if (char1 == *(char2p++))
+ x = --D;
+ else
+ x++;
+ D = *p;
+ D++;
+ if (x > D)
+ x = D;
+ *(p++) = x;
+ }
+ }
+ }
+ else {
+ /* in this case we don't have to scan two corner triangles (of size len1/2)
+ * in the matrix because no best path can go throught them. note this
+ * breaks when len1 == len2 == 2 so the memchr() special case above is
+ * necessary */
+ row[0] = len1 - half - 1;
+ for (i = 1; i < len1; i++) {
+ size_t *p;
+ const lev_byte char1 = string1[i - 1];
+ const lev_byte *char2p;
+ size_t D, x;
+ /* skip the upper triangle */
+ if (i >= len1 - half) {
+ size_t offset = i - (len1 - half);
+ size_t c3;
+
+ char2p = string2 + offset;
+ p = row + offset;
+ c3 = *(p++) + (char1 != *(char2p++));
+ x = *p;
+ x++;
+ D = x;
+ if (x > c3)
+ x = c3;
+ *(p++) = x;
+ }
+ else {
+ p = row + 1;
+ char2p = string2;
+ D = x = i;
+ }
+ /* skip the lower triangle */
+ if (i <= half + 1)
+ end = row + len2 + i - half - 2;
+ /* main */
+ while (p <= end) {
+ size_t c3 = --D + (char1 != *(char2p++));
+ x++;
+ if (x > c3)
+ x = c3;
+ D = *p;
+ D++;
+ if (x > D)
+ x = D;
+ *(p++) = x;
+ }
+ /* lower triangle sentinel */
+ if (i <= half) {
+ size_t c3 = --D + (char1 != *char2p);
+ x++;
+ if (x > c3)
+ x = c3;
+ *p = x;
+ }
+ }
+ }
+
+ i = *end;
+ free(row);
+ return i;
+}
+
+_LEV_STATIC_PY double
+lev_edit_distance_sod(size_t len, const lev_byte *string,
+ size_t n, const size_t *lengths,
+ const lev_byte *strings[],
+ const double *weights,
+ int xcost)
+{
+ size_t i, d;
+ double sum = 0.0;
+
+ for (i = 0; i < n; i++) {
+ d = lev_edit_distance(len, string, lengths[i], strings[i], xcost);
+ if (d == (size_t)-1)
+ return -1.0;
+ sum += weights[i]*d;
+ }
+ return sum;
+}
+
+/**
+ * lev_u_edit_distance:
+ * @len1: The length of @string1.
+ * @string1: A sequence of Unicode characters of length @len1, may contain NUL
+ * characters.
+ * @len2: The length of @string2.
+ * @string2: A sequence of Unicode characters of length @len2, may contain NUL
+ * characters.
+ * @xcost: If nonzero, the replace operation has weight 2, otherwise all
+ * edit operations have equal weights of 1.
+ *
+ * Computes Levenshtein edit distance of two Unicode strings.
+ *
+ * Returns: The edit distance.
+ **/
+_LEV_STATIC_PY size_t
+lev_u_edit_distance(size_t len1, const lev_wchar *string1,
+ size_t len2, const lev_wchar *string2,
+ int xcost)
+{
+ size_t i;
+ size_t *row; /* we only need to keep one row of costs */
+ size_t *end;
+ size_t half;
+
+ /* strip common prefix */
+ while (len1 > 0 && len2 > 0 && *string1 == *string2) {
+ len1--;
+ len2--;
+ string1++;
+ string2++;
+ }
+
+ /* strip common suffix */
+ while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) {
+ len1--;
+ len2--;
+ }
+
+ /* catch trivial cases */
+ if (len1 == 0)
+ return len2;
+ if (len2 == 0)
+ return len1;
+
+ /* make the inner cycle (i.e. string2) the longer one */
+ if (len1 > len2) {
+ size_t nx = len1;
+ const lev_wchar *sx = string1;
+ len1 = len2;
+ len2 = nx;
+ string1 = string2;
+ string2 = sx;
+ }
+ /* check len1 == 1 separately */
+ if (len1 == 1) {
+ lev_wchar z = *string1;
+ const lev_wchar *p = string2;
+ for (i = len2; i; i--) {
+ if (*(p++) == z)
+ return len2 - 1;
+ }
+ return len2 + (xcost != 0);
+ }
+ len1++;
+ len2++;
+ half = len1 >> 1;
+
+ /* initalize first row */
+ row = (size_t*)malloc(len2*sizeof(size_t));
+ if (!row)
+ return (size_t)(-1);
+ end = row + len2 - 1;
+ for (i = 0; i < len2 - (xcost ? 0 : half); i++)
+ row[i] = i;
+
+ /* go through the matrix and compute the costs. yes, this is an extremely
+ * obfuscated version, but also extremely memory-conservative and relatively
+ * fast. */
+ if (xcost) {
+ for (i = 1; i < len1; i++) {
+ size_t *p = row + 1;
+ const lev_wchar char1 = string1[i - 1];
+ const lev_wchar *char2p = string2;
+ size_t D = i - 1;
+ size_t x = i;
+ while (p <= end) {
+ if (char1 == *(char2p++))
+ x = D;
+ else
+ x++;
+ D = *p;
+ if (x > D + 1)
+ x = D + 1;
+ *(p++) = x;
+ }
+ }
+ }
+ else {
+ /* in this case we don't have to scan two corner triangles (of size len1/2)
+ * in the matrix because no best path can go throught them. note this
+ * breaks when len1 == len2 == 2 so the memchr() special case above is
+ * necessary */
+ row[0] = len1 - half - 1;
+ for (i = 1; i < len1; i++) {
+ size_t *p;
+ const lev_wchar char1 = string1[i - 1];
+ const lev_wchar *char2p;
+ size_t D, x;
+ /* skip the upper triangle */
+ if (i >= len1 - half) {
+ size_t offset = i - (len1 - half);
+ size_t c3;
+
+ char2p = string2 + offset;
+ p = row + offset;
+ c3 = *(p++) + (char1 != *(char2p++));
+ x = *p;
+ x++;
+ D = x;
+ if (x > c3)
+ x = c3;
+ *(p++) = x;
+ }
+ else {
+ p = row + 1;
+ char2p = string2;
+ D = x = i;
+ }
+ /* skip the lower triangle */
+ if (i <= half + 1)
+ end = row + len2 + i - half - 2;
+ /* main */
+ while (p <= end) {
+ size_t c3 = --D + (char1 != *(char2p++));
+ x++;
+ if (x > c3)
+ x = c3;
+ D = *p;
+ D++;
+ if (x > D)
+ x = D;
+ *(p++) = x;
+ }
+ /* lower triangle sentinel */
+ if (i <= half) {
+ size_t c3 = --D + (char1 != *char2p);
+ x++;
+ if (x > c3)
+ x = c3;
+ *p = x;
+ }
+ }
+ }
+
+ i = *end;
+ free(row);
+ return i;
+}
+
+_LEV_STATIC_PY double
+lev_u_edit_distance_sod(size_t len, const lev_wchar *string,
+ size_t n, const size_t *lengths,
+ const lev_wchar *strings[],
+ const double *weights,
+ int xcost)
+{
+ size_t i, d;
+ double sum = 0.0;
+
+ for (i = 0; i < n; i++) {
+ d = lev_u_edit_distance(len, string, lengths[i], strings[i], xcost);
+ if (d == (size_t)-1)
+ return -1.0;
+ sum += weights[i]*d;
+ }
+ return sum;
+}
+/* }}} */
+
+
+/****************************************************************************
+ *
+ * Other simple distances: Hamming, Jaro, Jaro-Winkler
+ *
+ ****************************************************************************/
+/* {{{ */
+/**
+ * lev_hamming_distance:
+ * @len: The length of @string1 and @string2.
+ * @string1: A sequence of bytes of length @len1, may contain NUL characters.
+ * @string2: A sequence of bytes of length @len2, may contain NUL characters.
+ *
+ * Computes Hamming distance of two strings.
+ *
+ * The strings must have the same length.
+ *
+ * Returns: The Hamming distance.
+ **/
+_LEV_STATIC_PY size_t
+lev_hamming_distance(size_t len,
+ const lev_byte *string1,
+ const lev_byte *string2)
+{
+ size_t dist, i;
+
+ dist = 0;
+ for (i = len; i; i--) {
+ if (*(string1++) != *(string2++))
+ dist++;
+ }
+
+ return dist;
+}
+
+/**
+ * lev_u_hamming_distance:
+ * @len: The length of @string1 and @string2.
+ * @string1: A sequence of Unicode characters of length @len1, may contain NUL
+ * characters.
+ * @string2: A sequence of Unicode characters of length @len2, may contain NUL
+ * characters.
+ *
+ * Computes Hamming distance of two strings.
+ *
+ * The strings must have the same length.
+ *
+ * Returns: The Hamming distance.
+ **/
+_LEV_STATIC_PY size_t
+lev_u_hamming_distance(size_t len,
+ const lev_wchar *string1,
+ const lev_wchar *string2)
+{
+ size_t dist, i;
+
+ dist = 0;
+ for (i = len; i; i--) {
+ if (*(string1++) != *(string2++))
+ dist++;
+ }
+
+ return dist;
+}
+
+/**
+ * lev_jaro_ratio:
+ * @len1: The length of @string1.
+ * @string1: A sequence of bytes of length @len1, may contain NUL characters.
+ * @len2: The length of @string2.
+ * @string2: A sequence of bytes of length @len2, may contain NUL characters.
+ *
+ * Computes Jaro string similarity metric of two strings.
+ *
+ * Returns: The Jaro metric of @string1 and @string2.
+ **/
+_LEV_STATIC_PY double
+lev_jaro_ratio(size_t len1, const lev_byte *string1,
+ size_t len2, const lev_byte *string2)
+{
+ size_t i, j, halflen, trans, match, to;
+ size_t *idx;
+ double md;
+
+ if (len1 == 0 || len2 == 0) {
+ if (len1 == 0 && len2 == 0)
+ return 1.0;
+ return 0.0;
+ }
+ /* make len1 always shorter (or equally long) */
+ if (len1 > len2) {
+ const lev_byte *b;
+
+ b = string1;
+ string1 = string2;
+ string2 = b;
+
+ i = len1;
+ len1 = len2;
+ len2 = i;
+ }
+
+ halflen = (len1 + 1)/2;
+ idx = (size_t*)calloc(len1, sizeof(size_t));
+ if (!idx)
+ return -1.0;
+
+ /* The literature about Jaro metric is confusing as the method of assigment
+ * of common characters is nowhere specified. There are several possible
+ * deterministic mutual assignments of common characters of two strings.
+ * We use earliest-position method, which is however suboptimal (e.g., it
+ * gives two transpositions in jaro("Jaro", "Joaro") because of assigment
+ * of the first `o'). No reasonable algorithm for the optimal one is
+ * currently known to me. */
+ match = 0;
+ /* the part with allowed range overlapping left */
+ for (i = 0; i < halflen; i++) {
+ for (j = 0; j <= i+halflen; j++) {
+ if (string1[j] == string2[i] && !idx[j]) {
+ match++;
+ idx[j] = match;
+ break;
+ }
+ }
+ }
+ /* the part with allowed range overlapping right */
+ to = len1+halflen < len2 ? len1+halflen : len2;
+ for (i = halflen; i < to; i++) {
+ for (j = i - halflen; j < len1; j++) {
+ if (string1[j] == string2[i] && !idx[j]) {
+ match++;
+ idx[j] = match;
+ break;
+ }
+ }
+ }
+ if (!match) {
+ free(idx);
+ return 0.0;
+ }
+ /* count transpositions */
+ i = 0;
+ trans = 0;
+ for (j = 0; j < len1; j++) {
+ if (idx[j]) {
+ i++;
+ if (idx[j] != i)
+ trans++;
+ }
+ }
+ free(idx);
+
+ md = (double)match;
+ return (md/len1 + md/len2 + 1.0 - trans/md/2.0)/3.0;
+}
+
+/**
+ * lev_u_jaro_ratio:
+ * @len1: The length of @string1.
+ * @string1: A sequence of Unicode characters of length @len1, may contain NUL
+ * characters.
+ * @len2: The length of @string2.
+ * @string2: A sequence of Unicode characters of length @len2, may contain NUL
+ * characters.
+ *
+ * Computes Jaro string similarity metric of two Unicode strings.
+ *
+ * Returns: The Jaro metric of @string1 and @string2.
+ **/
+_LEV_STATIC_PY double
+lev_u_jaro_ratio(size_t len1, const lev_wchar *string1,
+ size_t len2, const lev_wchar *string2)
+{
+ size_t i, j, halflen, trans, match, to;
+ size_t *idx;
+ double md;
+
+ if (len1 == 0 || len2 == 0) {
+ if (len1 == 0 && len2 == 0)
+ return 1.0;
+ return 0.0;
+ }
+ /* make len1 always shorter (or equally long) */
+ if (len1 > len2) {
+ const lev_wchar *b;
+
+ b = string1;
+ string1 = string2;
+ string2 = b;
+
+ i = len1;
+ len1 = len2;
+ len2 = i;
+ }
+
+ halflen = (len1 + 1)/2;
+ idx = (size_t*)calloc(len1, sizeof(size_t));
+ if (!idx)
+ return -1.0;
+
+ match = 0;
+ /* the part with allowed range overlapping left */
+ for (i = 0; i < halflen; i++) {
+ for (j = 0; j <= i+halflen; j++) {
+ if (string1[j] == string2[i] && !idx[j]) {
+ match++;
+ idx[j] = match;
+ break;
+ }
+ }
+ }
+ /* the part with allowed range overlapping right */
+ to = len1+halflen < len2 ? len1+halflen : len2;
+ for (i = halflen; i < to; i++) {
+ for (j = i - halflen; j < len1; j++) {
+ if (string1[j] == string2[i] && !idx[j]) {
+ match++;
+ idx[j] = match;
+ break;
+ }
+ }
+ }
+ if (!match) {
+ free(idx);
+ return 0.0;
+ }
+ /* count transpositions */
+ i = 0;
+ trans = 0;
+ for (j = 0; j < len1; j++) {
+ if (idx[j]) {
+ i++;
+ if (idx[j] != i)
+ trans++;
+ }
+ }
+ free(idx);
+
+ md = (double)match;
+ return (md/len1 + md/len2 + 1.0 - trans/md/2.0)/3.0;
+}
+
+/**
+ * lev_jaro_winkler_ratio:
+ * @len1: The length of @string1.
+ * @string1: A sequence of bytes of length @len1, may contain NUL characters.
+ * @len2: The length of @string2.
+ * @string2: A sequence of bytes of length @len2, may contain NUL characters.
+ * @pfweight: Prefix weight, i.e., how much weight should be given to a
+ * common prefix.
+ *
+ * Computes Jaro-Winkler string similarity metric of two strings.
+ *
+ * The formula is J+@pfweight*P*(1-J), where J is Jaro metric and P is the
+ * length of common prefix.
+ *
+ * Returns: The Jaro-Winkler metric of @string1 and @string2.
+ **/
+_LEV_STATIC_PY double
+lev_jaro_winkler_ratio(size_t len1, const lev_byte *string1,
+ size_t len2, const lev_byte *string2,
+ double pfweight)
+{
+ double j;
+ size_t p, m;
+
+ j = lev_jaro_ratio(len1, string1, len2, string2);
+ m = len1 < len2 ? len1 : len2;
+ for (p = 0; p < m; p++) {
+ if (string1[p] != string2[p])
+ break;
+ }
+ j += (1.0 - j)*p*pfweight;
+ return j > 1.0 ? 1.0 : j;
+}
+
+/**
+ * lev_u_jaro_winkler_ratio:
+ * @len1: The length of @string1.
+ * @string1: A sequence of Unicode characters of length @len1, may contain NUL
+ * characters.
+ * @len2: The length of @string2.
+ * @string2: A sequence of Unicode characters of length @len2, may contain NUL
+ * characters.
+ * @pfweight: Prefix weight, i.e., how much weight should be given to a
+ * common prefix.
+ *
+ * Computes Jaro-Winkler string similarity metric of two Unicode strings.
+ *
+ * The formula is J+@pfweight*P*(1-J), where J is Jaro metric and P is the
+ * length of common prefix.
+ *
+ * Returns: The Jaro-Winkler metric of @string1 and @string2.
+ **/
+_LEV_STATIC_PY double
+lev_u_jaro_winkler_ratio(size_t len1, const lev_wchar *string1,
+ size_t len2, const lev_wchar *string2,
+ double pfweight)
+{
+ double j;
+ size_t p, m;
+
+ j = lev_u_jaro_ratio(len1, string1, len2, string2);
+ m = len1 < len2 ? len1 : len2;
+ for (p = 0; p < m; p++) {
+ if (string1[p] != string2[p])
+ break;
+ }
+ j += (1.0 - j)*p*pfweight;
+ return j > 1.0 ? 1.0 : j;
+}
+/* }}} */
+
+/****************************************************************************
+ *
+ * Generalized medians, the greedy algorithm, and greedy improvements
+ *
+ ****************************************************************************/
+/* {{{ */
+
+/* compute the sets of symbols each string contains, and the set of symbols
+ * in any of them (symset). meanwhile, count how many different symbols
+ * there are (used below for symlist). */
+static lev_byte*
+make_symlist(size_t n, const size_t *lengths,
+ const lev_byte *strings[], size_t *symlistlen)
+{
+ short int *symset; /* indexed by ALL symbols, contains 1 for symbols
+ present in the strings, zero for others */
+ size_t i, j;
+ lev_byte *symlist;
+
+ symset = calloc(0x100, sizeof(short int));
+ if (!symset) {
+ *symlistlen = (size_t)(-1);
+ return NULL;
+ }
+ *symlistlen = 0;
+ for (i = 0; i < n; i++) {
+ const lev_byte *stri = strings[i];
+ for (j = 0; j < lengths[i]; j++) {
+ int c = stri[j];
+ if (!symset[c]) {
+ (*symlistlen)++;
+ symset[c] = 1;
+ }
+ }
+ }
+ if (!*symlistlen) {
+ free(symset);
+ return NULL;
+ }
+
+ /* create dense symbol table, so we can easily iterate over only characters
+ * present in the strings */
+ {
+ size_t pos = 0;
+ symlist = (lev_byte*)malloc((*symlistlen)*sizeof(lev_byte));
+ if (!symlist) {
+ *symlistlen = (size_t)(-1);
+ free(symset);
+ return NULL;
+ }
+ for (j = 0; j < 0x100; j++) {
+ if (symset[j])
+ symlist[pos++] = (lev_byte)j;
+ }
+ }
+ free(symset);
+
+ return symlist;
+}
+
+/**
+ * lev_greedy_median:
+ * @n: The size of @lengths, @strings, and @weights.
+ * @lengths: The lengths of @strings.
+ * @strings: An array of strings, that may contain NUL characters.
+ * @weights: The string weights (they behave exactly as multiplicities, though
+ * any positive value is allowed, not just integers).
+ * @medlength: Where the length of the median should be stored.
+ *
+ * Finds a generalized median string of @strings using the greedy algorithm.
+ *
+ * Note it's considerably more efficient to give a string with weight 2
+ * than to store two identical strings in @strings (with weights 1).
+ *
+ * Returns: The generalized median, as a newly allocated string; its length
+ * is stored in @medlength.
+ **/
+_LEV_STATIC_PY lev_byte*
+lev_greedy_median(size_t n, const size_t *lengths,
+ const lev_byte *strings[],
+ const double *weights,
+ size_t *medlength)
+{
+ size_t i; /* usually iterates over strings (n) */
+ size_t j; /* usually iterates over characters */
+ size_t len; /* usually iterates over the approximate median string */
+ lev_byte *symlist; /* list of symbols present in the strings,
+ we iterate over it insead of set of all
+ existing symbols */
+ size_t symlistlen; /* length of symlist */
+ size_t maxlen; /* maximum input string length */
+ size_t stoplen; /* maximum tried median string length -- this is slightly
+ higher than maxlen, because the median string may be
+ longer than any of the input strings */
+ size_t **rows; /* Levenshtein matrix rows for each string, we need to keep
+ only one previous row to construct the current one */
+ size_t *row; /* a scratch buffer for new Levenshtein matrix row computation,
+ shared among all strings */
+ lev_byte *median; /* the resulting approximate median string */
+ double *mediandist; /* the total distance of the best median string of
+ given length. warning! mediandist[0] is total
+ distance for empty string, while median[] itself
+ is normally zero-based */
+ size_t bestlen; /* the best approximate median string length */
+
+ /* find all symbols */
+ symlist = make_symlist(n, lengths, strings, &symlistlen);
+ if (!symlist) {
+ *medlength = 0;
+ if (symlistlen != 0)
+ return NULL;
+ else
+ return calloc(1, sizeof(lev_byte));
+ }
+
+ /* allocate and initialize per-string matrix rows and a common work buffer */
+ rows = (size_t**)malloc(n*sizeof(size_t*));
+ if (!rows) {
+ free(symlist);
+ return NULL;
+ }
+ maxlen = 0;
+ for (i = 0; i < n; i++) {
+ size_t *ri;
+ size_t leni = lengths[i];
+ if (leni > maxlen)
+ maxlen = leni;
+ ri = rows[i] = (size_t*)malloc((leni + 1)*sizeof(size_t));
+ if (!ri) {
+ for (j = 0; j < i; j++)
+ free(rows[j]);
+ free(rows);
+ free(symlist);
+ return NULL;
+ }
+ for (j = 0; j <= leni; j++)
+ ri[j] = j;
+ }
+ stoplen = 2*maxlen + 1;
+ row = (size_t*)malloc((stoplen + 1)*sizeof(size_t));
+ if (!row) {
+ for (j = 0; j < n; j++)
+ free(rows[j]);
+ free(rows);
+ free(symlist);
+ return NULL;
+ }
+
+ /* compute final cost of string of length 0 (empty string may be also
+ * a valid answer) */
+ median = (lev_byte*)malloc(stoplen*sizeof(lev_byte));
+ if (!median) {
+ for (j = 0; j < n; j++)
+ free(rows[j]);
+ free(rows);
+ free(row);
+ free(symlist);
+ return NULL;
+ }
+ mediandist = (double*)malloc((stoplen + 1)*sizeof(double));
+ if (!mediandist) {
+ for (j = 0; j < n; j++)
+ free(rows[j]);
+ free(rows);
+ free(row);
+ free(symlist);
+ free(median);
+ return NULL;
+ }
+ mediandist[0] = 0.0;
+ for (i = 0; i < n; i++)
+ mediandist[0] += lengths[i]*weights[i];
+
+ /* build up the approximate median string symbol by symbol
+ * XXX: we actually exit on break below, but on the same condition */
+ for (len = 1; len <= stoplen; len++) {
+ lev_byte symbol;
+ double minminsum = LEV_INFINITY;
+ row[0] = len;
+ /* iterate over all symbols we may want to add */
+ for (j = 0; j < symlistlen; j++) {
+ double totaldist = 0.0;
+ double minsum = 0.0;
+ symbol = symlist[j];
+ /* sum Levenshtein distances from all the strings, with given weights */
+ for (i = 0; i < n; i++) {
+ const lev_byte *stri = strings[i];
+ size_t *p = rows[i];
+ size_t leni = lengths[i];
+ size_t *end = rows[i] + leni;
+ size_t min = len;
+ size_t x = len; /* == row[0] */
+ /* compute how another row of Levenshtein matrix would look for median
+ * string with this symbol added */
+ while (p < end) {
+ size_t D = *(p++) + (symbol != *(stri++));
+ x++;
+ if (x > D)
+ x = D;
+ if (x > *p + 1)
+ x = *p + 1;
+ if (x < min)
+ min = x;
+ }
+ minsum += min*weights[i];
+ totaldist += x*weights[i];
+ }
+ /* is this symbol better than all the others? */
+ if (minsum < minminsum) {
+ minminsum = minsum;
+ mediandist[len] = totaldist;
+ median[len - 1] = symbol;
+ }
+ }
+ /* stop the iteration if we no longer need to recompute the matrix rows
+ * or when we are over maxlen and adding more characters doesn't seem
+ * useful */
+ if (len == stoplen
+ || (len > maxlen && mediandist[len] > mediandist[len - 1])) {
+ stoplen = len;
+ break;
+ }
+ /* now the best symbol is known, so recompute all matrix rows for this
+ * one */
+ symbol = median[len - 1];
+ for (i = 0; i < n; i++) {
+ const lev_byte *stri = strings[i];
+ size_t *oldrow = rows[i];
+ size_t leni = lengths[i];
+ size_t k;
+ /* compute a row of Levenshtein matrix */
+ for (k = 1; k <= leni; k++) {
+ size_t c1 = oldrow[k] + 1;
+ size_t c2 = row[k - 1] + 1;
+ size_t c3 = oldrow[k - 1] + (symbol != stri[k - 1]);
+ row[k] = c2 > c3 ? c3 : c2;
+ if (row[k] > c1)
+ row[k] = c1;
+ }
+ memcpy(oldrow, row, (leni + 1)*sizeof(size_t));
+ }
+ }
+
+ /* find the string with minimum total distance */
+ bestlen = 0;
+ for (len = 1; len <= stoplen; len++) {
+ if (mediandist[len] < mediandist[bestlen])
+ bestlen = len;
+ }
+
+ /* clean up */
+ for (i = 0; i < n; i++)
+ free(rows[i]);
+ free(rows);
+ free(row);
+ free(symlist);
+ free(mediandist);
+
+ /* return result */
+ {
+ lev_byte *result = (lev_byte*)malloc(bestlen*sizeof(lev_byte));
+ if (!result) {
+ free(median);
+ return NULL;
+ }
+ memcpy(result, median, bestlen*sizeof(lev_byte));
+ free(median);
+ *medlength = bestlen;
+ return result;
+ }
+}
+
+/*
+ * Knowing the distance matrices up to some row, finish the distance
+ * computations.
+ *
+ * string1, len1 are already shortened.
+ */
+static double
+finish_distance_computations(size_t len1, lev_byte *string1,
+ size_t n, const size_t *lengths,
+ const lev_byte **strings,
+ const double *weights, size_t **rows,
+ size_t *row)
+{
+ size_t *end;
+ size_t i, j;
+ size_t offset; /* row[0]; offset + len1 give together real len of string1 */
+ double distsum = 0.0; /* sum of distances */
+
+ /* catch trivia case */
+ if (len1 == 0) {
+ for (j = 0; j < n; j++)
+ distsum += rows[j][lengths[j]]*weights[j];
+ return distsum;
+ }
+
+ /* iterate through the strings and sum the distances */
+ for (j = 0; j < n; j++) {
+ size_t *rowi = rows[j]; /* current row */
+ size_t leni = lengths[j]; /* current length */
+ size_t len = len1; /* temporary len1 for suffix stripping */
+ const lev_byte *stringi = strings[j]; /* current string */
+
+ /* strip common suffix (prefix CAN'T be stripped) */
+ while (len && leni && stringi[leni-1] == string1[len-1]) {
+ len--;
+ leni--;
+ }
+
+ /* catch trivial cases */
+ if (len == 0) {
+ distsum += rowi[leni]*weights[j];
+ continue;
+ }
+ offset = rowi[0];
+ if (leni == 0) {
+ distsum += (offset + len)*weights[j];
+ continue;
+ }
+
+ /* complete the matrix */
+ memcpy(row, rowi, (leni + 1)*sizeof(size_t));
+ end = row + leni;
+
+ for (i = 1; i <= len; i++) {
+ size_t *p = row + 1;
+ const lev_byte char1 = string1[i - 1];
+ const lev_byte *char2p = stringi;
+ size_t D, x;
+
+ D = x = i + offset;
+ while (p <= end) {
+ size_t c3 = --D + (char1 != *(char2p++));
+ x++;
+ if (x > c3)
+ x = c3;
+ D = *p;
+ D++;
+ if (x > D)
+ x = D;
+ *(p++) = x;
+ }
+ }
+ distsum += weights[j]*(*end);
+ }
+
+ return distsum;
+}
+
+/**
+ * lev_median_improve:
+ * @len: The length of @s.
+ * @s: The approximate generalized median string to be improved.
+ * @n: The size of @lengths, @strings, and @weights.
+ * @lengths: The lengths of @strings.
+ * @strings: An array of strings, that may contain NUL characters.
+ * @weights: The string weights (they behave exactly as multiplicities, though
+ * any positive value is allowed, not just integers).
+ * @medlength: Where the new length of the median should be stored.
+ *
+ * Tries to make @s a better generalized median string of @strings with
+ * small perturbations.
+ *
+ * It never returns a string with larger SOD than @s; in the worst case, a
+ * string identical to @s is returned.
+ *
+ * Returns: The improved generalized median, as a newly allocated string; its
+ * length is stored in @medlength.
+ **/
+_LEV_STATIC_PY lev_byte*
+lev_median_improve(size_t len, const lev_byte *s,
+ size_t n, const size_t *lengths,
+ const lev_byte *strings[],
+ const double *weights,
+ size_t *medlength)
+{
+ size_t i; /* usually iterates over strings (n) */
+ size_t j; /* usually iterates over characters */
+ size_t pos; /* the position in the approximate median string we are
+ trying to change */
+ lev_byte *symlist; /* list of symbols present in the strings,
+ we iterate over it insead of set of all
+ existing symbols */
+ size_t symlistlen; /* length of symlist */
+ size_t maxlen; /* maximum input string length */
+ size_t stoplen; /* maximum tried median string length -- this is slightly
+ higher than maxlen, because the median string may be
+ longer than any of the input strings */
+ size_t **rows; /* Levenshtein matrix rows for each string, we need to keep
+ only one previous row to construct the current one */
+ size_t *row; /* a scratch buffer for new Levenshtein matrix row computation,
+ shared among all strings */
+ lev_byte *median; /* the resulting approximate median string */
+ size_t medlen; /* the current approximate median string length */
+ double minminsum; /* the current total distance sum */
+
+ /* find all symbols */
+ symlist = make_symlist(n, lengths, strings, &symlistlen);
+ if (!symlist) {
+ *medlength = 0;
+ if (symlistlen != 0)
+ return NULL;
+ else
+ return calloc(1, sizeof(lev_byte));
+ }
+
+ /* allocate and initialize per-string matrix rows and a common work buffer */
+ rows = (size_t**)malloc(n*sizeof(size_t*));
+ if (!rows) {
+ free(symlist);
+ return NULL;
+ }
+ maxlen = 0;
+ for (i = 0; i < n; i++) {
+ size_t *ri;
+ size_t leni = lengths[i];
+ if (leni > maxlen)
+ maxlen = leni;
+ ri = rows[i] = (size_t*)malloc((leni + 1)*sizeof(size_t));
+ if (!ri) {
+ for (j = 0; j < i; j++)
+ free(rows[j]);
+ free(rows);
+ free(symlist);
+ return NULL;
+ }
+ for (j = 0; j <= leni; j++)
+ ri[j] = j;
+ }
+ stoplen = 2*maxlen + 1;
+ row = (size_t*)malloc((stoplen + 2)*sizeof(size_t));
+ if (!row) {
+ for (j = 0; j < n; j++)
+ free(rows[j]);
+ free(rows);
+ free(symlist);
+ return NULL;
+ }
+
+ /* initialize median to given string */
+ median = (lev_byte*)malloc((stoplen+1)*sizeof(lev_byte));
+ if (!median) {
+ for (j = 0; j < n; j++)
+ free(rows[j]);
+ free(rows);
+ free(row);
+ free(symlist);
+ return NULL;
+ }
+ median++; /* we need -1st element for insertions a pos 0 */
+ medlen = len;
+ memcpy(median, s, (medlen)*sizeof(lev_byte));
+ minminsum = finish_distance_computations(medlen, median,
+ n, lengths, strings,
+ weights, rows, row);
+
+ /* sequentially try perturbations on all positions */
+ for (pos = 0; pos <= medlen; ) {
+ lev_byte orig_symbol, symbol;
+ LevEditType operation;
+ double sum;
+
+ symbol = median[pos];
+ operation = LEV_EDIT_KEEP;
+ /* IF pos < medlength: FOREACH symbol: try to replace the symbol
+ * at pos, if some lower the total distance, chooste the best */
+ if (pos < medlen) {
+ orig_symbol = median[pos];
+ for (j = 0; j < symlistlen; j++) {
+ if (symlist[j] == orig_symbol)
+ continue;
+ median[pos] = symlist[j];
+ sum = finish_distance_computations(medlen - pos, median + pos,
+ n, lengths, strings,
+ weights, rows, row);
+ if (sum < minminsum) {
+ minminsum = sum;
+ symbol = symlist[j];
+ operation = LEV_EDIT_REPLACE;
+ }
+ }
+ median[pos] = orig_symbol;
+ }
+ /* FOREACH symbol: try to add it at pos, if some lower the total
+ * distance, chooste the best (increase medlength)
+ * We simulate insertion by replacing the character at pos-1 */
+ orig_symbol = *(median + pos - 1);
+ for (j = 0; j < symlistlen; j++) {
+ *(median + pos - 1) = symlist[j];
+ sum = finish_distance_computations(medlen - pos + 1, median + pos - 1,
+ n, lengths, strings,
+ weights, rows, row);
+ if (sum < minminsum) {
+ minminsum = sum;
+ symbol = symlist[j];
+ operation = LEV_EDIT_INSERT;
+ }
+ }
+ *(median + pos - 1) = orig_symbol;
+ /* IF pos < medlength: try to delete the symbol at pos, if it lowers
+ * the total distance remember it (decrease medlength) */
+ if (pos < medlen) {
+ sum = finish_distance_computations(medlen - pos - 1, median + pos + 1,
+ n, lengths, strings,
+ weights, rows, row);
+ if (sum < minminsum) {
+ minminsum = sum;
+ operation = LEV_EDIT_DELETE;
+ }
+ }
+ /* actually perform the operation */
+ switch (operation) {
+ case LEV_EDIT_REPLACE:
+ median[pos] = symbol;
+ break;
+
+ case LEV_EDIT_INSERT:
+ memmove(median+pos+1, median+pos,
+ (medlen - pos)*sizeof(lev_byte));
+ median[pos] = symbol;
+ medlen++;
+ break;
+
+ case LEV_EDIT_DELETE:
+ memmove(median+pos, median + pos+1,
+ (medlen - pos-1)*sizeof(lev_byte));
+ medlen--;
+ break;
+
+ default:
+ break;
+ }
+ assert(medlen <= stoplen);
+ /* now the result is known, so recompute all matrix rows and move on */
+ if (operation != LEV_EDIT_DELETE) {
+ symbol = median[pos];
+ row[0] = pos + 1;
+ for (i = 0; i < n; i++) {
+ const lev_byte *stri = strings[i];
+ size_t *oldrow = rows[i];
+ size_t leni = lengths[i];
+ size_t k;
+ /* compute a row of Levenshtein matrix */
+ for (k = 1; k <= leni; k++) {
+ size_t c1 = oldrow[k] + 1;
+ size_t c2 = row[k - 1] + 1;
+ size_t c3 = oldrow[k - 1] + (symbol != stri[k - 1]);
+ row[k] = c2 > c3 ? c3 : c2;
+ if (row[k] > c1)
+ row[k] = c1;
+ }
+ memcpy(oldrow, row, (leni + 1)*sizeof(size_t));
+ }
+ pos++;
+ }
+ }
+
+ /* clean up */
+ for (i = 0; i < n; i++)
+ free(rows[i]);
+ free(rows);
+ free(row);
+ free(symlist);
+
+ /* return result */
+ {
+ lev_byte *result = (lev_byte*)malloc(medlen*sizeof(lev_byte));
+ if (!result) {
+ free(median);
+ return NULL;
+ }
+ *medlength = medlen;
+ memcpy(result, median, medlen*sizeof(lev_byte));
+ median--;
+ free(median);
+ return result;
+ }
+}
+
+/* used internally in make_usymlist */
+typedef struct _HItem HItem;
+struct _HItem {
+ lev_wchar c;
+ HItem *n;
+};
+
+/* free usmylist hash
+ * this is a separate function because we need it in more than one place */
+static void
+free_usymlist_hash(HItem *symmap)
+{
+ size_t j;
+
+ for (j = 0; j < 0x100; j++) {
+ HItem *p = symmap + j;
+ if (p->n == symmap || p->n == NULL)
+ continue;
+ p = p->n;
+ while (p) {
+ HItem *q = p;
+ p = p->n;
+ free(q);
+ }
+ }
+ free(symmap);
+}
+
+/* compute the sets of symbols each string contains, and the set of symbols
+ * in any of them (symset). meanwhile, count how many different symbols
+ * there are (used below for symlist). */
+static lev_wchar*
+make_usymlist(size_t n, const size_t *lengths,
+ const lev_wchar *strings[], size_t *symlistlen)
+{
+ lev_wchar *symlist;
+ size_t i, j;
+ HItem *symmap;
+
+ j = 0;
+ for (i = 0; i < n; i++)
+ j += lengths[i];
+
+ *symlistlen = 0;
+ if (j == 0)
+ return NULL;
+
+ /* find all symbols, use a kind of hash for storage */
+ symmap = (HItem*)malloc(0x100*sizeof(HItem));
+ if (!symmap) {
+ *symlistlen = (size_t)(-1);
+ return NULL;
+ }
+ /* this is an ugly memory allocation avoiding hack: most hash elements
+ * will probably contain none or one symbols only so, when p->n is equal
+ * to symmap, it means there're no symbols yet, afters insterting the
+ * first one, p->n becomes normally NULL and then it behaves like an
+ * usual singly linked list */
+ for (i = 0; i < 0x100; i++)
+ symmap[i].n = symmap;
+ for (i = 0; i < n; i++) {
+ const lev_wchar *stri = strings[i];
+ for (j = 0; j < lengths[i]; j++) {
+ int c = stri[j];
+ int key = (c + (c >> 7)) & 0xff;
+ HItem *p = symmap + key;
+ if (p->n == symmap) {
+ p->c = c;
+ p->n = NULL;
+ (*symlistlen)++;
+ continue;
+ }
+ while (p->c != c && p->n != NULL)
+ p = p->n;
+ if (p->c != c) {
+ p->n = (HItem*)malloc(sizeof(HItem));
+ if (!p->n) {
+ free_usymlist_hash(symmap);
+ *symlistlen = (size_t)(-1);
+ return NULL;
+ }
+ p = p->n;
+ p->n = NULL;
+ p->c = c;
+ (*symlistlen)++;
+ }
+ }
+ }
+ /* create dense symbol table, so we can easily iterate over only characters
+ * present in the strings */
+ {
+ size_t pos = 0;
+ symlist = (lev_wchar*)malloc((*symlistlen)*sizeof(lev_wchar));
+ if (!symlist) {
+ free_usymlist_hash(symmap);
+ *symlistlen = (size_t)(-1);
+ return NULL;
+ }
+ for (j = 0; j < 0x100; j++) {
+ HItem *p = symmap + j;
+ while (p != NULL && p->n != symmap) {
+ symlist[pos++] = p->c;
+ p = p->n;
+ }
+ }
+ }
+
+ /* free memory */
+ free_usymlist_hash(symmap);
+
+ return symlist;
+}
+
+/**
+ * lev_u_greedy_median:
+ * @n: The size of @lengths, @strings, and @weights.
+ * @lengths: The lengths of @strings.
+ * @strings: An array of strings, that may contain NUL characters.
+ * @weights: The string weights (they behave exactly as multiplicities, though
+ * any positive value is allowed, not just integers).
+ * @medlength: Where the length of the median should be stored.
+ *
+ * Finds a generalized median string of @strings using the greedy algorithm.
+ *
+ * Note it's considerably more efficient to give a string with weight 2
+ * than to store two identical strings in @strings (with weights 1).
+ *
+ * Returns: The generalized median, as a newly allocated string; its length
+ * is stored in @medlength.
+ **/
+_LEV_STATIC_PY lev_wchar*
+lev_u_greedy_median(size_t n, const size_t *lengths,
+ const lev_wchar *strings[],
+ const double *weights,
+ size_t *medlength)
+{
+ size_t i; /* usually iterates over strings (n) */
+ size_t j; /* usually iterates over characters */
+ size_t len; /* usually iterates over the approximate median string */
+ lev_wchar *symlist; /* list of symbols present in the strings,
+ we iterate over it insead of set of all
+ existing symbols */
+ size_t symlistlen; /* length of symlistle */
+ size_t maxlen; /* maximum input string length */
+ size_t stoplen; /* maximum tried median string length -- this is slightly
+ higher than maxlen, because the median string may be
+ longer than any of the input strings */
+ size_t **rows; /* Levenshtein matrix rows for each string, we need to keep
+ only one previous row to construct the current one */
+ size_t *row; /* a scratch buffer for new Levenshtein matrix row computation,
+ shared among all strings */
+ lev_wchar *median; /* the resulting approximate median string */
+ double *mediandist; /* the total distance of the best median string of
+ given length. warning! mediandist[0] is total
+ distance for empty string, while median[] itself
+ is normally zero-based */
+ size_t bestlen; /* the best approximate median string length */
+
+ /* find all symbols */
+ symlist = make_usymlist(n, lengths, strings, &symlistlen);
+ if (!symlist) {
+ *medlength = 0;
+ if (symlistlen != 0)
+ return NULL;
+ else
+ return calloc(1, sizeof(lev_wchar));
+ }
+
+ /* allocate and initialize per-string matrix rows and a common work buffer */
+ rows = (size_t**)malloc(n*sizeof(size_t*));
+ if (!rows) {
+ free(symlist);
+ return NULL;
+ }
+ maxlen = 0;
+ for (i = 0; i < n; i++) {
+ size_t *ri;
+ size_t leni = lengths[i];
+ if (leni > maxlen)
+ maxlen = leni;
+ ri = rows[i] = (size_t*)malloc((leni + 1)*sizeof(size_t));
+ if (!ri) {
+ for (j = 0; j < i; j++)
+ free(rows[j]);
+ free(rows);
+ free(symlist);
+ return NULL;
+ }
+ for (j = 0; j <= leni; j++)
+ ri[j] = j;
+ }
+ stoplen = 2*maxlen + 1;
+ row = (size_t*)malloc((stoplen + 1)*sizeof(size_t));
+ if (!row) {
+ for (j = 0; j < n; j++)
+ free(rows[j]);
+ free(rows);
+ free(symlist);
+ return NULL;
+ }
+
+ /* compute final cost of string of length 0 (empty string may be also
+ * a valid answer) */
+ median = (lev_wchar*)malloc(stoplen*sizeof(lev_wchar));
+ if (!median) {
+ for (j = 0; j < n; j++)
+ free(rows[j]);
+ free(rows);
+ free(row);
+ free(symlist);
+ return NULL;
+ }
+ mediandist = (double*)malloc((stoplen + 1)*sizeof(double));
+ if (!mediandist) {
+ for (j = 0; j < n; j++)
+ free(rows[j]);
+ free(rows);
+ free(row);
+ free(symlist);
+ free(median);
+ return NULL;
+ }
+ mediandist[0] = 0.0;
+ for (i = 0; i < n; i++)
+ mediandist[0] += lengths[i]*weights[i];
+
+ /* build up the approximate median string symbol by symbol
+ * XXX: we actually exit on break below, but on the same condition */
+ for (len = 1; len <= stoplen; len++) {
+ lev_wchar symbol;
+ double minminsum = LEV_INFINITY;
+ row[0] = len;
+ /* iterate over all symbols we may want to add */
+ for (j = 0; j < symlistlen; j++) {
+ double totaldist = 0.0;
+ double minsum = 0.0;
+ symbol = symlist[j];
+ /* sum Levenshtein distances from all the strings, with given weights */
+ for (i = 0; i < n; i++) {
+ const lev_wchar *stri = strings[i];
+ size_t *p = rows[i];
+ size_t leni = lengths[i];
+ size_t *end = rows[i] + leni;
+ size_t min = len;
+ size_t x = len; /* == row[0] */
+ /* compute how another row of Levenshtein matrix would look for median
+ * string with this symbol added */
+ while (p < end) {
+ size_t D = *(p++) + (symbol != *(stri++));
+ x++;
+ if (x > D)
+ x = D;
+ if (x > *p + 1)
+ x = *p + 1;
+ if (x < min)
+ min = x;
+ }
+ minsum += min*weights[i];
+ totaldist += x*weights[i];
+ }
+ /* is this symbol better than all the others? */
+ if (minsum < minminsum) {
+ minminsum = minsum;
+ mediandist[len] = totaldist;
+ median[len - 1] = symbol;
+ }
+ }
+ /* stop the iteration if we no longer need to recompute the matrix rows
+ * or when we are over maxlen and adding more characters doesn't seem
+ * useful */
+ if (len == stoplen
+ || (len > maxlen && mediandist[len] > mediandist[len - 1])) {
+ stoplen = len;
+ break;
+ }
+ /* now the best symbol is known, so recompute all matrix rows for this
+ * one */
+ symbol = median[len - 1];
+ for (i = 0; i < n; i++) {
+ const lev_wchar *stri = strings[i];
+ size_t *oldrow = rows[i];
+ size_t leni = lengths[i];
+ size_t k;
+ /* compute a row of Levenshtein matrix */
+ for (k = 1; k <= leni; k++) {
+ size_t c1 = oldrow[k] + 1;
+ size_t c2 = row[k - 1] + 1;
+ size_t c3 = oldrow[k - 1] + (symbol != stri[k - 1]);
+ row[k] = c2 > c3 ? c3 : c2;
+ if (row[k] > c1)
+ row[k] = c1;
+ }
+ memcpy(oldrow, row, (leni + 1)*sizeof(size_t));
+ }
+ }
+
+ /* find the string with minimum total distance */
+ bestlen = 0;
+ for (len = 1; len <= stoplen; len++) {
+ if (mediandist[len] < mediandist[bestlen])
+ bestlen = len;
+ }
+
+ /* clean up */
+ for (i = 0; i < n; i++)
+ free(rows[i]);
+ free(rows);
+ free(row);
+ free(symlist);
+ free(mediandist);
+
+ /* return result */
+ {
+ lev_wchar *result = (lev_wchar*)malloc(bestlen*sizeof(lev_wchar));
+ if (!result) {
+ free(median);
+ return NULL;
+ }
+ memcpy(result, median, bestlen*sizeof(lev_wchar));
+ free(median);
+ *medlength = bestlen;
+ return result;
+ }
+}
+
+/*
+ * Knowing the distance matrices up to some row, finish the distance
+ * computations.
+ *
+ * string1, len1 are already shortened.
+ */
+static double
+finish_udistance_computations(size_t len1, lev_wchar *string1,
+ size_t n, const size_t *lengths,
+ const lev_wchar **strings,
+ const double *weights, size_t **rows,
+ size_t *row)
+{
+ size_t *end;
+ size_t i, j;
+ size_t offset; /* row[0]; offset + len1 give together real len of string1 */
+ double distsum = 0.0; /* sum of distances */
+
+ /* catch trivia case */
+ if (len1 == 0) {
+ for (j = 0; j < n; j++)
+ distsum += rows[j][lengths[j]]*weights[j];
+ return distsum;
+ }
+
+ /* iterate through the strings and sum the distances */
+ for (j = 0; j < n; j++) {
+ size_t *rowi = rows[j]; /* current row */
+ size_t leni = lengths[j]; /* current length */
+ size_t len = len1; /* temporary len1 for suffix stripping */
+ const lev_wchar *stringi = strings[j]; /* current string */
+
+ /* strip common suffix (prefix CAN'T be stripped) */
+ while (len && leni && stringi[leni-1] == string1[len-1]) {
+ len--;
+ leni--;
+ }
+
+ /* catch trivial cases */
+ if (len == 0) {
+ distsum += rowi[leni]*weights[j];
+ continue;
+ }
+ offset = rowi[0];
+ if (leni == 0) {
+ distsum += (offset + len)*weights[j];
+ continue;
+ }
+
+ /* complete the matrix */
+ memcpy(row, rowi, (leni + 1)*sizeof(size_t));
+ end = row + leni;
+
+ for (i = 1; i <= len; i++) {
+ size_t *p = row + 1;
+ const lev_wchar char1 = string1[i - 1];
+ const lev_wchar *char2p = stringi;
+ size_t D, x;
+
+ D = x = i + offset;
+ while (p <= end) {
+ size_t c3 = --D + (char1 != *(char2p++));
+ x++;
+ if (x > c3)
+ x = c3;
+ D = *p;
+ D++;
+ if (x > D)
+ x = D;
+ *(p++) = x;
+ }
+ }
+ distsum += weights[j]*(*end);
+ }
+
+ return distsum;
+}
+
+/**
+ * lev_u_median_improve:
+ * @len: The length of @s.
+ * @s: The approximate generalized median string to be improved.
+ * @n: The size of @lengths, @strings, and @weights.
+ * @lengths: The lengths of @strings.
+ * @strings: An array of strings, that may contain NUL characters.
+ * @weights: The string weights (they behave exactly as multiplicities, though
+ * any positive value is allowed, not just integers).
+ * @medlength: Where the new length of the median should be stored.
+ *
+ * Tries to make @s a better generalized median string of @strings with
+ * small perturbations.
+ *
+ * It never returns a string with larger SOD than @s; in the worst case, a
+ * string identical to @s is returned.
+ *
+ * Returns: The improved generalized median, as a newly allocated string; its
+ * length is stored in @medlength.
+ **/
+_LEV_STATIC_PY lev_wchar*
+lev_u_median_improve(size_t len, const lev_wchar *s,
+ size_t n, const size_t *lengths,
+ const lev_wchar *strings[],
+ const double *weights,
+ size_t *medlength)
+{
+ size_t i; /* usually iterates over strings (n) */
+ size_t j; /* usually iterates over characters */
+ size_t pos; /* the position in the approximate median string we are
+ trying to change */
+ lev_wchar *symlist; /* list of symbols present in the strings,
+ we iterate over it insead of set of all
+ existing symbols */
+ size_t symlistlen; /* length of symlist */
+ size_t maxlen; /* maximum input string length */
+ size_t stoplen; /* maximum tried median string length -- this is slightly
+ higher than maxlen, because the median string may be
+ longer than any of the input strings */
+ size_t **rows; /* Levenshtein matrix rows for each string, we need to keep
+ only one previous row to construct the current one */
+ size_t *row; /* a scratch buffer for new Levenshtein matrix row computation,
+ shared among all strings */
+ lev_wchar *median; /* the resulting approximate median string */
+ size_t medlen; /* the current approximate median string length */
+ double minminsum; /* the current total distance sum */
+
+ /* find all symbols */
+ symlist = make_usymlist(n, lengths, strings, &symlistlen);
+ if (!symlist) {
+ *medlength = 0;
+ if (symlistlen != 0)
+ return NULL;
+ else
+ return calloc(1, sizeof(lev_wchar));
+ }
+
+ /* allocate and initialize per-string matrix rows and a common work buffer */
+ rows = (size_t**)malloc(n*sizeof(size_t*));
+ if (!rows) {
+ free(symlist);
+ return NULL;
+ }
+ maxlen = 0;
+ for (i = 0; i < n; i++) {
+ size_t *ri;
+ size_t leni = lengths[i];
+ if (leni > maxlen)
+ maxlen = leni;
+ ri = rows[i] = (size_t*)malloc((leni + 1)*sizeof(size_t));
+ if (!ri) {
+ for (j = 0; j < i; j++)
+ free(rows[j]);
+ free(rows);
+ free(symlist);
+ return NULL;
+ }
+ for (j = 0; j <= leni; j++)
+ ri[j] = j;
+ }
+ stoplen = 2*maxlen + 1;
+ row = (size_t*)malloc((stoplen + 2)*sizeof(size_t));
+ if (!row) {
+ for (j = 0; j < n; j++)
+ free(rows[j]);
+ free(rows);
+ free(symlist);
+ return NULL;
+ }
+
+ /* initialize median to given string */
+ median = (lev_wchar*)malloc((stoplen+1)*sizeof(lev_wchar));
+ if (!median) {
+ for (j = 0; j < n; j++)
+ free(rows[j]);
+ free(rows);
+ free(row);
+ free(symlist);
+ return NULL;
+ }
+ median++; /* we need -1st element for insertions a pos 0 */
+ medlen = len;
+ memcpy(median, s, (medlen)*sizeof(lev_wchar));
+ minminsum = finish_udistance_computations(medlen, median,
+ n, lengths, strings,
+ weights, rows, row);
+
+ /* sequentially try perturbations on all positions */
+ for (pos = 0; pos <= medlen; ) {
+ lev_wchar orig_symbol, symbol;
+ LevEditType operation;
+ double sum;
+
+ symbol = median[pos];
+ operation = LEV_EDIT_KEEP;
+ /* IF pos < medlength: FOREACH symbol: try to replace the symbol
+ * at pos, if some lower the total distance, chooste the best */
+ if (pos < medlen) {
+ orig_symbol = median[pos];
+ for (j = 0; j < symlistlen; j++) {
+ if (symlist[j] == orig_symbol)
+ continue;
+ median[pos] = symlist[j];
+ sum = finish_udistance_computations(medlen - pos, median + pos,
+ n, lengths, strings,
+ weights, rows, row);
+ if (sum < minminsum) {
+ minminsum = sum;
+ symbol = symlist[j];
+ operation = LEV_EDIT_REPLACE;
+ }
+ }
+ median[pos] = orig_symbol;
+ }
+ /* FOREACH symbol: try to add it at pos, if some lower the total
+ * distance, chooste the best (increase medlength)
+ * We simulate insertion by replacing the character at pos-1 */
+ orig_symbol = *(median + pos - 1);
+ for (j = 0; j < symlistlen; j++) {
+ *(median + pos - 1) = symlist[j];
+ sum = finish_udistance_computations(medlen - pos + 1, median + pos - 1,
+ n, lengths, strings,
+ weights, rows, row);
+ if (sum < minminsum) {
+ minminsum = sum;
+ symbol = symlist[j];
+ operation = LEV_EDIT_INSERT;
+ }
+ }
+ *(median + pos - 1) = orig_symbol;
+ /* IF pos < medlength: try to delete the symbol at pos, if it lowers
+ * the total distance remember it (decrease medlength) */
+ if (pos < medlen) {
+ sum = finish_udistance_computations(medlen - pos - 1, median + pos + 1,
+ n, lengths, strings,
+ weights, rows, row);
+ if (sum < minminsum) {
+ minminsum = sum;
+ operation = LEV_EDIT_DELETE;
+ }
+ }
+ /* actually perform the operation */
+ switch (operation) {
+ case LEV_EDIT_REPLACE:
+ median[pos] = symbol;
+ break;
+
+ case LEV_EDIT_INSERT:
+ memmove(median+pos+1, median+pos,
+ (medlen - pos)*sizeof(lev_wchar));
+ median[pos] = symbol;
+ medlen++;
+ break;
+
+ case LEV_EDIT_DELETE:
+ memmove(median+pos, median + pos+1,
+ (medlen - pos-1)*sizeof(lev_wchar));
+ medlen--;
+ break;
+
+ default:
+ break;
+ }
+ assert(medlen <= stoplen);
+ /* now the result is known, so recompute all matrix rows and move on */
+ if (operation != LEV_EDIT_DELETE) {
+ symbol = median[pos];
+ row[0] = pos + 1;
+ for (i = 0; i < n; i++) {
+ const lev_wchar *stri = strings[i];
+ size_t *oldrow = rows[i];
+ size_t leni = lengths[i];
+ size_t k;
+ /* compute a row of Levenshtein matrix */
+ for (k = 1; k <= leni; k++) {
+ size_t c1 = oldrow[k] + 1;
+ size_t c2 = row[k - 1] + 1;
+ size_t c3 = oldrow[k - 1] + (symbol != stri[k - 1]);
+ row[k] = c2 > c3 ? c3 : c2;
+ if (row[k] > c1)
+ row[k] = c1;
+ }
+ memcpy(oldrow, row, (leni + 1)*sizeof(size_t));
+ }
+ pos++;
+ }
+ }
+
+ /* clean up */
+ for (i = 0; i < n; i++)
+ free(rows[i]);
+ free(rows);
+ free(row);
+ free(symlist);
+
+ /* return result */
+ {
+ lev_wchar *result = (lev_wchar*)malloc(medlen*sizeof(lev_wchar));
+ if (!result) {
+ free(median);
+ return NULL;
+ }
+ *medlength = medlen;
+ memcpy(result, median, medlen*sizeof(lev_wchar));
+ median--;
+ free(median);
+ return result;
+ }
+}
+/* }}} */
+
+/****************************************************************************
+ *
+ * Quick (voting) medians
+ *
+ ****************************************************************************/
+/* {{{ */
+
+/* compute the sets of symbols each string contains, and the set of symbols
+ * in any of them (symset). meanwhile, count how many different symbols
+ * there are (used below for symlist).
+ * the symset is passed as an argument to avoid its allocation and
+ * deallocation when it's used in the caller too */
+static lev_byte*
+make_symlistset(size_t n, const size_t *lengths,
+ const lev_byte *strings[], size_t *symlistlen,
+ double *symset)
+{
+ size_t i, j;
+ lev_byte *symlist;
+
+ if (!symset) {
+ *symlistlen = (size_t)(-1);
+ return NULL;
+ }
+ memset(symset, 0, 0x100*sizeof(double)); /* XXX: needs IEEE doubles?! */
+ *symlistlen = 0;
+ for (i = 0; i < n; i++) {
+ const lev_byte *stri = strings[i];
+ for (j = 0; j < lengths[i]; j++) {
+ int c = stri[j];
+ if (!symset[c]) {
+ (*symlistlen)++;
+ symset[c] = 1.0;
+ }
+ }
+ }
+ if (!*symlistlen)
+ return NULL;
+
+ /* create dense symbol table, so we can easily iterate over only characters
+ * present in the strings */
+ {
+ size_t pos = 0;
+ symlist = (lev_byte*)malloc((*symlistlen)*sizeof(lev_byte));
+ if (!symlist) {
+ *symlistlen = (size_t)(-1);
+ return NULL;
+ }
+ for (j = 0; j < 0x100; j++) {
+ if (symset[j])
+ symlist[pos++] = (lev_byte)j;
+ }
+ }
+
+ return symlist;
+}
+
+_LEV_STATIC_PY lev_byte*
+lev_quick_median(size_t n,
+ const size_t *lengths,
+ const lev_byte *strings[],
+ const double *weights,
+ size_t *medlength)
+{
+ size_t symlistlen, len, i, j, k;
+ lev_byte *symlist;
+ lev_byte *median; /* the resulting string */
+ double *symset;
+ double ml, wl;
+
+ /* first check whether the result would be an empty string
+ * and compute resulting string length */
+ ml = wl = 0.0;
+ for (i = 0; i < n; i++) {
+ ml += lengths[i]*weights[i];
+ wl += weights[i];
+ }
+ if (wl == 0.0)
+ return (lev_byte*)calloc(1, sizeof(lev_byte));
+ ml = floor(ml/wl + 0.499999);
+ *medlength = len = ml;
+ if (!len)
+ return (lev_byte*)calloc(1, sizeof(lev_byte));
+ median = (lev_byte*)malloc(len*sizeof(lev_byte));
+ if (!median)
+ return NULL;
+
+ /* find the symbol set;
+ * now an empty symbol set is really a failure */
+ symset = (double*)calloc(0x100, sizeof(double));
+ if (!symset) {
+ free(median);
+ return NULL;
+ }
+ symlist = make_symlistset(n, lengths, strings, &symlistlen, symset);
+ if (!symlist) {
+ free(median);
+ free(symset);
+ return NULL;
+ }
+
+ for (j = 0; j < len; j++) {
+ /* clear the symbol probabilities */
+ if (symlistlen < 32) {
+ for (i = 0; i < symlistlen; i++)
+ symset[symlist[i]] = 0.0;
+ }
+ else
+ memset(symset, 0, 0x100*sizeof(double));
+
+ /* let all strings vote */
+ for (i = 0; i < n; i++) {
+ const lev_byte *stri = strings[i];
+ double weighti = weights[i];
+ size_t lengthi = lengths[i];
+ double start = lengthi/ml*j;
+ double end = start + lengthi/ml;
+ size_t istart = floor(start);
+ size_t iend = ceil(end);
+
+ /* rounding errors can overflow the buffer */
+ if (iend > lengthi)
+ iend = lengthi;
+
+ for (k = istart+1; k < iend; k++)
+ symset[stri[k]] += weighti;
+ symset[stri[istart]] += weighti*(1+istart - start);
+ symset[stri[iend-1]] -= weighti*(iend - end);
+ }
+
+ /* find the elected symbol */
+ k = symlist[0];
+ for (i = 1; i < symlistlen; i++) {
+ if (symset[symlist[i]] > symset[k])
+ k = symlist[i];
+ }
+ median[j] = k;
+ }
+
+ free(symset);
+ free(symlist);
+
+ return median;
+}
+
+/* used internally in make_usymlistset */
+typedef struct _HQItem HQItem;
+struct _HQItem {
+ lev_wchar c;
+ double s;
+ HQItem *n;
+};
+
+/* free usmylistset hash
+ * this is a separate function because we need it in more than one place */
+static void
+free_usymlistset_hash(HQItem *symmap)
+{
+ size_t j;
+
+ for (j = 0; j < 0x100; j++) {
+ HQItem *p = symmap + j;
+ if (p->n == symmap || p->n == NULL)
+ continue;
+ p = p->n;
+ while (p) {
+ HQItem *q = p;
+ p = p->n;
+ free(q);
+ }
+ }
+ free(symmap);
+}
+
+/* compute the sets of symbols each string contains, and the set of symbols
+ * in any of them (symset). meanwhile, count how many different symbols
+ * there are (used below for symlist).
+ * the symset is passed as an argument to avoid its allocation and
+ * deallocation when it's used in the caller too */
+static lev_wchar*
+make_usymlistset(size_t n, const size_t *lengths,
+ const lev_wchar *strings[], size_t *symlistlen,
+ HQItem *symmap)
+{
+ lev_wchar *symlist;
+ size_t i, j;
+
+ j = 0;
+ for (i = 0; i < n; i++)
+ j += lengths[i];
+
+ *symlistlen = 0;
+ if (j == 0)
+ return NULL;
+
+ /* this is an ugly memory allocation avoiding hack: most hash elements
+ * will probably contain none or one symbols only so, when p->n is equal
+ * to symmap, it means there're no symbols yet, afters insterting the
+ * first one, p->n becomes normally NULL and then it behaves like an
+ * usual singly linked list */
+ for (i = 0; i < 0x100; i++)
+ symmap[i].n = symmap;
+ for (i = 0; i < n; i++) {
+ const lev_wchar *stri = strings[i];
+ for (j = 0; j < lengths[i]; j++) {
+ int c = stri[j];
+ int key = (c + (c >> 7)) & 0xff;
+ HQItem *p = symmap + key;
+ if (p->n == symmap) {
+ p->c = c;
+ p->n = NULL;
+ (*symlistlen)++;
+ continue;
+ }
+ while (p->c != c && p->n != NULL)
+ p = p->n;
+ if (p->c != c) {
+ p->n = (HQItem*)malloc(sizeof(HQItem));
+ if (!p->n) {
+ *symlistlen = (size_t)(-1);
+ return NULL;
+ }
+ p = p->n;
+ p->n = NULL;
+ p->c = c;
+ (*symlistlen)++;
+ }
+ }
+ }
+ /* create dense symbol table, so we can easily iterate over only characters
+ * present in the strings */
+ {
+ size_t pos = 0;
+ symlist = (lev_wchar*)malloc((*symlistlen)*sizeof(lev_wchar));
+ if (!symlist) {
+ *symlistlen = (size_t)(-1);
+ return NULL;
+ }
+ for (j = 0; j < 0x100; j++) {
+ HQItem *p = symmap + j;
+ while (p != NULL && p->n != symmap) {
+ symlist[pos++] = p->c;
+ p = p->n;
+ }
+ }
+ }
+
+ return symlist;
+}
+
+_LEV_STATIC_PY lev_wchar*
+lev_u_quick_median(size_t n,
+ const size_t *lengths,
+ const lev_wchar *strings[],
+ const double *weights,
+ size_t *medlength)
+{
+ size_t symlistlen, len, i, j, k;
+ lev_wchar *symlist;
+ lev_wchar *median; /* the resulting string */
+ HQItem *symmap;
+ double ml, wl;
+
+ /* first check whether the result would be an empty string
+ * and compute resulting string length */
+ ml = wl = 0.0;
+ for (i = 0; i < n; i++) {
+ ml += lengths[i]*weights[i];
+ wl += weights[i];
+ }
+ if (wl == 0.0)
+ return (lev_wchar*)calloc(1, sizeof(lev_wchar));
+ ml = floor(ml/wl + 0.499999);
+ *medlength = len = ml;
+ if (!len)
+ return (lev_wchar*)calloc(1, sizeof(lev_wchar));
+ median = (lev_wchar*)malloc(len*sizeof(lev_wchar));
+ if (!median)
+ return NULL;
+
+ /* find the symbol set;
+ * now an empty symbol set is really a failure */
+ symmap = (HQItem*)malloc(0x100*sizeof(HQItem));
+ if (!symmap) {
+ free(median);
+ return NULL;
+ }
+ symlist = make_usymlistset(n, lengths, strings, &symlistlen, symmap);
+ if (!symlist) {
+ free(median);
+ free_usymlistset_hash(symmap);
+ return NULL;
+ }
+
+ for (j = 0; j < len; j++) {
+ /* clear the symbol probabilities */
+ for (i = 0; i < 0x100; i++) {
+ HQItem *p = symmap + i;
+ if (p->n == symmap)
+ continue;
+ while (p) {
+ p->s = 0.0;
+ p = p->n;
+ }
+ }
+
+ /* let all strings vote */
+ for (i = 0; i < n; i++) {
+ const lev_wchar *stri = strings[i];
+ double weighti = weights[i];
+ size_t lengthi = lengths[i];
+ double start = lengthi/ml*j;
+ double end = start + lengthi/ml;
+ size_t istart = floor(start);
+ size_t iend = ceil(end);
+
+ /* rounding errors can overflow the buffer */
+ if (iend > lengthi)
+ iend = lengthi;
+
+ /* the inner part, including the complete last character */
+ for (k = istart+1; k < iend; k++) {
+ int c = stri[k];
+ int key = (c + (c >> 7)) & 0xff;
+ HQItem *p = symmap + key;
+ while (p->c != c)
+ p = p->n;
+ p->s += weighti;
+ }
+ /* the initial fraction */
+ {
+ int c = stri[istart];
+ int key = (c + (c >> 7)) & 0xff;
+ HQItem *p = symmap + key;
+ while (p->c != c)
+ p = p->n;
+ p->s += weighti*(1+istart - start);
+ }
+ /* subtract what we counted from the last character but doesn't
+ * actually belong here.
+ * this strategy works also when istart+1 == iend (i.e., everything
+ * happens inside a one character) */
+ {
+ int c = stri[iend-1];
+ int key = (c + (c >> 7)) & 0xff;
+ HQItem *p = symmap + key;
+ while (p->c != c)
+ p = p->n;
+ p->s -= weighti*(iend - end);
+ }
+ }
+
+ /* find the elected symbol */
+ {
+ HQItem *max = NULL;
+
+ for (i = 0; i < 0x100; i++) {
+ HQItem *p = symmap + i;
+ if (p->n == symmap)
+ continue;
+ while (p) {
+ if (!max || p->s > max->s)
+ max = p;
+ p = p->n;
+ }
+ }
+ median[j] = max->c;
+ }
+ }
+
+ free_usymlistset_hash(symmap);
+ free(symlist);
+
+ return median;
+}
+/* }}} */
+
+/****************************************************************************
+ *
+ * Set medians
+ *
+ ****************************************************************************/
+/* {{{ */
+
+/**
+ * lev_set_median_index:
+ * @n: The size of @lengths, @strings, and @weights.
+ * @lengths: The lengths of @strings.
+ * @strings: An array of strings, that may contain NUL characters.
+ * @weights: The string weights (they behave exactly as multiplicities, though
+ * any positive value is allowed, not just integers).
+ *
+ * Finds the median string of a string set @strings.
+ *
+ * Returns: An index in @strings pointing to the set median, -1 in case of
+ * failure.
+ **/
+_LEV_STATIC_PY size_t
+lev_set_median_index(size_t n, const size_t *lengths,
+ const lev_byte *strings[],
+ const double *weights)
+{
+ size_t minidx = 0;
+ double mindist = LEV_INFINITY;
+ size_t i;
+ long int *distances;
+
+ distances = (long int*)malloc((n*(n - 1)/2)*sizeof(long int));
+ if (!distances)
+ return (size_t)-1;
+
+ memset(distances, 0xff, (n*(n - 1)/2)*sizeof(long int)); /* XXX */
+ for (i = 0; i < n; i++) {
+ size_t j = 0;
+ double dist = 0.0;
+ const lev_byte *stri = strings[i];
+ size_t leni = lengths[i];
+ /* below diagonal */
+ while (j < i && dist < mindist) {
+ size_t dindex = (i - 1)*(i - 2)/2 + j;
+ long int d;
+ if (distances[dindex] >= 0)
+ d = distances[dindex];
+ else {
+ d = lev_edit_distance(lengths[j], strings[j], leni, stri, 0);
+ if (d < 0) {
+ free(distances);
+ return (size_t)-1;
+ }
+ }
+ dist += weights[j]*d;
+ j++;
+ }
+ j++; /* no need to compare item with itself */
+ /* above diagonal */
+ while (j < n && dist < mindist) {
+ size_t dindex = (j - 1)*(j - 2)/2 + i;
+ distances[dindex] = lev_edit_distance(lengths[j], strings[j],
+ leni, stri, 0);
+ if (distances[dindex] < 0) {
+ free(distances);
+ return (size_t)-1;
+ }
+ dist += weights[j]*distances[dindex];
+ j++;
+ }
+
+ if (dist < mindist) {
+ mindist = dist;
+ minidx = i;
+ }
+ }
+
+ free(distances);
+ return minidx;
+}
+
+/**
+ * lev_u_set_median_index:
+ * @n: The size of @lengths, @strings, and @weights.
+ * @lengths: The lengths of @strings.
+ * @strings: An array of strings, that may contain NUL characters.
+ * @weights: The string weights (they behave exactly as multiplicities, though
+ * any positive value is allowed, not just integers).
+ *
+ * Finds the median string of a string set @strings.
+ *
+ * Returns: An index in @strings pointing to the set median, -1 in case of
+ * failure.
+ **/
+_LEV_STATIC_PY size_t
+lev_u_set_median_index(size_t n, const size_t *lengths,
+ const lev_wchar *strings[],
+ const double *weights)
+{
+ size_t minidx = 0;
+ double mindist = LEV_INFINITY;
+ size_t i;
+ long int *distances;
+
+ distances = (long int*)malloc((n*(n - 1)/2)*sizeof(long int));
+ if (!distances)
+ return (size_t)-1;
+
+ memset(distances, 0xff, (n*(n - 1)/2)*sizeof(long int)); /* XXX */
+ for (i = 0; i < n; i++) {
+ size_t j = 0;
+ double dist = 0.0;
+ const lev_wchar *stri = strings[i];
+ size_t leni = lengths[i];
+ /* below diagonal */
+ while (j < i && dist < mindist) {
+ size_t dindex = (i - 1)*(i - 2)/2 + j;
+ long int d;
+ if (distances[dindex] >= 0)
+ d = distances[dindex];
+ else {
+ d = lev_u_edit_distance(lengths[j], strings[j], leni, stri, 0);
+ if (d < 0) {
+ free(distances);
+ return (size_t)-1;
+ }
+ }
+ dist += weights[j]*d;
+ j++;
+ }
+ j++; /* no need to compare item with itself */
+ /* above diagonal */
+ while (j < n && dist < mindist) {
+ size_t dindex = (j - 1)*(j - 2)/2 + i;
+ distances[dindex] = lev_u_edit_distance(lengths[j], strings[j],
+ leni, stri, 0);
+ if (distances[dindex] < 0) {
+ free(distances);
+ return (size_t)-1;
+ }
+ dist += weights[j]*distances[dindex];
+ j++;
+ }
+
+ if (dist < mindist) {
+ mindist = dist;
+ minidx = i;
+ }
+ }
+
+ free(distances);
+ return minidx;
+}
+
+/**
+ * lev_set_median:
+ * @n: The size of @lengths, @strings, and @weights.
+ * @lengths: The lengths of @strings.
+ * @strings: An array of strings, that may contain NUL characters.
+ * @weights: The string weights (they behave exactly as multiplicities, though
+ * any positive value is allowed, not just integers).
+ * @medlength: Where the length of the median string should be stored.
+ *
+ * Finds the median string of a string set @strings.
+ *
+ * Returns: The set median as a newly allocate string, its length is stored
+ * in @medlength. %NULL in the case of failure.
+ **/
+_LEV_STATIC_PY lev_byte*
+lev_set_median(size_t n, const size_t *lengths,
+ const lev_byte *strings[],
+ const double *weights,
+ size_t *medlength)
+{
+ size_t minidx = lev_set_median_index(n, lengths, strings, weights);
+ lev_byte *result;
+
+ if (minidx == (size_t)-1)
+ return NULL;
+
+ *medlength = lengths[minidx];
+ if (!lengths[minidx])
+ return (lev_byte*)calloc(1, sizeof(lev_byte));
+
+ result = (lev_byte*)malloc(lengths[minidx]*sizeof(lev_byte));
+ if (!result)
+ return NULL;
+ return memcpy(result, strings[minidx], lengths[minidx]*sizeof(lev_byte));
+}
+
+/**
+ * lev_u_set_median:
+ * @n: The size of @lengths, @strings, and @weights.
+ * @lengths: The lengths of @strings.
+ * @strings: An array of strings, that may contain NUL characters.
+ * @weights: The string weights (they behave exactly as multiplicities, though
+ * any positive value is allowed, not just integers).
+ * @medlength: Where the length of the median string should be stored.
+ *
+ * Finds the median string of a string set @strings.
+ *
+ * Returns: The set median as a newly allocate string, its length is stored
+ * in @medlength. %NULL in the case of failure.
+ **/
+_LEV_STATIC_PY lev_wchar*
+lev_u_set_median(size_t n, const size_t *lengths,
+ const lev_wchar *strings[],
+ const double *weights,
+ size_t *medlength)
+{
+ size_t minidx = lev_u_set_median_index(n, lengths, strings, weights);
+ lev_wchar *result;
+
+ if (minidx == (size_t)-1)
+ return NULL;
+
+ *medlength = lengths[minidx];
+ if (!lengths[minidx])
+ return (lev_wchar*)calloc(1, sizeof(lev_wchar));
+
+ result = (lev_wchar*)malloc(lengths[minidx]*sizeof(lev_wchar));
+ if (!result)
+ return NULL;
+ return memcpy(result, strings[minidx], lengths[minidx]*sizeof(lev_wchar));
+}
+/* }}} */
+
+/****************************************************************************
+ *
+ * Set, sequence distances
+ *
+ ****************************************************************************/
+/* {{{ */
+
+/**
+ * lev_edit_seq_distance:
+ * @n1: The length of @lengths1 and @strings1.
+ * @lengths1: The lengths of strings in @strings1.
+ * @strings1: An array of strings that may contain NUL characters.
+ * @n2: The length of @lengths2 and @strings2.
+ * @lengths2: The lengths of strings in @strings2.
+ * @strings2: An array of strings that may contain NUL characters.
+ *
+ * Finds the distance between string sequences @strings1 and @strings2.
+ *
+ * In other words, this is a double-Levenshtein algorithm.
+ *
+ * The cost of string replace operation is based on string similarity: it's
+ * zero for identical strings and 2 for completely unsimilar strings.
+ *
+ * Returns: The distance of the two sequences.
+ **/
+_LEV_STATIC_PY double
+lev_edit_seq_distance(size_t n1, const size_t *lengths1,
+ const lev_byte *strings1[],
+ size_t n2, const size_t *lengths2,
+ const lev_byte *strings2[])
+{
+ size_t i;
+ double *row; /* we only need to keep one row of costs */
+ double *end;
+
+ /* strip common prefix */
+ while (n1 > 0 && n2 > 0
+ && *lengths1 == *lengths2
+ && memcmp(*strings1, *strings2,
+ *lengths1*sizeof(lev_byte)) == 0) {
+ n1--;
+ n2--;
+ strings1++;
+ strings2++;
+ lengths1++;
+ lengths2++;
+ }
+
+ /* strip common suffix */
+ while (n1 > 0 && n2 > 0
+ && lengths1[n1-1] == lengths2[n2-1]
+ && memcmp(strings1[n1-1], strings2[n2-1],
+ lengths1[n1-1]*sizeof(lev_byte)) == 0) {
+ n1--;
+ n2--;
+ }
+
+ /* catch trivial cases */
+ if (n1 == 0)
+ return n2;
+ if (n2 == 0)
+ return n1;
+
+ /* make the inner cycle (i.e. strings2) the longer one */
+ if (n1 > n2) {
+ size_t nx = n1;
+ const size_t *lx = lengths1;
+ const lev_byte **sx = strings1;
+ n1 = n2;
+ n2 = nx;
+ lengths1 = lengths2;
+ lengths2 = lx;
+ strings1 = strings2;
+ strings2 = sx;
+ }
+ n1++;
+ n2++;
+
+ /* initalize first row */
+ row = (double*)malloc(n2*sizeof(double));
+ if (!row)
+ return -1.0;
+ end = row + n2 - 1;
+ for (i = 0; i < n2; i++)
+ row[i] = (double)i;
+
+ /* go through the matrix and compute the costs. yes, this is an extremely
+ * obfuscated version, but also extremely memory-conservative and relatively
+ * fast. */
+ for (i = 1; i < n1; i++) {
+ double *p = row + 1;
+ const lev_byte *str1 = strings1[i - 1];
+ const size_t len1 = lengths1[i - 1];
+ const lev_byte **str2p = strings2;
+ const size_t *len2p = lengths2;
+ double D = i - 1.0;
+ double x = i;
+ while (p <= end) {
+ size_t l = len1 + *len2p;
+ double q;
+ if (l == 0)
+ q = D;
+ else {
+ size_t d = lev_edit_distance(len1, str1, *(len2p++), *(str2p++), 1);
+ if (d == (size_t)(-1)) {
+ free(row);
+ return -1.0;
+ }
+ q = D + 2.0/l*d;
+ }
+ x += 1.0;
+ if (x > q)
+ x = q;
+ D = *p;
+ if (x > D + 1.0)
+ x = D + 1.0;
+ *(p++) = x;
+ }
+ }
+
+ {
+ double q = *end;
+ free(row);
+ return q;
+ }
+}
+
+/**
+ * lev_u_edit_seq_distance:
+ * @n1: The length of @lengths1 and @strings1.
+ * @lengths1: The lengths of strings in @strings1.
+ * @strings1: An array of strings that may contain NUL characters.
+ * @n2: The length of @lengths2 and @strings2.
+ * @lengths2: The lengths of strings in @strings2.
+ * @strings2: An array of strings that may contain NUL characters.
+ *
+ * Finds the distance between string sequences @strings1 and @strings2.
+ *
+ * In other words, this is a double-Levenshtein algorithm.
+ *
+ * The cost of string replace operation is based on string similarity: it's
+ * zero for identical strings and 2 for completely unsimilar strings.
+ *
+ * Returns: The distance of the two sequences.
+ **/
+_LEV_STATIC_PY double
+lev_u_edit_seq_distance(size_t n1, const size_t *lengths1,
+ const lev_wchar *strings1[],
+ size_t n2, const size_t *lengths2,
+ const lev_wchar *strings2[])
+{
+ size_t i;
+ double *row; /* we only need to keep one row of costs */
+ double *end;
+
+ /* strip common prefix */
+ while (n1 > 0 && n2 > 0
+ && *lengths1 == *lengths2
+ && memcmp(*strings1, *strings2,
+ *lengths1*sizeof(lev_wchar)) == 0) {
+ n1--;
+ n2--;
+ strings1++;
+ strings2++;
+ lengths1++;
+ lengths2++;
+ }
+
+ /* strip common suffix */
+ while (n1 > 0 && n2 > 0
+ && lengths1[n1-1] == lengths2[n2-1]
+ && memcmp(strings1[n1-1], strings2[n2-1],
+ lengths1[n1-1]*sizeof(lev_wchar)) == 0) {
+ n1--;
+ n2--;
+ }
+
+ /* catch trivial cases */
+ if (n1 == 0)
+ return (double)n2;
+ if (n2 == 0)
+ return (double)n1;
+
+ /* make the inner cycle (i.e. strings2) the longer one */
+ if (n1 > n2) {
+ size_t nx = n1;
+ const size_t *lx = lengths1;
+ const lev_wchar **sx = strings1;
+ n1 = n2;
+ n2 = nx;
+ lengths1 = lengths2;
+ lengths2 = lx;
+ strings1 = strings2;
+ strings2 = sx;
+ }
+ n1++;
+ n2++;
+
+ /* initalize first row */
+ row = (double*)malloc(n2*sizeof(double));
+ if (!row)
+ return -1.0;
+ end = row + n2 - 1;
+ for (i = 0; i < n2; i++)
+ row[i] = (double)i;
+
+ /* go through the matrix and compute the costs. yes, this is an extremely
+ * obfuscated version, but also extremely memory-conservative and relatively
+ * fast. */
+ for (i = 1; i < n1; i++) {
+ double *p = row + 1;
+ const lev_wchar *str1 = strings1[i - 1];
+ const size_t len1 = lengths1[i - 1];
+ const lev_wchar **str2p = strings2;
+ const size_t *len2p = lengths2;
+ double D = i - 1.0;
+ double x = i;
+ while (p <= end) {
+ size_t l = len1 + *len2p;
+ double q;
+ if (l == 0)
+ q = D;
+ else {
+ size_t d = lev_u_edit_distance(len1, str1, *(len2p++), *(str2p++), 1);
+ if (d == (size_t)(-1)) {
+ free(row);
+ return -1.0;
+ }
+ q = D + 2.0/l*d;
+ }
+ x += 1.0;
+ if (x > q)
+ x = q;
+ D = *p;
+ if (x > D + 1.0)
+ x = D + 1.0;
+ *(p++) = x;
+ }
+ }
+
+ {
+ double q = *end;
+ free(row);
+ return q;
+ }
+}
+
+/**
+ * lev_set_distance:
+ * @n1: The length of @lengths1 and @strings1.
+ * @lengths1: The lengths of strings in @strings1.
+ * @strings1: An array of strings that may contain NUL characters.
+ * @n2: The length of @lengths2 and @strings2.
+ * @lengths2: The lengths of strings in @strings2.
+ * @strings2: An array of strings that may contain NUL characters.
+ *
+ * Finds the distance between string sets @strings1 and @strings2.
+ *
+ * The difference from lev_edit_seq_distance() is that order doesn't matter.
+ * The optimal association of @strings1 and @strings2 is found first and
+ * the similarity is computed for that.
+ *
+ * Uses sequential Munkers-Blackman algorithm.
+ *
+ * Returns: The distance of the two sets.
+ **/
+_LEV_STATIC_PY double
+lev_set_distance(size_t n1, const size_t *lengths1,
+ const lev_byte *strings1[],
+ size_t n2, const size_t *lengths2,
+ const lev_byte *strings2[])
+{
+ double *dists; /* the (modified) distance matrix, indexed [row*n1 + col] */
+ double *r;
+ size_t i, j;
+ size_t *map;
+ double sum;
+
+ /* catch trivial cases */
+ if (n1 == 0)
+ return (double)n2;
+ if (n2 == 0)
+ return (double)n1;
+
+ /* make the number of columns (n1) smaller than the number of rows */
+ if (n1 > n2) {
+ size_t nx = n1;
+ const size_t *lx = lengths1;
+ const lev_byte **sx = strings1;
+ n1 = n2;
+ n2 = nx;
+ lengths1 = lengths2;
+ lengths2 = lx;
+ strings1 = strings2;
+ strings2 = sx;
+ }
+
+ /* compute distances from each to each */
+ r = dists = (double*)malloc(n1*n2*sizeof(double));
+ if (!r)
+ return -1.0;
+ for (i = 0; i < n2; i++) {
+ size_t len2 = lengths2[i];
+ const lev_byte *str2 = strings2[i];
+ const size_t *len1p = lengths1;
+ const lev_byte **str1p = strings1;
+ for (j = 0; j < n1; j++) {
+ size_t l = len2 + *len1p;
+ if (l == 0)
+ *(r++) = 0.0;
+ else {
+ size_t d = lev_edit_distance(len2, str2, *(len1p++), *(str1p)++, 1);
+ if (d == (size_t)(-1)) {
+ free(r);
+ return -1.0;
+ }
+ *(r++) = (double)d/l;
+ }
+ }
+ }
+
+ /* find the optimal mapping between the two sets */
+ map = munkers_blackman(n1, n2, dists);
+ if (!map)
+ return -1.0;
+
+ /* sum the set distance */
+ sum = n2 - n1;
+ for (j = 0; j < n1; j++) {
+ size_t l;
+ i = map[j];
+ l = lengths1[j] + lengths2[i];
+ if (l > 0) {
+ size_t d = lev_edit_distance(lengths1[j], strings1[j],
+ lengths2[i], strings2[i], 1);
+ if (d == (size_t)(-1)) {
+ free(map);
+ return -1.0;
+ }
+ sum += 2.0*d/l;
+ }
+ }
+ free(map);
+
+ return sum;
+}
+
+/**
+ * lev_u_set_distance:
+ * @n1: The length of @lengths1 and @strings1.
+ * @lengths1: The lengths of strings in @strings1.
+ * @strings1: An array of strings that may contain NUL characters.
+ * @n2: The length of @lengths2 and @strings2.
+ * @lengths2: The lengths of strings in @strings2.
+ * @strings2: An array of strings that may contain NUL characters.
+ *
+ * Finds the distance between string sets @strings1 and @strings2.
+ *
+ * The difference from lev_u_edit_seq_distance() is that order doesn't matter.
+ * The optimal association of @strings1 and @strings2 is found first and
+ * the similarity is computed for that.
+ *
+ * Uses sequential Munkers-Blackman algorithm.
+ *
+ * Returns: The distance of the two sets.
+ **/
+_LEV_STATIC_PY double
+lev_u_set_distance(size_t n1, const size_t *lengths1,
+ const lev_wchar *strings1[],
+ size_t n2, const size_t *lengths2,
+ const lev_wchar *strings2[])
+{
+ double *dists; /* the (modified) distance matrix, indexed [row*n1 + col] */
+ double *r;
+ size_t i, j;
+ size_t *map;
+ double sum;
+
+ /* catch trivial cases */
+ if (n1 == 0)
+ return (double)n2;
+ if (n2 == 0)
+ return (double)n1;
+
+ /* make the number of columns (n1) smaller than the number of rows */
+ if (n1 > n2) {
+ size_t nx = n1;
+ const size_t *lx = lengths1;
+ const lev_wchar **sx = strings1;
+ n1 = n2;
+ n2 = nx;
+ lengths1 = lengths2;
+ lengths2 = lx;
+ strings1 = strings2;
+ strings2 = sx;
+ }
+
+ /* compute distances from each to each */
+ r = dists = (double*)malloc(n1*n2*sizeof(double));
+ if (!r)
+ return -1.0;
+ for (i = 0; i < n2; i++) {
+ size_t len2 = lengths2[i];
+ const lev_wchar *str2 = strings2[i];
+ const size_t *len1p = lengths1;
+ const lev_wchar **str1p = strings1;
+ for (j = 0; j < n1; j++) {
+ size_t l = len2 + *len1p;
+ if (l == 0)
+ *(r++) = 0.0;
+ else {
+ size_t d = lev_u_edit_distance(len2, str2, *(len1p++), *(str1p)++, 1);
+ if (d == (size_t)(-1)) {
+ free(r);
+ return -1.0;
+ }
+ *(r++) = (double)d/l;
+ }
+ }
+ }
+
+ /* find the optimal mapping between the two sets */
+ map = munkers_blackman(n1, n2, dists);
+ if (!map)
+ return -1.0;
+
+ /* sum the set distance */
+ sum = n2 - n1;
+ for (j = 0; j < n1; j++) {
+ size_t l;
+ i = map[j];
+ l = lengths1[j] + lengths2[i];
+ if (l > 0) {
+ size_t d = lev_u_edit_distance(lengths1[j], strings1[j],
+ lengths2[i], strings2[i], 1);
+ if (d == (size_t)(-1)) {
+ free(map);
+ return -1.0;
+ }
+ sum += 2.0*d/l;
+ }
+ }
+ free(map);
+
+ return sum;
+}
+
+/*
+ * Munkers-Blackman algorithm.
+ */
+static size_t*
+munkers_blackman(size_t n1, size_t n2, double *dists)
+{
+ size_t i, j;
+ size_t *covc, *covr; /* 1 if column/row is covered */
+ /* these contain 1-based indices, so we can use zero as `none'
+ * zstarr: column of a z* in given row
+ * zstarc: row of a z* in given column
+ * zprimer: column of a z' in given row */
+ size_t *zstarr, *zstarc, *zprimer;
+
+ /* allocate memory */
+ covc = calloc(n1, sizeof(size_t));
+ if (!covc)
+ return NULL;
+ zstarc = calloc(n1, sizeof(size_t));
+ if (!zstarc) {
+ free(covc);
+ return NULL;
+ }
+ covr = calloc(n2, sizeof(size_t));
+ if (!covr) {
+ free(zstarc);
+ free(covc);
+ return NULL;
+ }
+ zstarr = calloc(n2, sizeof(size_t));
+ if (!zstarr) {
+ free(covr);
+ free(zstarc);
+ free(covc);
+ return NULL;
+ }
+ zprimer = calloc(n2, sizeof(size_t));
+ if (!zprimer) {
+ free(zstarr);
+ free(covr);
+ free(zstarc);
+ free(covc);
+ return NULL;
+ }
+
+ /* step 0 (subtract minimal distance) and step 1 (find zeroes) */
+ for (j = 0; j < n1; j++) {
+ size_t minidx = 0;
+ double *col = dists + j;
+ double min = *col;
+ double *p = col + n1;
+ for (i = 1; i < n2; i++) {
+ if (min > *p) {
+ minidx = i;
+ min = *p;
+ }
+ p += n1;
+ }
+ /* subtract */
+ p = col;
+ for (i = 0; i < n2; i++) {
+ *p -= min;
+ if (*p < LEV_EPSILON)
+ *p = 0.0;
+ p += n1;
+ }
+ /* star the zero, if possible */
+ if (!zstarc[j] && !zstarr[minidx]) {
+ zstarc[j] = minidx + 1;
+ zstarr[minidx] = j + 1;
+ }
+ else {
+ /* otherwise try to find some other */
+ p = col;
+ for (i = 0; i < n2; i++) {
+ if (i != minidx && *p == 0.0
+ && !zstarc[j] && !zstarr[i]) {
+ zstarc[j] = i + 1;
+ zstarr[i] = j + 1;
+ break;
+ }
+ p += n1;
+ }
+ }
+ }
+
+ /* main */
+ while (1) {
+ /* step 2 (cover columns containing z*) */
+ {
+ size_t nc = 0;
+ for (j = 0; j < n1; j++) {
+ if (zstarc[j]) {
+ covc[j] = 1;
+ nc++;
+ }
+ }
+ if (nc == n1)
+ break;
+ }
+
+ /* step 3 (find uncovered zeroes) */
+ while (1) {
+ step_3:
+ /* search uncovered matrix entries */
+ for (j = 0; j < n1; j++) {
+ double *p = dists + j;
+ if (covc[j])
+ continue;
+ for (i = 0; i < n2; i++) {
+ if (!covr[i] && *p == 0.0) {
+ /* when a zero is found, prime it */
+ zprimer[i] = j + 1;
+ if (zstarr[i]) {
+ /* if there's a z* in the same row,
+ * uncover the column, cover the row and redo */
+ covr[i] = 1;
+ covc[zstarr[i] - 1] = 0;
+ goto step_3;
+ }
+ /* if there's no z*,
+ * we are at the end of our path an can convert z'
+ * to z* */
+ goto step_4;
+ }
+ p += n1;
+ }
+ }
+
+ /* step 5 (new zero manufacturer)
+ * we can't get here, unless no zero is found at all */
+ {
+ /* find the smallest uncovered entry */
+ double min = LEV_INFINITY;
+ for (j = 0; j < n1; j++) {
+ double *p = dists + j;
+ if (covc[j])
+ continue;
+ for (i = 0; i < n2; i++) {
+ if (!covr[i] && min > *p) {
+ min = *p;
+ }
+ p += n1;
+ }
+ }
+ /* add it to all covered rows */
+ for (i = 0; i < n2; i++) {
+ double *p = dists + i*n1;
+ if (!covr[i])
+ continue;
+ for (j = 0; j < n1; j++)
+ *(p++) += min;
+ }
+ /* subtract if from all uncovered columns */
+ for (j = 0; j < n1; j++) {
+ double *p = dists + j;
+ if (covc[j])
+ continue;
+ for (i = 0; i < n2; i++) {
+ *p -= min;
+ if (*p < LEV_EPSILON)
+ *p = 0.0;
+ p += n1;
+ }
+ }
+ }
+ }
+
+ /* step 4 (increment the number of z*)
+ * i is the row number (we get it from step 3) */
+ step_4:
+ i++;
+ do {
+ size_t x = i;
+
+ i--;
+ j = zprimer[i] - 1; /* move to z' in the same row */
+ zstarr[i] = j + 1; /* mark it as z* in row buffer */
+ i = zstarc[j]; /* move to z* in the same column */
+ zstarc[j] = x; /* mark the z' as being new z* */
+ } while (i);
+ memset(zprimer, 0, n2*sizeof(size_t));
+ memset(covr, 0, n2*sizeof(size_t));
+ memset(covc, 0, n1*sizeof(size_t));
+ }
+
+ free(dists);
+ free(covc);
+ free(covr);
+ /*free(zstarc); this is the result */
+ free(zstarr);
+ free(zprimer);
+
+ for (j = 0; j < n1; j++)
+ zstarc[j]--;
+ return zstarc;
+}
+/* }}} */
+
+/****************************************************************************
+ *
+ * Editops and other difflib-like stuff.
+ *
+ ****************************************************************************/
+/* {{{ */
+
+/**
+ * lev_editops_check_errors:
+ * @len1: The length of an eventual @ops source string.
+ * @len2: The length of an eventual @ops destination string.
+ * @n: The length of @ops.
+ * @ops: An array of elementary edit operations.
+ *
+ * Checks whether @ops is consistent and applicable as a partial edit from a
+ * string of length @len1 to a string of length @len2.
+ *
+ * Returns: Zero if @ops seems OK, a nonzero error code otherwise.
+ **/
+_LEV_STATIC_PY int
+lev_editops_check_errors(size_t len1, size_t len2,
+ size_t n, const LevEditOp *ops)
+{
+ const LevEditOp *o;
+ size_t i;
+
+ if (!n)
+ return LEV_EDIT_ERR_OK;
+
+ /* check bounds */
+ o = ops;
+ for (i = n; i; i--, o++) {
+ if (o->type >= LEV_EDIT_LAST)
+ return LEV_EDIT_ERR_TYPE;
+ if (o->spos > len1 || o->dpos > len2)
+ return LEV_EDIT_ERR_OUT;
+ if (o->spos == len1 && o->type != LEV_EDIT_INSERT)
+ return LEV_EDIT_ERR_OUT;
+ if (o->dpos == len2 && o->type != LEV_EDIT_DELETE)
+ return LEV_EDIT_ERR_OUT;
+ }
+
+ /* check ordering */
+ o = ops + 1;
+ for (i = n - 1; i; i--, o++, ops++) {
+ if (o->spos < ops->spos || o->dpos < ops->dpos)
+ return LEV_EDIT_ERR_ORDER;
+ }
+
+ return LEV_EDIT_ERR_OK;
+}
+
+/**
+ * lev_opcodes_check_errors:
+ * @len1: The length of an eventual @ops source string.
+ * @len2: The length of an eventual @ops destination string.
+ * @nb: The length of @bops.
+ * @bops: An array of difflib block edit operation codes.
+ *
+ * Checks whether @bops is consistent and applicable as an edit from a
+ * string of length @len1 to a string of length @len2.
+ *
+ * Returns: Zero if @bops seems OK, a nonzero error code otherwise.
+ **/
+_LEV_STATIC_PY int
+lev_opcodes_check_errors(size_t len1, size_t len2,
+ size_t nb, const LevOpCode *bops)
+{
+ const LevOpCode *b;
+ size_t i;
+
+ if (!nb)
+ return 1;
+
+ /* check completenes */
+ if (bops->sbeg || bops->dbeg
+ || bops[nb - 1].send != len1 || bops[nb - 1].dend != len2)
+ return LEV_EDIT_ERR_SPAN;
+
+ /* check bounds and block consistency */
+ b = bops;
+ for (i = nb; i; i--, b++) {
+ if (b->send > len1 || b->dend > len2)
+ return LEV_EDIT_ERR_OUT;
+ switch (b->type) {
+ case LEV_EDIT_KEEP:
+ case LEV_EDIT_REPLACE:
+ if (b->dend - b->dbeg != b->send - b->sbeg || b->dend == b->dbeg)
+ return LEV_EDIT_ERR_BLOCK;
+ break;
+
+ case LEV_EDIT_INSERT:
+ if (b->dend - b->dbeg == 0 || b->send - b->sbeg != 0)
+ return LEV_EDIT_ERR_BLOCK;
+ break;
+
+ case LEV_EDIT_DELETE:
+ if (b->send - b->sbeg == 0 || b->dend - b->dbeg != 0)
+ return LEV_EDIT_ERR_BLOCK;
+ break;
+
+ default:
+ return LEV_EDIT_ERR_TYPE;
+ break;
+ }
+ }
+
+ /* check ordering */
+ b = bops + 1;
+ for (i = nb - 1; i; i--, b++, bops++) {
+ if (b->sbeg != bops->send || b->dbeg != bops->dend)
+ return LEV_EDIT_ERR_ORDER;
+ }
+
+ return LEV_EDIT_ERR_OK;
+}
+
+/**
+ * lev_editops_invert:
+ * @n: The length of @ops.
+ * @ops: An array of elementary edit operations.
+ *
+ * Inverts the sense of @ops. It is modified in place.
+ *
+ * In other words, @ops becomes a valid partial edit for the original source
+ * and destination strings with their roles exchanged.
+ **/
+_LEV_STATIC_PY void
+lev_editops_invert(size_t n, LevEditOp *ops)
+{
+ size_t i;
+
+ for (i = n; i; i--, ops++) {
+ size_t z;
+
+ z = ops->dpos;
+ ops->dpos = ops->spos;
+ ops->spos = z;
+ if (ops->type & 2)
+ ops->type ^= 1;
+ }
+}
+
+/**
+ * lev_editops_apply:
+ * @len1: The length of @string1.
+ * @string1: A string of length @len1, may contain NUL characters.
+ * @len2: The length of @string2.
+ * @string2: A string of length @len2, may contain NUL characters.
+ * @n: The size of @ops.
+ * @ops: An array of elementary edit operations.
+ * @len: Where the size of the resulting string should be stored.
+ *
+ * Applies a partial edit @ops from @string1 to @string2.
+ *
+ * NB: @ops is not checked for applicability.
+ *
+ * Returns: The result of the partial edit as a newly allocated string, its
+ * length is stored in @len.
+ **/
+_LEV_STATIC_PY lev_byte*
+lev_editops_apply(size_t len1, const lev_byte *string1,
+ __attribute__((unused)) size_t len2, const lev_byte *string2,
+ size_t n, const LevEditOp *ops,
+ size_t *len)
+{
+ lev_byte *dst, *dpos; /* destination string */
+ const lev_byte *spos; /* source string position */
+ size_t i, j;
+
+ /* this looks too complex for such a simple task, but note ops is not
+ * a complete edit sequence, we have to be able to apply anything anywhere */
+ dpos = dst = (lev_byte*)malloc((n + len1)*sizeof(lev_byte));
+ if (!dst) {
+ *len = (size_t)(-1);
+ return NULL;
+ }
+ spos = string1;
+ for (i = n; i; i--, ops++) {
+ /* XXX: this fine with gcc internal memcpy, but when memcpy is
+ * actually a function, it may be pretty slow */
+ j = ops->spos - (spos - string1) + (ops->type == LEV_EDIT_KEEP);
+ if (j) {
+ memcpy(dpos, spos, j*sizeof(lev_byte));
+ spos += j;
+ dpos += j;
+ }
+ switch (ops->type) {
+ case LEV_EDIT_DELETE:
+ spos++;
+ break;
+
+ case LEV_EDIT_REPLACE:
+ spos++;
+ case LEV_EDIT_INSERT:
+ *(dpos++) = string2[ops->dpos];
+ break;
+
+ default:
+ break;
+ }
+ }
+ j = len1 - (spos - string1);
+ if (j) {
+ memcpy(dpos, spos, j*sizeof(lev_byte));
+ spos += j;
+ dpos += j;
+ }
+
+ *len = dpos - dst;
+ /* possible realloc failure is detected with *len != 0 */
+ return realloc(dst, *len*sizeof(lev_byte));
+}
+
+/**
+ * lev_u_editops_apply:
+ * @len1: The length of @string1.
+ * @string1: A string of length @len1, may contain NUL characters.
+ * @len2: The length of @string2.
+ * @string2: A string of length @len2, may contain NUL characters.
+ * @n: The size of @ops.
+ * @ops: An array of elementary edit operations.
+ * @len: Where the size of the resulting string should be stored.
+ *
+ * Applies a partial edit @ops from @string1 to @string2.
+ *
+ * NB: @ops is not checked for applicability.
+ *
+ * Returns: The result of the partial edit as a newly allocated string, its
+ * length is stored in @len.
+ **/
+_LEV_STATIC_PY lev_wchar*
+lev_u_editops_apply(size_t len1, const lev_wchar *string1,
+ __attribute__((unused)) size_t len2,
+ const lev_wchar *string2,
+ size_t n, const LevEditOp *ops,
+ size_t *len)
+{
+ lev_wchar *dst, *dpos; /* destination string */
+ const lev_wchar *spos; /* source string position */
+ size_t i, j;
+
+ /* this looks too complex for such a simple task, but note ops is not
+ * a complete edit sequence, we have to be able to apply anything anywhere */
+ dpos = dst = (lev_wchar*)malloc((n + len1)*sizeof(lev_wchar));
+ if (!dst) {
+ *len = (size_t)(-1);
+ return NULL;
+ }
+ spos = string1;
+ for (i = n; i; i--, ops++) {
+ /* XXX: this is fine with gcc internal memcpy, but when memcpy is
+ * actually a function, it may be pretty slow */
+ j = ops->spos - (spos - string1) + (ops->type == LEV_EDIT_KEEP);
+ if (j) {
+ memcpy(dpos, spos, j*sizeof(lev_wchar));
+ spos += j;
+ dpos += j;
+ }
+ switch (ops->type) {
+ case LEV_EDIT_DELETE:
+ spos++;
+ break;
+
+ case LEV_EDIT_REPLACE:
+ spos++;
+ case LEV_EDIT_INSERT:
+ *(dpos++) = string2[ops->dpos];
+ break;
+
+ default:
+ break;
+ }
+ }
+ j = len1 - (spos - string1);
+ if (j) {
+ memcpy(dpos, spos, j*sizeof(lev_wchar));
+ spos += j;
+ dpos += j;
+ }
+
+ *len = dpos - dst;
+ /* possible realloc failure is detected with *len != 0 */
+ return realloc(dst, *len*sizeof(lev_wchar));
+}
+
+/**
+ * editops_from_cost_matrix:
+ * @len1: The length of @string1.
+ * @string1: A string of length @len1, may contain NUL characters.
+ * @o1: The offset where the matrix starts from the start of @string1.
+ * @len2: The length of @string2.
+ * @string2: A string of length @len2, may contain NUL characters.
+ * @o2: The offset where the matrix starts from the start of @string2.
+ * @matrix: The cost matrix.
+ * @n: Where the number of edit operations should be stored.
+ *
+ * Reconstructs the optimal edit sequence from the cost matrix @matrix.
+ *
+ * The matrix is freed.
+ *
+ * Returns: The optimal edit sequence, as a newly allocated array of
+ * elementary edit operations, it length is stored in @n.
+ **/
+static LevEditOp*
+editops_from_cost_matrix(size_t len1, const lev_byte *string1, size_t off1,
+ size_t len2, const lev_byte *string2, size_t off2,
+ size_t *matrix, size_t *n)
+{
+ size_t *p;
+ size_t i, j, pos;
+ LevEditOp *ops;
+ int dir = 0;
+
+ pos = *n = matrix[len1*len2 - 1];
+ if (!*n) {
+ free(matrix);
+ return NULL;
+ }
+ ops = (LevEditOp*)malloc((*n)*sizeof(LevEditOp));
+ if (!ops) {
+ free(matrix);
+ *n = (size_t)(-1);
+ return NULL;
+ }
+ i = len1 - 1;
+ j = len2 - 1;
+ p = matrix + len1*len2 - 1;
+ while (i || j) {
+ /* prefer contiuning in the same direction */
+ if (dir < 0 && j && *p == *(p - 1) + 1) {
+ pos--;
+ ops[pos].type = LEV_EDIT_INSERT;
+ ops[pos].spos = i + off1;
+ ops[pos].dpos = --j + off2;
+ p--;
+ continue;
+ }
+ if (dir > 0 && i && *p == *(p - len2) + 1) {
+ pos--;
+ ops[pos].type = LEV_EDIT_DELETE;
+ ops[pos].spos = --i + off1;
+ ops[pos].dpos = j + off2;
+ p -= len2;
+ continue;
+ }
+ if (i && j && *p == *(p - len2 - 1)
+ && string1[i - 1] == string2[j - 1]) {
+ /* don't be stupid like difflib, don't store LEV_EDIT_KEEP */
+ i--;
+ j--;
+ p -= len2 + 1;
+ dir = 0;
+ continue;
+ }
+ if (i && j && *p == *(p - len2 - 1) + 1) {
+ pos--;
+ ops[pos].type = LEV_EDIT_REPLACE;
+ ops[pos].spos = --i + off1;
+ ops[pos].dpos = --j + off2;
+ p -= len2 + 1;
+ dir = 0;
+ continue;
+ }
+ /* we cant't turn directly from -1 to 1, in this case it would be better
+ * to go diagonally, but check it (dir == 0) */
+ if (dir == 0 && j && *p == *(p - 1) + 1) {
+ pos--;
+ ops[pos].type = LEV_EDIT_INSERT;
+ ops[pos].spos = i + off1;
+ ops[pos].dpos = --j + off2;
+ p--;
+ dir = -1;
+ continue;
+ }
+ if (dir == 0 && i && *p == *(p - len2) + 1) {
+ pos--;
+ ops[pos].type = LEV_EDIT_DELETE;
+ ops[pos].spos = --i + off1;
+ ops[pos].dpos = j + off2;
+ p -= len2;
+ dir = 1;
+ continue;
+ }
+ /* coredump right now, later might be too late ;-) */
+ assert("lost in the cost matrix" == NULL);
+ }
+ free(matrix);
+
+ return ops;
+}
+
+/**
+ * lev_editops_find:
+ * @len1: The length of @string1.
+ * @string1: A string of length @len1, may contain NUL characters.
+ * @len2: The length of @string2.
+ * @string2: A string of length @len2, may contain NUL characters.
+ * @n: Where the number of edit operations should be stored.
+ *
+ * Find an optimal edit sequence from @string1 to @string2.
+ *
+ * When there's more than one optimal sequence, a one is arbitrarily (though
+ * deterministically) chosen.
+ *
+ * Returns: The optimal edit sequence, as a newly allocated array of
+ * elementary edit operations, it length is stored in @n.
+ * It is normalized, i.e., keep operations are not included.
+ **/
+_LEV_STATIC_PY LevEditOp*
+lev_editops_find(size_t len1, const lev_byte *string1,
+ size_t len2, const lev_byte *string2,
+ size_t *n)
+{
+ size_t len1o, len2o;
+ size_t i;
+ size_t *matrix; /* cost matrix */
+
+ /* strip common prefix */
+ len1o = 0;
+ while (len1 > 0 && len2 > 0 && *string1 == *string2) {
+ len1--;
+ len2--;
+ string1++;
+ string2++;
+ len1o++;
+ }
+ len2o = len1o;
+
+ /* strip common suffix */
+ while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) {
+ len1--;
+ len2--;
+ }
+ len1++;
+ len2++;
+
+ /* initalize first row and column */
+ matrix = (size_t*)malloc(len1*len2*sizeof(size_t));
+ if (!matrix) {
+ *n = (size_t)(-1);
+ return NULL;
+ }
+ for (i = 0; i < len2; i++)
+ matrix[i] = i;
+ for (i = 1; i < len1; i++)
+ matrix[len2*i] = i;
+
+ /* find the costs and fill the matrix */
+ for (i = 1; i < len1; i++) {
+ size_t *prev = matrix + (i - 1)*len2;
+ size_t *p = matrix + i*len2;
+ size_t *end = p + len2 - 1;
+ const lev_byte char1 = string1[i - 1];
+ const lev_byte *char2p = string2;
+ size_t x = i;
+ p++;
+ while (p <= end) {
+ size_t c3 = *(prev++) + (char1 != *(char2p++));
+ x++;
+ if (x > c3)
+ x = c3;
+ c3 = *prev + 1;
+ if (x > c3)
+ x = c3;
+ *(p++) = x;
+ }
+ }
+
+ /* find the way back */
+ return editops_from_cost_matrix(len1, string1, len1o,
+ len2, string2, len2o,
+ matrix, n);
+}
+
+/**
+ * ueditops_from_cost_matrix:
+ * @len1: The length of @string1.
+ * @string1: A string of length @len1, may contain NUL characters.
+ * @o1: The offset where the matrix starts from the start of @string1.
+ * @len2: The length of @string2.
+ * @string2: A string of length @len2, may contain NUL characters.
+ * @o2: The offset where the matrix starts from the start of @string2.
+ * @matrix: The cost matrix.
+ * @n: Where the number of edit operations should be stored.
+ *
+ * Reconstructs the optimal edit sequence from the cost matrix @matrix.
+ *
+ * The matrix is freed.
+ *
+ * Returns: The optimal edit sequence, as a newly allocated array of
+ * elementary edit operations, it length is stored in @n.
+ **/
+static LevEditOp*
+ueditops_from_cost_matrix(size_t len1, const lev_wchar *string1, size_t o1,
+ size_t len2, const lev_wchar *string2, size_t o2,
+ size_t *matrix, size_t *n)
+{
+ size_t *p;
+ size_t i, j, pos;
+ LevEditOp *ops;
+ int dir = 0;
+
+ pos = *n = matrix[len1*len2 - 1];
+ if (!*n) {
+ free(matrix);
+ return NULL;
+ }
+ ops = (LevEditOp*)malloc((*n)*sizeof(LevEditOp));
+ if (!ops) {
+ free(matrix);
+ *n = (size_t)(-1);
+ return NULL;
+ }
+ i = len1 - 1;
+ j = len2 - 1;
+ p = matrix + len1*len2 - 1;
+ while (i || j) {
+ /* prefer contiuning in the same direction */
+ if (dir < 0 && j && *p == *(p - 1) + 1) {
+ pos--;
+ ops[pos].type = LEV_EDIT_INSERT;
+ ops[pos].spos = i + o1;
+ ops[pos].dpos = --j + o2;
+ p--;
+ continue;
+ }
+ if (dir > 0 && i && *p == *(p - len2) + 1) {
+ pos--;
+ ops[pos].type = LEV_EDIT_DELETE;
+ ops[pos].spos = --i + o1;
+ ops[pos].dpos = j + o2;
+ p -= len2;
+ continue;
+ }
+ if (i && j && *p == *(p - len2 - 1)
+ && string1[i - 1] == string2[j - 1]) {
+ /* don't be stupid like difflib, don't store LEV_EDIT_KEEP */
+ i--;
+ j--;
+ p -= len2 + 1;
+ dir = 0;
+ continue;
+ }
+ if (i && j && *p == *(p - len2 - 1) + 1) {
+ pos--;
+ ops[pos].type = LEV_EDIT_REPLACE;
+ ops[pos].spos = --i + o1;
+ ops[pos].dpos = --j + o2;
+ p -= len2 + 1;
+ dir = 0;
+ continue;
+ }
+ /* we cant't turn directly from -1 to 1, in this case it would be better
+ * to go diagonally, but check it (dir == 0) */
+ if (dir == 0 && j && *p == *(p - 1) + 1) {
+ pos--;
+ ops[pos].type = LEV_EDIT_INSERT;
+ ops[pos].spos = i + o1;
+ ops[pos].dpos = --j + o2;
+ p--;
+ dir = -1;
+ continue;
+ }
+ if (dir == 0 && i && *p == *(p - len2) + 1) {
+ pos--;
+ ops[pos].type = LEV_EDIT_DELETE;
+ ops[pos].spos = --i + o1;
+ ops[pos].dpos = j + o2;
+ p -= len2;
+ dir = 1;
+ continue;
+ }
+ /* coredump right now, later might be too late ;-) */
+ assert("lost in the cost matrix" == NULL);
+ }
+ free(matrix);
+
+ return ops;
+}
+
+/**
+ * lev_u_editops_find:
+ * @len1: The length of @string1.
+ * @string1: A string of length @len1, may contain NUL characters.
+ * @len2: The length of @string2.
+ * @string2: A string of length @len2, may contain NUL characters.
+ * @n: Where the number of edit operations should be stored.
+ *
+ * Find an optimal edit sequence from @string1 to @string2.
+ *
+ * When there's more than one optimal sequence, a one is arbitrarily (though
+ * deterministically) chosen.
+ *
+ * Returns: The optimal edit sequence, as a newly allocated array of
+ * elementary edit operations, it length is stored in @n.
+ * It is normalized, i.e., keep operations are not included.
+ **/
+_LEV_STATIC_PY LevEditOp*
+lev_u_editops_find(size_t len1, const lev_wchar *string1,
+ size_t len2, const lev_wchar *string2,
+ size_t *n)
+{
+ size_t len1o, len2o;
+ size_t i;
+ size_t *matrix; /* cost matrix */
+
+ /* strip common prefix */
+ len1o = 0;
+ while (len1 > 0 && len2 > 0 && *string1 == *string2) {
+ len1--;
+ len2--;
+ string1++;
+ string2++;
+ len1o++;
+ }
+ len2o = len1o;
+
+ /* strip common suffix */
+ while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) {
+ len1--;
+ len2--;
+ }
+ len1++;
+ len2++;
+
+ /* initalize first row and column */
+ matrix = (size_t*)malloc(len1*len2*sizeof(size_t));
+ if (!matrix) {
+ *n = (size_t)(-1);
+ return NULL;
+ }
+ for (i = 0; i < len2; i++)
+ matrix[i] = i;
+ for (i = 1; i < len1; i++)
+ matrix[len2*i] = i;
+
+ /* find the costs and fill the matrix */
+ for (i = 1; i < len1; i++) {
+ size_t *prev = matrix + (i - 1)*len2;
+ size_t *p = matrix + i*len2;
+ size_t *end = p + len2 - 1;
+ const lev_wchar char1 = string1[i - 1];
+ const lev_wchar *char2p = string2;
+ size_t x = i;
+ p++;
+ while (p <= end) {
+ size_t c3 = *(prev++) + (char1 != *(char2p++));
+ x++;
+ if (x > c3)
+ x = c3;
+ c3 = *prev + 1;
+ if (x > c3)
+ x = c3;
+ *(p++) = x;
+ }
+ }
+
+ /* find the way back */
+ return ueditops_from_cost_matrix(len1, string1, len1o,
+ len2, string2, len2o,
+ matrix, n);
+}
+
+/**
+ * lev_opcodes_to_editops:
+ * @nb: The length of @bops.
+ * @bops: An array of difflib block edit operation codes.
+ * @n: Where the number of edit operations should be stored.
+ * @keepkeep: If nonzero, keep operations will be included. Otherwise the
+ * result will be normalized, i.e. without any keep operations.
+ *
+ * Converts difflib block operation codes to elementary edit operations.
+ *
+ * Returns: The converted edit operatiosn, as a newly allocated array; its
+ * size is stored in @n.
+ **/
+_LEV_STATIC_PY LevEditOp*
+lev_opcodes_to_editops(size_t nb, const LevOpCode *bops,
+ size_t *n, int keepkeep)
+{
+ size_t i;
+ const LevOpCode *b;
+ LevEditOp *ops, *o;
+
+ /* compute the number of atomic operations */
+ *n = 0;
+ if (!nb)
+ return NULL;
+
+ b = bops;
+ if (keepkeep) {
+ for (i = nb; i; i--, b++) {
+ size_t sd = b->send - b->sbeg;
+ size_t dd = b->dend - b->dbeg;
+ *n += (sd > dd ? sd : dd);
+ }
+ }
+ else {
+ for (i = nb; i; i--, b++) {
+ size_t sd = b->send - b->sbeg;
+ size_t dd = b->dend - b->dbeg;
+ *n += (b->type != LEV_EDIT_KEEP ? (sd > dd ? sd : dd) : 0);
+ }
+ }
+
+ /* convert */
+ o = ops = (LevEditOp*)malloc((*n)*sizeof(LevEditOp));
+ if (!ops) {
+ *n = (size_t)(-1);
+ return NULL;
+ }
+ b = bops;
+ for (i = nb; i; i--, b++) {
+ size_t j;
+
+ switch (b->type) {
+ case LEV_EDIT_KEEP:
+ if (keepkeep) {
+ for (j = 0; j < b->send - b->sbeg; j++, o++) {
+ o->type = LEV_EDIT_KEEP;
+ o->spos = b->sbeg + j;
+ o->dpos = b->dbeg + j;
+ }
+ }
+ break;
+
+ case LEV_EDIT_REPLACE:
+ for (j = 0; j < b->send - b->sbeg; j++, o++) {
+ o->type = LEV_EDIT_REPLACE;
+ o->spos = b->sbeg + j;
+ o->dpos = b->dbeg + j;
+ }
+ break;
+
+ case LEV_EDIT_DELETE:
+ for (j = 0; j < b->send - b->sbeg; j++, o++) {
+ o->type = LEV_EDIT_DELETE;
+ o->spos = b->sbeg + j;
+ o->dpos = b->dbeg;
+ }
+ break;
+
+ case LEV_EDIT_INSERT:
+ for (j = 0; j < b->dend - b->dbeg; j++, o++) {
+ o->type = LEV_EDIT_INSERT;
+ o->spos = b->sbeg;
+ o->dpos = b->dbeg + j;
+ }
+ break;
+
+ default:
+ break;
+ }
+ }
+ assert((size_t)(o - ops) == *n);
+
+ return ops;
+}
+
+/**
+ * lev_editops_to_opcodes:
+ * @n: The size of @ops.
+ * @ops: An array of elementary edit operations.
+ * @nb: Where the number of difflib block operation codes should be stored.
+ * @len1: The length of the source string.
+ * @len2: The length of the destination string.
+ *
+ * Converts elementary edit operations to difflib block operation codes.
+ *
+ * Note the string lengths are necessary since difflib doesn't allow omitting
+ * keep operations.
+ *
+ * Returns: The converted block operation codes, as a newly allocated array;
+ * its length is stored in @nb.
+ **/
+_LEV_STATIC_PY LevOpCode*
+lev_editops_to_opcodes(size_t n, const LevEditOp *ops, size_t *nb,
+ size_t len1, size_t len2)
+{
+ size_t nbl, i, spos, dpos;
+ const LevEditOp *o;
+ LevOpCode *bops, *b;
+ LevEditType type;
+
+ /* compute the number of blocks */
+ nbl = 0;
+ o = ops;
+ spos = dpos = 0;
+ type = LEV_EDIT_KEEP;
+ for (i = n; i; ) {
+ /* simply pretend there are no keep blocks */
+ while (o->type == LEV_EDIT_KEEP && --i)
+ o++;
+ if (!i)
+ break;
+ if (spos < o->spos || dpos < o->dpos) {
+ nbl++;
+ spos = o->spos;
+ dpos = o->dpos;
+ }
+ nbl++;
+ type = o->type;
+ switch (type) {
+ case LEV_EDIT_REPLACE:
+ do {
+ spos++;
+ dpos++;
+ i--;
+ o++;
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
+ break;
+
+ case LEV_EDIT_DELETE:
+ do {
+ spos++;
+ i--;
+ o++;
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
+ break;
+
+ case LEV_EDIT_INSERT:
+ do {
+ dpos++;
+ i--;
+ o++;
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
+ break;
+
+ default:
+ break;
+ }
+ }
+ if (spos < len1 || dpos < len2)
+ nbl++;
+
+ /* convert */
+ b = bops = (LevOpCode*)malloc(nbl*sizeof(LevOpCode));
+ if (!bops) {
+ *nb = (size_t)(-1);
+ return NULL;
+ }
+ o = ops;
+ spos = dpos = 0;
+ type = LEV_EDIT_KEEP;
+ for (i = n; i; ) {
+ /* simply pretend there are no keep blocks */
+ while (o->type == LEV_EDIT_KEEP && --i)
+ o++;
+ if (!i)
+ break;
+ b->sbeg = spos;
+ b->dbeg = dpos;
+ if (spos < o->spos || dpos < o->dpos) {
+ b->type = LEV_EDIT_KEEP;
+ spos = b->send = o->spos;
+ dpos = b->dend = o->dpos;
+ b++;
+ b->sbeg = spos;
+ b->dbeg = dpos;
+ }
+ type = o->type;
+ switch (type) {
+ case LEV_EDIT_REPLACE:
+ do {
+ spos++;
+ dpos++;
+ i--;
+ o++;
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
+ break;
+
+ case LEV_EDIT_DELETE:
+ do {
+ spos++;
+ i--;
+ o++;
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
+ break;
+
+ case LEV_EDIT_INSERT:
+ do {
+ dpos++;
+ i--;
+ o++;
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
+ break;
+
+ default:
+ break;
+ }
+ b->type = type;
+ b->send = spos;
+ b->dend = dpos;
+ b++;
+ }
+ if (spos < len1 || dpos < len2) {
+ assert(len1 - spos == len2 - dpos);
+ b->type = LEV_EDIT_KEEP;
+ b->sbeg = spos;
+ b->dbeg = dpos;
+ b->send = len1;
+ b->dend = len2;
+ b++;
+ }
+ assert((size_t)(b - bops) == nbl);
+
+ *nb = nbl;
+ return bops;
+}
+
+/**
+ * lev_opcodes_apply:
+ * @len1: The length of the source string.
+ * @string1: A string of length @len1, may contain NUL characters.
+ * @len2: The length of the destination string.
+ * @string2: A string of length @len2, may contain NUL characters.
+ * @nb: The length of @bops.
+ * @bops: An array of difflib block edit operation codes.
+ * @len: Where the size of the resulting string should be stored.
+ *
+ * Applies a sequence of difflib block operations to a string.
+ *
+ * NB: @bops is not checked for applicability.
+ *
+ * Returns: The result of the edit as a newly allocated string, its length
+ * is stored in @len.
+ **/
+_LEV_STATIC_PY lev_byte*
+lev_opcodes_apply(size_t len1, const lev_byte *string1,
+ size_t len2, const lev_byte *string2,
+ size_t nb, const LevOpCode *bops,
+ size_t *len)
+{
+ lev_byte *dst, *dpos; /* destination string */
+ const lev_byte *spos; /* source string position */
+ size_t i;
+
+ /* this looks too complex for such a simple task, but note ops is not
+ * a complete edit sequence, we have to be able to apply anything anywhere */
+ dpos = dst = (lev_byte*)malloc((len1 + len2)*sizeof(lev_byte));
+ if (!dst) {
+ *len = (size_t)(-1);
+ return NULL;
+ }
+ spos = string1;
+ for (i = nb; i; i--, bops++) {
+ switch (bops->type) {
+ case LEV_EDIT_INSERT:
+ case LEV_EDIT_REPLACE:
+ memcpy(dpos, string2 + bops->dbeg,
+ (bops->dend - bops->dbeg)*sizeof(lev_byte));
+ break;
+
+ case LEV_EDIT_KEEP:
+ memcpy(dpos, string1 + bops->sbeg,
+ (bops->send - bops->sbeg)*sizeof(lev_byte));
+ break;
+
+ default:
+ break;
+ }
+ spos += bops->send - bops->sbeg;
+ dpos += bops->dend - bops->dbeg;
+ }
+
+ *len = dpos - dst;
+ /* possible realloc failure is detected with *len != 0 */
+ return realloc(dst, *len*sizeof(lev_byte));
+}
+
+/**
+ * lev_u_opcodes_apply:
+ * @len1: The length of the source string.
+ * @string1: A string of length @len1, may contain NUL characters.
+ * @len2: The length of the destination string.
+ * @string2: A string of length @len2, may contain NUL characters.
+ * @nb: The length of @bops.
+ * @bops: An array of difflib block edit operation codes.
+ * @len: Where the size of the resulting string should be stored.
+ *
+ * Applies a sequence of difflib block operations to a string.
+ *
+ * NB: @bops is not checked for applicability.
+ *
+ * Returns: The result of the edit as a newly allocated string, its length
+ * is stored in @len.
+ **/
+_LEV_STATIC_PY lev_wchar*
+lev_u_opcodes_apply(size_t len1, const lev_wchar *string1,
+ size_t len2, const lev_wchar *string2,
+ size_t nb, const LevOpCode *bops,
+ size_t *len)
+{
+ lev_wchar *dst, *dpos; /* destination string */
+ const lev_wchar *spos; /* source string position */
+ size_t i;
+
+ /* this looks too complex for such a simple task, but note ops is not
+ * a complete edit sequence, we have to be able to apply anything anywhere */
+ dpos = dst = (lev_wchar*)malloc((len1 + len2)*sizeof(lev_wchar));
+ if (!dst) {
+ *len = (size_t)(-1);
+ return NULL;
+ }
+ spos = string1;
+ for (i = nb; i; i--, bops++) {
+ switch (bops->type) {
+ case LEV_EDIT_INSERT:
+ case LEV_EDIT_REPLACE:
+ memcpy(dpos, string2 + bops->dbeg,
+ (bops->dend - bops->dbeg)*sizeof(lev_wchar));
+ break;
+
+ case LEV_EDIT_KEEP:
+ memcpy(dpos, string1 + bops->sbeg,
+ (bops->send - bops->sbeg)*sizeof(lev_wchar));
+ break;
+
+ default:
+ break;
+ }
+ spos += bops->send - bops->sbeg;
+ dpos += bops->dend - bops->dbeg;
+ }
+
+ *len = dpos - dst;
+ /* possible realloc failure is detected with *len != 0 */
+ return realloc(dst, *len*sizeof(lev_wchar));
+}
+
+/**
+ * lev_opcodes_invert:
+ * @nb: The length of @ops.
+ * @bops: An array of difflib block edit operation codes.
+ *
+ * Inverts the sense of @ops. It is modified in place.
+ *
+ * In other words, @ops becomes a partial edit for the original source
+ * and destination strings with their roles exchanged.
+ **/
+_LEV_STATIC_PY void
+lev_opcodes_invert(size_t nb, LevOpCode *bops)
+{
+ size_t i;
+
+ for (i = nb; i; i--, bops++) {
+ size_t z;
+
+ z = bops->dbeg;
+ bops->dbeg = bops->sbeg;
+ bops->sbeg = z;
+ z = bops->dend;
+ bops->dend = bops->send;
+ bops->send = z;
+ if (bops->type & 2)
+ bops->type ^= 1;
+ }
+}
+
+/**
+ * lev_editops_matching_blocks:
+ * @len1: The length of the source string.
+ * @len2: The length of the destination string.
+ * @n: The size of @ops.
+ * @ops: An array of elementary edit operations.
+ * @nmblocks: Where the number of matching block should be stored.
+ *
+ * Computes the matching block corresponding to an optimal edit @ops.
+ *
+ * Returns: The matching blocks as a newly allocated array, it length is
+ * stored in @nmblocks.
+ **/
+_LEV_STATIC_PY LevMatchingBlock*
+lev_editops_matching_blocks(size_t len1,
+ size_t len2,
+ size_t n,
+ const LevEditOp *ops,
+ size_t *nmblocks)
+{
+ size_t nmb, i, spos, dpos;
+ LevEditType type;
+ const LevEditOp *o;
+ LevMatchingBlock *mblocks, *mb;
+
+ /* compute the number of matching blocks */
+ nmb = 0;
+ o = ops;
+ spos = dpos = 0;
+ type = LEV_EDIT_KEEP;
+ for (i = n; i; ) {
+ /* simply pretend there are no keep blocks */
+ while (o->type == LEV_EDIT_KEEP && --i)
+ o++;
+ if (!i)
+ break;
+ if (spos < o->spos || dpos < o->dpos) {
+ nmb++;
+ spos = o->spos;
+ dpos = o->dpos;
+ }
+ type = o->type;
+ switch (type) {
+ case LEV_EDIT_REPLACE:
+ do {
+ spos++;
+ dpos++;
+ i--;
+ o++;
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
+ break;
+
+ case LEV_EDIT_DELETE:
+ do {
+ spos++;
+ i--;
+ o++;
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
+ break;
+
+ case LEV_EDIT_INSERT:
+ do {
+ dpos++;
+ i--;
+ o++;
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
+ break;
+
+ default:
+ break;
+ }
+ }
+ if (spos < len1 || dpos < len2)
+ nmb++;
+
+ /* fill the info */
+ mb = mblocks = (LevMatchingBlock*)malloc(nmb*sizeof(LevOpCode));
+ if (!mblocks) {
+ *nmblocks = (size_t)(-1);
+ return NULL;
+ }
+ o = ops;
+ spos = dpos = 0;
+ type = LEV_EDIT_KEEP;
+ for (i = n; i; ) {
+ /* simply pretend there are no keep blocks */
+ while (o->type == LEV_EDIT_KEEP && --i)
+ o++;
+ if (!i)
+ break;
+ if (spos < o->spos || dpos < o->dpos) {
+ mb->spos = spos;
+ mb->dpos = dpos;
+ mb->len = o->spos - spos;
+ spos = o->spos;
+ dpos = o->dpos;
+ mb++;
+ }
+ type = o->type;
+ switch (type) {
+ case LEV_EDIT_REPLACE:
+ do {
+ spos++;
+ dpos++;
+ i--;
+ o++;
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
+ break;
+
+ case LEV_EDIT_DELETE:
+ do {
+ spos++;
+ i--;
+ o++;
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
+ break;
+
+ case LEV_EDIT_INSERT:
+ do {
+ dpos++;
+ i--;
+ o++;
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
+ break;
+
+ default:
+ break;
+ }
+ }
+ if (spos < len1 || dpos < len2) {
+ assert(len1 - spos == len2 - dpos);
+ mb->spos = spos;
+ mb->dpos = dpos;
+ mb->len = len1 - spos;
+ mb++;
+ }
+ assert((size_t)(mb - mblocks) == nmb);
+
+ *nmblocks = nmb;
+ return mblocks;
+}
+
+/**
+ * lev_opcodes_matching_blocks:
+ * @len1: The length of the source string.
+ * @len2: The length of the destination string.
+ * @nb: The size of @bops.
+ * @bops: An array of difflib block edit operation codes.
+ * @nmblocks: Where the number of matching block should be stored.
+ *
+ * Computes the matching block corresponding to an optimal edit @bops.
+ *
+ * Returns: The matching blocks as a newly allocated array, it length is
+ * stored in @nmblocks.
+ **/
+_LEV_STATIC_PY LevMatchingBlock*
+lev_opcodes_matching_blocks(size_t len1,
+ __attribute__((unused)) size_t len2,
+ size_t nb,
+ const LevOpCode *bops,
+ size_t *nmblocks)
+{
+ size_t nmb, i;
+ const LevOpCode *b;
+ LevMatchingBlock *mblocks, *mb;
+
+ /* compute the number of matching blocks */
+ nmb = 0;
+ b = bops;
+ for (i = nb; i; i--, b++) {
+ if (b->type == LEV_EDIT_KEEP) {
+ nmb++;
+ /* adjacent KEEP blocks -- we never produce it, but... */
+ while (i && b->type == LEV_EDIT_KEEP) {
+ i--;
+ b++;
+ }
+ if (!i)
+ break;
+ }
+ }
+
+ /* convert */
+ mb = mblocks = (LevMatchingBlock*)malloc(nmb*sizeof(LevOpCode));
+ if (!mblocks) {
+ *nmblocks = (size_t)(-1);
+ return NULL;
+ }
+ b = bops;
+ for (i = nb; i; i--, b++) {
+ if (b->type == LEV_EDIT_KEEP) {
+ mb->spos = b->sbeg;
+ mb->dpos = b->dbeg;
+ /* adjacent KEEP blocks -- we never produce it, but... */
+ while (i && b->type == LEV_EDIT_KEEP) {
+ i--;
+ b++;
+ }
+ if (!i) {
+ mb->len = len1 - mb->spos;
+ mb++;
+ break;
+ }
+ mb->len = b->sbeg - mb->spos;
+ mb++;
+ }
+ }
+ assert((size_t)(mb - mblocks) == nmb);
+
+ *nmblocks = nmb;
+ return mblocks;
+}
+
+/**
+ * lev_editops_total_cost:
+ * @n: Tne size of @ops.
+ * @ops: An array of elementary edit operations.
+ *
+ * Computes the total cost of operations in @ops.
+ *
+ * The costs of elementary operations are all 1.
+ *
+ * Returns: The total cost.
+ **/
+_LEV_STATIC_PY size_t
+lev_editops_total_cost(size_t n,
+ const LevEditOp *ops)
+{
+ size_t i, sum = 0;
+
+ for (i = n; i; i--, ops++)
+ sum += !!ops->type;
+
+ return sum;
+}
+
+/**
+ * lev_editops_normalize:
+ * @n: The size of @ops.
+ * @ops: An array of elementary edit operations.
+ * @nnorm: Where to store then length of the normalized array.
+ *
+ * Normalizes a list of edit operations to contain no keep operations.
+ *
+ * Note it returns %NULL for an empty array.
+ *
+ * Returns: A newly allocated array of normalized edit operations, its length
+ * is stored to @nnorm.
+ **/
+_LEV_STATIC_PY LevEditOp*
+lev_editops_normalize(size_t n,
+ const LevEditOp *ops,
+ size_t *nnorm)
+{
+ size_t nx, i;
+ const LevEditOp *o;
+ LevEditOp *opsnorm, *on;
+
+ if (!n || !ops) {
+ *nnorm = 0;
+ return NULL;
+ }
+
+ nx = 0;
+ o = ops;
+ for (i = n; i; i--, o++)
+ nx += (o->type == LEV_EDIT_KEEP);
+
+ *nnorm = n - nx;
+ if (!*nnorm)
+ return NULL;
+
+ opsnorm = on = (LevEditOp*)malloc((n - nx)*sizeof(LevEditOp));
+ o = ops;
+ for (i = n; i; i--, o++) {
+ if (o->type == LEV_EDIT_KEEP)
+ continue;
+ memcpy(on++, o, sizeof(LevEditOp));
+ }
+
+ return opsnorm;
+}
+
+/**
+ * lev_opcodes_total_cost:
+ * @nb: Tne size of @bops.
+ * @bops: An array of difflib block operation codes.
+ *
+ * Computes the total cost of operations in @bops.
+ *
+ * The costs of elementary operations are all 1.
+ *
+ * Returns: The total cost.
+ **/
+_LEV_STATIC_PY size_t
+lev_opcodes_total_cost(size_t nb,
+ const LevOpCode *bops)
+{
+ size_t i, sum = 0;
+
+ for (i = nb; i; i--, bops++) {
+ switch (bops->type) {
+ case LEV_EDIT_REPLACE:
+ case LEV_EDIT_DELETE:
+ sum += (bops->send - bops->sbeg);
+ break;
+
+ case LEV_EDIT_INSERT:
+ sum += (bops->dend - bops->dbeg);
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ return sum;
+}
+
+/**
+ * lev_editops_subtract:
+ * @n: The size of @ops.
+ * @ops: An array of elementary edit operations.
+ * @ns: The size of @sub.
+ * @sub: A subsequence (ordered subset) of @ops
+ * @nrem: Where to store then length of the remainder array.
+ *
+ * Subtracts a subsequence of elementary edit operations from a sequence.
+ *
+ * The remainder is a sequence that, applied to result of application of @sub,
+ * gives the same final result as application of @ops to original string.
+ *
+ * Returns: A newly allocated array of normalized edit operations, its length
+ * is stored to @nrem. It is always normalized, i.e, without any
+ * keep operations. On failure, %NULL is returned.
+ **/
+_LEV_STATIC_PY LevEditOp*
+lev_editops_subtract(size_t n,
+ const LevEditOp *ops,
+ size_t ns,
+ const LevEditOp *sub,
+ size_t *nrem)
+{
+ static const int shifts[] = { 0, 0, 1, -1 };
+ LevEditOp *rem;
+ size_t i, j, nr, nn;
+ int shift;
+
+ /* compute remainder size */
+ *nrem = -1;
+ nr = nn = 0;
+ for (i = 0; i < n; i++) {
+ if (ops[i].type != LEV_EDIT_KEEP)
+ nr++;
+ }
+ for (i = 0; i < ns; i++) {
+ if (sub[i].type != LEV_EDIT_KEEP)
+ nn++;
+ }
+ if (nn > nr)
+ return NULL;
+ nr -= nn;
+
+
+ /* subtract */
+ /* we could simply return NULL when nr == 0, but then it would be possible
+ * to subtract *any* sequence of the right length to get an empty sequence
+ * -- clrealy incorrectly; so we have to scan the list to check */
+ rem = nr ? (LevEditOp*)malloc(nr*sizeof(LevEditOp)) : NULL;
+ j = nn = shift = 0;
+ for (i = 0; i < ns; i++) {
+ while ((ops[j].spos != sub[i].spos
+ || ops[j].dpos != sub[i].dpos
+ || ops[j].type != sub[i].type)
+ && j < n) {
+ if (ops[j].type != LEV_EDIT_KEEP) {
+ rem[nn] = ops[j];
+ rem[nn].spos += shift;
+ nn++;
+ }
+ j++;
+ }
+ if (j == n) {
+ free(rem);
+ return NULL;
+ }
+
+ shift += shifts[sub[i].type];
+ j++;
+ }
+
+ while (j < n) {
+ if (ops[j].type != LEV_EDIT_KEEP) {
+ rem[nn] = ops[j];
+ rem[nn].spos += shift;
+ nn++;
+ }
+ j++;
+ }
+ assert(nn == nr);
+
+ *nrem = nr;
+ return rem;
+}
+/* }}} */
+
+/****************************************************************************
+ *
+ * Weighted mean strings
+ *
+ ****************************************************************************/
+
+/*
+_LEV_STATIC_PY lev_byte*
+lev_weighted_average(size_t len1,
+ const lev_byte* string1,
+ size_t len2,
+ const lev_byte* string2,
+ size_t n,
+ const LevEditOp *ops,
+ LevAveragingType avtype,
+ size_t total_cost,
+ double alpha,
+ size_t *len)
+{
+ return NULL;
+}
+*/
+
diff --git a/src/nimlevenshtein/_levenshtein.h b/src/nimlevenshtein/_levenshtein.h
new file mode 100644
index 0000000..0f9e161
--- /dev/null
+++ b/src/nimlevenshtein/_levenshtein.h
@@ -0,0 +1,394 @@
+/* @(#) $Id: Levenshtein.h,v 1.22 2005/01/13 20:02:56 yeti Exp $ */
+#ifndef LEVENSHTEIN_H
+#define LEVENSHTEIN_H
+
+#ifndef size_t
+# include <stdlib.h>
+#endif
+
+/* A bit dirty. */
+#ifndef _LEV_STATIC_PY
+# define _LEV_STATIC_PY /* */
+#endif
+
+/* In C, this is just wchar_t and unsigned char, in Python, lev_wchar can
+ * be anything. If you really want to cheat, define wchar_t to any integer
+ * type you like before including Levenshtein.h and recompile it. */
+#ifndef lev_wchar
+# ifndef wchar_t
+# include <wchar.h>
+# endif
+# define lev_wchar wchar_t
+#endif
+typedef unsigned char lev_byte;
+
+/* Edit opration type
+ * DON'T CHANGE! used ad arrays indices and the bits are occasionally used
+ * as flags */
+typedef enum {
+ LEV_EDIT_KEEP = 0,
+ LEV_EDIT_REPLACE = 1,
+ LEV_EDIT_INSERT = 2,
+ LEV_EDIT_DELETE = 3,
+ LEV_EDIT_LAST /* sometimes returned when an error occurs */
+} LevEditType;
+
+/* Error codes returned by editop check functions */
+typedef enum {
+ LEV_EDIT_ERR_OK = 0,
+ LEV_EDIT_ERR_TYPE, /* nonexistent edit type */
+ LEV_EDIT_ERR_OUT, /* edit out of string bounds */
+ LEV_EDIT_ERR_ORDER, /* ops are not ordered */
+ LEV_EDIT_ERR_BLOCK, /* incosistent block boundaries (block ops) */
+ LEV_EDIT_ERR_SPAN, /* sequence is not a full transformation (block ops) */
+ LEV_EDIT_ERR_LAST
+} LevEditOpError;
+
+/* string averaging method (UNUSED yet) */
+typedef enum {
+ LEV_AVG_HEAD = 0, /* take operations from the head */
+ LEV_AVG_TAIL, /* take operations from the tail */
+ LEV_AVG_SPREAD, /* take a equidistantly distributed subset */
+ LEV_AVG_BLOCK, /* take a random continuous block */
+ LEV_AVG_RANDOM, /* take a random subset */
+ LEV_AVG_LAST
+} LevAveragingType;
+
+/* Edit operation (atomic).
+ * This is the `native' atomic edit operation. It differs from the difflib
+ * one's because it represents a change of one character, not a block. And
+ * we usually don't care about LEV_EDIT_KEEP, though the functions can handle
+ * them. The positions are interpreted as at the left edge of a character.
+ */
+typedef struct {
+ LevEditType type; /* editing operation type */
+ size_t spos; /* source block position */
+ size_t dpos; /* destination position */
+} LevEditOp;
+
+/* Edit operation (difflib-compatible).
+ * This is not `native', but conversion functions exist. These fields exactly
+ * correspond to the codeops() tuples fields (and this method is also the
+ * source of the silly OpCode name). Sequences must span over complete
+ * strings, subsequences are simply edit sequences with more (or larger)
+ * LEV_EDIT_KEEP blocks.
+ */
+typedef struct {
+ LevEditType type; /* editing operation type */
+ size_t sbeg, send; /* source block begin, end */
+ size_t dbeg, dend; /* destination block begin, end */
+} LevOpCode;
+
+/* Matching block (difflib-compatible). */
+typedef struct {
+ size_t spos;
+ size_t dpos;
+ size_t len;
+} LevMatchingBlock;
+
+_LEV_STATIC_PY
+size_t
+lev_edit_distance(size_t len1,
+ const lev_byte *string1,
+ size_t len2,
+ const lev_byte *string2,
+ int xcost);
+
+_LEV_STATIC_PY
+size_t
+lev_u_edit_distance(size_t len1,
+ const lev_wchar *string1,
+ size_t len2,
+ const lev_wchar *string2,
+ int xcost);
+
+_LEV_STATIC_PY
+size_t
+lev_hamming_distance(size_t len,
+ const lev_byte *string1,
+ const lev_byte *string2);
+
+_LEV_STATIC_PY
+size_t
+lev_u_hamming_distance(size_t len,
+ const lev_wchar *string1,
+ const lev_wchar *string2);
+
+_LEV_STATIC_PY
+double
+lev_jaro_ratio(size_t len1,
+ const lev_byte *string1,
+ size_t len2,
+ const lev_byte *string2);
+
+_LEV_STATIC_PY
+double
+lev_u_jaro_ratio(size_t len1,
+ const lev_wchar *string1,
+ size_t len2,
+ const lev_wchar *string2);
+
+_LEV_STATIC_PY
+double
+lev_jaro_winkler_ratio(size_t len1, const lev_byte *string1,
+ size_t len2, const lev_byte *string2,
+ double pfweight);
+
+_LEV_STATIC_PY
+double
+lev_u_jaro_winkler_ratio(size_t len1, const lev_wchar *string1,
+ size_t len2, const lev_wchar *string2,
+ double pfweight);
+
+_LEV_STATIC_PY
+lev_byte*
+lev_greedy_median(size_t n,
+ const size_t *lengths,
+ const lev_byte *strings[],
+ const double *weights,
+ size_t *medlength);
+
+_LEV_STATIC_PY
+lev_wchar*
+lev_u_greedy_median(size_t n,
+ const size_t *lengths,
+ const lev_wchar *strings[],
+ const double *weights,
+ size_t *medlength);
+
+_LEV_STATIC_PY
+lev_byte*
+lev_median_improve(size_t len, const lev_byte *s,
+ size_t n, const size_t *lengths,
+ const lev_byte *strings[],
+ const double *weights,
+ size_t *medlength);
+
+_LEV_STATIC_PY
+lev_wchar*
+lev_u_median_improve(size_t len, const lev_wchar *s,
+ size_t n, const size_t *lengths,
+ const lev_wchar *strings[],
+ const double *weights,
+ size_t *medlength);
+
+_LEV_STATIC_PY
+lev_byte*
+lev_quick_median(size_t n,
+ const size_t *lengths,
+ const lev_byte *strings[],
+ const double *weights,
+ size_t *medlength);
+
+_LEV_STATIC_PY
+lev_wchar*
+lev_u_quick_median(size_t n,
+ const size_t *lengths,
+ const lev_wchar *strings[],
+ const double *weights,
+ size_t *medlength);
+
+_LEV_STATIC_PY
+lev_byte*
+lev_set_median(size_t n,
+ const size_t *lengths,
+ const lev_byte *strings[],
+ const double *weights,
+ size_t *medlength);
+
+_LEV_STATIC_PY
+size_t
+lev_set_median_index(size_t n, const size_t *lengths,
+ const lev_byte *strings[],
+ const double *weights);
+
+_LEV_STATIC_PY
+lev_wchar*
+lev_u_set_median(size_t n,
+ const size_t *lengths,
+ const lev_wchar *strings[],
+ const double *weights,
+ size_t *medlength);
+
+_LEV_STATIC_PY
+size_t
+lev_u_set_median_index(size_t n, const size_t *lengths,
+ const lev_wchar *strings[],
+ const double *weights);
+
+_LEV_STATIC_PY
+double
+lev_edit_seq_distance(size_t n1,
+ const size_t *lengths1,
+ const lev_byte *strings1[],
+ size_t n2,
+ const size_t *lengths2,
+ const lev_byte *strings2[]);
+
+_LEV_STATIC_PY
+double
+lev_u_edit_seq_distance(size_t n1,
+ const size_t *lengths1,
+ const lev_wchar *strings1[],
+ size_t n2,
+ const size_t *lengths2,
+ const lev_wchar *strings2[]);
+
+_LEV_STATIC_PY
+double
+lev_set_distance(size_t n1,
+ const size_t *lengths1,
+ const lev_byte *strings1[],
+ size_t n2,
+ const size_t *lengths2,
+ const lev_byte *strings2[]);
+
+_LEV_STATIC_PY
+double
+lev_u_set_distance(size_t n1,
+ const size_t *lengths1,
+ const lev_wchar *strings1[],
+ size_t n2,
+ const size_t *lengths2,
+ const lev_wchar *strings2[]);
+
+_LEV_STATIC_PY
+int
+lev_editops_check_errors(size_t len1,
+ size_t len2,
+ size_t n,
+ const LevEditOp *ops);
+
+_LEV_STATIC_PY
+int
+lev_opcodes_check_errors(size_t len1,
+ size_t len2,
+ size_t nb,
+ const LevOpCode *bops);
+
+_LEV_STATIC_PY
+void
+lev_editops_invert(size_t n,
+ LevEditOp *ops);
+
+_LEV_STATIC_PY
+void
+lev_opcodes_invert(size_t nb,
+ LevOpCode *bops);
+
+_LEV_STATIC_PY
+LevMatchingBlock*
+lev_editops_matching_blocks(size_t len1,
+ size_t len2,
+ size_t n,
+ const LevEditOp *ops,
+ size_t *nmblocks);
+
+_LEV_STATIC_PY
+LevMatchingBlock*
+lev_opcodes_matching_blocks(size_t len1,
+ size_t len2,
+ size_t nb,
+ const LevOpCode *bops,
+ size_t *nmblocks);
+
+_LEV_STATIC_PY
+lev_byte*
+lev_editops_apply(size_t len1,
+ const lev_byte* string1,
+ size_t len2,
+ const lev_byte* string2,
+ size_t n,
+ const LevEditOp *ops,
+ size_t *len);
+
+_LEV_STATIC_PY
+lev_wchar*
+lev_u_editops_apply(size_t len1,
+ const lev_wchar* string1,
+ size_t len2,
+ const lev_wchar* string2,
+ size_t n,
+ const LevEditOp *ops,
+ size_t *len);
+
+_LEV_STATIC_PY
+lev_byte*
+lev_opcodes_apply(size_t len1,
+ const lev_byte* string1,
+ size_t len2,
+ const lev_byte* string2,
+ size_t nb,
+ const LevOpCode *bops,
+ size_t *len);
+
+_LEV_STATIC_PY
+lev_wchar*
+lev_u_opcodes_apply(size_t len1,
+ const lev_wchar* string1,
+ size_t len2,
+ const lev_wchar* string2,
+ size_t nb,
+ const LevOpCode *bops,
+ size_t *len);
+
+_LEV_STATIC_PY
+LevEditOp*
+lev_editops_find(size_t len1,
+ const lev_byte *string1,
+ size_t len2,
+ const lev_byte *string2,
+ size_t *n);
+
+_LEV_STATIC_PY
+LevEditOp*
+lev_u_editops_find(size_t len1,
+ const lev_wchar *string1,
+ size_t len2,
+ const lev_wchar *string2,
+ size_t *n);
+
+_LEV_STATIC_PY
+LevEditOp*
+lev_opcodes_to_editops(size_t nb,
+ const LevOpCode *bops,
+ size_t *n,
+ int keepkeep);
+
+_LEV_STATIC_PY
+LevOpCode*
+lev_editops_to_opcodes(size_t n,
+ const LevEditOp *ops,
+ size_t *nb,
+ size_t len1,
+ size_t len2);
+
+_LEV_STATIC_PY
+size_t
+lev_editops_total_cost(size_t n,
+ const LevEditOp *ops);
+
+_LEV_STATIC_PY
+size_t
+lev_opcodes_total_cost(size_t nb,
+ const LevOpCode *bops);
+
+_LEV_STATIC_PY
+LevEditOp*
+lev_editops_normalize(size_t n,
+ const LevEditOp *ops,
+ size_t *nnorm);
+
+_LEV_STATIC_PY LevEditOp*
+lev_editops_subtract(size_t n,
+ const LevEditOp *ops,
+ size_t ns,
+ const LevEditOp *sub,
+ size_t *nrem);
+
+/* UNUSED yet */
+_LEV_STATIC_PY
+void
+lev_init_rng(unsigned long int seed);
+
+#endif /* not LEVENSHTEIN_H */
diff --git a/src/nimlevenshtein/binding.nim b/src/nimlevenshtein/binding.nim
new file mode 100644
index 0000000..b9706d6
--- /dev/null
+++ b/src/nimlevenshtein/binding.nim
@@ -0,0 +1,150 @@
+## Low-level bindings to python-Levenshtein C API.
+
+{.passC:"-DNO_PYTHON -Dlev_wchar=int".}
+{.compile:"_levenshtein.c".}
+
+when defined(linux):
+ {.passL:"-lm".}
+
+type
+ lev_byte* = cuchar
+
+ EditType* {.pure.} = enum
+ ## Edit operation type
+ Keep = 0 ## Keep, sometimes called *equal*
+ Replace = 1 ## Substitution
+ Insert = 2 ## Insertion
+ Delete = 3 ## Deletion
+ Last ## sometimes returned when an error occurs
+
+ EditOpError* {.pure.} = enum
+ ## Error codes returned by editop check functions
+ Ok = 0, ## no error
+ Type, ## nonexistent edit type
+ Out, ## edit out of string bounds
+ Order, ## ops are not ordered
+ Block, ## incosistent block boundaries (block ops)
+ Span, ## sequence is not a full transformation (block ops)
+ Last
+
+ EditOp* {.bycopy.} = object
+ ## Edit operation (atomic).
+ ##
+ ## This is the `native` atomic edit operation. It differs from the difflib one's because it
+ ## represents a change of one character, not a block. And we usually don't care about
+ ## `EditType.Keep`, though the functions can handle them. The positions are interpreted as at
+ ## the left edge of a character.
+ `type`*: EditType ## editing operation type
+ spos*: csize ## source block position
+ dpos*: csize ## destination position
+
+ OpCode* {.bycopy.} = object
+ ## Edit operation (difflib-compatible).
+ ##
+ ## This is not `native`, but conversion functions exist. These fields exactly correspond to
+ ## the codeops() tuples fields (and this method is also the source of the silly OpCode name).
+ ## Sequences must span over complete strings, subsequences are simply edit sequences with more
+ ## (or larger) `EditType.Keep` blocks.
+ `type`*: EditType ## editing operation type
+ sbeg*: csize ## source block begin
+ send*: csize ## source block end
+ dbeg*: csize ## destination block begin
+ dend*: csize ## destination block end
+
+ MatchingBlock* {.bycopy.} = object
+ ## Matching block (difflib-compatible).
+ spos*: csize ## source block position
+ dpos*: csize ## destination block position
+ len*: csize ## block length
+
+proc lev_edit_distance*(len1: csize; string1: ptr lev_byte; len2: csize;
+ string2: ptr lev_byte; xcost: cint): csize {.importc.}
+proc lev_u_edit_distance*(len1: csize; string1: ptr int32; len2: csize;
+ string2: ptr int32; xcost: cint): csize {.importc.}
+proc lev_hamming_distance*(len: csize; string1: ptr lev_byte; string2: ptr lev_byte): csize {.
+ importc.}
+proc lev_u_hamming_distance*(len: csize; string1: ptr int32; string2: ptr int32): csize {.
+ importc.}
+proc lev_jaro_ratio*(len1: csize; string1: ptr lev_byte; len2: csize;
+ string2: ptr lev_byte): cdouble {.importc.}
+proc lev_u_jaro_ratio*(len1: csize; string1: ptr int32; len2: csize; string2: ptr int32): cdouble {.
+ importc.}
+proc lev_jaro_winkler_ratio*(len1: csize; string1: ptr lev_byte; len2: csize;
+ string2: ptr lev_byte; pfweight: cdouble): cdouble {.importc.}
+proc lev_u_jaro_winkler_ratio*(len1: csize; string1: ptr int32; len2: csize;
+ string2: ptr int32; pfweight: cdouble): cdouble {.importc.}
+proc lev_greedy_median*(n: csize; lengths: ptr csize; strings: ptr ptr lev_byte;
+ weights: ptr cdouble; medlength: ptr csize): ptr lev_byte {.importc.}
+proc lev_u_greedy_median*(n: csize; lengths: ptr csize; strings: ptr ptr int32;
+ weights: ptr cdouble; medlength: ptr csize): ptr int32 {.importc.}
+proc lev_median_improve*(len: csize; s: ptr lev_byte; n: csize; lengths: ptr csize;
+ strings: ptr ptr lev_byte; weights: ptr cdouble;
+ medlength: ptr csize): ptr lev_byte {.importc.}
+proc lev_u_median_improve*(len: csize; s: ptr int32; n: csize; lengths: ptr csize;
+ strings: ptr ptr int32; weights: ptr cdouble;
+ medlength: ptr csize): ptr int32 {.importc.}
+proc lev_quick_median*(n: csize; lengths: ptr csize; strings: ptr ptr lev_byte;
+ weights: ptr cdouble; medlength: ptr csize): ptr lev_byte {.importc.}
+proc lev_u_quick_median*(n: csize; lengths: ptr csize; strings: ptr ptr int32;
+ weights: ptr cdouble; medlength: ptr csize): ptr int32 {.importc.}
+proc lev_set_median*(n: csize; lengths: ptr csize; strings: ptr ptr lev_byte;
+ weights: ptr cdouble; medlength: ptr csize): ptr lev_byte {.importc.}
+proc lev_set_median_index*(n: csize; lengths: ptr csize; strings: ptr ptr lev_byte;
+ weights: ptr cdouble): csize {.importc.}
+proc lev_u_set_median*(n: csize; lengths: ptr csize; strings: ptr ptr int32;
+ weights: ptr cdouble; medlength: ptr csize): ptr int32 {.importc.}
+proc lev_u_set_median_index*(n: csize; lengths: ptr csize; strings: ptr ptr int32;
+ weights: ptr cdouble): csize {.importc.}
+proc lev_edit_seq_distance*(n1: csize; lengths1: ptr csize;
+ strings1: ptr ptr lev_byte; n2: csize; lengths2: ptr csize;
+ strings2: ptr ptr lev_byte): cdouble {.importc.}
+proc lev_u_edit_seq_distance*(n1: csize; lengths1: ptr csize; strings1: ptr ptr int32;
+ n2: csize; lengths2: ptr csize; strings2: ptr ptr int32): cdouble {.
+ importc.}
+proc lev_set_distance*(n1: csize; lengths1: ptr csize; strings1: ptr ptr lev_byte;
+ n2: csize; lengths2: ptr csize; strings2: ptr ptr lev_byte): cdouble {.
+ importc.}
+proc lev_u_set_distance*(n1: csize; lengths1: ptr csize; strings1: ptr ptr int32;
+ n2: csize; lengths2: ptr csize; strings2: ptr ptr int32): cdouble {.
+ importc.}
+proc lev_editops_check_errors*(len1: csize; len2: csize; n: csize; ops: ptr EditOp): cint {.
+ importc.}
+proc lev_opcodes_check_errors*(len1: csize; len2: csize; nb: csize; bops: ptr OpCode): cint {.
+ importc.}
+proc lev_editops_invert*(n: csize; ops: ptr EditOp) {.importc.}
+proc lev_opcodes_invert*(nb: csize; bops: ptr OpCode) {.importc.}
+proc lev_editops_matching_blocks*(len1: csize; len2: csize; n: csize;
+ ops: ptr EditOp; nmblocks: ptr csize): ptr MatchingBlock {.
+ importc.}
+proc lev_opcodes_matching_blocks*(len1: csize; len2: csize; nb: csize;
+ bops: ptr OpCode; nmblocks: ptr csize): ptr MatchingBlock {.
+ importc.}
+proc lev_editops_apply*(len1: csize; string1: ptr lev_byte; len2: csize;
+ string2: ptr lev_byte; n: csize; ops: ptr EditOp;
+ len: ptr csize): ptr lev_byte {.importc.}
+proc lev_u_editops_apply*(len1: csize; string1: ptr int32; len2: csize;
+ string2: ptr int32; n: csize; ops: ptr EditOp;
+ len: ptr csize): ptr int32 {.importc.}
+proc lev_opcodes_apply*(len1: csize; string1: ptr lev_byte; len2: csize;
+ string2: ptr lev_byte; nb: csize; bops: ptr OpCode;
+ len: ptr csize): ptr lev_byte {.importc.}
+proc lev_u_opcodes_apply*(len1: csize; string1: ptr int32; len2: csize;
+ string2: ptr int32; nb: csize; bops: ptr OpCode;
+ len: ptr csize): ptr int32 {.importc.}
+proc lev_editops_find*(len1: csize; string1: ptr lev_byte; len2: csize;
+ string2: ptr lev_byte; n: ptr csize): ptr EditOp {.importc.}
+proc lev_u_editops_find*(len1: csize; string1: ptr int32; len2: csize;
+ string2: ptr int32; n: ptr csize): ptr EditOp {.importc.}
+proc lev_opcodes_to_editops*(nb: csize; bops: ptr OpCode; n: ptr csize; keepkeep: cint): ptr EditOp {.
+ importc.}
+proc lev_editops_to_opcodes*(n: csize; ops: ptr EditOp; nb: ptr csize; len1: csize;
+ len2: csize): ptr OpCode {.importc.}
+proc lev_editops_total_cost*(n: csize; ops: ptr EditOp): csize {.importc.}
+proc lev_opcodes_total_cost*(nb: csize; bops: ptr OpCode): csize {.importc.}
+proc lev_editops_normalize*(n: csize; ops: ptr EditOp; nnorm: ptr csize): ptr EditOp {.
+ importc.}
+proc lev_editops_subtract*(n: csize; ops: ptr EditOp; ns: csize; sub: ptr EditOp;
+ nrem: ptr csize): ptr EditOp {.importc.}
+
+
+proc lev_init_rng*(seed: culong) {.importc.}
diff --git a/src/nimlevenshtein/wrapper.nim b/src/nimlevenshtein/wrapper.nim
new file mode 100644
index 0000000..5067cf1
--- /dev/null
+++ b/src/nimlevenshtein/wrapper.nim
@@ -0,0 +1,286 @@
+## A thin wrapper around the python-Levenshtein C API.
+##
+## Using this wrapper you can use Nim strings and sequences instead of passing cstrings and pointers
+## to arrays.
+
+import binding
+
+export EditOp
+export OpCode
+export EditType
+export MatchingBlock
+
+converter toLevByte(s: string): ptr lev_byte = cast[ptr lev_byte](s.cstring)
+
+converter toLevByteArray(a: cstringArray): ptr ptr lev_byte = cast[ptr ptr lev_byte](a)
+
+proc cFree(p: pointer) {.importc:"free".}
+
+proc distance*(a, b: string, xcost: int): int =
+ ## Computes Levenshtein edit distance of two strings.
+ ##
+ ## If nonzero, the replace operation has weight 2, otherwise all edit operations have equal
+ ## weights of 1.
+ lev_edit_distance(len(a), a, len(b), b, xcost.cint)
+
+proc hammingDistance*(a, b: string): int =
+ ## Computes Hamming distance of two strings.
+ ##
+ ## The strings must have the same length.
+ assert len(a) == len(b)
+ lev_hamming_distance(len(a), a, b)
+
+proc jaroRatio*(a, b: string): float =
+ ## Computes Jaro string similarity metric of two strings.
+ lev_jaro_ratio(len(a), a, len(b), b)
+
+proc jaroWinklerRatio*(a, b: string, pfweigth: float): float =
+ ## Computes Jaro-Winkler string similarity metric of two strings.
+ ##
+ ## The formula is J+@pfweight*P*(1-J), where J is Jaro metric and P is the length of common
+ ## prefix.
+ lev_jaro_winkler_ratio(len(a), a, len(b), b, pfweigth)
+
+template medianCommon(f: typed, strings: seq[string], weights: seq[float]) =
+ assert len(strings) == len(weights)
+
+ var lengths = newSeqUninitialized[csize](strings.len)
+ for i, s in strings:
+ lengths[i] = len(s)
+
+ var cstrings = allocCStringArray(strings)
+ defer: deallocCStringArray(cstrings)
+
+ var nm: csize
+
+ let r = f(strings.len, lengths[0].addr, cstrings, weights[0].unsafeAddr, nm.addr)
+ defer: cFree(r)
+
+ if r == nil and nm > 0:
+ raise newException(OutOfMemError, "out of memory")
+
+ result = newString(nm)
+ copyMem(result[0].addr, r, nm)
+
+proc greedyMedian*(strings: seq[string], weights: seq[float]): string =
+ ## Finds a generalized median string of @strings using the greedy algorithm.
+ ##
+ ## Note it's considerably more efficient to give a string with weight 2 than to store two
+ ## identical strings in `strings` (with weights 1).
+ medianCommon(lev_greedy_median, strings, weights)
+
+proc medianImprove*(s: string, strings: seq[string], weights: seq[float]): string =
+ ## Tries to make `s` a better generalized median string of `strings` with small perturbations.
+ ##
+ ## It never returns a string with larger SOD than `s`; in the worst case, a string identical to
+ ## `s` is returned.
+ assert len(strings) == len(weights)
+
+ var lengths = newSeqUninitialized[csize](strings.len)
+ for i, s in strings:
+ lengths[i] = len(s)
+
+ var cstrings = allocCStringArray(strings)
+ defer: deallocCStringArray(cstrings)
+
+ var nm: csize
+
+ let r = lev_median_improve(len(s), s, strings.len, lengths[0].addr, cstrings, weights[0].unsafeAddr, nm.addr)
+ defer: cFree(r)
+
+ if r == nil and nm > 0:
+ raise newException(OutOfMemError, "lev_median_improve")
+
+ result = newString(nm)
+ copyMem(result[0].addr, r, nm)
+
+proc quickMedian*(strings: seq[string], weights: seq[float]): string =
+ medianCommon(lev_quick_median, strings, weights)
+
+proc setMedian*(strings: seq[string], weights: seq[float]): string =
+ ## Finds the median string of a string set `strings`.
+ medianCommon(lev_set_median, strings, weights)
+
+template setSeqCommon(f: typed, a: seq[string], b: seq[string]): float =
+ var ca = allocCStringArray(a)
+ defer: deallocCStringArray(ca)
+
+ var la = newSeqUninitialized[csize](len(a))
+ for i, s in a:
+ la[i] = len(s)
+
+ var cb = allocCStringArray(b)
+ defer: deallocCStringArray(cb)
+
+ var lb = newSeqUninitialized[csize](len(b))
+ for i, s in b:
+ lb[i] = len(s)
+
+ f(len(a), la[0].addr, ca, len(b), lb[0].addr, cb)
+
+proc editSeqDistance*(a: seq[string], b: seq[string]): float =
+ ## Finds the distance between string sequences `a` and `b`.
+ ##
+ ## In other words, this is a double-Levenshtein algorithm.
+ ##
+ ## The cost of string replace operation is based on string similarity: it's zero for identical
+ ## strings and 2 for completely unsimilar strings.
+ setSeqCommon(lev_edit_seq_distance, a, b)
+
+proc setDistance*(a: seq[string], b: seq[string]): float =
+ ## Finds the distance between string sets `a` and `b`.
+ ##
+ ## The difference from `editSeqDistance()` is that order doesn't matter. The optimal association
+ ## of `a` and `b` is found first and the similarity is computed for that.
+ ##
+ ## Uses sequential Munkers-Blackman algorithm.
+ setSeqCommon(lev_set_distance, a, b)
+
+proc checkErrors*(len1: int, len2: int, ops: seq[EditOp]) =
+ ## Checks whether `ops` is consistent and applicable as a partial edit from a string of length
+ ## `len1` to a string of length `len2`.
+ ##
+ ## Raises an exception if there are errors.
+ if lev_editops_check_errors(len1, len2, len(ops), ops[0].unsafeAddr) != 0:
+ raise newException(Exception, "edit operations are invalid or inapplicable")
+
+proc checkErrors*(len1: int, len2: int, bops: seq[OpCode]) =
+ ## Checks whether `bops` is consistent and applicable as an edit from a string of length `len1`
+ ## to a string of length `len2`.
+ ##
+ ## Raises an exception if there are errors.
+ if lev_opcodes_check_errors(len1, len2, len(bops), bops[0].unsafeAddr) != 0:
+ raise newException(Exception, "edit operations are invalid or inapplicable")
+
+proc invert*(ops: var seq[EditOp]) =
+ ## Inverts the sense of `ops`. It is modified in place.
+ ##
+ ## In other words, `ops` becomes a valid partial edit for the original source and destination
+ ## strings with their roles exchanged.
+ lev_editops_invert(len(ops), ops[0].addr)
+
+proc invert*(ops: var seq[OpCode]) =
+ ## Inverts the sense of `ops`. It is modified in place.
+ ##
+ ## In other words, `ops` becomes a partial edit for the original source and destination strings
+ ## with their roles exchanged.
+ lev_opcodes_invert(len(ops), ops[0].addr)
+
+proc matchingBlocks*(len1: int, len2: int, ops: seq[EditOp]): seq[MatchingBlock] =
+ ## Computes the matching block corresponding to an optimal edit `ops`.
+ var nmblocks: csize
+ let mblocks = lev_editops_matching_blocks(len1, len2, len(ops), ops[0].unsafeAddr, nmblocks.addr)
+ defer: cFree(mblocks)
+ if mblocks == nil and nmblocks > 0:
+ raise newException(OutOfMemError, "lev_editops_matching_blocks")
+ result = newSeq[MatchingBlock](nmblocks)
+ copyMem(result[0].addr, mblocks, sizeof(MatchingBlock)*nmblocks)
+
+proc matchingBlocks*(len1: int, len2: int, ops: seq[OpCode]): seq[MatchingBlock] =
+ ## Computes the matching block corresponding to an optimal edit `ops`.
+ var nmblocks: csize
+ let mblocks = lev_opcodes_matching_blocks(len1, len2, len(ops), ops[0].unsafeAddr, nmblocks.addr)
+ defer: cFree(mblocks)
+ if mblocks == nil and nmblocks > 0:
+ raise newException(OutOfMemError, "lev_opcodes_matching_blocks")
+ result = newSeq[MatchingBlock](nmblocks)
+ copyMem(result[0].addr, mblocks, sizeof(MatchingBlock)*nmblocks)
+
+proc apply*(string1: string, string2: string, ops: seq[EditOp]): string =
+ ## Applies a partial edit `ops` from `string1` to `string2`.
+ ##
+ ## NB: `ops` is not checked for applicability.
+ var nr: csize
+ let r = lev_editops_apply(len(string1), string1, len(string2), string2, len(ops), ops[0].unsafeAddr, nr.addr)
+ defer: cFree(r)
+ if r == nil and nr > 0:
+ raise newException(OutOfMemError, "lev_editops_apply")
+ result = newString(nr)
+ copyMem(result[0].addr, r, nr)
+
+proc apply*(string1: string, string2: string, ops: seq[OpCode]): string =
+ ## Applies a sequence of difflib block operations to a string.
+ ##
+ ## NB: `ops` is not checked for applicability.
+ var nr: csize
+ let r = lev_opcodes_apply(len(string1), string1, len(string2), string2, len(ops), ops[0].unsafeAddr, nr.addr)
+ defer: cFree(r)
+ if r == nil and nr > 0:
+ raise newException(OutOfMemError, "lev_opcodes_apply")
+ result = newString(nr)
+ copyMem(result[0].addr, r, nr)
+
+proc editops*(string1: string, string2: string): seq[EditOp] =
+ ## Find an optimal edit sequence from `string1` to `string2`.
+ ##
+ ## When there's more than one optimal sequence, a one is arbitrarily (though deterministically)
+ ## chosen.
+ ##
+ ## The return value is normalized, i.e., keep operations are not included.
+ var n: csize
+ let ops = lev_editops_find(len(string1), string1, len(string2), string2, n.addr)
+ defer: cFree(ops)
+ if ops == nil and n > 0:
+ raise newException(OutOfMemError, "lev_editops_find")
+ result = newSeq[EditOp](n)
+ copyMem(result[0].addr, ops, sizeof(EditOp)*n)
+
+proc toEditOps*(ops: seq[OpCode], keepkeep: bool): seq[EditOp] =
+ ## Converts difflib block operation codes to elementary edit operations.
+ ##
+ ## If `keepkeep` is true, keep operations will be included. Otherwise the result will be
+ ## normalized, i.e. without any keep operations.
+ var n: csize
+ let r = lev_opcodes_to_editops(len(ops), ops[0].unsafeAddr, n.addr, keepkeep.cint)
+ defer: cFree(r)
+ if r == nil and n > 0:
+ raise newException(OutOfMemError, "lev_opcodes_to_editops")
+ result = newSeq[EditOp](n)
+ copyMem(result[0].addr, r, sizeof(EditOp)*n)
+
+proc toOpCodes*(ops: seq[EditOp], len1: int, len2: int): seq[OpCode] =
+ ## Converts elementary edit operations to difflib block operation codes.
+ ##
+ ## Note the string lengths are necessary since difflib doesn't allow omitting keep operations.
+ var n: csize
+ let r = lev_editops_to_opcodes(len(ops), ops[0].unsafeAddr, n.addr, len1, len2)
+ defer: cFree(r)
+ if r == nil and n > 0:
+ raise newException(OutOfMemError, "lev_editops_to_opcodes")
+ result = newSeq[OpCode](n)
+ copyMem(result[0].addr, r, sizeof(OpCode)*n)
+
+proc totalCost*(ops: seq[EditOp]): int =
+ ## Computes the total cost of operations in `ops`.
+ ##
+ ## The costs of elementary operations are all 1.
+ lev_editops_total_cost(len(ops), ops[0].unsafeAddr)
+
+proc totalCost*(ops: seq[OpCode]): int =
+ ## Computes the total cost of operations in `ops`.
+ ##
+ ## The costs of elementary operations are all 1.
+ lev_opcodes_total_cost(len(ops), ops[0].unsafeAddr)
+
+proc normalize*(ops: seq[EditOp]): seq[EditOp] =
+ ## Normalizes a list of edit operations to contain no keep operations.
+ var n: csize
+ let r = lev_editops_normalize(len(ops), ops[0].unsafeAddr, n.addr)
+ defer: cFree(r)
+ if r == nil and n > 0:
+ raise newException(OutOfMemError, "lev_editops_normalize")
+ result = newSeq[EditOp](n)
+ copyMem(result[0].addr, r, sizeof(EditOp)*n)
+
+proc subtract*(ops: seq[EditOp], sub: seq[EditOp]): seq[EditOp] =
+ ## Subtracts a subsequence of elementary edit operations from a sequence.
+ ##
+ ## The remainder is a sequence that, applied to result of application of `sub`, gives the same
+ ## final result as application of `ops` to original string.
+ var n: csize
+ let r = lev_editops_subtract(len(ops), ops[0].unsafeAddr, len(sub), sub[0].unsafeAddr, n.addr)
+ defer: cFree(r)
+ if r == nil and n == cast[csize](-1):
+ raise newException(Exception, "lev_editops_subtract failed")
+ result = newSeq[EditOp](n)
+ copyMem(result[0].addr, r, sizeof(EditOp)*n)