aboutsummaryrefslogtreecommitdiff
path: root/src/nimlevenshtein/_levenshtein.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/nimlevenshtein/_levenshtein.h')
-rw-r--r--src/nimlevenshtein/_levenshtein.h394
1 files changed, 394 insertions, 0 deletions
diff --git a/src/nimlevenshtein/_levenshtein.h b/src/nimlevenshtein/_levenshtein.h
new file mode 100644
index 0000000..0f9e161
--- /dev/null
+++ b/src/nimlevenshtein/_levenshtein.h
@@ -0,0 +1,394 @@
+/* @(#) $Id: Levenshtein.h,v 1.22 2005/01/13 20:02:56 yeti Exp $ */
+#ifndef LEVENSHTEIN_H
+#define LEVENSHTEIN_H
+
+#ifndef size_t
+# include <stdlib.h>
+#endif
+
+/* A bit dirty. */
+#ifndef _LEV_STATIC_PY
+# define _LEV_STATIC_PY /* */
+#endif
+
+/* In C, this is just wchar_t and unsigned char, in Python, lev_wchar can
+ * be anything. If you really want to cheat, define wchar_t to any integer
+ * type you like before including Levenshtein.h and recompile it. */
+#ifndef lev_wchar
+# ifndef wchar_t
+# include <wchar.h>
+# endif
+# define lev_wchar wchar_t
+#endif
+typedef unsigned char lev_byte;
+
+/* Edit opration type
+ * DON'T CHANGE! used ad arrays indices and the bits are occasionally used
+ * as flags */
+typedef enum {
+ LEV_EDIT_KEEP = 0,
+ LEV_EDIT_REPLACE = 1,
+ LEV_EDIT_INSERT = 2,
+ LEV_EDIT_DELETE = 3,
+ LEV_EDIT_LAST /* sometimes returned when an error occurs */
+} LevEditType;
+
+/* Error codes returned by editop check functions */
+typedef enum {
+ LEV_EDIT_ERR_OK = 0,
+ LEV_EDIT_ERR_TYPE, /* nonexistent edit type */
+ LEV_EDIT_ERR_OUT, /* edit out of string bounds */
+ LEV_EDIT_ERR_ORDER, /* ops are not ordered */
+ LEV_EDIT_ERR_BLOCK, /* incosistent block boundaries (block ops) */
+ LEV_EDIT_ERR_SPAN, /* sequence is not a full transformation (block ops) */
+ LEV_EDIT_ERR_LAST
+} LevEditOpError;
+
+/* string averaging method (UNUSED yet) */
+typedef enum {
+ LEV_AVG_HEAD = 0, /* take operations from the head */
+ LEV_AVG_TAIL, /* take operations from the tail */
+ LEV_AVG_SPREAD, /* take a equidistantly distributed subset */
+ LEV_AVG_BLOCK, /* take a random continuous block */
+ LEV_AVG_RANDOM, /* take a random subset */
+ LEV_AVG_LAST
+} LevAveragingType;
+
+/* Edit operation (atomic).
+ * This is the `native' atomic edit operation. It differs from the difflib
+ * one's because it represents a change of one character, not a block. And
+ * we usually don't care about LEV_EDIT_KEEP, though the functions can handle
+ * them. The positions are interpreted as at the left edge of a character.
+ */
+typedef struct {
+ LevEditType type; /* editing operation type */
+ size_t spos; /* source block position */
+ size_t dpos; /* destination position */
+} LevEditOp;
+
+/* Edit operation (difflib-compatible).
+ * This is not `native', but conversion functions exist. These fields exactly
+ * correspond to the codeops() tuples fields (and this method is also the
+ * source of the silly OpCode name). Sequences must span over complete
+ * strings, subsequences are simply edit sequences with more (or larger)
+ * LEV_EDIT_KEEP blocks.
+ */
+typedef struct {
+ LevEditType type; /* editing operation type */
+ size_t sbeg, send; /* source block begin, end */
+ size_t dbeg, dend; /* destination block begin, end */
+} LevOpCode;
+
+/* Matching block (difflib-compatible). */
+typedef struct {
+ size_t spos;
+ size_t dpos;
+ size_t len;
+} LevMatchingBlock;
+
+_LEV_STATIC_PY
+size_t
+lev_edit_distance(size_t len1,
+ const lev_byte *string1,
+ size_t len2,
+ const lev_byte *string2,
+ int xcost);
+
+_LEV_STATIC_PY
+size_t
+lev_u_edit_distance(size_t len1,
+ const lev_wchar *string1,
+ size_t len2,
+ const lev_wchar *string2,
+ int xcost);
+
+_LEV_STATIC_PY
+size_t
+lev_hamming_distance(size_t len,
+ const lev_byte *string1,
+ const lev_byte *string2);
+
+_LEV_STATIC_PY
+size_t
+lev_u_hamming_distance(size_t len,
+ const lev_wchar *string1,
+ const lev_wchar *string2);
+
+_LEV_STATIC_PY
+double
+lev_jaro_ratio(size_t len1,
+ const lev_byte *string1,
+ size_t len2,
+ const lev_byte *string2);
+
+_LEV_STATIC_PY
+double
+lev_u_jaro_ratio(size_t len1,
+ const lev_wchar *string1,
+ size_t len2,
+ const lev_wchar *string2);
+
+_LEV_STATIC_PY
+double
+lev_jaro_winkler_ratio(size_t len1, const lev_byte *string1,
+ size_t len2, const lev_byte *string2,
+ double pfweight);
+
+_LEV_STATIC_PY
+double
+lev_u_jaro_winkler_ratio(size_t len1, const lev_wchar *string1,
+ size_t len2, const lev_wchar *string2,
+ double pfweight);
+
+_LEV_STATIC_PY
+lev_byte*
+lev_greedy_median(size_t n,
+ const size_t *lengths,
+ const lev_byte *strings[],
+ const double *weights,
+ size_t *medlength);
+
+_LEV_STATIC_PY
+lev_wchar*
+lev_u_greedy_median(size_t n,
+ const size_t *lengths,
+ const lev_wchar *strings[],
+ const double *weights,
+ size_t *medlength);
+
+_LEV_STATIC_PY
+lev_byte*
+lev_median_improve(size_t len, const lev_byte *s,
+ size_t n, const size_t *lengths,
+ const lev_byte *strings[],
+ const double *weights,
+ size_t *medlength);
+
+_LEV_STATIC_PY
+lev_wchar*
+lev_u_median_improve(size_t len, const lev_wchar *s,
+ size_t n, const size_t *lengths,
+ const lev_wchar *strings[],
+ const double *weights,
+ size_t *medlength);
+
+_LEV_STATIC_PY
+lev_byte*
+lev_quick_median(size_t n,
+ const size_t *lengths,
+ const lev_byte *strings[],
+ const double *weights,
+ size_t *medlength);
+
+_LEV_STATIC_PY
+lev_wchar*
+lev_u_quick_median(size_t n,
+ const size_t *lengths,
+ const lev_wchar *strings[],
+ const double *weights,
+ size_t *medlength);
+
+_LEV_STATIC_PY
+lev_byte*
+lev_set_median(size_t n,
+ const size_t *lengths,
+ const lev_byte *strings[],
+ const double *weights,
+ size_t *medlength);
+
+_LEV_STATIC_PY
+size_t
+lev_set_median_index(size_t n, const size_t *lengths,
+ const lev_byte *strings[],
+ const double *weights);
+
+_LEV_STATIC_PY
+lev_wchar*
+lev_u_set_median(size_t n,
+ const size_t *lengths,
+ const lev_wchar *strings[],
+ const double *weights,
+ size_t *medlength);
+
+_LEV_STATIC_PY
+size_t
+lev_u_set_median_index(size_t n, const size_t *lengths,
+ const lev_wchar *strings[],
+ const double *weights);
+
+_LEV_STATIC_PY
+double
+lev_edit_seq_distance(size_t n1,
+ const size_t *lengths1,
+ const lev_byte *strings1[],
+ size_t n2,
+ const size_t *lengths2,
+ const lev_byte *strings2[]);
+
+_LEV_STATIC_PY
+double
+lev_u_edit_seq_distance(size_t n1,
+ const size_t *lengths1,
+ const lev_wchar *strings1[],
+ size_t n2,
+ const size_t *lengths2,
+ const lev_wchar *strings2[]);
+
+_LEV_STATIC_PY
+double
+lev_set_distance(size_t n1,
+ const size_t *lengths1,
+ const lev_byte *strings1[],
+ size_t n2,
+ const size_t *lengths2,
+ const lev_byte *strings2[]);
+
+_LEV_STATIC_PY
+double
+lev_u_set_distance(size_t n1,
+ const size_t *lengths1,
+ const lev_wchar *strings1[],
+ size_t n2,
+ const size_t *lengths2,
+ const lev_wchar *strings2[]);
+
+_LEV_STATIC_PY
+int
+lev_editops_check_errors(size_t len1,
+ size_t len2,
+ size_t n,
+ const LevEditOp *ops);
+
+_LEV_STATIC_PY
+int
+lev_opcodes_check_errors(size_t len1,
+ size_t len2,
+ size_t nb,
+ const LevOpCode *bops);
+
+_LEV_STATIC_PY
+void
+lev_editops_invert(size_t n,
+ LevEditOp *ops);
+
+_LEV_STATIC_PY
+void
+lev_opcodes_invert(size_t nb,
+ LevOpCode *bops);
+
+_LEV_STATIC_PY
+LevMatchingBlock*
+lev_editops_matching_blocks(size_t len1,
+ size_t len2,
+ size_t n,
+ const LevEditOp *ops,
+ size_t *nmblocks);
+
+_LEV_STATIC_PY
+LevMatchingBlock*
+lev_opcodes_matching_blocks(size_t len1,
+ size_t len2,
+ size_t nb,
+ const LevOpCode *bops,
+ size_t *nmblocks);
+
+_LEV_STATIC_PY
+lev_byte*
+lev_editops_apply(size_t len1,
+ const lev_byte* string1,
+ size_t len2,
+ const lev_byte* string2,
+ size_t n,
+ const LevEditOp *ops,
+ size_t *len);
+
+_LEV_STATIC_PY
+lev_wchar*
+lev_u_editops_apply(size_t len1,
+ const lev_wchar* string1,
+ size_t len2,
+ const lev_wchar* string2,
+ size_t n,
+ const LevEditOp *ops,
+ size_t *len);
+
+_LEV_STATIC_PY
+lev_byte*
+lev_opcodes_apply(size_t len1,
+ const lev_byte* string1,
+ size_t len2,
+ const lev_byte* string2,
+ size_t nb,
+ const LevOpCode *bops,
+ size_t *len);
+
+_LEV_STATIC_PY
+lev_wchar*
+lev_u_opcodes_apply(size_t len1,
+ const lev_wchar* string1,
+ size_t len2,
+ const lev_wchar* string2,
+ size_t nb,
+ const LevOpCode *bops,
+ size_t *len);
+
+_LEV_STATIC_PY
+LevEditOp*
+lev_editops_find(size_t len1,
+ const lev_byte *string1,
+ size_t len2,
+ const lev_byte *string2,
+ size_t *n);
+
+_LEV_STATIC_PY
+LevEditOp*
+lev_u_editops_find(size_t len1,
+ const lev_wchar *string1,
+ size_t len2,
+ const lev_wchar *string2,
+ size_t *n);
+
+_LEV_STATIC_PY
+LevEditOp*
+lev_opcodes_to_editops(size_t nb,
+ const LevOpCode *bops,
+ size_t *n,
+ int keepkeep);
+
+_LEV_STATIC_PY
+LevOpCode*
+lev_editops_to_opcodes(size_t n,
+ const LevEditOp *ops,
+ size_t *nb,
+ size_t len1,
+ size_t len2);
+
+_LEV_STATIC_PY
+size_t
+lev_editops_total_cost(size_t n,
+ const LevEditOp *ops);
+
+_LEV_STATIC_PY
+size_t
+lev_opcodes_total_cost(size_t nb,
+ const LevOpCode *bops);
+
+_LEV_STATIC_PY
+LevEditOp*
+lev_editops_normalize(size_t n,
+ const LevEditOp *ops,
+ size_t *nnorm);
+
+_LEV_STATIC_PY LevEditOp*
+lev_editops_subtract(size_t n,
+ const LevEditOp *ops,
+ size_t ns,
+ const LevEditOp *sub,
+ size_t *nrem);
+
+/* UNUSED yet */
+_LEV_STATIC_PY
+void
+lev_init_rng(unsigned long int seed);
+
+#endif /* not LEVENSHTEIN_H */