diff options
Diffstat (limited to 'src/nimlevenshtein/_levenshtein.h')
| -rw-r--r-- | src/nimlevenshtein/_levenshtein.h | 394 |
1 files changed, 394 insertions, 0 deletions
diff --git a/src/nimlevenshtein/_levenshtein.h b/src/nimlevenshtein/_levenshtein.h new file mode 100644 index 0000000..0f9e161 --- /dev/null +++ b/src/nimlevenshtein/_levenshtein.h @@ -0,0 +1,394 @@ +/* @(#) $Id: Levenshtein.h,v 1.22 2005/01/13 20:02:56 yeti Exp $ */ +#ifndef LEVENSHTEIN_H +#define LEVENSHTEIN_H + +#ifndef size_t +# include <stdlib.h> +#endif + +/* A bit dirty. */ +#ifndef _LEV_STATIC_PY +# define _LEV_STATIC_PY /* */ +#endif + +/* In C, this is just wchar_t and unsigned char, in Python, lev_wchar can + * be anything. If you really want to cheat, define wchar_t to any integer + * type you like before including Levenshtein.h and recompile it. */ +#ifndef lev_wchar +# ifndef wchar_t +# include <wchar.h> +# endif +# define lev_wchar wchar_t +#endif +typedef unsigned char lev_byte; + +/* Edit opration type + * DON'T CHANGE! used ad arrays indices and the bits are occasionally used + * as flags */ +typedef enum { + LEV_EDIT_KEEP = 0, + LEV_EDIT_REPLACE = 1, + LEV_EDIT_INSERT = 2, + LEV_EDIT_DELETE = 3, + LEV_EDIT_LAST /* sometimes returned when an error occurs */ +} LevEditType; + +/* Error codes returned by editop check functions */ +typedef enum { + LEV_EDIT_ERR_OK = 0, + LEV_EDIT_ERR_TYPE, /* nonexistent edit type */ + LEV_EDIT_ERR_OUT, /* edit out of string bounds */ + LEV_EDIT_ERR_ORDER, /* ops are not ordered */ + LEV_EDIT_ERR_BLOCK, /* incosistent block boundaries (block ops) */ + LEV_EDIT_ERR_SPAN, /* sequence is not a full transformation (block ops) */ + LEV_EDIT_ERR_LAST +} LevEditOpError; + +/* string averaging method (UNUSED yet) */ +typedef enum { + LEV_AVG_HEAD = 0, /* take operations from the head */ + LEV_AVG_TAIL, /* take operations from the tail */ + LEV_AVG_SPREAD, /* take a equidistantly distributed subset */ + LEV_AVG_BLOCK, /* take a random continuous block */ + LEV_AVG_RANDOM, /* take a random subset */ + LEV_AVG_LAST +} LevAveragingType; + +/* Edit operation (atomic). + * This is the `native' atomic edit operation. It differs from the difflib + * one's because it represents a change of one character, not a block. And + * we usually don't care about LEV_EDIT_KEEP, though the functions can handle + * them. The positions are interpreted as at the left edge of a character. + */ +typedef struct { + LevEditType type; /* editing operation type */ + size_t spos; /* source block position */ + size_t dpos; /* destination position */ +} LevEditOp; + +/* Edit operation (difflib-compatible). + * This is not `native', but conversion functions exist. These fields exactly + * correspond to the codeops() tuples fields (and this method is also the + * source of the silly OpCode name). Sequences must span over complete + * strings, subsequences are simply edit sequences with more (or larger) + * LEV_EDIT_KEEP blocks. + */ +typedef struct { + LevEditType type; /* editing operation type */ + size_t sbeg, send; /* source block begin, end */ + size_t dbeg, dend; /* destination block begin, end */ +} LevOpCode; + +/* Matching block (difflib-compatible). */ +typedef struct { + size_t spos; + size_t dpos; + size_t len; +} LevMatchingBlock; + +_LEV_STATIC_PY +size_t +lev_edit_distance(size_t len1, + const lev_byte *string1, + size_t len2, + const lev_byte *string2, + int xcost); + +_LEV_STATIC_PY +size_t +lev_u_edit_distance(size_t len1, + const lev_wchar *string1, + size_t len2, + const lev_wchar *string2, + int xcost); + +_LEV_STATIC_PY +size_t +lev_hamming_distance(size_t len, + const lev_byte *string1, + const lev_byte *string2); + +_LEV_STATIC_PY +size_t +lev_u_hamming_distance(size_t len, + const lev_wchar *string1, + const lev_wchar *string2); + +_LEV_STATIC_PY +double +lev_jaro_ratio(size_t len1, + const lev_byte *string1, + size_t len2, + const lev_byte *string2); + +_LEV_STATIC_PY +double +lev_u_jaro_ratio(size_t len1, + const lev_wchar *string1, + size_t len2, + const lev_wchar *string2); + +_LEV_STATIC_PY +double +lev_jaro_winkler_ratio(size_t len1, const lev_byte *string1, + size_t len2, const lev_byte *string2, + double pfweight); + +_LEV_STATIC_PY +double +lev_u_jaro_winkler_ratio(size_t len1, const lev_wchar *string1, + size_t len2, const lev_wchar *string2, + double pfweight); + +_LEV_STATIC_PY +lev_byte* +lev_greedy_median(size_t n, + const size_t *lengths, + const lev_byte *strings[], + const double *weights, + size_t *medlength); + +_LEV_STATIC_PY +lev_wchar* +lev_u_greedy_median(size_t n, + const size_t *lengths, + const lev_wchar *strings[], + const double *weights, + size_t *medlength); + +_LEV_STATIC_PY +lev_byte* +lev_median_improve(size_t len, const lev_byte *s, + size_t n, const size_t *lengths, + const lev_byte *strings[], + const double *weights, + size_t *medlength); + +_LEV_STATIC_PY +lev_wchar* +lev_u_median_improve(size_t len, const lev_wchar *s, + size_t n, const size_t *lengths, + const lev_wchar *strings[], + const double *weights, + size_t *medlength); + +_LEV_STATIC_PY +lev_byte* +lev_quick_median(size_t n, + const size_t *lengths, + const lev_byte *strings[], + const double *weights, + size_t *medlength); + +_LEV_STATIC_PY +lev_wchar* +lev_u_quick_median(size_t n, + const size_t *lengths, + const lev_wchar *strings[], + const double *weights, + size_t *medlength); + +_LEV_STATIC_PY +lev_byte* +lev_set_median(size_t n, + const size_t *lengths, + const lev_byte *strings[], + const double *weights, + size_t *medlength); + +_LEV_STATIC_PY +size_t +lev_set_median_index(size_t n, const size_t *lengths, + const lev_byte *strings[], + const double *weights); + +_LEV_STATIC_PY +lev_wchar* +lev_u_set_median(size_t n, + const size_t *lengths, + const lev_wchar *strings[], + const double *weights, + size_t *medlength); + +_LEV_STATIC_PY +size_t +lev_u_set_median_index(size_t n, const size_t *lengths, + const lev_wchar *strings[], + const double *weights); + +_LEV_STATIC_PY +double +lev_edit_seq_distance(size_t n1, + const size_t *lengths1, + const lev_byte *strings1[], + size_t n2, + const size_t *lengths2, + const lev_byte *strings2[]); + +_LEV_STATIC_PY +double +lev_u_edit_seq_distance(size_t n1, + const size_t *lengths1, + const lev_wchar *strings1[], + size_t n2, + const size_t *lengths2, + const lev_wchar *strings2[]); + +_LEV_STATIC_PY +double +lev_set_distance(size_t n1, + const size_t *lengths1, + const lev_byte *strings1[], + size_t n2, + const size_t *lengths2, + const lev_byte *strings2[]); + +_LEV_STATIC_PY +double +lev_u_set_distance(size_t n1, + const size_t *lengths1, + const lev_wchar *strings1[], + size_t n2, + const size_t *lengths2, + const lev_wchar *strings2[]); + +_LEV_STATIC_PY +int +lev_editops_check_errors(size_t len1, + size_t len2, + size_t n, + const LevEditOp *ops); + +_LEV_STATIC_PY +int +lev_opcodes_check_errors(size_t len1, + size_t len2, + size_t nb, + const LevOpCode *bops); + +_LEV_STATIC_PY +void +lev_editops_invert(size_t n, + LevEditOp *ops); + +_LEV_STATIC_PY +void +lev_opcodes_invert(size_t nb, + LevOpCode *bops); + +_LEV_STATIC_PY +LevMatchingBlock* +lev_editops_matching_blocks(size_t len1, + size_t len2, + size_t n, + const LevEditOp *ops, + size_t *nmblocks); + +_LEV_STATIC_PY +LevMatchingBlock* +lev_opcodes_matching_blocks(size_t len1, + size_t len2, + size_t nb, + const LevOpCode *bops, + size_t *nmblocks); + +_LEV_STATIC_PY +lev_byte* +lev_editops_apply(size_t len1, + const lev_byte* string1, + size_t len2, + const lev_byte* string2, + size_t n, + const LevEditOp *ops, + size_t *len); + +_LEV_STATIC_PY +lev_wchar* +lev_u_editops_apply(size_t len1, + const lev_wchar* string1, + size_t len2, + const lev_wchar* string2, + size_t n, + const LevEditOp *ops, + size_t *len); + +_LEV_STATIC_PY +lev_byte* +lev_opcodes_apply(size_t len1, + const lev_byte* string1, + size_t len2, + const lev_byte* string2, + size_t nb, + const LevOpCode *bops, + size_t *len); + +_LEV_STATIC_PY +lev_wchar* +lev_u_opcodes_apply(size_t len1, + const lev_wchar* string1, + size_t len2, + const lev_wchar* string2, + size_t nb, + const LevOpCode *bops, + size_t *len); + +_LEV_STATIC_PY +LevEditOp* +lev_editops_find(size_t len1, + const lev_byte *string1, + size_t len2, + const lev_byte *string2, + size_t *n); + +_LEV_STATIC_PY +LevEditOp* +lev_u_editops_find(size_t len1, + const lev_wchar *string1, + size_t len2, + const lev_wchar *string2, + size_t *n); + +_LEV_STATIC_PY +LevEditOp* +lev_opcodes_to_editops(size_t nb, + const LevOpCode *bops, + size_t *n, + int keepkeep); + +_LEV_STATIC_PY +LevOpCode* +lev_editops_to_opcodes(size_t n, + const LevEditOp *ops, + size_t *nb, + size_t len1, + size_t len2); + +_LEV_STATIC_PY +size_t +lev_editops_total_cost(size_t n, + const LevEditOp *ops); + +_LEV_STATIC_PY +size_t +lev_opcodes_total_cost(size_t nb, + const LevOpCode *bops); + +_LEV_STATIC_PY +LevEditOp* +lev_editops_normalize(size_t n, + const LevEditOp *ops, + size_t *nnorm); + +_LEV_STATIC_PY LevEditOp* +lev_editops_subtract(size_t n, + const LevEditOp *ops, + size_t ns, + const LevEditOp *sub, + size_t *nrem); + +/* UNUSED yet */ +_LEV_STATIC_PY +void +lev_init_rng(unsigned long int seed); + +#endif /* not LEVENSHTEIN_H */ |
