### Levenshtein distance metric

While reading about BK-trees this morning (a topic I may write about at some point in the future), articles kept mentioning a distance metric on strings called the Levenshtein (Левенште́йн) metric. Because I've never written an implementation of this algorithm before (and some of the links to Python source seem to be broken), I've written my own.
#! /usr/bin/env python3.6
# -*- coding: utf-8 -*-

""" An implementation of the Levenshtein distance """

from typing import List

class Levenshtein:
""" Compute the Levenshtein distance between two strings """
def __init__(self, s1: str, s2: str) -> None:
self.s1 = s1
self.s2 = s2
self._l1 = len(s1)
self._l2 = len(s2)
self._mat = self._constructMat(self._l1, self._l2)

@staticmethod
def _constructMat(dim1: int, dim2: int) -> List[List[int]]:
""" Get a list of lists of zeros for updating; requires O((n + 1) * (m + 1)) space """
mat = [ * dim2] * dim1

for i in range(dim1 + 1):
mat[i] = i

for j in range(dim2 + 1):
mat[j] = j

return mat

def dist(self) -> int:
""" Get the distance between the two strings """

if min(self._l1, self._l2) == 0:
return max(self._l1, self._l2)

# Examine each character, comparing to the other string on self._mat
for i, ch1 in enumerate(self.s1, start=1):
for j, ch2 in enumerate(self.s2, start=1):
cost = 1 if ch1 != ch2 else 0

self._mat[i][j] = min(
self._mat[i - 1][j] + 1,
self._mat[i][j - 1] + 1,
self._mat[i - 1][j - 1] + cost,
)

distance = self._mat[self._l1][self._l2]

return distance

def __enter__(self) -> int:
return self.dist()

def __exit__(self, *args) -> None:
pass

The Levenshtein metric yields the minimum number of substitutions or changes we would have to make to one string to turn it into the other. So, for example, if we have the two strings 'Hello' and 'Hello, world!', the result will be 8, because we have to append ', world!', which is 8 characters long.

Using a repo of English words, I also wrote the following unit tests.
from typing import Optional
from unittest import TestCase, main
from functools import wraps
from types import FunctionType as Function
from tqdm import tqdm

def expectTrue(inst: TestCase) -> Function:
def _test(f: Function) -> Function:
""" Decorator that cleans up tests """
@wraps(f)
def test_decorated(*args, **kwargs) -> None:
res = f(*args, **kwargs)
inst.assertTrue(res if res is not None else True)
return _test
return expectTrue

class TestLevenshtein(TestCase):

def setUp(self) -> None:
# Two simple examples
self._word1 = "Brandon"
self._word2 = "Brendan"

# We'll consider this our space of words on which to determine whether we
# have a metric space

def test_distances(self) -> None:
""" Test the basics """
with Levenshtein(self._word1, self._word2) as dist:
# One can easily tell in this case that the minimum number of insertions
# or substitutions to transform "Brandon" into "Brendan" is 2
self.assertEqual(dist, 2)

def test_metric_space_axioms(self) -> None:
""" Test metric space axioms against the Levenshtein metric """

@expectTrue(self)
def positivity(w1: str, w2: str) -> bool:
with Levenshtein(w1, w2) as dist:
return dist >= 0

@expectTrue(self)
def nondegeneracy(w1: str, w2: str) -> Optional[bool]:
with Levenshtein(w1, w2) as dist:
if dist == 0:
# d(w1, w2) ==> w1 == w2
return w1 == w2
else:
return

@expectTrue(self)
def symmetry(w1: str, w2: str) -> bool:
with Levenshtein(w1, w2) as dist1, Levenshtein(w2, w1) as dist2:
return dist1 == dist2

print(f'** Size of space is {len(self.wordlist)}, and number of iterations is {len(self.wordlist) ** 2}')

for w1 in tqdm(self.wordlist):
for w2 in self.wordlist:
positivity(w1, w2)
nondegeneracy(w1, w2)
symmetry(w1, w2)

if __name__ == '__main__':
main()

I kept getting IndexErrors at first, but then I realized that I had mixed up dim1 and dim2 (so the array had the opposite dimensions), and hence wouldn't work when self._l1 != self._l2). Also, I wrote the decorator expectTrue after readinga blog post about decorator enhancements on melp.nl. The only trouble I'm having is not having to pass a reference to the instance to the decorator, but I'm working on that.

If the space of words is really large (e.g. the whole file), it will unfortunately be impossible to check all the combinations in a realistic amount of time.