Sunday, September 2, 2007

Quick Levenshtein Edit Distance in TSQL

 

Quick Levenshtein Edit Distance

 

Levenshtein edit distance is a measure of the similarity between two strings. Edit distance is the the minimum number of character deletions, insertions, substitutions, and transpositions required to transform string1 into string2. In essence, the function is used to perform a fuzzy or approximate string search. This is very handy for trying to find the "correct" string for one that has been entered incorrectly, mistyped, etc. The code has been optimized to find strings that are very similar. A "limit" parameter is provided so the function will quickly reject strings that contain more than k mismatches.
--**************************************
-- Name: Quick Levenshtein Edit Distance
-- Description:Levenshtein edit distance is a measure of the similarity between two strings. Edit distance is the the minimum number of character deletions, insertions, substitutions, and transpositions required to transform string1 into string2. In essence, the function is used to perform a fuzzy or approximate string search. This is very handy for trying to find the "correct" string for one that has been entered incorrectly, mistyped, etc. The code has been optimized to find strings that are very similar. A "limit" parameter is provided so the function will quickly reject strings that contain more than k mismatches.
-- By: Larry Lewis
--
--
-- Inputs:@s varchar(50), @t varchar(50), @limit integer
--
-- Returns:Returns the integer edit distance (minimum number of character deletions, insertions, substitutions, and transpositions) required to transform string1 into string2 where edit distance is <= limit. Otherwise returns (len(s) + len(t)).
--
--Assumes:This code takes character transpositions into account when calculating edit distance (If desired, the transposition code can be commented out). The limit parameter provides a significant performance gain (over 10x faster) over standard implementations when searching for highly similar strings.
--
--Side Effects:None
--This code is copyrighted and has limited warranties.
--Please see http://www.Planet-Source-Code.com/xq/ASP/txtCodeId.584/lngWId.5/qx/vb/scripts/ShowCode.htm
--for details.
--**************************************

CREATE function LEV2( @s varchar(50), @t varchar(50) , @limit integer)
--Returns the Levenshtein Distance with transpositions between
-- strings s1 and s2 using k-limit pruning
--Includes transposed characters and many times faster than
-- the original LEVENSHTEIN function (especially when limit is low)
--Returns EditDistance where EditDistance is <= limit otherwise returns len(@s) + len(@t)
--Based on code by: Michael Gilleland http://www.merriampark.com/ld.htm
--Originally translated to TQL by Joseph Gama
--Author: Larry Lewis
returns integer
as


BEGIN
DECLARE @d varchar(2500), @LD int, @m int, @n int, @i int, @j int, @k int,
@s_i char(1), @t_j char(1), @lendif int, @mindist int, @y int, @z int, @smallLen int, @cost int
SET @LD = LEN(@s) + LEN(@t)
--Remove leftmost matching portion of strings
While Left(@s, 1) = Left(@t, 1) And Len(@s) > 0 And Len(@t) > 0
BEGIN
set @s = Right(@s, Len(@s) - 1)
set @t = Right(@t, Len(@t) - 1)
END
-- Find shorter string
set @n = Len(@s)
set @m = Len(@t)
set @smallLen = @n
If @m < @n
SET @smallLen = @m
--Stop if string lengths differ by more than LIMIT
set @lendif = @n - @m
If Abs(@lendif) > @limit
goto done

If @n = 0
BEGIN
IF @m <= @limit
set @LD = @m
goto done
END
If @m = 0
BEGIN
IF @n <= @limit
SET @LD = @n
GOTO done
END
SET @d=replicate(CHAR(0),2500)
-- Init Matrix d
SET @i=0
WHILE @i<=@n
BEGIN
SET @d=STUFF(@d,@i+1,1,CHAR(@i))--d(i, 0) = i
SET @i=@i+1
END
SET @i=1
WHILE @i<=@m
BEGIN
SET @d=STUFF(@d,@i*(@n+1)+1,1,CHAR(@i))--d(0, j) = j
SET @i=@i+1
END
-- ' Main Loop - Levenshtein --Try to traverse the matrix so that pruning cells are calculated ASAP
SET @k=1
WHILE @k<=@smallLen
BEGIN
SET @s_i=(substring(@s,@k,1))
SET @j=1
WHILE @j<=@k
BEGIN
SET @t_j=(substring(@t,@j,1))
-- Evaluate d(k,j)
set @MinDist = ASCII(substring(@d,(@j-1)*(@n+1)+@k,1)) --d(k - 1, j - 1)
If @s_i <> @t_j set @MinDist = @MinDist + 1

---y = d(k - 1, j) + 1
set @y = ASCII(substring(@d,@j*(@n+1)+@k,1))+1
---z = d(k, j - 1) + 1
set @z = ASCII(substring(@d,(@j-1)*(@n+1)+@k+1,1))+1
If @y < @MinDist set @MinDist = @y
If @z < @MinDist set @MinDist = @z
-- d(k, j) = MinDist
SET @d=STUFF(@d,@j*(@n+1)+@k+1,1,CHAR(@MinDist))

-- Check Transposition
-- Transposition sections can be commented out if transposition is not desired
If @j > 1 AND @k > 1
BEGIN
-- If @MinDist - d(k - 2, j - 2) = 2
If @MinDist - ASCII(substring(@d,(@j-2)*(@n+1)+@k-1,1)) = 2
BEGIN
If substring(@s,@k-1,1) = @t_j
BEGIN
If @s_i = substring(@t,@j-1,1) -- Mid$(t, j - 1, 1)
--d(k, j) = d(k - 2, j - 2) + 1
set @d = STUFF(@d,@j*(@n+1)+@k+1,1,Char( ASCII(substring(@d,(@j-2)*(@n+1)+@k-1,1))+1))
END
END
END

-- Limit Pruning - Stop processing if we are already over the limit
If @k = @j + @lendif
BEGIN
If @k < @smallLen OR @j < @smallLen
BEGIN
--If d(k, j) > limit Then Exit Function
IF ASCII(substring(@d,@j*(@n+1)+@k+1,1)) > @limit goto done
END
END
SET @j=@j+1
END

IF @j <= @m
BEGIN
SET @t_j=substring(@t,@j,1)
SET @i = 1
WHILE @i <= @k
BEGIN
SET @s_i = substring(@s,@i,1)
-- Evaluate d(i,j)----------------
set @MinDist = ASCII(substring(@d,(@j-1)*(@n+1)+@i,1)) --d(i - 1, j - 1)
If @s_i <> @t_j set @MinDist = @MinDist + 1

---y = d(i - 1, j) + 1
set @y = ASCII(substring(@d,@j*(@n+1)+@i,1))+1
---z = d(i, j - 1) + 1
set @z = ASCII(substring(@d,(@j-1)*(@n+1)+@i+1,1))+1
If @y < @MinDist set @MinDist = @y
If @z < @MinDist set @MinDist = @z
-- d(k, j) = MinDist
SET @d=STUFF(@d,@j*(@n+1)+@i+1,1,CHAR(@MinDist))

-- Check Transposition
-- Transposition sections can be commented out if transposition is not desired
If @i> 1 -- j is always greater than 1
BEGIN
-- If @MinDist - d(i - 2, j - 2) = 2
If @MinDist - ASCII(substring(@d,(@j-2)*(@n+1)+@i-1,1)) = 2
BEGIN
If substring(@s,@i-1,1) = @t_j
BEGIN
If @s_i = substring(@t,@j-1,1) -- Mid$(t, j - 1, 1)
--d(k, j) = d(i - 2, j - 2) + 1
set @d = STUFF(@d,@j*(@n+1)+@i+1,1,CHAR(ASCII(substring(@d,(@j-2)*(@n+1)+@i-1,1))+1))
END
END
END

-- Limit Pruning - Stop processing if we are already over the limit
If @i = @j + @lendif
BEGIN
If @i < @smallLen OR @j < @smallLen
BEGIN
--If d(i, j) > limit Then Exit Function
IF ASCII(substring(@d,@j*(@n+1)+@i+1,1)) > @limit goto done
END
END
SET @i = @i + 1
END
END

SET @k=@k+1
END

-- process remaining rightmost portion of matrix if any (if m > n)
SET @i = @n + 1
WHILE @i <= @m
BEGIN
SET @t_j=(substring(@t,@i,1))
SET @j=1
WHILE @j<=@n
BEGIN
SET @s_i=(substring(@s,@j,1))

-- Evaluate d(j, i)
set @MinDist = ASCII(substring(@d,(@i-1)*(@n+1)+@j,1)) --d(j - 1, i - 1)
If @s_i <> @t_j set @MinDist = @MinDist + 1

---y = d(j - 1, i) + 1
set @y = ASCII(substring(@d,@i*(@n+1)+@j,1))+1
---z = d(j, i - 1) + 1
set @z = ASCII(substring(@d,(@i-1)*(@n+1)+@j+1,1))+1
If @y < @MinDist set @MinDist = @y
If @z < @MinDist set @MinDist = @z
-- d(j, i) = MinDist
SET @d=STUFF(@d,@i*(@n+1)+@j+1,1,CHAR(@MinDist))

-- Check Transposition
-- Transposition sections can be commented out if transposition is not desired
If @j> 1 -- i is always greater than 1
BEGIN
-- If @MinDist - d(j - 2, i - 2) = 2
If @MinDist - ASCII(substring(@d,(@i-2)*(@n+1)+@j-1,1)) = 2
BEGIN
If substring(@s,@j-1,1) = @t_j
BEGIN
If @s_i = substring(@t,@i-1,1) -- Mid$(t, i - 1, 1)
--d(j, i) = d(j - 2, i - 2) + 1
set @d = STUFF(@d,@i*(@n+1)+@j+1,1,CHAR(ASCII(substring(@d,(@i-2)*(@n+1)+@j-1,1))+1))
END
END
END


-- Limit Pruning - Stop processing if we are already over the limit
If @j = @i + @lendif
BEGIN
If @i < @m OR @j < @n
BEGIN
--If d(j, i) > limit Then Exit Function
IF ASCII(substring(@d,@i*(@n+1)+@j+1,1)) > @limit goto done
END
END


SET @j = @j + 1
END

Set @i = @i + 1
END

-- process remaining lower portion of matrix if any (n > m)
SET @i = @m + 1
WHILE @i <= @n
BEGIN
SET @s_i= substring(@s,@i,1)
SET @j=1
WHILE @j<=@n
BEGIN
SET @t_j=(substring(@t,@j,1))

-- Evaluate d(i,j)
set @MinDist = ASCII(substring(@d,(@j-1)*(@n+1)+@i,1)) --d(i - 1, j - 1)
If @s_i <> @t_j set @MinDist = @MinDist + 1

---y = d(i - 1, j) + 1
set @y = ASCII(substring(@d,@j*(@n+1)+@i,1))+1
---z = d(i, j - 1) + 1
set @z = ASCII(substring(@d,(@j-1)*(@n+1)+@i+1,1))+1
If @y < @MinDist set @MinDist = @y
If @z < @MinDist set @MinDist = @z
-- d(i, j) = MinDist
SET @d=STUFF(@d,@j*(@n+1)+@i+1,1,CHAR(@MinDist))

-- Check Transposition
-- This section can be commented out if transposition is not desired
If @j> 1 -- i is always greater than 1
BEGIN
-- If @MinDist - d(i - 2, j - 2) = 2
If @MinDist - ASCII(substring(@d,(@j-2)*(@n+1)+@i-1,1)) = 2
BEGIN
If substring(@s,@i-1,1) = @t_j
BEGIN
If @s_i = substring(@t,@j-1,1) -- Mid$(t, j - 1, 1)
--d(k, j) = d(i - 2, j - 2) + 1
set @d = STUFF(@d,@j*(@n+1)+@i+1,1,CHAR(ASCII(substring(@d,(@j-2)*(@n+1)+@i-1,1))+1))
END
END
END


-- Limit Pruning - Stop processing if we are already over the limit
If @i = @j + @lendif
BEGIN
If @i < @smallLen OR @j < @smallLen
BEGIN
--If d(i, j) > limit Then Exit Function
IF ASCII(substring(@d,@j*(@n+1)+@i+1,1)) > @limit goto done
END
END

SET @j = @j + 1
END

SET @i= @i + 1
END

-- Return EditDistance
SET @LD = ASCII(substring(@d,@n*(@m+1)+@m+1,1))
done:
--RETURN @LD
--I kept this code that can be used to display the matrix with all calculated values
--From Query Analyser it provides a nice way to check the algorithm in action
--
RETURN @LD
--declare @z varchar(8000)
--set @z=''
--SET @i=0
--WHILE @i<=@n
-- BEGIN
-- SET @j=0
-- WHILE @j<=@m
-- BEGIN
-- set @z=@z+CONVERT(char(3),ASCII(substring(@d,@i*(@m+1 )+@j+1 ,1)))
-- SET @j=@j+1
-- END
-- SET @i=@i+1
-- END
--print dbo.wrap(@z,3*(@n+1))
END