001package squidpony; 002 003import squidpony.annotation.Beta; 004 005import java.util.HashMap; 006import java.util.Map; 007 008/** 009 * The Damerau-Levenshtein Algorithm is an extension to the Levenshtein 010 * Algorithm which solves the edit distance problem between a source string and 011 * a target string with the following operations: 012 * 013 * <ul> 014 * <li>Character Insertion</li> 015 * <li>Character Deletion</li> 016 * <li>Character Replacement</li> 017 * <li>Adjacent Character Swap</li> 018 * </ul> 019 * 020 * Note that the adjacent character swap operation is an edit that may be 021 * applied when two adjacent characters in the source string match two adjacent 022 * characters in the target string, but in reverse order, rather than a general 023 * allowance for adjacent character swaps. 024 * 025 * This implementation allows the client to specify the costs of the various 026 * edit operations with the restriction that the cost of two swap operations 027 * must not be less than the cost of a delete operation followed by an insert 028 * operation. This restriction is required to preclude two swaps involving the 029 * same character being required for optimality which, in turn, enables a fast 030 * dynamic programming solution. 031 * 032 * The running time of the Damerau-Levenshtein algorithm is O(n*m) where n is 033 * the length of the source string and m is the length of the target string. 034 * This implementation consumes O(n*m) space. 035 * 036* @author Kevin L. Stern 037 */ 038@Beta 039public class DamerauLevenshteinAlgorithm { 040 041 private final int deleteCost, insertCost, replaceCost, swapCost; 042 043 /** 044 * Constructor. 045 * 046* @param deleteCost the cost of deleting a character. 047 * @param insertCost the cost of inserting a character. 048 * @param replaceCost the cost of replacing a character. 049 * @param swapCost the cost of swapping two adjacent characters. 050 */ 051 public DamerauLevenshteinAlgorithm(int deleteCost, int insertCost, int replaceCost, int swapCost) { 052 /* 053 * Required to facilitate the premise to the algorithm that two swaps of 054 * the same character are never required for optimality. 055 */ 056 if (2 * swapCost < insertCost + deleteCost) { 057 throw new IllegalArgumentException("Unsupported cost assignment"); 058 } 059 this.deleteCost = deleteCost; 060 this.insertCost = insertCost; 061 this.replaceCost = replaceCost; 062 this.swapCost = swapCost; 063 } 064 065 /** 066 * Compute the Damerau-Levenshtein distance between the specified source 067 * string and the specified target string. 068 */ 069 public int execute(String source, String target) { 070 if (source.length() == 0) { 071 return target.length() * insertCost; 072 } 073 074 if (target.length() == 0) { 075 return source.length() * deleteCost; 076 } 077 078 int[][] table = new int[source.length()][target.length()]; 079 Map<Character, Integer> sourceIndexByCharacter = new HashMap<>(); 080 081 if (source.charAt(0) != target.charAt(0)) { 082 table[0][0] = Math.min(replaceCost, deleteCost + insertCost); 083 } 084 085 sourceIndexByCharacter.put(source.charAt(0), 0); 086 087 for (int i = 1; i < source.length(); i++) { 088 int deleteDistance = table[i - 1][0] + deleteCost; 089 int insertDistance = (i + 1) * deleteCost + insertCost; 090 int matchDistance = i * deleteCost 091 + (source.charAt(i) == target.charAt(0) ? 0 : replaceCost); 092 table[i][0] = Math.min(Math.min(deleteDistance, insertDistance), 093 matchDistance); 094 } 095 096 for (int j = 1; j < target.length(); j++) { 097 int deleteDistance = table[0][j - 1] + insertCost; 098 int insertDistance = (j + 1) * insertCost + deleteCost; 099 int matchDistance = j * insertCost 100 + (source.charAt(0) == target.charAt(j) ? 0 : replaceCost); 101 table[0][j] = Math.min(Math.min(deleteDistance, insertDistance), 102 matchDistance); 103 } 104 105 for (int i = 1; i < source.length(); i++) { 106 int maxSourceLetterMatchIndex = source.charAt(i) == target 107 .charAt(0) ? 0 : -1; 108 for (int j = 1; j < target.length(); j++) { 109 Integer candidateSwapIndex = sourceIndexByCharacter.get(target 110 .charAt(j)); 111 int jSwap = maxSourceLetterMatchIndex; 112 int deleteDistance = table[i - 1][j] + deleteCost; 113 int insertDistance = table[i][j - 1] + insertCost; 114 int matchDistance = table[i - 1][j - 1]; 115 if (source.charAt(i) != target.charAt(j)) { 116 matchDistance += replaceCost; 117 } else { 118 maxSourceLetterMatchIndex = j; 119 } 120 int swapDistance; 121 if (candidateSwapIndex != null && jSwap != -1) { 122 int iSwap = candidateSwapIndex; 123 int preSwapCost; 124 if (iSwap == 0 && jSwap == 0) { 125 preSwapCost = 0; 126 } else { 127 preSwapCost = table[Math.max(0, iSwap - 1)][Math.max(0, 128 jSwap - 1)]; 129 } 130 swapDistance = preSwapCost + (i - iSwap - 1) * deleteCost 131 + (j - jSwap - 1) * insertCost + swapCost; 132 } else { 133 swapDistance = Integer.MAX_VALUE; 134 } 135 table[i][j] = Math.min( 136 Math.min(Math.min(deleteDistance, insertDistance), 137 matchDistance), swapDistance); 138 } 139 sourceIndexByCharacter.put(source.charAt(i), i); 140 } 141 return table[source.length() - 1][target.length() - 1]; 142 } 143}