1 | //===-- llvm/ADT/edit_distance.h - Array edit distance function --- C++ -*-===// |
---|---|

2 | // |

3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |

4 | // See https://llvm.org/LICENSE.txt for license information. |

5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |

6 | // |

7 | //===----------------------------------------------------------------------===// |

8 | // |

9 | // This file defines a Levenshtein distance function that works for any two |

10 | // sequences, with each element of each sequence being analogous to a character |

11 | // in a string. |

12 | // |

13 | //===----------------------------------------------------------------------===// |

14 | |

15 | #ifndef LLVM_ADT_EDIT_DISTANCE_H |

16 | #define LLVM_ADT_EDIT_DISTANCE_H |

17 | |

18 | #include "llvm/ADT/ArrayRef.h" |

19 | #include <algorithm> |

20 | #include <memory> |

21 | |

22 | namespace llvm { |

23 | |

24 | /// Determine the edit distance between two sequences. |

25 | /// |

26 | /// \param FromArray the first sequence to compare. |

27 | /// |

28 | /// \param ToArray the second sequence to compare. |

29 | /// |

30 | /// \param AllowReplacements whether to allow element replacements (change one |

31 | /// element into another) as a single operation, rather than as two operations |

32 | /// (an insertion and a removal). |

33 | /// |

34 | /// \param MaxEditDistance If non-zero, the maximum edit distance that this |

35 | /// routine is allowed to compute. If the edit distance will exceed that |

36 | /// maximum, returns \c MaxEditDistance+1. |

37 | /// |

38 | /// \returns the minimum number of element insertions, removals, or (if |

39 | /// \p AllowReplacements is \c true) replacements needed to transform one of |

40 | /// the given sequences into the other. If zero, the sequences are identical. |

41 | template<typename T> |

42 | unsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray, |

43 | bool AllowReplacements = true, |

44 | unsigned MaxEditDistance = 0) { |

45 | // The algorithm implemented below is the "classic" |

46 | // dynamic-programming algorithm for computing the Levenshtein |

47 | // distance, which is described here: |

48 | // |

49 | // http://en.wikipedia.org/wiki/Levenshtein_distance |

50 | // |

51 | // Although the algorithm is typically described using an m x n |

52 | // array, only one row plus one element are used at a time, so this |

53 | // implementation just keeps one vector for the row. To update one entry, |

54 | // only the entries to the left, top, and top-left are needed. The left |

55 | // entry is in Row[x-1], the top entry is what's in Row[x] from the last |

56 | // iteration, and the top-left entry is stored in Previous. |

57 | typename ArrayRef<T>::size_type m = FromArray.size(); |

58 | typename ArrayRef<T>::size_type n = ToArray.size(); |

59 | |

60 | const unsigned SmallBufferSize = 64; |

61 | unsigned SmallBuffer[SmallBufferSize]; |

62 | std::unique_ptr<unsigned[]> Allocated; |

63 | unsigned *Row = SmallBuffer; |

64 | if (n + 1 > SmallBufferSize) { |

65 | Row = new unsigned[n + 1]; |

66 | Allocated.reset(Row); |

67 | } |

68 | |

69 | for (unsigned i = 1; i <= n; ++i) |

70 | Row[i] = i; |

71 | |

72 | for (typename ArrayRef<T>::size_type y = 1; y <= m; ++y) { |

73 | Row[0] = y; |

74 | unsigned BestThisRow = Row[0]; |

75 | |

76 | unsigned Previous = y - 1; |

77 | for (typename ArrayRef<T>::size_type x = 1; x <= n; ++x) { |

78 | int OldRow = Row[x]; |

79 | if (AllowReplacements) { |

80 | Row[x] = std::min( |

81 | Previous + (FromArray[y-1] == ToArray[x-1] ? 0u : 1u), |

82 | std::min(Row[x-1], Row[x])+1); |

83 | } |

84 | else { |

85 | if (FromArray[y-1] == ToArray[x-1]) Row[x] = Previous; |

86 | else Row[x] = std::min(Row[x-1], Row[x]) + 1; |

87 | } |

88 | Previous = OldRow; |

89 | BestThisRow = std::min(BestThisRow, Row[x]); |

90 | } |

91 | |

92 | if (MaxEditDistance && BestThisRow > MaxEditDistance) |

93 | return MaxEditDistance + 1; |

94 | } |

95 | |

96 | unsigned Result = Row[n]; |

97 | return Result; |

98 | } |

99 | |

100 | } // End llvm namespace |

101 | |

102 | #endif |

103 |