字符串匹配算法

``s = 'I Love Python'   print(s.find('Py'))  # 7   print(s.find('Pyc')) # -1   ``

`index` 方法与 `find` 的唯一区别在于当主串中不存在模式串时会抛出 `ValueError`

``Traceback (most recent call last):     File "<stdin>", line 1, in <module> ValueError: substring not found   ``

1. 顺序匹配算法

``class String:     def __init__(self, ss="", length=0):         self.string = ss         self.length = length or len(ss)     def __getitem__(self, i):         return self.string[i] ``

``def find(S, T, pos = 0):     i = pos   j = 0   while i < S.length and j < T.length:     if S[i] == T[j]:       i += 1       j += 1     else:       i = i - j + 1       j = 0   if j >= T.length:     return i - T.length   return -1 ``

2. KMP 算法

``def nxt(T, j):     if j == 0:     return -1   if T[0:k] == T[j-k:j]:     return max(k)   else:     return 0 ``

``def kmp_next(T):     nxt = [-1] * T.length   i   = 0   j   = -1   while i < T.length:     if j == -1 or T[j] == T[i]:       i += 1       j += 1       if i < T.length:         nxt[i] = j     else:       j = nxt[j]   return nxt ``

``def KMP(S, T, post = 0):     nxt = kmp_next(T)   i   = pos   j   = 0   while i < S.length and j < T.length:     if j == -1 or S[i] == T[i]:       i += 1       j += 1     else:       j = nxt[j]   if j >= T.length:     return i - T.length   return -1 ``

3. Python 源码中的实现方式

based on a mix between boyer-moore and horspool,

4. One More Think

1. 如果 `xm == yn` ，则 `zk == xm == yn``Zk-1``Xm-1``Yn-1` 的一个 LCS；
2. 如果 `xm != yn` ，且 `zk != xm` ，则 `Z``Xm-1``Yn` 的一个 LCS；
3. 如果 `xm != yn` ，且 `zk != yn` ，则 `Z``Xm``Yn-1` 的一个 LCS。

1. `i == 0 or j == 0` 时， `subs[i][j] = 0`
2. `i > 0 and j > 0 and xi == yj` 时， `subs[i][j] = subs[i-1][j-1] + 1`
3. `i > 0 and j > 0 and xi != yj` 时， `subs[i][j] = max(subs[i][j-1], subs[i-1][j])`

``def LCS_lengths(X, Y):     subs = []    # 初始化二维表   for _ in range(X.length + 1): # 长度为 X.length + 1 是为了保存 X0     subs.append([0] * (Y.length + 1))   # 这里有一个坑，考虑一下为什么不可以用下面的方式进行初始化？   # subs = [[0] * (Y.length + 1)] * (X.length + 1)   for i in range(1, X.length + 1):     for j in range(1, X.length + 1):       if X[i-1] == Y[j-1]: # 字符串中下标是从 0 开始的，但这里的 i, j 是从 X1, Y1 开始的         subs[i][j] = subs[i-1][j-1] + 1       elif subs[i][j-1] >= subs[i-1][j]:         subs[i][j] = subs[i][j-1]       else:         subs[i][j] = subs[i-1][j]   return subs ``

``X = String('ABCBDAB')   Y = String('BDCABA')   subs = LCS_lengths(X, Y)   print(subs)   """ [[0, 0, 0, 0, 0, 0, 0],  [0, 0, 0, 0, 1, 1, 1],  [0, 1, 1, 1, 1, 2, 2],  [0, 1, 1, 2, 2, 2, 2],  [0, 1, 1, 2, 2, 3, 3],  [0, 1, 2, 2, 2, 3, 3],  [0, 1, 2, 2, 3, 3, 4],  [0, 1, 2, 2, 3, 4, 4]] """  # 最后一个元素就是 Xm 与 Yn 的 LCS 长度： print(subs[-1][-1])   # 4 ``

``def LCS_lengths(X, Y):     subs = []   road_map = []   for _ in range(X.length + 1):     subs.append([0] * (Y.length + 1))     road_map.append([0] * (Y.length + 1))   for i in range(1, X.length + 1):     for j in range(1, Y.length + 1):       if X[i-1] == Y[j-1]:         subs[i][j] = subs[i-1][j-1] + 1         road_map[i][j] = 'M' # Match       elif subs[i][j-1] >= subs[i-1][j]:         subs[i][j] = subs[i][j-1]         road_map[i][j] = 'Y' # find from Yj-1       else:         subs[i][j] = subs[i-1][j]         road_map[i][j] = 'X' # find from Xi-1   return subs, road_map def LCS_find(X, Y):     _, road_map = LCS_lengths(X, Y)   def _find(road, X, i, j, lcs):     if i == 0 or j == 0:       return     if road[i][j] == 'M':       _find(road, X, i-1, j-1, lcs)       lcs.append(X[i])     elif road[i][j] == 'Y':       _find(road, X, i, j-1, lcs)     else:       _find(road, X, i-1, j, lcs)      lcs = []     _find(road_map, X, X.length, Y.length, lcs)     return lcs print(LCS_find(X, Y))   # ['B', 'C', 'B', 'A'] ``

% ➜ 算法与数据结构 pbpaste | pbcopy ➜ 算法与数据结构 pbpaste

字符串匹配算法

``s = 'I Love Python'   print(s.find('Py'))  # 7   print(s.find('Pyc')) # -1   ``

`index` 方法与 `find` 的唯一区别在于当主串中不存在模式串时会抛出 `ValueError`

``Traceback (most recent call last):     File "<stdin>", line 1, in <module> ValueError: substring not found   ``

1. 顺序匹配算法

``class String:     def __init__(self, ss="", length=0):         self.string = ss         self.length = length or len(ss)     def __getitem__(self, i):         return self.string[i] ``

``def find(S, T, pos = 0):     i = pos   j = 0   while i < S.length and j < T.length:     if S[i] == T[j]:       i += 1       j += 1     else:       i = i - j + 1       j = 0   if j >= T.length:     return i - T.length   return -1 ``

2. KMP 算法

``def nxt(T, j):     if j == 0:     return -1   if T[0:k] == T[j-k:j]:     return max(k)   else:     return 0 ``

``def kmp_next(T):     nxt = [-1] * T.length   i   = 0   j   = -1   while i < T.length:     if j == -1 or T[j] == T[i]:       i += 1       j += 1       if i < T.length:         nxt[i] = j     else:       j = nxt[j]   return nxt ``

``def KMP(S, T, post = 0):     nxt = kmp_next(T)   i   = pos   j   = 0   while i < S.length and j < T.length:     if j == -1 or S[i] == T[i]:       i += 1       j += 1     else:       j = nxt[j]   if j >= T.length:     return i - T.length   return -1 ``

3. Python 源码中的实现方式

based on a mix between boyer-moore and horspool,

4. One More Think

1. 如果 `xm == yn` ，则 `zk == xm == yn``Zk-1``Xm-1``Yn-1` 的一个 LCS；
2. 如果 `xm != yn` ，且 `zk != xm` ，则 `Z``Xm-1``Yn` 的一个 LCS；
3. 如果 `xm != yn` ，且 `zk != yn` ，则 `Z``Xm``Yn-1` 的一个 LCS。

1. `i == 0 or j == 0` 时， `subs[i][j] = 0`
2. `i > 0 and j > 0 and xi == yj` 时， `subs[i][j] = subs[i-1][j-1] + 1`
3. `i > 0 and j > 0 and xi != yj` 时， `subs[i][j] = max(subs[i][j-1], subs[i-1][j])`

``def LCS_lengths(X, Y):     subs = []    # 初始化二维表   for _ in range(X.length + 1): # 长度为 X.length + 1 是为了保存 X0     subs.append([0] * (Y.length + 1))   # 这里有一个坑，考虑一下为什么不可以用下面的方式进行初始化？   # subs = [[0] * (Y.length + 1)] * (X.length + 1)   for i in range(1, X.length + 1):     for j in range(1, X.length + 1):       if X[i-1] == Y[j-1]: # 字符串中下标是从 0 开始的，但这里的 i, j 是从 X1, Y1 开始的         subs[i][j] = subs[i-1][j-1] + 1       elif subs[i][j-1] >= subs[i-1][j]:         subs[i][j] = subs[i][j-1]       else:         subs[i][j] = subs[i-1][j]   return subs ``

``X = String('ABCBDAB')   Y = String('BDCABA')   subs = LCS_lengths(X, Y)   print(subs)   """ [[0, 0, 0, 0, 0, 0, 0],  [0, 0, 0, 0, 1, 1, 1],  [0, 1, 1, 1, 1, 2, 2],  [0, 1, 1, 2, 2, 2, 2],  [0, 1, 1, 2, 2, 3, 3],  [0, 1, 2, 2, 2, 3, 3],  [0, 1, 2, 2, 3, 3, 4],  [0, 1, 2, 2, 3, 4, 4]] """  # 最后一个元素就是 Xm 与 Yn 的 LCS 长度： print(subs[-1][-1])   # 4 ``

``def LCS_lengths(X, Y):     subs = []   road_map = []   for _ in range(X.length + 1):     subs.append([0] * (Y.length + 1))     road_map.append([0] * (Y.length + 1))   for i in range(1, X.length + 1):     for j in range(1, Y.length + 1):       if X[i-1] == Y[j-1]:         subs[i][j] = subs[i-1][j-1] + 1         road_map[i][j] = 'M' # Match       elif subs[i][j-1] >= subs[i-1][j]:         subs[i][j] = subs[i][j-1]         road_map[i][j] = 'Y' # find from Yj-1       else:         subs[i][j] = subs[i-1][j]         road_map[i][j] = 'X' # find from Xi-1   return subs, road_map def LCS_find(X, Y):     _, road_map = LCS_lengths(X, Y)   def _find(road, X, i, j, lcs):     if i == 0 or j == 0:       return     if road[i][j] == 'M':       _find(road, X, i-1, j-1, lcs)       lcs.append(X[i])     elif road[i][j] == 'Y':       _find(road, X, i, j-1, lcs)     else:       _find(road, X, i-1, j, lcs)      lcs = []     _find(road_map, X, X.length, Y.length, lcs)     return lcs print(LCS_find(X, Y))   # ['B', 'C', 'B', 'A'] ``