algorithm-base/animation-simulation/数据结构和算法/字符串匹配算法.md

692 lines
30 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

> **[tan45du_one](https://raw.githubusercontent.com/tan45du/tan45du.github.io/master/个人微信.15egrcgqd94w.jpg)** ,备注 github + 题目 + 问题 向我反馈
>
>
>
> <u>[****](https://raw.githubusercontent.com/tan45du/test/master/微信图片_20210320152235.2pthdebvh1c0.png)</u> 两个平台同步,想要和题友一起刷题,互相监督的同学,可以在我的小屋点击<u>[**刷题小队**](https://raw.githubusercontent.com/tan45du/test/master/微信图片_20210320152235.2pthdebvh1c0.png)</u>进入。
> leetcode AC
666 西
S T S T S T T S -1
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/字符串匹配.3q9wqbh8ws40.png)
T = baab, S = abcabaabcabac T S 4 0 4 T S -1
## BFBrute Force
****
### leetcdoe 28. strStr()
####
haystack needle haystack needle (0) -1
1:
> : haystack = "hello", needle = "ll"
> : 2
2:
> : haystack = "aaaaa", needle = "bba"
> : -1
####
0
####
Java Code:
```java
class Solution {
public int strStr(String haystack, String needle) {
int haylen = haystack.length();
int needlen = needle.length();
//特殊情况
if (haylen < needlen) {
return -1;
}
if (needlen == 0) {
return 0;
}
//主串
for (int i = 0; i < haylen - needlen + 1; ++i) {
int j;
//模式串
for (j = 0; j < needlen; j++) {
//不符合的情况,直接跳出,主串指针后移一位
if (haystack.charAt(i+j) != needle.charAt(j)) {
break;
}
}
//匹配成功
if (j == needlen) {
return i;
}
}
return -1;
}
}
```
Python Code:
```python
from typing import List
class Solution:
def strStr(self, haystack: str, needle: str)->int:
haylen = len(haystack)
needlen = len(needle)
#
if haylen < needlen:
return -1
if needlen == 0:
return 0
#
for i in range(0, haylen - needlen + 1):
#
j = 0
while j < needlen:
if haystack[i + j] != needle[j]:
break
j += 1
#
if j == needlen:
return i
return -1
```
BF退
Java Code:
```java
class Solution {
public int strStr(String haystack, String needle) {
//i代表主串指针j模式串
int i,j;
//主串长度和模式串长度
int halen = haystack.length();
int nelen = needle.length();
//循环条件,这里只有 i 增长
for (i = 0 , j = 0; i < halen && j < nelen; ++i) {
//相同时,则移动 j 指针
if (haystack.charAt(i) == needle.charAt(j)) {
++j;
} else {
//不匹配时,将 j 重新指向模式串的头部,将 i 本次匹配的开始位置的下一字符
i -= j;
j = 0;
}
}
//查询成功时返回索引,查询失败时返回 -1
int renum = j == nelen ? i - nelen : -1;
return renum;
}
}
```
Python Code:
```python
from typing import List
class Solution:
def strStr(self, haystack: str, needle: str)->int:
# ij
i = 0
j = 0
#
halen = len(haystack)
nelen = len(needle)
# i
while i < halen and j < nelen:
# j
if haystack[i] == needle[j]:
j += 1
else:
# j i
i -= j
j = 0
i += 1
# -1
renum = i - nelen if j == nelen else -1
return renum
```
## BM(Boyer-Moore)
BF BF
![BF](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/BF第一次.2qo0876qvs80.png)
BF abcdex abcde x
![BM2](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/BM2.141fhslg6vek.png)
![BM3](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/BM3.7iamevkxf0g0.png)
###
BF BM
![BM4](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/BM4.2mayfaccj3i0.png)
BM ******** f , T f f
![BM5](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/BM5.31j3sja7vsq0.png)
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/含有坏字符.10z8yxka8z8g.png)
f , f, T , f
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/坏字符移动.kl5k3nnzkcg.png)
d d
![2](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/换字符对其2.4xdb38am9e60.png)
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/坏字符原则.781vhv3vm280.png)
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/两个坏字符.1a6hcs8ildkw.png)
****
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/坏字符匹配不按规则.1y45278xg1vk.png)
****** babac** ********
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/坏字符移动规则.48oh1msdypy0.png)
![bug](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/换字符bug.24av6jslzh40.png)
###
BM
![1](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/好后缀1.4j88yw6hecu0.png)
使
BM cac cac ****
绿
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/好后缀对其.5wf80nidao80.png)
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/好后缀中间.7b6m6ki25l00.png)
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/比较.4m9ci1x1c1e0.png)
****
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/不完全重合.6oayqd0dre00.png)
![ok](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/好后缀ok.131zefgf1d1s.png)
1.********
2.****
3.
> Boyer R SMoore J S. A fast string searching algorithmJ. Communications of the ACM197710 762-772.
** 0 **
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/五好后缀.6wvqxa4um040.png)
AC KMP
Java Code:
```java
class Solution {
public int strStr(String haystack, String needle) {
char[] hay = haystack.toCharArray();
char[] need = needle.toCharArray();
int haylen = haystack.length();
int needlen = need.length;
return bm(hay,haylen,need,needlen);
}
//用来求坏字符情况下移动位数
private static void badChar(char[] b, int m, int[] bc) {
//初始化
for (int i = 0; i < 256; ++i) {
bc[i] = -1;
}
//m 代表模式串的长度,如果有两个 a,则后面那个会覆盖前面那个
for (int i = 0; i < m; ++i) {
int ascii = (int)b[i];
bc[ascii] = i;//下标
}
}
//用来求好后缀条件下的移动位数
private static void goodSuffix (char[] b, int m, int[] suffix,boolean[] prefix) {
//初始化
for (int i = 0; i < m; ++i) {
suffix[i] = -1;
prefix[i] = false;
}
for (int i = 0; i < m - 1; ++i) {
int j = i;
int k = 0;
while (j >= 0 && b[j] == b[m-1-k]) {
--j;
++k;
suffix[k] = j + 1;
}
if (j == -1) prefix[k] = true;
}
}
public static int bm (char[] a, int n, char[] b, int m) {
int[] bc = new int[256];//创建一个数组用来保存最右边字符的下标
badChar(b,m,bc);
//用来保存各种长度好后缀的最右位置的数组
int[] suffix_index = new int[m];
//判断是否是头部如果是头部则true
boolean[] ispre = new boolean[m];
goodSuffix(b,m,suffix_index,ispre);
int i = 0;//第一个匹配字符
//注意结束条件
while (i <= n-m) {
int j;
//从后往前匹配,匹配失败,找到坏字符
for (j = m - 1; j >= 0; --j) {
if (a[i+j] != b[j]) break;
}
//模式串遍历完毕,匹配成功
if (j < 0) {
return i;
}
//下面为匹配失败时,如何处理
//求出坏字符规则下移动的位数,就是我们坏字符下标减最右边的下标
int x = j - bc[(int)a[i+j]];
int y = 0;
//好后缀情况,求出好后缀情况下的移动位数,如果不含有好后缀的话,则按照坏字符来
if (y < m-1 && m - 1 - j > 0) {
y = move(j, m, suffix_index,ispre);
}
//移动
i = i + Math.max(x,y);
}
return -1;
}
// j代表坏字符的下标
private static int move (int j, int m, int[] suffix_index, boolean[] ispre) {
//好后缀长度
int k = m - 1 - j;
//如果含有长度为 k 的好后缀,返回移动位数,
if (suffix_index[k] != -1) return j - suffix_index[k] + 1;
//找头部为好后缀子串的最大长度,从长度最大的子串开始
for (int r = j + 2; r <= m-1; ++r) {
//如果是头部
if (ispre[m-r] == true) {
return r;
}
}
//如果没有发现好后缀匹配的串,或者头部为好后缀子串,则移动到 m 位,也就是匹配串的长度
return m;
}
}
```
Python Code:
```python
from typing import List
class Solution:
def strStr(self, haystack: str, needle: str)->int:
haylen = len(haystack)
needlen = len(needle)
return self.bm(haystack, haylen, needle, needlen)
#
def badChar(self, b: str, m: int, bc: List[int]):
#
for i in range(0, 256):
bc[i] = -1
# m a
for i in range(0, m,):
ascii = ord(b[i])
bc[ascii] = i#
#
def goodSuffix(self, b: str, m: int, suffix: List[int], prefix: List[bool]):
#
for i in range(0, m):
suffix[i] = -1
prefix[i] = False
for i in range(0, m - 1):
j = i
k = 0
while j >= 0 and b[j] == b[m - 1 - k]:
j -= 1
k += 1
suffix[k] = j + 1
if j == -1:
prefix[k] = True
def bm(self, a: str, n: int, b: str, m: int)->int:
bc = [0] * 256#
self.badChar(b, m, bc)
#
suffix_index = [0] * m
# True
ispre = [False] * m
self.goodSuffix(b, m, suffix_index, ispre)
i = 0#
#
while i <= n - m:
#
j = m - 1
while j >= 0:
if a[i + j] != b[j]:
break
j -= 1
#
if j < 0:
return i
#
#
x = j - bc[ord(a[i + j])]
y = 0
# ,
if y < m - 1 and m - 1 - j > 0:
y = self.move(j, m, suffix_index, ispre)
#
i += max(x, y)
return -1
# j
def move(j: int, m: int, suffix_index: List[int], ispre: List[bool])->int:
#
k = m - 1 - j
# k
if suffix_index[k] != -1:
return j - suffix_index[k] + 1
#
for r in range(j + 2, m):
# //如果是头部
if ispre[m - r] == True:
return r
# m
return m
```
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/头缀函数.145da63ig3s0.png)
## KMPKnuth-Morris-Pratt
BM BM KMP BM KMP
KMP KMP
****
![KMP](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/KMP例子.1uirbimk5fcw.png)
绿
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/公共前后缀.3wib411usww0.png)
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/原理.bghc3ecm4z4.png)
KMP
BM bc,suffix,prefix KMP next next
next
![next](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/next数组.3nir7pgcs9c0.png)
next KMP next
![KMP1](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/KMP1.j74ujxjuq1c.png)
![kmp2](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/kmp2.6jx846nmyd00.png)
****
** next **
Java Code:
```java
class Solution {
public int strStr(String haystack, String needle) {
//两种特殊情况
if (needle.length() == 0) {
return 0;
}
if (haystack.length() == 0) {
return -1;
}
// char 数组
char[] hasyarr = haystack.toCharArray();
char[] nearr = needle.toCharArray();
//长度
int halen = hasyarr.length;
int nelen = nearr.length;
//返回下标
return kmp(hasyarr,halen,nearr,nelen);
}
public int kmp (char[] hasyarr, int halen, char[] nearr, int nelen) {
//获取next 数组
int[] next = next(nearr,nelen);
int j = 0;
for (int i = 0; i < halen; ++i) {
//发现不匹配的字符,然后根据 next 数组移动指针,移动到最大公共前后缀的,
//前缀的后一位,和咱们移动模式串的含义相同
while (j > 0 && hasyarr[i] != nearr[j]) {
j = next[j - 1] + 1;
//超出长度时,可以直接返回不存在
if (nelen - j + i > halen) {
return -1;
}
}
//如果相同就将指针同时后移一下,比较下个字符
if (hasyarr[i] == nearr[j]) {
++j;
}
//遍历完整个模式串,返回模式串的起点下标
if (j == nelen) {
return i - nelen + 1;
}
}
return -1;
}
//这一块比较难懂,不想看的同学可以忽略,了解大致含义即可,或者自己调试一下,看看运行情况
//我会每一步都写上注释
public int[] next (char[] needle,int len) {
//定义 next 数组
int[] next = new int[len];
// 初始化
next[0] = -1;
int k = -1;
for (int i = 1; i < len; ++i) {
//我们此时知道了 [0,i-1]的最长前后缀但是k+1的指向的值和i不相同时我们则需要回溯
//因为 next[k]就时用来记录子串的最长公共前后缀的尾坐标(即长度)
//就要找 k+1前一个元素在next数组里的值,即next[k+1]
while (k != -1 && needle[k + 1] != needle[i]) {
k = next[k];
}
// 相同情况,就是 k的下一位和 i 相同时,此时我们已经知道 [0,i-1]的最长前后缀
//然后 k - 1 又和 i 相同最长前后缀加1即可
if (needle[k+1] == needle[i]) {
++k;
}
next[i] = k;
}
return next;
}
}
```
Python Code:
```python
from typing import List
class Solution:
def strStr(self, haystack: str, needle: str)->int:
#
if len(needle) == 0:
return 0
if len(haystack) == 0:
return -1
#
halen = len(haystack)
nelen = len(needle)
#
return self.kmp(haystack, halen, needle, nelen)
def kmp(self, hasyarr: str, halen: int, nearr: str, nelen: int)->int:
# next
next = self.next(nearr, nelen)
j = 0
for i in range(0, halen):
# next
# ,
while j > 0 and hasyarr[i] != nearr[j]:
j = next[j - 1] + 1
#
if nelen - j + i > halen:
return -1
#
if hasyarr[i] == nearr[j]:
j += 1
#
if j == nelen:
return i - nelen + 1
return -1
#
#
def next(self, needle: str, len:int)->List[int]:
# next
next = [0] * len
#
next[0] = -1
k = -1
for i in range(1, len):
# [0,i-1]k+1i
# next[k]
# k+1next,next[k+1]
while k != -1 and needle[k + 1] != needle[i]:
k = next[k]
# k i [0,i-1]
# k - 1 i 1
if needle[k + 1] == needle[i]:
k += 1
next[i] = k
return next
```
[](https://cdn.jsdelivr.net/gh/tan45du/tan45du.github.io.photo@master/photo/qrcode_for_gh_1f36d2ef6df9_258.5lojyphpkso0.jpg)