algorithm-base/animation-simulation/数据结构和算法/字符串匹配算法.md

692 lines
30 KiB
Java
Raw Normal View History

2021-03-20 08:30:29 +00:00
> **[tan45du_one](https://raw.githubusercontent.com/tan45du/tan45du.github.io/master/个人微信.15egrcgqd94w.jpg)** ,备注 github + 题目 + 问题 向我反馈
>
>
>
> <u>[****](https://raw.githubusercontent.com/tan45du/test/master/微信图片_20210320152235.2pthdebvh1c0.png)</u> 两个平台同步,想要和题友一起刷题,互相监督的同学,可以在我的小屋点击<u>[**刷题小队**](https://raw.githubusercontent.com/tan45du/test/master/微信图片_20210320152235.2pthdebvh1c0.png)</u>进入。
2021-03-20 07:58:25 +00:00
> leetcode AC
666 西
S T S T S T T S -1
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/字符串匹配.3q9wqbh8ws40.png)
T = baab, S = abcabaabcabac T S 4 0 4 T S -1
## BFBrute Force
****
### leetcdoe 28. strStr()
####
haystack needle haystack needle (0) -1
1:
> : haystack = "hello", needle = "ll"
> : 2
2:
> : haystack = "aaaaa", needle = "bba"
> : -1
####
0
####
Java Code:
2021-03-20 07:58:25 +00:00
```java
class Solution {
public int strStr(String haystack, String needle) {
int haylen = haystack.length();
int needlen = needle.length();
//特殊情况
if (haylen < needlen) {
return -1;
}
if (needlen == 0) {
return 0;
}
//主串
for (int i = 0; i < haylen - needlen + 1; ++i) {
int j;
//模式串
for (j = 0; j < needlen; j++) {
//不符合的情况,直接跳出,主串指针后移一位
if (haystack.charAt(i+j) != needle.charAt(j)) {
break;
}
}
//匹配成功
if (j == needlen) {
return i;
}
}
return -1;
}
}
```
Python Code:
```python
from typing import List
class Solution:
def strStr(self, haystack: str, needle: str)->int:
haylen = len(haystack)
needlen = len(needle)
#
if haylen < needlen:
return -1
if needlen == 0:
return 0
#
for i in range(0, haylen - needlen + 1):
#
j = 0
while j < needlen:
if haystack[i + j] != needle[j]:
break
j += 1
#
if j == needlen:
return i
return -1
```
2021-03-20 07:58:25 +00:00
BF退
Java Code:
2021-03-20 07:58:25 +00:00
```java
class Solution {
public int strStr(String haystack, String needle) {
//i代表主串指针j模式串
int i,j;
//主串长度和模式串长度
int halen = haystack.length();
int nelen = needle.length();
//循环条件,这里只有 i 增长
for (i = 0 , j = 0; i < halen && j < nelen; ++i) {
//相同时,则移动 j 指针
if (haystack.charAt(i) == needle.charAt(j)) {
++j;
} else {
//不匹配时,将 j 重新指向模式串的头部,将 i 本次匹配的开始位置的下一字符
i -= j;
j = 0;
}
}
//查询成功时返回索引,查询失败时返回 -1
int renum = j == nelen ? i - nelen : -1;
return renum;
}
}
```
Python Code:
```python
from typing import List
class Solution:
def strStr(self, haystack: str, needle: str)->int:
# ij
i = 0
j = 0
#
halen = len(haystack)
nelen = len(needle)
# i
while i < halen and j < nelen:
# j
if haystack[i] == needle[j]:
j += 1
else:
# j i
i -= j
j = 0
i += 1
# -1
renum = i - nelen if j == nelen else -1
return renum
```
2021-03-20 07:58:25 +00:00
## BM(Boyer-Moore)
BF BF
![BF](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/BF第一次.2qo0876qvs80.png)
BF abcdex abcde x
![BM2](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/BM2.141fhslg6vek.png)
![BM3](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/BM3.7iamevkxf0g0.png)
###
BF BM
![BM4](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/BM4.2mayfaccj3i0.png)
BM ******** f , T f f
![BM5](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/BM5.31j3sja7vsq0.png)
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/含有坏字符.10z8yxka8z8g.png)
f , f, T , f
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/坏字符移动.kl5k3nnzkcg.png)
d d
![2](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/换字符对其2.4xdb38am9e60.png)
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/坏字符原则.781vhv3vm280.png)
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/两个坏字符.1a6hcs8ildkw.png)
****
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/坏字符匹配不按规则.1y45278xg1vk.png)
****** babac** ********
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/坏字符移动规则.48oh1msdypy0.png)
![bug](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/换字符bug.24av6jslzh40.png)
###
BM
![1](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/好后缀1.4j88yw6hecu0.png)
使
BM cac cac ****
绿
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/好后缀对其.5wf80nidao80.png)
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/好后缀中间.7b6m6ki25l00.png)
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/比较.4m9ci1x1c1e0.png)
****
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/不完全重合.6oayqd0dre00.png)
![ok](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/好后缀ok.131zefgf1d1s.png)
1.********
2.****
3.
> Boyer R SMoore J S. A fast string searching algorithmJ. Communications of the ACM197710 762-772.
** 0 **
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/五好后缀.6wvqxa4um040.png)
AC KMP
Java Code:
2021-03-20 07:58:25 +00:00
```java
class Solution {
public int strStr(String haystack, String needle) {
char[] hay = haystack.toCharArray();
char[] need = needle.toCharArray();
int haylen = haystack.length();
int needlen = need.length;
return bm(hay,haylen,need,needlen);
}
//用来求坏字符情况下移动位数
private static void badChar(char[] b, int m, int[] bc) {
//初始化
for (int i = 0; i < 256; ++i) {
bc[i] = -1;
}
//m 代表模式串的长度,如果有两个 a,则后面那个会覆盖前面那个
for (int i = 0; i < m; ++i) {
int ascii = (int)b[i];
bc[ascii] = i;//下标
}
}
//用来求好后缀条件下的移动位数
private static void goodSuffix (char[] b, int m, int[] suffix,boolean[] prefix) {
//初始化
for (int i = 0; i < m; ++i) {
suffix[i] = -1;
prefix[i] = false;
}
for (int i = 0; i < m - 1; ++i) {
int j = i;
int k = 0;
while (j >= 0 && b[j] == b[m-1-k]) {
--j;
++k;
suffix[k] = j + 1;
}
if (j == -1) prefix[k] = true;
}
}
public static int bm (char[] a, int n, char[] b, int m) {
int[] bc = new int[256];//创建一个数组用来保存最右边字符的下标
badChar(b,m,bc);
//用来保存各种长度好后缀的最右位置的数组
int[] suffix_index = new int[m];
//判断是否是头部如果是头部则true
boolean[] ispre = new boolean[m];
goodSuffix(b,m,suffix_index,ispre);
int i = 0;//第一个匹配字符
//注意结束条件
while (i <= n-m) {
int j;
//从后往前匹配,匹配失败,找到坏字符
for (j = m - 1; j >= 0; --j) {
if (a[i+j] != b[j]) break;
}
//模式串遍历完毕,匹配成功
if (j < 0) {
return i;
}
//下面为匹配失败时,如何处理
//求出坏字符规则下移动的位数,就是我们坏字符下标减最右边的下标
int x = j - bc[(int)a[i+j]];
int y = 0;
//好后缀情况,求出好后缀情况下的移动位数,如果不含有好后缀的话,则按照坏字符来
if (y < m-1 && m - 1 - j > 0) {
y = move(j, m, suffix_index,ispre);
}
//移动
i = i + Math.max(x,y);
}
return -1;
}
// j代表坏字符的下标
private static int move (int j, int m, int[] suffix_index, boolean[] ispre) {
//好后缀长度
int k = m - 1 - j;
//如果含有长度为 k 的好后缀,返回移动位数,
if (suffix_index[k] != -1) return j - suffix_index[k] + 1;
//找头部为好后缀子串的最大长度,从长度最大的子串开始
for (int r = j + 2; r <= m-1; ++r) {
//如果是头部
if (ispre[m-r] == true) {
return r;
}
}
//如果没有发现好后缀匹配的串,或者头部为好后缀子串,则移动到 m 位,也就是匹配串的长度
return m;
}
}
```
Python Code:
```python
from typing import List
class Solution:
def strStr(self, haystack: str, needle: str)->int:
haylen = len(haystack)
needlen = len(needle)
return self.bm(haystack, haylen, needle, needlen)
#
def badChar(self, b: str, m: int, bc: List[int]):
#
for i in range(0, 256):
bc[i] = -1
# m a
for i in range(0, m,):
ascii = ord(b[i])
bc[ascii] = i#
#
def goodSuffix(self, b: str, m: int, suffix: List[int], prefix: List[bool]):
#
for i in range(0, m):
suffix[i] = -1
prefix[i] = False
for i in range(0, m - 1):
j = i
k = 0
while j >= 0 and b[j] == b[m - 1 - k]:
j -= 1
k += 1
suffix[k] = j + 1
if j == -1:
prefix[k] = True
def bm(self, a: str, n: int, b: str, m: int)->int:
bc = [0] * 256#
self.badChar(b, m, bc)
#
suffix_index = [0] * m
# True
ispre = [False] * m
self.goodSuffix(b, m, suffix_index, ispre)
i = 0#
#
while i <= n - m:
#
j = m - 1
while j >= 0:
if a[i + j] != b[j]:
break
j -= 1
#
if j < 0:
return i
#
#
x = j - bc[ord(a[i + j])]
y = 0
# ,
if y < m - 1 and m - 1 - j > 0:
y = self.move(j, m, suffix_index, ispre)
#
i += max(x, y)
return -1
# j
def move(j: int, m: int, suffix_index: List[int], ispre: List[bool])->int:
#
k = m - 1 - j
# k
if suffix_index[k] != -1:
return j - suffix_index[k] + 1
#
for r in range(j + 2, m):
# //如果是头部
if ispre[m - r] == True:
return r
# m
return m
```
2021-03-20 07:58:25 +00:00
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/头缀函数.145da63ig3s0.png)
## KMPKnuth-Morris-Pratt
BM BM KMP BM KMP
KMP KMP
****
![KMP](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/KMP例子.1uirbimk5fcw.png)
绿
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/公共前后缀.3wib411usww0.png)
![](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/原理.bghc3ecm4z4.png)
KMP
BM bc,suffix,prefix KMP next next
next
![next](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/next数组.3nir7pgcs9c0.png)
next KMP next
![KMP1](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/KMP1.j74ujxjuq1c.png)
![kmp2](https://cdn.jsdelivr.net/gh/tan45du/photobed@master/photo/kmp2.6jx846nmyd00.png)
****
** next **
Java Code:
2021-03-20 07:58:25 +00:00
```java
class Solution {
public int strStr(String haystack, String needle) {
//两种特殊情况
if (needle.length() == 0) {
return 0;
}
if (haystack.length() == 0) {
return -1;
}
// char 数组
char[] hasyarr = haystack.toCharArray();
char[] nearr = needle.toCharArray();
//长度
int halen = hasyarr.length;
int nelen = nearr.length;
//返回下标
return kmp(hasyarr,halen,nearr,nelen);
}
public int kmp (char[] hasyarr, int halen, char[] nearr, int nelen) {
//获取next 数组
int[] next = next(nearr,nelen);
int j = 0;
for (int i = 0; i < halen; ++i) {
//发现不匹配的字符,然后根据 next 数组移动指针,移动到最大公共前后缀的,
//前缀的后一位,和咱们移动模式串的含义相同
while (j > 0 && hasyarr[i] != nearr[j]) {
j = next[j - 1] + 1;
//超出长度时,可以直接返回不存在
if (nelen - j + i > halen) {
return -1;
}
}
//如果相同就将指针同时后移一下,比较下个字符
if (hasyarr[i] == nearr[j]) {
++j;
}
//遍历完整个模式串,返回模式串的起点下标
if (j == nelen) {
return i - nelen + 1;
}
}
return -1;
}
//这一块比较难懂,不想看的同学可以忽略,了解大致含义即可,或者自己调试一下,看看运行情况
//我会每一步都写上注释
public int[] next (char[] needle,int len) {
//定义 next 数组
int[] next = new int[len];
// 初始化
next[0] = -1;
int k = -1;
for (int i = 1; i < len; ++i) {
//我们此时知道了 [0,i-1]的最长前后缀但是k+1的指向的值和i不相同时我们则需要回溯
//因为 next[k]就时用来记录子串的最长公共前后缀的尾坐标(即长度)
//就要找 k+1前一个元素在next数组里的值,即next[k+1]
while (k != -1 && needle[k + 1] != needle[i]) {
k = next[k];
}
// 相同情况,就是 k的下一位和 i 相同时,此时我们已经知道 [0,i-1]的最长前后缀
//然后 k - 1 又和 i 相同最长前后缀加1即可
if (needle[k+1] == needle[i]) {
++k;
}
next[i] = k;
}
return next;
}
}
```
Python Code:
```python
from typing import List
class Solution:
def strStr(self, haystack: str, needle: str)->int:
#
if len(needle) == 0:
return 0
if len(haystack) == 0:
return -1
#
halen = len(haystack)
nelen = len(needle)
#
return self.kmp(haystack, halen, needle, nelen)
def kmp(self, hasyarr: str, halen: int, nearr: str, nelen: int)->int:
# next
next = self.next(nearr, nelen)
j = 0
for i in range(0, halen):
# next
# ,
while j > 0 and hasyarr[i] != nearr[j]:
j = next[j - 1] + 1
#
if nelen - j + i > halen:
return -1
#
if hasyarr[i] == nearr[j]:
j += 1
#
if j == nelen:
return i - nelen + 1
return -1
#
#
def next(self, needle: str, len:int)->List[int]:
# next
next = [0] * len
#
next[0] = -1
k = -1
for i in range(1, len):
# [0,i-1]k+1i
# next[k]
# k+1next,next[k+1]
while k != -1 and needle[k + 1] != needle[i]:
k = next[k]
# k i [0,i-1]
# k - 1 i 1
if needle[k + 1] == needle[i]:
k += 1
next[i] = k
return next
```
2021-03-20 07:58:25 +00:00
[](https://cdn.jsdelivr.net/gh/tan45du/tan45du.github.io.photo@master/photo/qrcode_for_gh_1f36d2ef6df9_258.5lojyphpkso0.jpg)