算法——字符串

引言

本文整理了常见的字符串相关算法,方便以后查阅。算法相关的文章均收录于<算法系列文章>

字符串匹配

接口:

1
2
3
4
5
6
7
8
9
10
package bbm.match;

import java.util.List;

/**
* @author bbm
*/
public interface StringMatcher {
List<Integer> match(String string, String pattern);
}

测试:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
package bbm.match;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;

/**
* @author bbm
*/
class StringMatcherTest {

@Test
public void testNative() {
StringMatcher stringMatcher = new NativeStringMatcher();
List<Character> chars = Arrays.asList('1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'x');
for (int loop = 0; loop < 10000; loop++) {
StringBuilder stringBuilder = new StringBuilder();
Random random = new Random(System.currentTimeMillis());
for (int i = 0; i < 100; i++) {
int index = random.nextInt(chars.size());
stringBuilder.append(chars.get(index));
}
String string = stringBuilder.toString();
int start = random.nextInt(100);
int end = start + random.nextInt(100 - start);
String patternString = string.substring(start, end);
Pattern pattern = Pattern.compile(patternString);
Matcher matcher = pattern.matcher(string);
List<Integer> result = new ArrayList<>();
while (matcher.find()) {
result.add(matcher.start());
}
List<Integer> myResult = stringMatcher.match(string, patternString);
try {
Assertions.assertEquals(result.size(), myResult.size());
for (int i = 0; i < result.size(); i++) {
Assertions.assertEquals(result.get(i), myResult.get(i));
}
} catch (Throwable e) {
e.printStackTrace();
}
}
}
}

朴素算法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
package bbm.match;

import java.util.LinkedList;
import java.util.List;

/**
* @author bbm
*/
public class NativeStringMatcher implements StringMatcher {
@Override
public List<Integer> match(String string, String pattern) {
List<Integer> result = new LinkedList<>();
for (int i = 0; i < string.length() - pattern.length() + 1; i++) {
boolean match = true;
for (int j = 0; j < pattern.length(); j++) {
if (pattern.charAt(j) != string.charAt(i + j)) {
match = false;
break;
}
}
if (match) {
result.add(i);
}
}
return result;
}
}

状态机

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
package bbm.match;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* 需要根据目标字符串构建状态机,从目标字符串中找到所有出现的字符,为每个字符进行编号,然后构建 (patternLength + 1) * 字符集数量 的状态
* 转移表,构建状态转移表的一般思路是,假设 x 为 pattern 中的任意下标,stateMap[x][x + 1] = x+1 ,此时,我们要
* 计算 "pattern[0, x] + y" 的后缀和 pattern[0, x+1] 的最长前缀,其中 y 恒不等于 pattern[x+1],得到这个最大前缀 maxPrefix 之后,
* stateMap[x][y] = maxPrefix.length
*
* 初始化时间复杂度:O(m*字符集数量)
* 匹配时间复杂度:O(n)
*
* @author bbm
*/
public class StateMachineStringMatcher implements StringMatcher {

private int[][] stateMap;
private Map<Character, Integer> characters = new HashMap<>();

public StateMachineStringMatcher(String pattern) {
int index = 0;
if (!pattern.isEmpty()) {
for (int i = 0; i < pattern.length(); i++) {
if (!characters.containsKey(pattern.charAt(i))) {
characters.put(pattern.charAt(i), index++);
}
}
stateMap = new int[pattern.length() + 1][characters.size()];
for (int i = 0; i < pattern.length() + 1; i++) {
String sub = i < pattern.length() ? pattern.substring(0, i) : pattern;
int rightIndex = -1;
if (i < pattern.length()) {
rightIndex = characters.get(pattern.charAt(i));
stateMap[i][rightIndex] = i + 1;
}
for (Map.Entry<Character, Integer> entry: characters.entrySet()) {
if (!entry.getValue().equals(rightIndex)) {
for (int j = i + 1; j > 0; j--) {
String temp = sub + entry.getKey();
String prefix = j < pattern.length() ? pattern.substring(0, j) : pattern;
if (temp.endsWith(prefix)) {
stateMap[i][entry.getValue()] = Math.min(j, pattern.length());
break;
}
}
}
}
}
} else {
stateMap = null;
}
}

@Override
public List<Integer> match(String string, String pattern) {
List<Integer> result = new ArrayList<>();
if (stateMap == null) {
for (int i = 0; i < string.length() + 1; i++) {
result.add(i);
}
} else {
int state = 0;
for (int i = 0; i < string.length(); i++) {
if (characters.containsKey(string.charAt(i))) {
state = stateMap[state][characters.get(string.charAt(i))];
} else {
state = 0;
}
if (state == pattern.length()) {
result.add(i - pattern.length() + 1);
}
}
}
return result;
}
}

参考内容

[1] 《算法导论》

贝克街的流浪猫 wechat
您的打赏将鼓励我继续分享!
  • 本文作者: 贝克街的流浪猫
  • 本文链接: https://www.beikejiedeliulangmao.top/algorithm/string/
  • 版权声明: 本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议,转载请注明出处!
  • 创作声明: 本文基于上述所有参考内容进行创作,其中可能涉及复制、修改或者转换,图片均来自网络,如有侵权请联系我,我会第一时间进行删除。