数据结构详解：KMP 算法、Trie 树与并查集

KMP 算法

KMP 算法主要用于解决字符串匹配问题。给定主串 S[]（较长）和模式串 P[]，KMP 算法通常使用下标从 1 开始遍历。

暴力匹配思路

暴力做法是每当匹配到不一致的字符时，将 P 串从头开始，与 S 串的起点后移一位的位置重新匹配。当串长度很长时，这种方法会导致超时。

for(int i = 1; i <= m; i ++){
    bool flag = true;
    for(int j = 1; j <= n; j++){
        if(S[i+j-1] != P[j]){
            flag = false;
            break;
        }
    }
}

原理

暴力匹配每次遇到不同字符都让 P 从头开始，但 P 串本身存在相同的前缀和后缀。当匹配失败时，无需逐个回退，可直接跳到特定位置继续匹配。

文章配图

Next 数组

Next 数组的含义是以 i 为终点的后缀和从 1 开始的前缀相等的最长长度。即 next[i] = j 表示 P[1...j] == P[i-j+1...i]。

计算方法：假设 i-1 位置之前的最长前缀后缀长度为 j，若 P[i] == P[j+1]，则 j++，令 next[i] = j；否则利用 next[j] 继续回溯匹配。

// 求 next 数组
for(int i = 2, j = 0; i <= n; i ++){
    while(j && P[i] != P[j + 1]) j = ne[j];
    if(P[i] == P[j + 1]) j ++;
    ne[i] = j;
}

匹配过程

明确指针后，每次比较 j+1 和 i 位置是否相同。不同则 j 回退到 ne[j]，相同则 j++。

// KMP 匹配模板
#include<iostream>
using namespace std;
const int N = 1e4 + 10;
const int M = 1e5 + 10;
char S[M], P[N], ne[N];
int n, m;

int main(){
    cin >> n >> (P + 1) >> m >> (S + 1);
    // 求 next 过程
    for(int i = 2, j = 0; i <= n; i ++){
        while(j && P[i] != P[j + 1]) j = ne[j];
        if(P[i] == P[j + 1]) j ++;
        ne[i] = j;
    }
    // 匹配过程
    for(int i = 1, j = 0; i <= m; i ++){
        while(j && S[i] != P[j + 1]) j = ne[j];
        if(S[i] == P[j + 1]) j ++;
        if(j == n){
            printf("%d ", i - n);
            j = ne[j];
        }
    }
    return 0;
}

Trie 树

Trie 树（字典树）用来快速存储和查找字符串集合。

文章配图

插入时将单词字母一一存入，根节点为 0。最后一个字母做标记以区分完整单词。

#include<iostream>
using namespace std;
const int N = 1e5 + 10;
char str[N];
int son[N][26], cnt[N], idx;

// 插入操作
void insert(char str[]){
    int p = 0;
    for(int i = 0; str[i]; i ++){
        int u = str[i] - 'a';
        if(!son[p][u]) son[p][u] = ++idx;
        p = son[p][u];
    }
    cnt[p]++;
}

// 查询操作
int query(char str[]){
    int p = 0;
    for(int i = 0; str[i]; i ++){
        int u = str[i] - 'a';
        if(!son[p][u]) return 0;
        p = son[p][u];
    }
    return cnt[p];
}

int main(){
    int n;
    scanf("%d", &n);
    while(n --){
        char op[2];
        scanf("%s%s", op, str);
        if(op[0]=='I') insert(str);
        else printf("%d\n", query(str));
    }
    return 0;
}

并查集

并查集用于处理集合合并及元素所属集合判断的问题。

基本原理： 每个集合用一棵树表示，树根编号即为集合编号，p[x] 存储 x 的父节点。

判断树根： if(p[x] == x)
求集合编号： while(p[x] != x) x = p[x];
合并集合： p[find(x)] = find(y);

优化： 路径压缩。

#include<iostream>
using namespace std;
const int N = 1e5 + 10;
int p[N], n, m;

int find(int x){
    if(p[x] != x) p[x] = find(p[x]);
    return p[x];
}

int main(){
    cin >> n >> m;
    for(int i = 1; i <= n; i++) p[i] = i;
    while(m --){
        char op[2];
        int a, b;
        scanf("%s%d%d", op, &a, &b);
        if(op[0] == 'M') p[find(a)] = find(b);
        else{
            if(find(a) == find(b)) puts("Yes");
            else puts("No");
        }
    }
    return 0;
}

若需统计每个集合的元素数量，可添加 size 数组。合并时累加 size，同集合则跳过。

#include<iostream>
using namespace std;
const int N = 1e5 + 10;
int p[N], n, m, sizes[N];

int find(int x){
    if(p[x] != x) p[x] = find(p[x]);
    return p[x];
}

int main(){
    cin >> n >> m;
    for(int i = 1; i <= n; i++) p[i] = i, sizes[i] = 1;
    while(m --){
        char op[5];
        int a, b;
        scanf("%s", op);
        if(op[0] == 'C') {
            scanf("%d%d", &a, &b);
            if(find(a) == find(b)) continue;
            sizes[find(b)] += sizes[find(a)];
            p[find(a)] = find(b);
        } else if(op[1] == '1'){
            scanf("%d%d", &a, &b);
            if(find(a) == find(b)) puts("Yes");
            else puts("No");
        } else {
            scanf("%d", &a);
            printf("%d\n", sizes[find(a)]);
        }
    }
    return 0;
}

以上介绍了三种常见数据结构的核心实现与应用场景。