Python 处理 Excel 身份证号匹配的五种算法方案 | 极客日志

Python算法

Python 处理 Excel 身份证号匹配的五种算法方案

综述由AI生成针对小表与大表身份证号匹配场景，本文对比了暴力循环、Pandas isin、Merge 合并、SQLite 查询及分块处理五种方案。测试表明，Pandas 内置的 isin 和 merge 方法在速度与代码简洁度上表现最优，适合常规数据处理；SQLite 适用于复杂逻辑或海量数据；分块处理则用于解决内存溢出问题。暴力法因效率过低不推荐实际使用。建议优先选择向量化操作以提升性能。

未来可期发布于 2026/3/16更新于 2026/6/1324 浏览

问题定义与数据准备

我们需要处理两个 Excel 文件：

small.xlsx：约 5,000 条记录。
large.xlsx：约 140,000 条记录。

目标是从大表中找出所有'身份证号'存在于小表中的记录，并保存为 result.xlsx。假设两表的字段名均为 id_card。

首先安装必要的库：

pip install pandas openpyxl

为了演示和测试性能，我们可以先生成模拟数据（实际使用时请替换为 pd.read_excel）：

import pandas as pd
import time
import random

def generate_id_card():
    """生成一个模拟的 18 位身份证号"""
    region_code = random.choice(['110101', '310104', '440301'])
    birth_date = f"19{random.randint(50, 99):02d}{random.randint(1, 12):02d}{random.randint(1, 28):02d}"
    sequence_code = f"{random.randint(0, 999):03d}"
    check_code = random.choice(['X', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'])
    return region_code + birth_date + sequence_code + check_code

# 生成小表数据 (5000 条)
small_data = {'id_card': [generate_id_card() for _  ()]}
small_df = pd.DataFrame(small_data)
small_df.to_excel(, index=)


ids_from_small = small_df[].tolist()
overlap_ids = random.sample(ids_from_small, )
large_list = []
 _  ():
     random.random() <   overlap_ids:
        id_to_use = random.choice(overlap_ids)
    :
        id_to_use = generate_id_card()
    large_list.append(id_to_use)

large_data = {: large_list, : [] * }
large_df = pd.DataFrame(large_data)
large_df.to_excel(, index=)
()
()
()

相关免费在线工具

加密/解密文本
使用加密算法（如AES、TripleDES、Rabbit或RC4）加密和解密文本明文。在线工具，加密/解密文本在线工具，online
Gemini 图片去水印
基于开源反向 Alpha 混合算法去除 Gemini/Nano Banana 图片水印，支持批量处理与下载。在线工具，Gemini 图片去水印在线工具，online
curl 转代码
解析常见 curl 参数并生成 fetch、axios、PHP curl 或 Python requests 示例代码。在线工具，curl 转代码在线工具，online
Base64 字符串编码/解码
将字符串编码和解码为其 Base64 格式表示形式即可。在线工具，Base64 字符串编码/解码在线工具，online
Base64 文件转换器
将字符串、文件或图像转换为其 Base64 表示形式。在线工具，Base64 文件转换器在线工具，online
Markdown转HTML
将 Markdown（GFM）转为 HTML 片段，浏览器内 marked 解析；与 HTML转Markdown 互为补充。在线工具，Markdown转HTML在线工具，online

def algorithm_1_brute_force(small_file, large_file, output_file):
    print("算法 1：暴力双重循环 - 开始执行")
    start_time = time.time()
    
    small_df = pd.read_excel(small_file)
    large_df = pd.read_excel(large_file)
    
    # 确保类型一致
    small_df['id_card'] = small_df['id_card'].astype(str)
    large_df['id_card'] = large_df['id_card'].astype(str)
    
    small_ids = small_df['id_card'].tolist()
    matched_rows = []
    large_ids = large_df['id_card'].tolist()
    
    for i, large_id in enumerate(large_ids):
        for small_id in small_ids:
            if large_id == small_id:
                matched_rows.append(i)
                break
    
    result_df = large_df.iloc[matched_rows]
    result_df.to_excel(output_file, index=False)
    
    end_time = time.time()
    print(f"算法 1 完成。找到 {len(result_df)} 条匹配记录。耗时：{end_time - start_time:.4f} 秒")
    return end_time - start_time

def algorithm_2_pandas_isin(small_file, large_file, output_file):
    print("算法 2：Pandas isin() - 开始执行")
    start_time = time.time()
    
    small_df = pd.read_excel(small_file)
    large_df = pd.read_excel(large_file)
    
    small_df['id_card'] = small_df['id_card'].astype(str)
    large_df['id_card'] = large_df['id_card'].astype(str)
    
    target_set = set(small_df['id_card'])
    mask = large_df['id_card'].isin(target_set)
    result_df = large_df[mask]
    
    result_df.to_excel(output_file, index=False)
    
    end_time = time.time()
    print(f"算法 2 完成。找到 {len(result_df)} 条匹配记录。耗时：{end_time - start_time:.4f} 秒")
    return end_time - start_time

def algorithm_3_pandas_merge(small_file, large_file, output_file):
    print("算法 3：Pandas Merge - 开始执行")
    start_time = time.time()
    
    small_df = pd.read_excel(small_file)
    large_df = pd.read_excel(large_file)
    
    small_df['id_card'] = small_df['id_card'].astype(str)
    large_df['id_card'] = large_df['id_card'].astype(str)
    
    # 仅选取 id_card 列作为匹配键，避免产生多余列
    result_df = pd.merge(large_df, small_df[['id_card']], on='id_card', how='inner')
    
    result_df.to_excel(output_file, index=False)
    
    end_time = time.time()
    print(f"算法 3 完成。找到 {len(result_df)} 条匹配记录。耗时：{end_time - start_time:.4f} 秒")
    return end_time - start_time

import sqlite3

def algorithm_4_sqlite(small_file, large_file, output_file):
    print("算法 4：SQLite 内存数据库 - 开始执行")
    start_time = time.time()
    
    small_df = pd.read_excel(small_file)
    large_df = pd.read_excel(large_file)
    
    small_df['id_card'] = small_df['id_card'].astype(str)
    large_df['id_card'] = large_df['id_card'].astype(str)
    
    conn = sqlite3.connect(':memory:')
    small_df.to_sql('small_table', conn, index=False)
    large_df.to_sql('large_table', conn, index=False)
    
    query = "SELECT large_table.* FROM large_table WHERE large_table.id_card IN (SELECT id_card FROM small_table)"
    result_df = pd.read_sql_query(query, conn)
    
    conn.close()
    result_df.to_excel(output_file, index=False)
    
    end_time = time.time()
    print(f"算法 4 完成。找到 {len(result_df)} 条匹配记录。耗时：{end_time - start_time:.4f} 秒")
    return end_time - start_time

def algorithm_5_chunking(small_file, large_file, output_file, chunksize=10000):
    print("算法 5：分块处理 - 开始执行")
    start_time = time.time()
    
    small_df = pd.read_excel(small_file)
    small_df['id_card'] = small_df['id_card'].astype(str)
    target_set = set(small_df['id_card'])
    
    chunks_result_list = []
    chunk_reader = pd.read_excel(large_file, chunksize=chunksize)
    
    for chunk in chunk_reader:
        chunk['id_card'] = chunk['id_card'].astype(str)
        mask = chunk['id_card'].isin(target_set)
        filtered_chunk = chunk[mask]
        chunks_result_list.append(filtered_chunk)
    
    if chunks_result_list:
        final_result_df = pd.concat(chunks_result_list, ignore_index=True)
    else:
        final_result_df = pd.DataFrame()
        
    final_result_df.to_excel(output_file, index=False)
    
    end_time = time.time()
    print(f"算法 5 完成。找到 {len(final_result_df)} 条匹配记录。耗时：{end_time - start_time:.4f} 秒")
    return end_time - start_time

算法	优点	缺点	预计时间	推荐度
暴力循环	逻辑简单	速度极慢	~30 分钟以上	⭐（绝不推荐）
Pandas isin	实现简单，速度最快	需内存容纳集合	<1 秒	⭐⭐⭐⭐⭐（首选）
Pandas Merge	实现简单，语义清晰	略有额外开销	~1 秒	⭐⭐⭐⭐⭐（首选）
SQLite	支持复杂查询	步骤较多	1-3 秒	⭐⭐⭐⭐（备用）
分块处理	内存友好	速度较慢	2-5 秒	⭐⭐⭐（特殊场景）

if __name__ == '__main__':
    files = ('small.xlsx', 'large.xlsx')
    times = {}
    times['alg_2'] = algorithm_2_pandas_isin(*files, 'result_2.xlsx')
    times['alg_3'] = algorithm_3_pandas_merge(*files, 'result_3.xlsx')
    times['alg_4'] = algorithm_4_sqlite(*files, 'result_4.xlsx')
    times['alg_5'] = algorithm_5_chunking(*files, 'result_5.xlsx', chunksize=50000)
    
    print("\n=== 所有算法耗时对比 ===")
    for alg, t in times.items():
        print(f"{alg}: {t:.4f} 秒")

Python 处理 Excel 身份证号匹配的五种算法方案

问题定义与数据准备

更多推荐文章

相关免费在线工具

五种匹配方案对比

1. 暴力双重循环

2. Pandas isin() 方法

3. Pandas Merge（合并）

4. SQLite 内存数据库

5. 分块处理 (Chunking)

总结与建议

更多推荐文章

相关免费在线工具

Python 处理 Excel 身份证号匹配的五种算法方案

问题定义与数据准备

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

五种匹配方案对比

1. 暴力双重循环

2. Pandas isin() 方法

3. Pandas Merge（合并）

4. SQLite 内存数据库

5. 分块处理 (Chunking)

总结与建议

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具