Python 基于 Streamlit 实现 PDF 文档文字提取
本文介绍如何使用 Python 和 Streamlit 构建一个简单的工具,用于从 PDF 文档中提取文本内容,并支持加密 PDF 的密码输入处理。
环境依赖
安装所需的 Python 库:
pip install streamlit pdfplumber
核心代码
以下是完整的 Main.py 脚本示例。该程序实现了文件上传、文本提取(含加密处理)及简单的问答展示功能。
import os
import tempfile
import traceback
import streamlit as st
from typing import Optional, List
import pdfplumber
class DocumentQASystem:
"""私有化 PDF 文档问答系统"""
def __init__(self):
st.set_page_config(
page_title="PDF 文档问答系统",
layout="wide",
initial_sidebar_state="collapsed"
)
if "doc_text" not in st.session_state:
st.session_state.doc_text = ""
self._set_custom_style()
def _set_custom_style(self):
st.markdown("""
<style>
.stButton>button {width: 100%; margin-top: 10px;}
.stTextInput>div>div>input {padding: 8px;}
.doc-preview {max-height: 400px; overflow-y: auto; border: 1px solid #eee; padding: 10px; border-radius: 5px;}
</style>
""", unsafe_allow_html=True)
def _extract_pdf_text(self, pdf_path: ) -> :
full_text =
:
:
pdfplumber.(pdf_path) pdf:
page_num, page (pdf.pages, ):
text = page.extract_text()
text:
full_text +=
pdfplumber.utils.PDFEncryptionError:
st.warning()
pdf_password = st.text_input(
, =, key=,
=
)
pdf_password:
:
pdfplumber.(pdf_path, password=pdf_password) pdf:
page_num, page (pdf.pages, ):
text = page.extract_text()
text:
full_text +=
Exception:
st.error()
:
st.info()
full_text.strip()
ImportError:
st.error()
Exception e:
st.error()
() -> :
():
st.title()
st.divider()
col1, col2 = st.columns([, ])
col1:
st.subheader()
uploaded_file = st.file_uploader(, =[])
uploaded_file :
st.spinner():
tempfile.NamedTemporaryFile(delete=, suffix=) tmp_file:
tmp_file.write(uploaded_file.read())
tmp_path = tmp_file.name
st.session_state.doc_text = ._extract_pdf_text(tmp_path)
:
os.unlink(tmp_path)
Exception e:
st.warning()
st.session_state.doc_text:
st.success()
st.button(, =):
st.session_state.doc_text =
st.rerun()
st.divider()
question = st.text_input(
,
placeholder=,
disabled= st.session_state.doc_text
)
submit_btn = st.button(
,
=,
disabled= (question st.session_state.doc_text)
)
col2:
st.subheader()
submit_btn:
st.spinner():
answer = ._mock_llm_answer(question, st.session_state.doc_text)
st.markdown()
st.write(answer)
st.divider()
st.session_state.doc_text:
st.markdown()
st.markdown(
,
unsafe_allow_html=
)
:
st.info()
st.divider()
st.caption()
():
qa_system = DocumentQASystem()
qa_system.render_ui()
__name__ == :
main()


