deffetch_details(pmids, batch_size=200):
""" 批量获取文献详细信息
Args:
pmids: PMID 列表(字符串列表)
batch_size: 单次请求数量(推荐 200-500)
Returns:
list: 文献详情列表
"""
all_records = []
# 分批处理(避免 URL 过长)for i inrange(0, len(pmids), batch_size):
batch_pmids = pmids[i:i+batch_size]
print(f"📥 正在获取第 {i+1}-{i+len(batch_pmids)} 篇文献...")
try:
# 调用 EFetch API
handle = Entrez.efetch(
db="pubmed",
id=",".join(batch_pmids), # PMID 用逗号分隔
rettype="medline", # 返回格式:medline/xml/abstract
retmode="text"
)
records = Medline.parse(handle)
# 解析 MEDLINE 格式
all_records.extend(list(records))
handle.close()
except Exception as e:
print(f"❌ 批次失败:{e}")
continueprint(f"✅ 共获取 {len(all_records)} 篇文献详情")
return all_records
# 更推荐的 XML 格式解析(信息更全)deffetch_details_xml(pmids):
"""使用 XML 格式获取更完整的信息"""from Bio import Medline
try:
handle = Entrez.efetch(
db="pubmed",
id=",".join(pmids),
rettype="xml"
)
records = Entrez.read(handle)
handle.close()
# 提取结构化数据
articles = []
for article in records['PubmedArticle']:
medline = article['MedlineCitation']
# 构建文献对象
paper = {
"pmid": medline['PMID'],
"title": medline['Article']['ArticleTitle'],
"abstract": medline['Article'].get('Abstract',{}).get('AbstractText',[''])[0],
"authors": [f"{author.get('LastName','')}{author.get('ForeName','')}"for author in medline['Article'].get('AuthorList',[])],
"journal": medline['Article']['Journal']['Title'],
"pub_date": medline['Article']['Journal']['JournalIssue']['PubDate'],
"doi": None# 需要从 ArticleIdList 中提取
}
# 提取 DOI
id_list = article.get('PubmedData',{}).get('ArticleIdList',[])
for id_item in id_list:
if id_item.attributes.get('IdType')=='doi':
paper['doi']=str(id_item)
articles.append(paper)
return articles
except Exception as e:
print(f"❌ XML 解析失败:{e}")
return []
# 测试代码if __name__ == "__main__":
# 先搜索
result = search_pubmed("machine learning healthcare", max_results=5)
# 再获取详情if result['pmids']:
details = fetch_details_xml(result['pmids'])
# 打印第一篇文献if details:
paper = details[0]
print("\n"+"="*50)
print(f"标题:{paper['title']}")
print(f"作者:{', '.join(paper['authors'][:3])}...")
print(f"期刊:{paper['journal']}")
print(f"摘要:{paper['abstract'][:200]}...")
print(f"DOI: {paper['doi']}")
运行结果示例:
📥 正在获取第 1-5 篇文献...
✅ 共获取 5 篇文献详情
==================================================
标题:Machine Learning in Healthcare: A Review
作者:Smith J, Wang L, Johnson M...
期刊:Journal of Medical Systems
摘要:Machine learning has revolutionized healthcare by enabling predictive analytics...
DOI: 10.1007/s10916-024-12345-6
📊 性能优化与限流处理
限流策略详解
根据 NCBI 官方政策:
配置
限流速率
适用场景
无 API Key
3 请求/秒
小规模测试
有 API Key
10 请求/秒
生产环境
Biopython 自动限流机制:
# Biopython 内部会自动计算请求间隔# 无需手动添加 time.sleep()from Bio import Entrez
# 有 API Key 时:每次请求自动间隔 0.1 秒(10 req/s)
Entrez.api_key = "your_key"# 无 API Key 时:每次请求自动间隔 0.34 秒(3 req/s)
# 某些文献标题包含特殊 HTML 实体
# 例如:"COVID‑19" 或 "<i>in vivo</i>"
解决方案:
import html
defclean_text(text):
"""清理 HTML 实体和特殊字符"""ifisinstance(text,str):
text = html.unescape(text) # 解码 HTML 实体
text = text.replace("\u2009"," ") # 替换特殊空格return text
# 使用示例
title = clean_text(article['title'])
坑 2:PMID 格式不一致
问题: Entrez 返回的 PMID 有时是字符串,有时是整数
解决方案:
pmid = str(medline['PMID']) # 统一转换为字符串
坑 3:超过 10000 条结果的分页获取
问题: ESearch 的 retstart 参数最大支持 10000
解决方案:
defsearch_large_dataset(query, total_needed=50000):
"""获取超过 10000 条结果"""
all_pmids = []
# 使用时间范围分段查询
years = range(2020, 2025)
for year in years:
yearly_query = f"{query} AND {year}[PDAT]"
result = search_pubmed(yearly_query, max_results=10000)
all_pmids.extend(result['pmids'])
iflen(all_pmids) >= total_needed:
breakreturn all_pmids[:total_needed]