Zotero 8.0.1 英文文献批量下载与自动化管理实战

Zotero 是一款免费开源的文献管理软件，支持 Windows、macOS 和 Linux 平台。它包含桌面端软件和浏览器插件，丰富的插件生态可以显著增强其功能。

官网：https://www.zotero.org/ GitHub：https://github.com/zotero/zotero

最新版本为 8.0.1。相比之前的版本，更新频率明显加快，大约每 1-3 个月就会发布一个大版本。

版本号	正式发布日期	核心更新内容
Zotero 8	2026 年 1 月	全新引文对话框、列表内查看注释、开启快速更新周期
Zotero 7	2024 年 8 月	界面全面现代化、原生支持苹果 M 系列芯片、新增 EPUB 阅读器
Zotero 6	2022 年 3 月	首创内置 PDF 阅读器、上线 iOS 移动端、引入划线提炼笔记功能

一、文献检索和导出

使用 Web of Science 等数据库检索文献后，导出时选择以下格式：

RIS 格式
Records from：单次最多导出 1000 条，若超过需分批（如 1-1000, 1001-2000）
Full Record：确保包含完整元数据

保存下来的 .ris 文件是纯文本格式的元数据集合，包含标题、作者、年份、DOI、期刊等信息。

二、文献批量下载原理

Zotero 主要通过以下几种合法或授权渠道获取全文：

1. 开放获取（Open Access）

这是 Zotero 最主要的合法下载方式。当导入条目或点击'查找全文'时，软件会利用 DOI 查询 Unpaywall 等数据库。如果论文有合法的开源副本（如机构存储库、预印本），它会直接下载。

Unpaywall 由非营利组织 OurResearch 维护，索引了全球超过 50,000 个开放存取存储库。它只抓取作者按法律上传的副本或出版商官方开源的内容，完全合法且速度较快。

2. 机构权限下载

如果你所在的学校或机构购买了相关数据库，可以通过浏览器插件利用 IP 权限下载。

原理：Zotero Connector 会模仿你的浏览器操作，点击网页上的"Download PDF"按钮。
特点：准确率最高，能下载到解析完美的官方 PDF。
限制：仅限浏览器插件模式，且受限于机构对大批量下载的管控。

3. 图书馆数字资源

在 Zotero 设置中填入学校图书馆的 OpenURL 地址。当无法直接获取时，可跳转至图书馆搜索该条目。

4. 自定义 PDF 查找引擎 (Resolvers)

# -*- coding = utf-8 -*- import os import time import requests import pandas as pd import rispy from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, as_completed import warnings import re warnings.filterwarnings('ignore') os.environ['CURL_CA_BUNDLE'] = '' # ================= 配置区域 ================= RIS_PATH = r"xxx.ris" DOWNLOAD_DIR = r"xxx\未下载" LOG_FILE = os.path.join(DOWNLOAD_DIR, "download_status_log.csv") # 7 个镜像站点模板 SCIHUB_MIRRORS_TEMPLATE = [ "https://www.pismin.com/{doi}", "https://sci-hub.st/{doi}", "https://sci-hub.ru/{doi}", "https://sci-hub.box/{doi}", "https://sci-hub.red/{doi}", "https://sci-hub.ren/{doi}", "https://sci-hub.ee/{doi}" ] HEADERS = { 'User-Agent': 'Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' } # ================= 核心功能函数 ================= def sanitize_filename(filename): if not isinstance(filename, str): return "Unknown_Filename" filename = re.sub(r'[\\/*?:"<>|]', '_', filename) filename = re.sub(r'\s+', ' ', filename).strip() return filename[:200] def parse_ris_robust(file_path): print(f"[解析] 正在读取 RIS 文件：{file_path}") with open(file_path, 'r', encoding='utf-8-sig', errors='ignore') as f: entries = rispy.load(f) parsed_data = [] for entry in entries: doi = entry.get('doi') or entry.get('DO') or entry.get('number') if not doi: notes = entry.get('notes', []) if isinstance(notes, list): for n in notes: if '10.' in n and '/' in n: doi = n break title = entry.get('title') or entry.get('primary_title') or entry.get('TI') or entry.get('T1') or entry.get('T2') if not title: title = "Unknown_Title" year = entry.get('year') or entry.get('publication_year') or entry.get('PY') or entry.get('Y1') if not year: date = entry.get('date') or entry.get('DA') if date: year = date[:4] else: year = "NoYear" doi = str(doi).strip() if doi else "" title = str(title).strip() year = str(year).strip() clean_title = sanitize_filename(title) filename = f"[{year}] {clean_title}.pdf" status = 'Pending' if doi else 'No_DOI' if not doi: print(f" [警告] 发现无 DOI 文献：{title[:30]}...") parsed_data.append({ 'DOI': doi, 'Title': title, 'Filename': filename, 'Status': status, 'Message': '' }) return pd.DataFrame(parsed_data) def init_task_manager(): if not os.path.exists(DOWNLOAD_DIR): os.makedirs(DOWNLOAD_DIR) rebuild = False if os.path.exists(LOG_FILE): df = pd.read_csv(LOG_FILE) unknown_count = df['Title'].str.contains('Unknown Title', case=False, na=False).sum() if unknown_count > 5: print(f"[检测] 旧记录文件包含 {unknown_count} 个未知标题，判定为解析失败。正在强制重新解析 RIS...") rebuild = True else: print(f"[读取] 加载现有进度，共 {len(df)} 条记录。") df.loc[df['Status'] == 'Downloading', 'Status'] = 'Pending' else: rebuild = True if rebuild: df = parse_ris_robust(RIS_PATH) df.to_csv(LOG_FILE, index=False) print(f"[构建] 新的统计文件已创建，共 {len(df)} 条。") return df def get_pdf_direct_link(session, url): try: resp = session.get(url, headers=HEADERS, timeout=10, verify=False, allow_redirects=True) if resp.status_code != 200: return None soup = BeautifulSoup(resp.content, 'html.parser') target = soup.find('embed', attrs={'type': 'application/pdf'}) or \ soup.find('iframe', attrs={'src': re.compile(r'\.pdf')}) if target and target.get('src'): raw_url = target.get('src') if raw_url.startswith('//'): return 'https:' + raw_url if raw_url.startswith('/'): return '/'.join(url.split('/')[:3]) + raw_url return raw_url btn = soup.find('button', onclick=True) if btn and 'location.href' in btn['onclick']: match = re.search(r"href='(.*?)'", btn['onclick']) if match: raw_url = match.group(1) if raw_url.startswith('//'): return 'https:' + raw_url return raw_url except Exception: pass return None def attempt_download_single_mirror(url_template, doi, save_path): mirror_url = url_template.replace("{doi}", doi) session = requests.Session() session.mount('https', requests.adapters.HTTPAdapter(max_retries=1)) try: pdf_url = get_pdf_direct_link(session, mirror_url) if not pdf_url: return False, "Page parsed but no PDF found" r = session.get(pdf_url, headers=HEADERS, stream=True, timeout=20, verify=False) ct = r.headers.get('Content-Type', '').lower() if 'html' in ct or len(r.content) < 1000: return False, "Not a PDF file" if r.status_code == 200: with open(save_path, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) return True, mirror_url except Exception as e: return False, str(e) return False, "Unknown Error" def parallel_download_handler(index, row, df): doi = row['DOI'] filename = row['Filename'] save_path = os.path.join(DOWNLOAD_DIR, filename) print(f"\n--> [{index + 1}/{len(df)}] 开始并行下载：{doi}") print(f" 目标文件：{filename}") df.at[index, 'Status'] = 'Downloading' success = False winning_mirror = "" with ThreadPoolExecutor(max_workers=8) as executor: future_to_url = { executor.submit(attempt_download_single_mirror, url, doi, save_path): url for url in SCIHUB_MIRRORS_TEMPLATE } for future in as_completed(future_to_url): url = future_to_url[future] try: is_success, msg = future.result() if is_success: success = True winning_mirror = msg print(f" [√] 成功！来源镜像：{winning_mirror.split('/')[2]}") executor.shutdown(wait=False, cancel_futures=True) break except Exception: continue if success: df.at[index, 'Status'] = 'Downloaded' df.at[index, 'Message'] = f"From {winning_mirror}" else: df.at[index, 'Status'] = 'Failed' df.at[index, 'Message'] = "All mirrors failed or timed out" print(f" [X] 所有镜像均失败：{doi}") df.to_csv(LOG_FILE, index=False) def main(): print("=== WoS 极速下载器 (并行版) ===") df = init_task_manager() tasks = df[df['Status'] == 'Pending'] total = len(tasks) print(f"\n=== 待处理队列：{total} 个文献 ===") if total == 0: print("没有由于下载的任务。检查是否需要重置 'Failed' 状态。") return for index, row in tasks.iterrows(): file_path = os.path.join(DOWNLOAD_DIR, row['Filename']) if os.path.exists(file_path) and os.path.getsize(file_path) > 2000: print(f"[{index + 1}] 文件已存在，跳过：{row['Filename']}") df.at[index, 'Status'] = 'Downloaded' df.at[index, 'Message'] = 'File exists' df.to_csv(LOG_FILE, index=False) continue parallel_download_handler(index, row, df) time.sleep(1) if __name__ == "__main__": main()

Zotero 8.0.1 英文文献批量下载与自动化管理实战