import os
import time
import requests
import pandas as pd
import rispy
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import warnings
import re
warnings.filterwarnings('ignore')
os.environ['CURL_CA_BUNDLE'] = ''
RIS_PATH = r"xxx.ris"
DOWNLOAD_DIR = r"xxx\未下载"
LOG_FILE = os.path.join(DOWNLOAD_DIR, "download_status_log.csv")
SCIHUB_MIRRORS_TEMPLATE = [
"https://www.pismin.com/{doi}",
"https://sci-hub.st/{doi}",
"https://sci-hub.ru/{doi}",
"https://sci-hub.box/{doi}",
"https://sci-hub.red/{doi}",
"https://sci-hub.ren/{doi}",
"https://sci-hub.ee/{doi}"
]
HEADERS = {
'User-Agent': 'Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
def sanitize_filename(filename):
if not isinstance(filename, str): return "Unknown_Filename"
filename = re.sub(r'[\\/*?:"<>|]', '_', filename)
filename = re.sub(r'\s+', ' ', filename).strip()
return filename[:200]
def parse_ris_robust(file_path):
print(f"[解析] 正在读取 RIS 文件:{file_path}")
with open(file_path, 'r', encoding='utf-8-sig', errors='ignore') as f:
entries = rispy.load(f)
parsed_data = []
for entry in entries:
doi = entry.get('doi') or entry.get('DO') or entry.get('number')
if not doi:
notes = entry.get('notes', [])
if isinstance(notes, list):
for n in notes:
if '10.' in n and '/' in n:
doi = n
break
title = entry.get('title') or entry.get('primary_title') or entry.get('TI') or entry.get('T1') or entry.get('T2')
if not title: title = "Unknown_Title"
year = entry.get('year') or entry.get('publication_year') or entry.get('PY') or entry.get('Y1')
if not year:
date = entry.get('date') or entry.get('DA')
if date: year = date[:4]
else: year = "NoYear"
doi = str(doi).strip() if doi else ""
title = str(title).strip()
year = str(year).strip()
clean_title = sanitize_filename(title)
filename = f"[{year}] {clean_title}.pdf"
status = 'Pending' if doi else 'No_DOI'
if not doi:
print(f" [警告] 发现无 DOI 文献:{title[:30]}...")
parsed_data.append({
'DOI': doi, 'Title': title, 'Filename': filename,
'Status': status, 'Message': ''
})
return pd.DataFrame(parsed_data)
def init_task_manager():
if not os.path.exists(DOWNLOAD_DIR): os.makedirs(DOWNLOAD_DIR)
rebuild = False
if os.path.exists(LOG_FILE):
df = pd.read_csv(LOG_FILE)
unknown_count = df['Title'].str.contains('Unknown Title', case=False, na=False).sum()
if unknown_count > 5:
print(f"[检测] 旧记录文件包含 {unknown_count} 个未知标题,判定为解析失败。正在强制重新解析 RIS...")
rebuild = True
else:
print(f"[读取] 加载现有进度,共 {len(df)} 条记录。")
df.loc[df['Status'] == 'Downloading', 'Status'] = 'Pending'
else:
rebuild = True
if rebuild:
df = parse_ris_robust(RIS_PATH)
df.to_csv(LOG_FILE, index=False)
print(f"[构建] 新的统计文件已创建,共 {len(df)} 条。")
return df
def get_pdf_direct_link(session, url):
try:
resp = session.get(url, headers=HEADERS, timeout=10, verify=False, allow_redirects=True)
if resp.status_code != 200: return None
soup = BeautifulSoup(resp.content, 'html.parser')
target = soup.find('embed', attrs={'type': 'application/pdf'}) or \
soup.find('iframe', attrs={'src': re.compile(r'\.pdf')})
if target and target.get('src'):
raw_url = target.get('src')
if raw_url.startswith('//'): return 'https:' + raw_url
if raw_url.startswith('/'): return '/'.join(url.split('/')[:3]) + raw_url
return raw_url
btn = soup.find('button', onclick=True)
if btn and 'location.href' in btn['onclick']:
match = re.search(r"href='(.*?)'", btn['onclick'])
if match:
raw_url = match.group(1)
if raw_url.startswith('//'): return 'https:' + raw_url
return raw_url
except Exception: pass
return None
def attempt_download_single_mirror(url_template, doi, save_path):
mirror_url = url_template.replace("{doi}", doi)
session = requests.Session()
session.mount('https', requests.adapters.HTTPAdapter(max_retries=1))
try:
pdf_url = get_pdf_direct_link(session, mirror_url)
if not pdf_url: return False, "Page parsed but no PDF found"
r = session.get(pdf_url, headers=HEADERS, stream=True, timeout=20, verify=False)
ct = r.headers.get('Content-Type', '').lower()
if 'html' in ct or len(r.content) < 1000: return False, "Not a PDF file"
if r.status_code == 200:
with open(save_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192): f.write(chunk)
return True, mirror_url
except Exception as e: return False, str(e)
return False, "Unknown Error"
def parallel_download_handler(index, row, df):
doi = row['DOI']
filename = row['Filename']
save_path = os.path.join(DOWNLOAD_DIR, filename)
print(f"\n--> [{index + 1}/{len(df)}] 开始并行下载:{doi}")
print(f" 目标文件:{filename}")
df.at[index, 'Status'] = 'Downloading'
success = False
winning_mirror = ""
with ThreadPoolExecutor(max_workers=8) as executor:
future_to_url = {
executor.submit(attempt_download_single_mirror, url, doi, save_path): url
for url in SCIHUB_MIRRORS_TEMPLATE
}
for future in as_completed(future_to_url):
url = future_to_url[future]
try:
is_success, msg = future.result()
if is_success:
success = True
winning_mirror = msg
print(f" [√] 成功!来源镜像:{winning_mirror.split('/')[2]}")
executor.shutdown(wait=False, cancel_futures=True)
break
except Exception: continue
if success:
df.at[index, 'Status'] = 'Downloaded'
df.at[index, 'Message'] = f"From {winning_mirror}"
else:
df.at[index, 'Status'] = 'Failed'
df.at[index, 'Message'] = "All mirrors failed or timed out"
print(f" [X] 所有镜像均失败:{doi}")
df.to_csv(LOG_FILE, index=False)
def main():
print("=== WoS 极速下载器 (并行版) ===")
df = init_task_manager()
tasks = df[df['Status'] == 'Pending']
total = len(tasks)
print(f"\n=== 待处理队列:{total} 个文献 ===")
if total == 0:
print("没有由于下载的任务。检查是否需要重置 'Failed' 状态。")
return
for index, row in tasks.iterrows():
file_path = os.path.join(DOWNLOAD_DIR, row['Filename'])
if os.path.exists(file_path) and os.path.getsize(file_path) > 2000:
print(f"[{index + 1}] 文件已存在,跳过:{row['Filename']}")
df.at[index, 'Status'] = 'Downloaded'
df.at[index, 'Message'] = 'File exists'
df.to_csv(LOG_FILE, index=False)
continue
parallel_download_handler(index, row, df)
time.sleep(1)
if __name__ == "__main__":
main()