import requests
import csv
import time
import re
import random
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
class DoubanTop250Crawler:
def __init__(self):
"""初始化豆瓣电影Top250爬虫"""
self.ua = UserAgent()
self.headers = {
'User-Agent': self.ua.random,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://movie.douban.com/',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
self.base_url = 'https://movie.douban.com/top250'
self.movie_data = []
def clean_text(self, text):
"""清洗文本,去除多余空格和换行"""
if not text:
return ''
return re.sub(r'\s+', ' ', text).strip()
def parse_movie_info(self, movie_tag):
"""解析单部电影的标签,提取核心信息"""
try:
rank = movie_tag.find('em').text if movie_tag.find('em') else '0'
title_tags = movie_tag.find_all('span', class_='title')
chinese_title = title_tags[0].text if title_tags else '未知名称'
if len(title_tags) > 1:
foreign_title = self.clean_text(title_tags[1].text.replace('/', ''))
else:
foreign_title = ''
rating = movie_tag.find('span', class_='rating_num').text if movie_tag.find('span', class_='rating_num') else '0'
rating_people_tag = movie_tag.find('div', class_='star').find_all('span')[-1]
rating_people = self.clean_text(rating_people_tag.text.replace('人评价', '')) if rating_people_tag else '0'
info_text = self.clean_text(movie_tag.find('div', class_='bd').find('p').text)
director = re.findall(r'导演: (.*?) ', info_text)[0] if re.findall(r'导演: (.*?) ', info_text) else '未知导演'
actors = re.findall(r'主演: (.*?) \d', info_text)[0] if re.findall(r'主演: (.*?) \d', info_text) else '未知演员'
year = re.findall(r'(\d{4})', info_text)[0] if re.findall(r'(\d{4})', info_text) else '未知年份'
country = re.findall(r'/\s*(.*?)\s*/', info_text)[0] if re.findall(r'/\s*(.*?)\s*/', info_text) else '未知国家'
genre = re.findall(r'/[^/]*$', info_text)[0].replace('/', '').strip() if re.findall(r'/[^/]*$', info_text) else '未知类型'
quote = movie_tag.find('span', class_='inq').text if movie_tag.find('span', class_='inq') else '无简介'
detail_link = movie_tag.find('a')['href'] if movie_tag.find('a') else ''
movie_info = {
'排名': int(rank),
'中文名称': chinese_title,
'外文名称': foreign_title,
'豆瓣评分': float(rating),
'评分人数': rating_people,
'导演': director,
'主演': actors,
'上映年份': year,
'制片国家/地区': country,
'类型': genre,
'简介': quote,
'详情链接': detail_link
}
return movie_info
except Exception as e:
print(f"解析单部电影失败:{e}")
return None
def get_page(self, offset):
"""爬取指定偏移量的页面数据"""
url = f'{self.base_url}?start={offset}&filter='
try:
time.sleep(random.uniform(2, 5))
response = requests.get(url=url, headers=self.headers, timeout=15)
response.raise_for_status()
response.encoding = 'utf-8'
return response.text
except RequestException as e:
print(f"偏移量{offset}页面请求失败:{e}")
return None
def parse_page(self, html):
"""解析页面HTML,提取所有电影数据"""
if not html:
return
soup = BeautifulSoup(html, 'html.parser')
movie_items = soup.find_all('div', class_='item')
for item in movie_items:
movie_info = self.parse_movie_info(item)
if movie_info:
self.movie_data.append(movie_info)
print(f"已爬取:第{movie_info['排名']}名 - {movie_info['中文名称']}({movie_info['豆瓣评分']}分)")
def save_data(self):
"""保存电影数据到CSV和Markdown文件"""
if not self.movie_data:
print("无数据可保存")
return
sorted_data = sorted(self.movie_data, key=lambda x: x['排名'])
csv_headers = [
'排名', '中文名称', '外文名称', '豆瓣评分', '评分人数', '导演', '主演', '上映年份', '制片国家/地区', '类型', '简介', '详情链接'
]
with open('douban_top250.csv', 'w', encoding='utf-8-sig', newline='') as f:
writer = csv.DictWriter(f, fieldnames=csv_headers)
writer.writeheader()
writer.writerows(sorted_data)
with open('douban_top250.md', 'w', encoding='utf-8') as f:
f.write('# 豆瓣电影Top250 完整观影清单\n\n')
f.write(f'爬取时间:{time.strftime("%Y-%m-%d %H:%M:%S")}\n\n')
for movie in sorted_data:
f.write(f"## 第{movie['排名']}名:{movie['中文名称']}\n")
if movie['外文名称']:
f.write(f"**外文名称**:{movie['外文名称']}\n\n")
f.write(f"**豆瓣评分**:{movie['豆瓣评分']}({movie['评分人数']}人评价)\n")
f.write(f"**导演**:{movie['导演']} | **主演**:{movie['主演']}\n")
f.write(f"**上映年份**:{movie['上映年份']} | **制片国家/地区**:{movie['制片国家/地区']} | **类型**:{movie['类型']}\n\n")
f.write(f"> {movie['简介']}\n\n")
f.write(f"[查看详情]({movie['详情链接']})\n")
f.write('---\n\n')
print(f"数据保存完成!共爬取{len(sorted_data)}部电影数据")
print(f"CSV文件:douban_top250.csv")
print(f"Markdown文件:douban_top250.md")
def run(self):
"""执行爬虫主流程"""
print("开始爬取豆瓣电影Top250数据...")
for offset in range(0, 250, 25):
print(f"\n正在爬取第{offset//25 + 1}页(偏移量:{offset})...")
html = self.get_page(offset)
self.parse_page(html)
self.save_data()
print("\n爬虫执行完毕!")
if __name__ == '__main__':
crawler = DoubanTop250Crawler()
crawler.run()