html = response.text
soup = BeautifulSoup(html, 'lxml')
for h2 in soup.find_all('h2', class_='post-title'):
a_tag = h2.find('a')
title = a_tag.get_text(strip=True)
link = a_tag['href']
print(title, link)
3.5 数据存储
SQLite(适合小规模):
import sqlite3
conn = sqlite3.connect('spider.db')
cursor = conn.cursor()
cursor.execute('''CREATE TABLE IF NOT EXISTS articles (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT,
url TEXT UNIQUE
);''')
items = [('第一篇', 'https://...'), ('第二篇', 'https://...')]
for title, url in items:
try:
cursor.execute('INSERT INTO articles (title, url) VALUES (?, ?)', (title, url))
except sqlite3.IntegrityError:
pass
conn.commit()
conn.close()
JSON 格式:
import json
data = [{'title': '第一篇', 'url': 'https://...'}]
withopen('result.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
3.6 常见反爬措施
User-Agent:随机选用浏览器 UA。
IP 限制:使用代理池,定期更换 IP。
Cookie 验证:使用 requests.Session() 管理会话。
AJAX/动态渲染:分析接口直接请求 JSON,或使用 Selenium/Playwright。
4. 进阶篇:解析工具
4.1 lxml (XPath)
基于 C 语言实现,速度快,支持 XPath。
from lxml import etree
html = '<html><body><div><a href="/p1">文章 A</a></div></body></html>'
tree = etree.HTML(html)
titles = tree.xpath('//div/a/text()')
links = tree.xpath('//div/a/@href')
for t, l inzip(titles, links):
print(t, l)
4.2 parsel
Scrapy 内置解析器,支持 CSS/XPath。
from parsel import Selector
html = '<ul><li><a href="/a1">Item1</a></li></ul>'
sel = Selector(text=html)
for item in sel.css('li'):
title = item.css('a::text').get()
print(title)
4.3 PyQuery
类似 jQuery 的解析方式。
from pyquery import PyQuery as pq
doc = pq('<div><a href="/x1">新闻 X1</a></div>')
a = doc('#posts h2 a')
print(a.text(), a.attr('href'))
4.4 正则表达式
适用于简单模式匹配,如邮箱、电话。
import re
info = '联系邮箱:[email protected]'
emails = re.findall(r'[\w\.-]+@[\w\.-]+', info)
print(emails)
import scrapy
classMyprojectItem(scrapy.Item):
text = scrapy.Field()
author = scrapy.Field()
编写 Spider:
import scrapy
classQuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['https://quotes.toscrape.com/']
defparse(self, response):
for quote in response.css('div.quote'):
item = MyprojectItem()
item['text'] = quote.css('span.text::text').get()
yield item
next_page = response.css('li.next a::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
5.4 Pipeline 与 Settings
Pipeline 用于数据处理与存储,Settings 配置全局参数(如并发数、延时)。
5.5 Scrapy Shell
用于快速测试选择器:
scrapy shell 'https://quotes.toscrape.com/'
5.6 中间件
可自定义 User-Agent 或代理中间件。
classRandomUserAgentMiddleware:
defprocess_request(self, request, spider):
ua = random.choice(USER_AGENTS)
request.headers.setdefault('User-Agent', ua)
6. 动态内容爬取:Selenium 与 Playwright
当页面依赖 JavaScript 渲染时,需使用浏览器自动化工具。
6.1 Selenium 基础
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(options=chrome_options)
driver.get('https://quotes.toscrape.com/js/')
html = driver.page_source
driver.quit()
6.2 Playwright
微软出品,更快更轻量。
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto('https://quotes.toscrape.com/js/')
html = page.content()
browser.close()
7. 异步爬虫:aiohttp + asyncio
面对大量链接,同步请求效率低,异步可显著提升吞吐量。
7.1 aiohttp 示例
import asyncio
import aiohttp
from bs4 import BeautifulSoup
asyncdeffetch(session, url):
asyncwith session.get(url, timeout=10) as response:
returnawait response.text()
asyncdefmain(urls):
conn = aiohttp.TCPConnector(limit=50)
asyncwith aiohttp.ClientSession(connector=conn) as session:
tasks = [fetch(session, url) for url in urls]
htmls = await asyncio.gather(*tasks)
for html, url inzip(htmls, urls):
soup = BeautifulSoup(html, 'lxml')
print(soup.title.string)
urls = [f'https://example.com/page/{i}'for i inrange(1, 10)]
asyncio.run(main(urls))
7.2 HTTPX
兼容 Requests API 的异步客户端。
import httpx
asyncwith httpx.AsyncClient() as client:
resp = await client.get(url)