Python 爬虫实战：反爬绕过与数据解析常见问题总结 | 极客日志

Python

Python 爬虫实战：反爬绕过与数据解析常见问题总结

综述由AI生成总结了 Python 爬虫开发中常见的五个技术坑点。包括 User-Agent 伪装、IP 封禁处理、BeautifulSoup 解析语法（class_）、JSON 编码乱码修复以及动态页面渲染（Selenium/Playwright）。针对每个问题提供了具体的代码示例和解决方案，帮助开发者避免常见错误。

利刃发布于 2026/3/28更新于 2026/5/2328 浏览

Python 爬虫实战：反爬绕过与数据解析常见问题总结

第一个坑：User-Agent 被识别为爬虫

刚开始写爬虫时，直接用 requests.get(url) 发送请求，结果返回 403 Forbidden。

问题原因：requests 默认的 User-Agent 是 python-requests/版本号，网站一眼就能识别这是爬虫。

解决方案：自定义请求头，伪装成浏览器。

import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'keep-alive',
}
response = requests.get(url, headers=headers, timeout=10)
print(response.status_code)

经验：Headers 越完整越像真人在访问，最少也要加上 User-Agent。

第二个坑：IP 被封禁

搞定了 User-Agent 开始爬数据，结果爬了 100 多条后，又返回 403 了。这次更惨，换浏览器也不行——IP 被封了！

问题原因：同一个 IP 短时间内请求太频繁，触发了网站的反爬机制。

解决方案：使用代理 IP 池 + 请求间隔。

import requests
import time
import random

# 代理池（示例，实际需要购买或免费获取）
proxies = [
    {'http': 'http://123.45.67.89:8080', 'https': 'https://123.45.67.89:8080'},
    {'http': 'http://98.76.54.32:8080', 'https': 'https://98.76.54.32:8080'},
]

def fetch_with_proxy(url):
    
    proxy = random.choice(proxies)
    headers = {: }
    :
        response = requests.get(url, headers=headers, proxies=proxy, timeout=)
        
        time.sleep(random.uniform(, ))
         response
     Exception  e:
        ()
         


 page  (, ):
    url = 
    response = fetch_with_proxy(url)
     response:
        ()

更多推荐文章

查看全部

from bs4 import BeautifulSoup

html = '<div><span>商品名称</span></div>'
soup = BeautifulSoup(html, 'html.parser')

# 错误写法：class 是 Python 关键字，不能直接用
# name = soup.find(class='name').text  # 报错！

# 正确写法：用 class_ 参数
name = soup.find(class_='name').text
print(name)
# 输出：商品名称

from bs4 import BeautifulSoup

# 完整示例：解析商品列表
html = '''
<div>
    <div>
        <span>iPhone 15</span>
        <span>5999</span>
    </div>
    <div>
        <span>华为 Mate 60</span>
        <span>6999</span>
    </div>
</div>
'''
soup = BeautifulSoup(html, 'html.parser')

# 方法 1：用 class_ 参数
items = soup.find_all(class_='product-item')
for item in items:
    name = item.find(class_='name').text
    price = item.find(class_='price').text
    print(f"商品：{name}, 价格：{price}")

# 方法 2：用 CSS 选择器（更简洁）
names = soup.select('.product-item .name')
prices = soup.select('.product-item .price')
for name, price in zip(names, prices):
    print(f"商品：{name.text}, 价格：{price.text}")

import requests

response = requests.get('https://api.example.com/data')
data = response.json()

# 直接解析 JSON
# 问题：中文显示为 Unicode 转义
print(data)
# {'name': '\u4e2d\u6587', 'city': '\u5317\u4eac'}

# 解决方案：设置正确的编码
response.encoding = 'utf-8'  # 关键步骤！
data = response.json()
print(data)
# {'name': '中文', 'city': '北京'}

import requests
import json

def fetch_json(url):
    response = requests.get(url)
    # 方法 1：自动检测编码
    response.encoding = response.apparent_encoding
    # 方法 2：手动指定 UTF-8
    # response.encoding = 'utf-8'
    # 方法 3：处理 gbk 编码
    # response.encoding = 'gbk'
    
    try:
        data = response.json()
        return data
    except json.JSONDecodeError:
        # 如果解析失败，尝试用文本方式
        print("JSON 解析失败，原始文本：", response.text[:200])
        return None

# 使用示例
url = 'https://api.example.com/users'
data = fetch_json(url)
if data:
    for user in data:
        print(f"用户名：{user.get('name','N/A')}")

import requests
from bs4 import BeautifulSoup

url = 'https://example.com/dynamic-page'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# 问题：页面是空的！
items = soup.find_all(class_='product-item')
print(f"找到 {len(items)} 个商品")
# 输出：找到 0 个商品
# 因为数据是通过 JavaScript 动态加载的
# 原始 HTML 里根本没有这些内容

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# 配置 Chrome（无头模式）
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(options=options)

try:
    # 打开页面
    driver.get('https://example.com/dynamic-page')
    # 等待 JavaScript 加载完成（最多等 10 秒）
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'product-item'))
    )
    # 滚动页面，触发懒加载
    for _ in range(3):
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(1)
    
    # 解析数据
    items = driver.find_elements(By.CLASS_NAME, 'product-item')
    print(f"找到 {len(items)} 个商品")
    for item in items:
        name = item.find_element(By.CLASS_NAME, 'name').text
        price = item.find_element(By.CLASS_NAME, 'price').text
        print(f"商品：{name}, 价格：{price}")
finally:
    driver.quit()

import asyncio
from playwright.async_api import async_playwright

async def main():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto('https://example.com/dynamic-page')
        # 等待数据加载
        await page.wait_for_selector('.product-item')
        # 提取数据
        items = await page.query_selector_all('.product-item')
        for item in items:
            name = await item.query_selector_eval('.name', 'el => el.textContent')
            price = await item.query_selector_eval('.price', 'el => el.textContent')
            print(f"商品：{name}, 价格：{price}")
        await browser.close()

asyncio.run(main())

Python 爬虫实战：反爬绕过与数据解析常见问题总结

Python 爬虫实战：反爬绕过与数据解析常见问题总结

第一个坑：User-Agent 被识别为爬虫

第二个坑：IP 被封禁

更多推荐文章

第三个坑：CSS 选择器解析不到数据

第四个坑：JSON 数据中文乱码

第五个坑：异步加载的数据抓不到

更多推荐文章

相关免费在线工具

Python 爬虫实战：反爬绕过与数据解析常见问题总结

Python 爬虫实战：反爬绕过与数据解析常见问题总结

第一个坑：User-Agent 被识别为爬虫

第二个坑：IP 被封禁

微信扫一扫，关注极客日志

更多推荐文章

第三个坑：CSS 选择器解析不到数据

第四个坑：JSON 数据中文乱码

第五个坑：异步加载的数据抓不到

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具