Python 网页解析库 BeautifulSoup4 使用指南 | 极客日志

Python

Python 网页解析库 BeautifulSoup4 使用指南

BeautifulSoup4 是 Python 生态中流行的 HTML/XML 解析库，提供直观 API 和强大解析能力。介绍安装方法，包括 lxml 和 html5lib 解析器配置。涵盖基础用法如标签查找 find/find_all、CSS 选择器 select 及数据提取处理。高级用法涉及动态内容处理（结合 Selenium）、通用爬虫框架构建及智能内容提取策略。实际应用场景展示电商价格监控系统实现，包含多平台商品抓取、价格比对及邮件通知功能。掌握 BS4 有助于理解 HTML 结构与数据提取原理，是网络爬虫生态核心组件。

灭霸发布于 2026/2/8更新于 2026/7/208.2K 浏览

一、库的简介：实际生活中的作用

BeautifulSoup4（简称 BS4）是 Python 生态中最受欢迎的 HTML/XML 解析库之一，它以其直观的 API 和强大的解析能力闻名。在实际生活中，BeautifulSoup4 的应用无处不在——无论是数据分析师从网站上采集数据、研究人员收集学术信息、电商从业者监控竞争对手价格，还是普通用户批量下载网络资源，都能看到它的身影。

BS4 的"有趣"之处在于它能够将复杂的 HTML/XML 文档转换成树状结构，让开发者可以用近乎自然语言的方式来定位和提取数据。比如，你可以通过 find_all('div', class_='product') 轻松找到所有商品信息，或者用 select('table tr:nth-child(odd)') 提取表格的奇数行数据。这种直观的操作方式大大降低了网页数据提取的门槛，使得非专业开发者也能轻松处理网页数据。

二、安装 BeautifulSoup4

BeautifulSoup4 需要配合解析器使用，推荐安装 lxml 解析器以获得更好的性能：

# 安装 BeautifulSoup4 和 lxml 解析器
pip install beautifulsoup4 lxml

# 也可以安装 html5lib 解析器（更宽松的解析）
pip install html5lib

# 验证安装
python -c "from bs4 import BeautifulSoup; print(BeautifulSoup.__version__)"

三、基本用法

1. 基础解析与导航

from bs4 import BeautifulSoup
import requests

# 获取网页内容
url = 'https://example.com'
response = requests.get(url)
html_content = response.content

# 创建 BeautifulSoup 对象（使用 lxml 解析器）
soup = BeautifulSoup(html_content, 'lxml')

# 查看美化后的 HTML
print(soup.prettify()[:500])  # 打印前 500 字符

# 基本导航
print(f"文档标题：{soup.title.string}")
print(f"第一个段落：{soup.p.text}")
print(f"所有链接数量：{len(soup.find_all('a'))}")

# 访问标签属性
if soup.head:
    print(f"字符编码：{soup.head.get(, )}")

相关免费在线工具

curl 转代码
解析常见 curl 参数并生成 fetch、axios、PHP curl 或 Python requests 示例代码。在线工具，curl 转代码在线工具，online
Base64 字符串编码/解码
将字符串编码和解码为其 Base64 格式表示形式即可。在线工具，Base64 字符串编码/解码在线工具，online
Base64 文件转换器
将字符串、文件或图像转换为其 Base64 表示形式。在线工具，Base64 文件转换器在线工具，online
Markdown转HTML
将 Markdown（GFM）转为 HTML 片段，浏览器内 marked 解析；与 HTML转Markdown 互为补充。在线工具，Markdown转HTML在线工具，online
HTML转Markdown
将 HTML 片段转为 GitHub Flavored Markdown，支持标题、列表、链接、代码块与表格等；浏览器内处理，可链接预填。在线工具，HTML转Markdown在线工具，online
JSON 压缩
通过删除不必要的空白来缩小和压缩JSON。在线工具，JSON 压缩在线工具，online

# find() - 查找单个元素
first_div = soup.find('div')
first_div_with_class = soup.find('div', class_='content')
first_div_with_id = soup.find('div', id='main')

# find_all() - 查找所有匹配元素
all_links = soup.find_all('a')
all_paragraphs = soup.find_all('p')
all_divs_with_class = soup.find_all('div', class_='item')

# 使用字典指定多个属性
specific_element = soup.find('input', {'type': 'text', 'name': 'username'})

# 限制查找数量
first_three_links = soup.find_all('a', limit=3)

# 遍历查找结果
for link in soup.find_all('a'):
    href = link.get('href')
    text = link.get_text(strip=True)
    if href and text:
        print(f"链接文本：{text}, 地址：{href}")

# select() 方法使用 CSS 选择器
# 选择所有 class 为 'product' 的 div
products = soup.select('div.product')

# 选择 id 为 'main-content' 的元素
main_content = soup.select_one('#main-content')

# 复杂选择器
# 选择所有 article 标签下的 h2 标题
article_titles = soup.select('article h2')

# 选择第一个 ul 下的所有 li
list_items = soup.select('ul:first-of-type li')

# 选择器组合
# 选择 class 包含 'price' 且不是第一个的子元素
prices = soup.select('.price:not(:first-child)')

# 属性选择器
images_with_alt = soup.select('img[alt]')
external_links = soup.select('a[href^="http"]')  # 以 http 开头的链接
email_links = soup.select('a[href*="mailto:"]')  # 包含 mailto: 的链接

# 获取文本内容
full_text = soup.get_text()
# 清理后的文本
clean_text = soup.get_text(strip=True)

# 获取属性值
img_src = soup.img['src'] if soup.img else None
link_href = soup.a.get('href', '')  # 使用 get 方法避免 KeyError

# 处理表格数据
table_data = []
if soup.table:
    rows = soup.table.find_all('tr')
    for row in rows:
        cols = row.find_all(['td', 'th'])
        row_data = [col.get_text(strip=True) for col in cols]
        table_data.append(row_data)

# 提取特定格式的数据
import re

# 提取所有邮箱地址
emails = soup.find_all(text=re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'))

# 提取所有电话号码
phones = soup.find_all(text=re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})'))

# 提取所有数字价格
prices = soup.find_all(text=re.compile(r'\$\d+(?:\.\d{2})?'))

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def scrape_dynamic_content(url, wait_for_element=None):
    """使用 Selenium 处理动态加载内容"""
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # 无头模式
    options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=options)
    try:
        driver.get(url)
        # 等待特定元素加载
        if wait_for_element:
            wait = WebDriverWait(driver, 10)
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, wait_for_element)))
        # 等待 JavaScript 执行完成
        time.sleep(2)
        # 获取页面源码并解析
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'lxml')
        return soup
    finally:
        driver.quit()

# 使用示例
# soup = scrape_dynamic_content(
#     'https://example.com/dynamic-page',
#     wait_for_element='.product-list'
# )

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import random
from concurrent.futures import ThreadPoolExecutor
import json

class WebScraper:
    """通用的网页爬虫框架"""
    def __init__(self, base_url, max_depth=2, max_workers=5):
        self.base_url = base_url
        self.max_depth = max_depth
        self.visited_urls = set()
        self.results = []
        self.max_workers = max_workers

    def is_valid_url(self, url):
        """验证 URL 是否有效"""
        parsed = urlparse(url)
        return bool(parsed.netloc) and bool(parsed.scheme)

    def should_crawl(self, url, depth):
        """判断是否应该爬取该 URL"""
        if depth > self.max_depth:
            return False
        if url in self.visited_urls:
            return False
        # 只爬取同域名下的链接
        base_domain = urlparse(self.base_url).netloc
        url_domain = urlparse(url).netloc
        return url_domain == base_domain

    def extract_links(self, soup, current_url):
        """从页面中提取所有链接"""
        links = set()
        for link in soup.find_all('a', href=True):
            href = link['href']
            absolute_url = urljoin(current_url, href)
            if self.is_valid_url(absolute_url):
                links.add(absolute_url)
        return links

    def parse_page(self, soup, url):
        """解析单个页面（子类可以重写此方法）"""
        # 默认实现：提取所有文本
        return {
            'url': url,
            'title': soup.title.string if soup.title else '',
            'text': soup.get_text(strip=True)[:500],  # 只取前 500 字符
            'links_count': len(soup.find_all('a'))
        }

    def crawl_page(self, url, depth=0):
        """爬取单个页面"""
        if not self.should_crawl(url, depth):
            return set()
        self.visited_urls.add(url)
        try:
            # 添加随机延迟，避免被封
            time.sleep(random.uniform(1, 3))
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'lxml')
            # 解析页面内容
            result = self.parse_page(soup, url)
            self.results.append(result)
            # 提取链接
            if depth < self.max_depth:
                links = self.extract_links(soup, url)
                return links
            else:
                return set()
        except Exception as e:
            print(f"爬取 {url} 失败：{str(e)}")
            return set()

    def crawl(self, start_url=None):
        """开始爬取"""
        start_url = start_url or self.base_url
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            urls_to_crawl = {start_url}
            depth = 0
            while urls_to_crawl and depth <= self.max_depth:
                print(f"深度 {depth}: 爬取 {len(urls_to_crawl)} 个 URL")
                # 并发爬取当前深度的所有 URL
                future_to_url = {
                    executor.submit(self.crawl_page, url, depth): url
                    for url in urls_to_crawl
                }
                # 收集新发现的链接
                new_urls = set()
                for future in future_to_url:
                    try:
                        links = future.result()
                        new_urls.update(links)
                    except Exception as e:
                        print(f"爬取失败：{str(e)}")
                # 准备下一轮爬取
                urls_to_crawl = new_urls - self.visited_urls
                depth += 1

    def save_results(self, filename):
        """保存结果到文件"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.results, f, ensure_ascii=False, indent=2)
        print(f"结果已保存到 {filename}，共 {len(self.results)} 条记录")

from bs4 import BeautifulSoup
import re
from datetime import datetime

class SmartContentExtractor:
    """智能内容提取器"""
    @staticmethod
    def extract_main_content(soup):
        """提取网页主要内容"""
        # 尝试多种策略找到主要内容
        strategies = [
            # 策略 1：查找 article 标签
            lambda: soup.find('article'),
            # 策略 2：查找 class 包含 content/main 的 div
            lambda: soup.find('div', class_=re.compile(r'content|main|post')),
            # 策略 3：查找最大的文本块
            lambda: max(
                soup.find_all(['div', 'section']),
                key=lambda x: len(x.get_text(strip=True))
            ) if soup.find_all(['div', 'section']) else None,
            # 策略 4：查找 itemprop="articleBody" 的元素
            lambda: soup.find(attrs={'itemprop': 'articleBody'}),
        ]
        for strategy in strategies:
            try:
                element = strategy()
                if element and len(element.get_text(strip=True)) > 100:
                    return element
            except:
                continue
        return soup.body if soup.body else soup

    @staticmethod
    def extract_metadata(soup):
        """提取页面元数据"""
        metadata = {}
        # Open Graph 数据
        og_data = {}
        for tag in soup.find_all('meta', property=re.compile(r'^og:')):
            og_data[tag['property'][3:]] = tag.get('content', '')
        if og_data:
            metadata['open_graph'] = og_data

        # Twitter Card 数据
        twitter_data = {}
        for tag in soup.find_all('meta', attrs={'name': re.compile(r'^twitter:')}):
            twitter_data[tag['name'][8:]] = tag.get('content', '')
        if twitter_data:
            metadata['twitter_card'] = twitter_data

        # JSON-LD 数据
        json_ld_scripts = soup.find_all('script', type='application/ld+json')
        json_ld_data = []
        for script in json_ld_scripts:
            try:
                import json
                data = json.loads(script.string)
                json_ld_data.append(data)
            except:
                continue
        if json_ld_data:
            metadata['json_ld'] = json_ld_data

        # 提取发布时间
        pub_date = None
        date_patterns = [
            r'\d{4}-\d{2}-\d{2}',
            r'\d{2}/\d{2}/\d{4}',
            r'\d{4}年\d{2}月\d{2}日'
        ]
        for pattern in date_patterns:
            date_match = soup.find(text=re.compile(pattern))
            if date_match:
                pub_date = date_match
                break
        metadata['publication_date'] = pub_date
        return metadata

    @staticmethod
    def extract_comments(soup):
        """提取评论内容"""
        comments = []
        # 查找常见的评论容器
        comment_selectors = [
            '.comment', '.comments', '#comments',
            '[class*="comment"]', '[id*="comment"]'
        ]
        for selector in comment_selectors:
            comment_elements = soup.select(selector)
            for element in comment_elements:
                comment_text = element.get_text(strip=True)
                if len(comment_text) > 20:  # 过滤太短的文本
                    comments.append({
                        'text': comment_text,
                        'html': str(element),
                        'selector': selector
                    })
        return comments

import requests
from bs4 import BeautifulSoup
import json
import time
from datetime import datetime
from dataclasses import dataclass
from typing import List, Dict, Optional
import smtplib
from email.mime.text import MIMEText
import schedule

@dataclass
class Product:
    """产品信息类"""
    name: str
    price: float
    original_price: Optional[float]
    url: str
    currency: str
    timestamp: datetime
    website: str

    @property
    def discount(self) -> Optional[float]:
        """计算折扣"""
        if self.original_price and self.original_price > 0:
            return round((1 - self.price / self.original_price) * 100, 1)
        return None

class PriceMonitor:
    """电商价格监控系统"""
    def __init__(self, config_file='config.json'):
        self.products = []
        self.price_history = {}
        self.load_config(config_file)

    def load_config(self, config_file):
        """加载配置"""
        with open(config_file, 'r', encoding='utf-8') as f:
            self.config = json.load(f)

    def scrape_amazon(self, url: str) -> Optional[Product]:
        """抓取亚马逊商品信息"""
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept-Language': 'en-US,en;q=0.9'
        }
        try:
            response = requests.get(url, headers=headers, timeout=10)
            soup = BeautifulSoup(response.content, 'lxml')
            # 提取商品名称
            name_elem = soup.select_one('#productTitle')
            name = name_elem.get_text(strip=True) if name_elem else 'Unknown'
            # 提取当前价格
            price_elem = soup.select_one('.a-price .a-offscreen')
            if not price_elem:
                price_elem = soup.select_one('#price_inside_buybox')
            price_text = price_elem.get_text(strip=True) if price_elem else ''
            price = self._parse_price(price_text)
            # 提取原价（如果有折扣）
            original_price_elem = soup.select_one('.a-text-price .a-offscreen')
            original_price_text = original_price_elem.get_text(strip=True) if original_price_elem else ''
            original_price = self._parse_price(original_price_text)
            # 提取货币
            currency = self._detect_currency(price_text)
            return Product(
                name=name,
                price=price,
                original_price=original_price,
                url=url,
                currency=currency,
                timestamp=datetime.now(),
                website='Amazon'
            )
        except Exception as e:
            print(f"抓取亚马逊商品失败：{str(e)}")
            return None

    def scrape_taobao(self, url: str) -> Optional[Product]:
        """抓取淘宝商品信息"""
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Cookie': self.config.get('taobao_cookie', '')
        }
        try:
            response = requests.get(url, headers=headers, timeout=10)
            soup = BeautifulSoup(response.content, 'lxml')
            # 淘宝页面结构较复杂，需要更精确的选择器
            name_elem = soup.select_one('.tb-detail-hd h1')
            name = name_elem.get_text(strip=True) if name_elem else 'Unknown'
            # 价格可能在多个位置
            price_selectors = ['.tm-price', '.tb-rmb-num', '[property="og:product:price"]']
            price = None
            for selector in price_selectors:
                price_elem = soup.select_one(selector)
                if price_elem:
                    price_text = price_elem.get_text(strip=True)
                    price = self._parse_price(price_text)
                    if price:
                        break
            return Product(
                name=name,
                price=price or 0,
                original_price=None,
                url=url,
                currency='CNY',
                timestamp=datetime.now(),
                website='Taobao'
            )
        except Exception as e:
            print(f"抓取淘宝商品失败：{str(e)}")
            return None

    def scrape_jd(self, url: str) -> Optional[Product]:
        """抓取京东商品信息"""
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        try:
            response = requests.get(url, headers=headers, timeout=10)
            soup = BeautifulSoup(response.content, 'lxml')
            # 商品名称
            name_elem = soup.select_one('.sku-name')
            name = name_elem.get_text(strip=True) if name_elem else 'Unknown'
            # 价格
            price_elem = soup.select_one('.price .num')
            price_text = price_elem.get_text(strip=True) if price_elem else ''
            price = self._parse_price(price_text)
            # 原价
            original_price_elem = soup.select_one('.price .del')
            original_price_text = original_price_elem.get_text(strip=True) if original_price_elem else ''
            original_price = self._parse_price(original_price_text)
            return Product(
                name=name,
                price=price or 0,
                original_price=original_price,
                url=url,
                currency='CNY',
                timestamp=datetime.now(),
                website='JD'
            )
        except Exception as e:
            print(f"抓取京东商品失败：{str(e)}")
            return None

    def _parse_price(self, price_text: str) -> Optional[float]:
        """解析价格文本"""
        import re
        if not price_text:
            return None
        # 提取数字（包括小数点和逗号）
        match = re.search(r'[\d,]+\.?\d*', price_text.replace(',', ''))
        if match:
            try:
                return float(match.group())
            except ValueError:
                return None
        return None

    def _detect_currency(self, price_text: str) -> str:
        """检测货币类型"""
        currency_symbols = {
            '$': 'USD', '€': 'EUR', '£': 'GBP', '¥': 'CNY', '₹': 'INR'
        }
        for symbol, currency in currency_symbols.items():
            if symbol in price_text:
                return currency
        # 默认货币
        return 'USD'

    def check_price_drop(self, product: Product, threshold: float = 10.0) -> bool:
        """检查价格是否下降超过阈值"""
        product_key = f"{product.website}_{product.url}"
        if product_key in self.price_history:
            old_price = self.price_history[product_key]
            price_drop = ((old_price - product.price) / old_price) * 100
            if price_drop >= threshold:
                return True
            # 更新价格历史
            self.price_history[product_key] = product.price
        return False

    def send_price_alert(self, product: Product, old_price: float):
        """发送价格提醒"""
        email_config = self.config.get('email', {})
        if not email_config.get('enabled', False):
            return
        try:
            price_drop = ((old_price - product.price) / old_price) * 100
            subject = f"💰 价格提醒：{product.name} 降价 {price_drop:.1f}%"
            message = f"""
商品名称：{product.name}
当前价格：{product.currency} {product.price:.2f}
之前价格：{product.currency} {old_price:.2f}
降价幅度：{price_drop:.1f}%
商品链接：{product.url}
监控时间：{product.timestamp.strftime('%Y-%m-%d %H:%M:%S')}
"""
            msg = MIMEText(message)
            msg['Subject'] = subject
            msg['From'] = email_config['from']
            msg['To'] = ', '.join(email_config['to'])
            with smtplib.SMTP(email_config['smtp_server'], email_config['smtp_port']) as server:
                server.starttls()
                server.login(email_config['username'], email_config['password'])
                server.send_message(msg)
            print(f"价格提醒已发送：{product.name}")
        except Exception as e:
            print(f"发送提醒失败：{str(e)}")

    def monitor_products(self):
        """监控所有产品"""
        print(f"开始监控 {len(self.config['products'])} 个商品...")
        for product_config in self.config['products']:
            url = product_config['url']
            website = product_config.get('website', 'auto')
            threshold = product_config.get('threshold', 10.0)
            # 根据 URL 自动检测网站
            if website == 'auto':
                if 'amazon.' in url:
                    website = 'amazon'
                elif 'taobao.' in url or 'tmall.' in url:
                    website = 'taobao'
                elif 'jd.' in url:
                    website = 'jd'
            # 抓取商品信息
            product = None
            if website == 'amazon':
                product = self.scrape_amazon(url)
            elif website == 'taobao':
                product = self.scrape_taobao(url)
            elif website == 'jd':
                product = self.scrape_jd(url)
            if product:
                self.products.append(product)
                # 检查价格下降
                product_key = f"{website}_{url}"
                if product_key in self.price_history:
                    old_price = self.price_history[product_key]
                    if self.check_price_drop(product, threshold):
                        self.send_price_alert(product, old_price)
                # 更新价格历史
                self.price_history[product_key] = product.price
                # 打印结果
                discount_info = f" (折扣：{product.discount}%)" if product.discount else ""
                print(f"{product.website}: {product.name} - {product.currency} {product.price:.2f}{discount_info}")
            # 避免请求过于频繁
            time.sleep(2)

    def run_scheduled_monitoring(self):
        """运行定时监控"""
        print("价格监控系统启动...")
        # 设置定时任务
        schedule.every().hour.do(self.monitor_products)
        # 立即运行一次
        self.monitor_products()
        # 保持运行
        while True:
            schedule.run_pending()
            time.sleep(60)

# 配置文件示例 (config.json)
"""
{
  "email": {
    "enabled": false,
    "smtp_server": "smtp.gmail.com",
    "smtp_port": 587,
    "username": "[email protected]",
    "password": "your_password",
    "from": "[email protected]",
    "to": ["[email protected]"]
  },
  "taobao_cookie": "your_taobao_cookie_here",
  "products": [
    {
      "name": "示例商品 1",
      "url": "https://www.amazon.com/dp/B08N5WRWNW",
      "website": "amazon",
      "threshold": 10.0
    },
    {
      "name": "示例商品 2",
      "url": "https://item.taobao.com/item.htm?id=123456789",
      "website": "taobao",
      "threshold": 15.0
    }
  ]
}
"""

# 使用示例
if __name__ == "__main__":
    monitor = PriceMonitor('config.json')
    # 单次运行
    monitor.monitor_products()
    # 或者运行定时监控
    # monitor.run_scheduled_monitoring()

Python 网页解析库 BeautifulSoup4 使用指南

一、库的简介：实际生活中的作用

二、安装 BeautifulSoup4

三、基本用法

1. 基础解析与导航

更多推荐文章

相关免费在线工具

2. 标签查找方法

3. CSS 选择器

4. 数据提取与处理

四、高级用法

1. 处理动态加载内容

2. 构建网页爬虫框架

3. 智能内容提取

五、实际应用场景

1. 电商价格监控系统

更多推荐文章

相关免费在线工具

Python 网页解析库 BeautifulSoup4 使用指南

一、库的简介：实际生活中的作用

二、安装 BeautifulSoup4

三、基本用法

1. 基础解析与导航

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

2. 标签查找方法

3. CSS 选择器

4. 数据提取与处理

四、高级用法

1. 处理动态加载内容

2. 构建网页爬虫框架

3. 智能内容提取

五、实际应用场景

1. 电商价格监控系统

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具