Python 爬虫副业可行性分析与技术入门指南 | 极客日志

Python 爬虫副业可行性分析与技术入门指南 | 极客日志

import requests
from bs4 import BeautifulSoup

url = 'https://example.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

try:
    response = requests.get(url, headers=headers, timeout=10)
    response.raise_for_status()
    response.encoding = response.apparent_encoding
    
    soup = BeautifulSoup(response.text, 'html.parser')
    # 示例：提取所有标题
    titles = soup.find_all('h1')
    for title in titles:
        print(title.get_text())
except Exception as e:
    print(f"Error: {e}")

# spiders/example_spider.py
import scrapy

class ExampleSpider(scrapy.Spider):
    name = 'example'
    allowed_domains = ['example.com']
    start_urls = ['http://example.com/']

    def parse(self, response):
        # 提取数据
        yield {
            'title': response.css('h1::text').get(),
            'url': response.url
        }
        # 递归抓取下一页
        next_page = response.css('a.next-page::attr(href)').get()
        if next_page:
            yield response.follow(next_page, self.parse)