Python 实现携程景区评论数据爬取与分析

介绍如何使用 Python 爬取携程景区的用户评论数据。通过浏览器开发者工具定位评论接口，使用 requests 库模拟请求获取 JSON 数据。随后解析返回结果，提取评论内容、评分、用户信息等字段，并进行统计分析（如评分分布、游客类型等）。最终将数据保存为 CSV 和 TXT 格式文件，便于后续处理。

剑仙发布于 2026/3/28更新于 2026/6/125 浏览

1. 前言

本文记录如何使用 Python 爬取携程景区的评论数据，并进行简单的分析与存储。

2. 爬虫实现

2.1 定位接口

打开携程网，找到某个景区点击跳转到详情页面。

文章配图

按 F12 打开开发者工具，然后点击下评论下一页来监听是否有网络请求更新。如果没有那就看页面地址栏是否发生变化，前者是动态更新（post 请求，通过调后端接口完成数据更新），后者是静态更新（get 请求，通过 html 页面更新数据）。

通过搜索评论内容定位到评论数据是通过调 getCommentCollapseList 接口返回的。

文章配图

那么我们现在已经知道数据在哪个接口中，接下来就需要在本地模拟调用这个请求即可，这里我是用 Python 的 requests 库实现。你问我怎么知道调这个请求需要携带这些参数？

那我告诉你一个快速又便捷的方法就是右键复制这个请求的 curl(bash) 去拿到 https://curlconverter.com/ 网站粘贴，就会自动输出完整的调用请求代码。

import requests
cookies = {
    'GUID': '09031069217559688465',
    'MKT_CKID': '1751274744072.9fx30.ncpi',
    '_RSG': 'Ce4EW5dni37P3spnPcTGtA',
    # ... (其他 Cookie 字段)
}
headers = {
    'accept': '*/*',
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'cache-control': 'no-cache',
    'content-type': 'application/json',
    'cookieorigin': 'https://you.ctrip.com',
    'origin': 'https://you.ctrip.com',
    'pragma': 'no-cache',
    : ,
    : ,
    : ,
    : ,
    : ,
    : ,
    : ,
    : ,
    : ,
    : ,
    : ,
    : ,
    : ,
    : ,
    : ,
}
params = {
    : ,
    : ,
}
json_data = {
    : {
        : ,
        : ,
        : ,
        : ,
        : ,
        : ,
        : ,
        : ,
        : ,
    },
    : {
        : ,
        : ,
        : ,
        : ,
        : ,
        : ,
        : ,
        : ,
        : [],
    },
}
response = requests.post(
    ,
    params=params,
    cookies=cookies,
    headers=headers,
    json=json_data,
)
(response.json())

相关免费在线工具

加密/解密文本
使用加密算法（如AES、TripleDES、Rabbit或RC4）加密和解密文本明文。在线工具，加密/解密文本在线工具，online
Gemini 图片去水印
基于开源反向 Alpha 混合算法去除 Gemini/Nano Banana 图片水印，支持批量处理与下载。在线工具，Gemini 图片去水印在线工具，online
curl 转代码
解析常见 curl 参数并生成 fetch、axios、PHP curl 或 Python requests 示例代码。在线工具，curl 转代码在线工具，online
Base64 字符串编码/解码
将字符串编码和解码为其 Base64 格式表示形式即可。在线工具，Base64 字符串编码/解码在线工具，online
Base64 文件转换器
将字符串、文件或图像转换为其 Base64 表示形式。在线工具，Base64 文件转换器在线工具，online
Markdown转HTML
将 Markdown（GFM）转为 HTML 片段，浏览器内 marked 解析；与 HTML转Markdown 互为补充。在线工具，Markdown转HTML在线工具，online

import requests
import json
import pandas as pd
from datetime import datetime

def crawlComment():
    cookies = { ... }  # 此处省略具体 Cookie 值，实际运行时需替换为有效会话
    headers = { ... }
    params = { ... }
    json_data = { ... }
    response = requests.post(
        'https://m.ctrip.com/restapi/soa2/13444/json/getCommentCollapseList',
        params=params,
        cookies=cookies,
        headers=headers,
        json=json_data,
    )
    return response.json()

def extract_comments_from_json(json_data):
    comments = []
    if 'result' in json_data and 'items' in json_data['result']:
        items = json_data['result']['items']
        for item in items:
            comment_info = {
                'comment_id': item.get('commentId'),
                'user_nick': item.get('userInfo', {}).get('userNick', ''),
                'user_member': item.get('userInfo', {}).get('userMember', ''),
                'score': item.get('score', 0),
                'content': item.get('content', ''),
                'publish_time': format_timestamp(item.get('publishTime', '')),
                'tourist_type': get_tourist_type_display(item.get('touristType', 0)),
                'ip_location': item.get('ipLocatedName', ''),
                'time_duration': item.get('timeDuration', ''),
                'useful_count': item.get('usefulCount', 0),
                'reply_count': item.get('replyCount', 0),
                'image_count': len(item.get('images', [])),
                'recommend_items': item.get('recommendItems', []),
                'scores': get_detailed_scores(item.get('scores', []))
            }
            comments.append(comment_info)
    return comments

def format_timestamp(timestamp_str):
    if not timestamp_str:
        return ''
    try:
        if timestamp_str.startswith('/Date('):
            timestamp_str = timestamp_str.replace('/Date(', '').replace(')/', '')
            millis = int(timestamp_str.split('+')[0])
            dt = datetime.fromtimestamp(millis / 1000)
            return dt.strftime('%Y-%m-%d %H:%M:%S')
    except:
        pass
    return timestamp_str

def get_tourist_type_display(tourist_type):
    tourist_type_map = {
        0: "个人游",
        1: "情侣夫妻",
        2: "家庭亲子",
        3: "朋友出游",
        4: "商务出差",
        5: "独自旅行"
    }
    return tourist_type_map.get(tourist_type, "其他")

def get_detailed_scores(scores_list):
    scores_detail = {}
    for score_item in scores_list:
        name = score_item.get('name', '')
        score = score_item.get('score', 0)
        if name:
            scores_detail[name] = score
    return scores_detail

def analyze_comments_data(comments):
    if not comments:
        return {}
    total_comments = len(comments)
    avg_score = sum(comment['score'] for comment in comments) / total_comments
    score_distribution = {}
    for comment in comments:
        score = comment['score']
        score_range = f"{int(score)}分"
        score_distribution[score_range] = score_distribution.get(score_range, 0) + 1
    tourist_type_dist = {}
    for comment in comments:
        tourist_type = comment['tourist_type']
        tourist_type_dist[tourist_type] = tourist_type_dist.get(tourist_type, 0) + 1
    location_dist = {}
    for comment in comments:
        location = comment['ip_location'] or '未知'
        location_dist[location] = location_dist.get(location, 0) + 1
    avg_content_length = sum(len(comment['content']) for comment in comments) / total_comments
    analysis = {
        'total_comments': total_comments,
        'average_score': round(avg_score, 2),
        'score_distribution': score_distribution,
        'tourist_type_distribution': tourist_type_dist,
        'location_distribution': location_dist,
        'average_content_length': round(avg_content_length, 2),
        'total_images': sum(comment['image_count'] for comment in comments),
        'total_useful_votes': sum(comment['useful_count'] for comment in comments)
    }
    return analysis

def save_comments_to_files(comments, analysis, base_filename='harbin_ice_world_comments'):
    df = pd.DataFrame(comments)
    csv_filename = f"{base_filename}.csv"
    df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
    txt_filename = f"{base_filename}.txt"
    with open(txt_filename, 'w', encoding='utf-8') as f:
        f.write("哈尔滨冰雪大世界用户评论分析\n")
        f.write("=" * 60 + "\n\n")
        f.write("数据统计:\n")
        f.write(f"总评论数：{analysis['total_comments']}\n")
        f.write(f"平均评分：{analysis['average_score']}\n")
        f.write(f"平均评论长度：{analysis['average_content_length']} 字符\n")
        f.write(f"总图片数：{analysis['total_images']}\n")
        f.write(f"总有用投票数：{analysis['total_useful_votes']}\n\n")
        f.write("评分分布:\n")
        for score_range, count in analysis['score_distribution'].items():
            f.write(f" {score_range}: {count} 条\n")
        f.write("\n游客类型分布:\n")
        for tourist_type, count in analysis['tourist_type_distribution'].items():
            f.write(f" {tourist_type}: {count} 条\n")
        f.write("\n地区分布:\n")
        for location, count in analysis['location_distribution'].items():
            f.write(f" {location}: {count} 条\n")
        f.write("\n" + "=" * 60 + "\n")
        f.write("详细评论内容:\n\n")
        for i, comment in enumerate(comments, 1):
            f.write(f"评论 #{i}\n")
            f.write(f"用户：{comment['user_nick']} ({comment['user_member']})\n")
            f.write(f"评分：{comment['score']}分\n")
            f.write(f"时间：{comment['publish_time']}\n")
            f.write(f"游客类型：{comment['tourist_type']}\n")
            f.write(f"地点：{comment['ip_location']}\n")
            f.write(f"游玩时长：{comment['time_duration']}\n")
            f.write(f"有用投票：{comment['useful_count']} | 回复：{comment['reply_count']} | 图片：{comment['image_count']}\n")
            if comment['scores']:
                f.write("详细评分：")
                scores_str = ", ".join([f"{k}:{v}分" for k, v in comment['scores'].items()])
                f.write(scores_str + "\n")
            if comment['recommend_items']:
                f.write(f"推荐项目：{', '.join(comment['recommend_items'])}\n")
            f.write(f"评论内容:\n{comment['content']}\n")
            f.write("-" * 80 + "\n\n")
    return csv_filename, txt_filename

def process_ctrip_comments(json_data):
    print("开始处理携程评论数据...")
    comments = extract_comments_from_json(json_data)
    print(f"成功提取 {len(comments)} 条评论")
    if not comments:
        print("未找到评论数据")
        return
    analysis = analyze_comments_data(comments)
    print(f"\n数据统计:")
    print(f"总评论数：{analysis['total_comments']}")
    print(f"平均评分：{analysis['average_score']}")
    print(f"平均评论长度：{analysis['average_content_length']} 字符")
    print(f"\n评分分布:")
    for score_range, count in analysis['score_distribution'].items():
        print(f" {score_range}: {count} 条")
    csv_file, txt_file = save_comments_to_files(comments, analysis)
    print(f"\n文件保存完成:")
    print(f"- 完整数据 CSV: {csv_file}")
    print(f"- 详细分析报告：{txt_file}")
    return comments, analysis

if __name__ == "__main__":
    json_data = crawlComment()
    comments, analysis = process_ctrip_comments(json_data)

Python 实现携程景区评论数据爬取与分析

1. 前言

2. 爬虫实现

2.1 定位接口

更多推荐文章

相关免费在线工具

3. 总结

更多推荐文章

相关免费在线工具

Python 实现携程景区评论数据爬取与分析

1. 前言

2. 爬虫实现

2.1 定位接口

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

3. 总结

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具