import requests
import json
import time
import os
from fake_useragent import UserAgent
from jsonpath import jsonpath
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
class CtripHotelSpider:
def __init__(self):
"""初始化爬虫配置"""
self.ua = UserAgent()
self.headers = {
"User-Agent": self.ua.random,
"Referer": "https://hotels.ctrip.com/",
"Origin": "https://hotels.ctrip.com",
"Accept": "application/json, text/plain, */*",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive",
"Cookie": os.getenv("CTRIP_COOKIE", ""),
"Content-Type": "application/json;charset=UTF-8"
}
self.timeout = 15
self.delay = 3
self.all_hotel_data = []
self.api_url = "https://hotels.ctrip.com/hotel/api/search/poiList"
def build_request_params(self, city_code="北京", check_in="2026-01-20", check_out="2026-01-22", page=1):
"""
构造接口请求参数
:param city_code: 城市名称/编码(支持中文或城市编码,如北京=1)
:param check_in: 入住日期(YYYY-MM-DD)
:param check_out: 离店日期(YYYY-MM-DD)
:param page: 页码
:return: 请求参数字典
"""
params = {
"cityId": self.get_city_code(city_code),
"checkIn": check_in,
"checkOut": check_out,
"pageNum": page,
"pageSize": 20,
"sortType": "9",
"allianceid": "4902",
"sid": "29155245",
"callback": "jQuery1830" + str(int(time.time() * 1000)),
"_": int(time.time() * 1000)
}
return params
def get_city_code(self, city_name):
"""
简易城市名称转编码(可扩展完整城市编码表)
:param city_name: 城市名称
:return: 城市编码
"""
city_code_map = {
"北京": 1,
"上海": 2,
"广州": 3,
"深圳": 4,
"杭州": 5,
"成都": 6
}
return city_code_map.get(city_name, 1)
def get_hotel_data(self, params):
"""
发送请求获取酒店数据
:param params: 请求参数
:return: 解析后的 JSON 数据/None
"""
try:
time.sleep(self.delay)
response = requests.get(
url=self.api_url,
headers=self.headers,
params=params,
timeout=self.timeout
)
response.raise_for_status()
response_text = response.text
if "jQuery" in response_text:
json_start = response_text.find("(") + 1
json_end = response_text.rfind(")")
json_data = json.loads(response_text[json_start:json_end])
else:
json_data = response.json()
return json_data
except requests.exceptions.RequestException as e:
print(f"请求接口失败:{str(e)}")
return None
except json.JSONDecodeError as e:
print(f"JSON 解析失败:{str(e)}")
return None
def parse_hotel_data(self, json_data):
"""
解析酒店数据,提取核心字段
:param json_data: 接口返回的 JSON 数据
:return: 结构化酒店数据列表
"""
if not json_data or json_data.get("code") != 200:
print("接口返回异常或无数据")
return []
hotel_list = []
hotels = jsonpath(json_data, "$..hotelList")[0] if jsonpath(json_data, "$..hotelList") else []
for hotel in hotels:
try:
hotel_name = jsonpath(hotel, "$.hotelName")[0] if jsonpath(hotel, "$.hotelName") else "未知酒店"
hotel_id = jsonpath(hotel, "$.hotelId")[0] if jsonpath(hotel, "$.hotelId") else ""
price = jsonpath(hotel, "$.lowPrice")[0] if jsonpath(hotel, "$.lowPrice") else 0
score = jsonpath(hotel, "$.hotelScore")[0] if jsonpath(hotel, "$.hotelScore") else 0.0
address = jsonpath(hotel, "$.address")[0] if jsonpath(hotel, "$.address") else "未知地址"
star_rating = jsonpath(hotel, "$.starRating")[0] if jsonpath(hotel, "$.starRating") else "无星级"
room_type = jsonpath(hotel, "$.roomTypeName")[0] if jsonpath(hotel, "$.roomTypeName") else "未知房型"
distance = jsonpath(hotel, "$.distance")[0] if jsonpath(hotel, "$.distance") else "未知距离"
hotel_info = {
"酒店名称": hotel_name,
"酒店 ID": hotel_id,
"最低价格 (元)": price,
"酒店评分": score,
"酒店星级": star_rating,
"房型": room_type,
"地址": address,
"距离商圈 (km)": distance
}
hotel_list.append(hotel_info)
except Exception as e:
print(f"解析单条酒店数据失败:{str(e)}")
continue
return hotel_list
def crawl_hotels(self, city="北京", check_in="2026-01-20", check_out="2026-01-22", max_page=3):
"""
批量抓取指定城市的酒店数据
:param city: 城市名称
:param check_in: 入住日期
:param check_out: 离店日期
:param max_page: 最大抓取页数
:return: 所有酒店数据
"""
print(f"开始抓取{city}市{check_in}至{check_out}的酒店数据,共抓取{max_page}页...")
for page in range(1, max_page + 1):
print(f"正在抓取第{page}页...")
params = self.build_request_params(city, check_in, check_out, page)
json_data = self.get_hotel_data(params)
page_hotels = self.parse_hotel_data(json_data)
if not page_hotels:
print(f"第{page}页无数据,停止抓取")
break
self.all_hotel_data.extend(page_hotels)
print(f"第{page}页抓取完成,共{len(page_hotels)}家酒店")
print(f"抓取完成!总计获取{len(self.all_hotel_data)}家酒店数据")
return self.all_hotel_data
def save_to_excel(self, file_path="ctrip_hotel_prices.xlsx"):
"""
将酒店数据保存为 Excel 文件(便于数据分析)
:param file_path: 保存路径
"""
if not self.all_hotel_data:
print("无数据可保存")
return
try:
df = pd.DataFrame(self.all_hotel_data)
df = df.sort_values(by="最低价格 (元)", ascending=True)
df.to_excel(file_path, index=False, engine="openpyxl")
print(f"数据已成功保存到:{file_path}")
except Exception as e:
print(f"保存 Excel 失败:{str(e)}")
def print_sample_results(self, sample_num=5):
"""
打印示例结果(前 N 条)
:param sample_num: 示例数量
"""
if not self.all_hotel_data:
print("无抓取结果")
return
print(f"\n===== 携程酒店价格抓取结果(前{sample_num}条) =====")
df = pd.DataFrame(self.all_hotel_data[:sample_num])
print(df.to_string(index=False))
if __name__ == "__main__":
spider = CtripHotelSpider()
spider.crawl_hotels(
city="北京",
check_in="2026-01-20",
check_out="2026-01-22",
max_page=3
)
spider.print_sample_results(sample_num=5)
spider.save_to_excel()