import requests
import re
import json
import time
import random
import pandas as pd
from fake_useragent import UserAgent
from jsonpath import jsonpath
from urllib.parse import urlparse, parse_qs
class FliggyHotelPackageCrawler:
def __init__(self):
self.ua = UserAgent()
self.base_headers = {
'User-Agent': self.ua.random,
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://www.fliggy.com/',
'Origin': 'https://www.fliggy.com',
'Cookie': '',
'X-Requested-With': 'XMLHttpRequest'
}
self.all_package_data = []
def extract_item_id(self, package_url):
"""
从套餐 URL 中提取商品 ID(itemId)
:param package_url: 飞猪酒店套餐 URL
:return: itemId 字符串/None
"""
try:
parsed_url = urlparse(package_url)
query_params = parse_qs(parsed_url.query)
if 't' in query_params:
return query_params['t'][0].replace('t_', '')
path_parts = parsed_url.path.split('/')
for part in path_parts:
if part.startswith('t_'):
return part.replace('t_', '')
id_match = re.search(r't_(\d+)', package_url)
return id_match.group(1) if id_match else None
except Exception as e:
print(f"提取 itemId 失败:{e}")
return None
def get_package_detail(self, item_id):
"""
调用飞猪 AJAX 接口获取套餐详情数据
:param item_id: 商品 ID
:return: 套餐数据字典/None
"""
try:
api_url = f"https://www.fliggy.com/hotel/api/item/detail?itemId={item_id}×tamp={int(time.time() * 1000)}"
time.sleep(random.uniform(3, 6))
response = requests.get(
url=api_url,
headers=self.base_headers,
timeout=20
)
response.raise_for_status()
response.encoding = 'utf-8'
json_data = response.json()
if json_data.get('code') != 0:
print(f"接口返回错误:{json_data.get('msg', '未知错误')}")
return None
package_data = {}
hotel_name = jsonpath(json_data, '$..hotelName')[0] if jsonpath(json_data, '$..hotelName') else '未知酒店'
package_data['hotel_name'] = hotel_name
package_info = jsonpath(json_data, '$..packageInfo')[0] if jsonpath(json_data, '$..packageInfo') else {}
package_data['package_name'] = package_info.get('name', '未知套餐')
package_data['package_price'] = float(package_info.get('price', 0)) if package_info.get('price') else 0.0
package_data['original_price'] = float(package_info.get('originalPrice', 0)) if package_info.get('originalPrice') else package_data['package_price']
package_data['room_type'] = package_info.get('roomType', '未知房型')
rights_list = jsonpath(json_data, '$..rights')[0] if jsonpath(json_data, '$..rights') else []
package_data['rights'] = ' | '.join(rights_list) if rights_list else '无权益'
package_data['valid_start'] = package_info.get('validStart', '未知开始时间')
package_data['valid_end'] = package_info.get('validEnd', '未知结束时间')
package_data['package_url'] = f"https://www.fliggy.com/hotel/t_{item_id}.htm"
package_data['hotel_url'] = jsonpath(json_data, '$..hotelUrl')[0] if jsonpath(json_data, '$..hotelUrl') else ''
print(f"成功抓取【{hotel_name} - {package_data['package_name']}】套餐数据")
return package_data
except requests.exceptions.RequestException as e:
print(f"接口请求失败:{e}")
return None
except Exception as e:
print(f"数据解析失败:{e}")
return None
def batch_crawl(self, package_url_list):
"""
批量抓取多个酒店套餐数据
:param package_url_list: 套餐 URL 列表
"""
for url in package_url_list:
item_id = self.extract_item_id(url)
if not item_id:
print(f"无法提取{item_id}的 itemId,跳过")
continue
package_data = self.get_package_detail(item_id)
if package_data:
self.all_package_data.append(package_data)
def save_data(self, save_path='fliggy_hotel_package.csv'):
"""
保存套餐数据到 CSV 文件
:param save_path: 保存路径
"""
if not self.all_package_data:
print("无有效套餐数据可保存")
return
df = pd.DataFrame(self.all_package_data)
df = df.drop_duplicates(subset=['hotel_name', 'package_name'], keep='last')
df['package_price'] = df['package_price'].apply(lambda x: round(x, 2))
df['original_price'] = df['original_price'].apply(lambda x: round(x, 2))
df.to_csv(save_path, index=False, encoding='utf-8-sig')
print(f"酒店套餐数据已保存至:{save_path}")
return df
if __name__ == '__main__':
crawler = FliggyHotelPackageCrawler()
target_packages = [
"https://www.fliggy.com/hotel/t_10020028.htm",
"https://www.fliggy.com/hotel/t_10030045.htm",
"https://www.fliggy.com/hotel/t_10040067.htm"
]
crawler.batch_crawl(target_packages)
result_df = crawler.save_data()
print("\n=== 飞猪酒店套餐信息抓取结果 ===")
print(result_df.to_string(index=False))
from tenacity import retry, stop_after_attempt, wait_random_exponential
@retry(stop=stop_after_attempt(3), wait=wait_random_exponential(multiplier=1, max=10))
def get_package_detail(self, item_id):
proxies = {
'http': 'http://IP:端口',
'https': 'https://IP:端口'
}
response = requests.get(api_url, headers=headers, proxies=proxies)