from DrissionPage import ChromiumPage
from pprint import pprint
import csv
def crawl_boss_zhipin():
with open('boss.csv', mode='w', encoding='utf-8', newline='') as f:
csv_fieldnames = [
'岗位名称', '公司', '规模', '公司领域', '学历要求', '经验要求',
'技能需求', '福利待遇', '薪资', '市', '区', '商圈', '经度', '纬度'
]
csv_writer = csv.DictWriter(f, fieldnames=csv_fieldnames)
csv_writer.writeheader()
dp = ChromiumPage()
dp.listen.start('joblist')
target_url = 'https://www.zhipin.com/web/geek/jobs?query=%E5%A4%A7%E6%95%B0%E6%8D%AE%E5%BC%80%E5%8F%91&city=101280600'
dp.get(target_url)
total_pages = 20
for page in range(1, total_pages + 1):
print(f'========== 正在采集第{page}页数据内容 ==========')
try:
resp = dp.listen.wait()
json_data = resp.response.body
job_list = json_data['zpData']['jobList']
for job in job_list:
job_info = {
'岗位名称': job.get('jobName', ''),
'公司': job.get('brandName', ''),
'规模': job.get('brandScaleName', ''),
'公司领域': job.get('brandIndustry', ''),
'学历要求': job.get('jobDegree', ''),
'经验要求': job.get('jobExperience', ''),
'技能需求': job.get('skills', []),
'福利待遇': job.get('welfareList', []),
'薪资': job.get('salaryDesc', ''),
'市': job.get('cityName', ''),
'区': job.get('areaDistrict', ''),
'商圈': job.get('businessDistrict', ''),
'经度': job.get('gps', {}).get('longitude', ''),
'纬度': job.get('gps', {}).get('latitude', '')
}
csv_writer.writerow(job_info)
pprint(job_info)
dp.scroll.to_bottom()
except Exception as e:
print(f'第{page}页数据采集失败,错误信息:{str(e)}')
continue
dp.quit()
print(f'========== 全部{total_pages}页数据采集完成,结果已存入 boss.csv ==========')
if __name__ == '__main__':
crawl_boss_zhipin()
========== 正在采集第 1 页数据内容 ==========
{'公司': '某科技有限公司', '公司领域': '大数据/人工智能', '商圈': '科技园', '学历要求': '本科', '经度': '113.94xxx', '纬度': '22.54xxx', '经验要求': '3-5 年', '福利待遇': ['五险一金', '年终奖金', '定期体检'], '技能需求': ['Hadoop', 'Spark', 'Hive'], '薪资': '25-35K·13 薪', '市': '深圳市', '区': '南山区', '规模': '500-999 人', '岗位名称': '大数据开发工程师'}