class JobScraper:
def __init__(self):
self.base_url = 'https://search.51job.com/list/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'
}
self.csv_file = None
self.writer = None
def start(self):
if not os.path.exists('python_招聘数据.csv'):
with open('python_招聘数据.csv', mode='w', encoding='utf-8-sig', newline='') as f:
writer = csv.DictWriter(f, fieldnames=[
'标题', '公司名字', '城市', '薪资', '招聘信息',
'公司属性', '公司规模', '企业性质', '发布日期',
'公司详情页', '招聘详情页'
])
writer.writeheader()
for page in range(1, 11):
print(f'正在爬取第 {page} 页...')
try:
self.fetch_page(page)
time.sleep(1)
except Exception as e:
print(f'第 {page} 页爬取失败:{e}')
continue
def fetch_page(self, page):
url = f'{self.base_url}010000%252C020000%252C030200%252C040000%252C090200,000000,0000,00,9,99,python,2,{page}.html'
response = requests.get(url=url, headers=self.headers, timeout=10)
if response.status_code == 200:
data_list = self.parse_data(response.text)
self.save_to_csv(data_list)
else:
raise Exception(f'请求失败,状态码:{response.status_code}')
def parse_data(self, html_content):
pattern = r'window.__SEARCH_RESULT__\s*=\s*(.*?)</script>'
match = re.search(pattern, html_content, re.S)
if not match:
return []
json_str = match.group(1).strip()
try:
json_data = json.loads(json_str)
jobs = json_data.get('engine_jds', [])
return jobs
except json.JSONDecodeError:
print('JSON 解析失败')
return []
def save_to_csv(self, jobs):
file_path = 'python_招聘数据.csv'
with open(file_path, mode='a', encoding='utf-8-sig', newline='') as f:
writer = csv.DictWriter(f, fieldnames=[
'标题', '公司名字', '城市', '薪资', '招聘信息',
'公司属性', '公司规模', '企业性质', '发布日期',
'公司详情页', '招聘详情页'
])
for job in jobs:
item = {
'标题': job.get('job_name', ''),
'公司名字': job.get('company_name', ''),
'城市': job.get('workarea_text', ''),
'薪资': job.get('providesalary_text', ''),
'招聘信息': '|'.join(job.get('attribute_text', [])),
'公司属性': job.get('companyind_text', ''),
'公司规模': job.get('companysize_text', ''),
'企业性质': job.get('companytype_text', ''),
'发布日期': job.get('issuedate', ''),
'公司详情页': job.get('company_href', ''),
'招聘详情页': job.get('job_href', '')
}
writer.writerow(item)
print(f'已保存:{item["标题"]}')
if __name__ == '__main__':
scraper = JobScraper()
scraper.start()