class JingdianmenpiaoSpider(scrapy.Spider):
name = 'jingdianmenpiaoSpider'
spiderUrl = 'https://piao.qunar.com/ticket/list.htm?keyword=%E4%B8%89%E4%BA%9A®ion=%E4%B8%89%E4%BA%9A&from=mpshouye_hotcity&page={}'
start_urls = spiderUrl.split(";")
headers = {
"cookie": "[REDACTED]"
}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def start_requests(self):
plat = platform.system().lower()
if plat in ['linux', 'windows']:
connect = self.db_connect()
cursor = connect.cursor()
if self.table_exists(cursor, '2nn2j_jingdianmenpiao') == 1:
cursor.close()
connect.close()
self.temp_data()
return
pageNum = 2
for url in self.start_urls:
if '{}' in url:
for page in range(1, pageNum):
next_link = url.format(page)
yield scrapy.Request(url=next_link, headers=self.headers, callback=self.parse)
else:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
_url = urlparse(self.spiderUrl)
self.protocol = _url.scheme
self.hostname = _url.netloc
list_items = response.css('div#search-list div[class~="sight_item"]')
for item in list_items:
fields = JingdianmenpiaoItem()
fields["laiyuan"] = self.remove_html(item.css('h3.sight_item_caption a.name::attr(href)').extract_first())
fields["jiage"] = self.remove_html(item.css('span.sight_item_price em::text').extract_first())
detailUrlRule = item.css('h3.sight_item_caption a.name::attr(href)').extract_first()
if self.protocol not in detailUrlRule and not detailUrlRule.startswith('//'):
detailUrlRule = self.protocol + '://' + self.hostname + detailUrlRule
elif detailUrlRule.startswith('//'):
detailUrlRule = self.protocol + ':' + detailUrlRule
fields["laiyuan"] = detailUrlRule
yield scrapy.Request(url=detailUrlRule, meta={'fields': fields}, headers=self.headers, callback=self.detail_parse)
def detail_parse(self, response):
fields = response.meta['fields']
try:
fields["biaoti"] = self.remove_html(response.css('''div.mp-description-detail div.mp-description-view span.mp-description-name::text''').extract_first())
fields["fengmian"] = self.remove_html(response.css('''div#mp-slider-content div.mp-description-image img::attr(src)''').extract_first())
fields["miaoshu"] = self.remove_html(response.css('''div.mp-description-onesentence::text''').extract_first())
fields["weizhi"] = self.remove_html(response.css('''span.mp-description-address::text''').extract_first())
fields["dianping"] = self.remove_html(response.css('''span#mp-description-commentscore''').extract_first())
fields["pinglun"] = self.remove_html(response.css('''span.mp-description-commentCount a::text''').extract_first())
fields["tese"] = self.remove_html(response.css('''div.mp-charact-intro div.mp-charact-desc''').extract_first())
fields["kaifangshijian"] = self.remove_html(response.css('''div.mp-charact-content div.mp-charact-desc''').extract_first())
except Exception as e:
pass
return fields
def remove_html(self, html):
if html is None:
return ''
pattern = re.compile(r'<[^>]+>', re.S)
return pattern.sub('', html).strip()
def db_connect(self):
type = self.settings.get('TYPE', 'mysql')
host = self.settings.get('HOST', 'localhost')
port = int(self.settings.get('PORT', 3306))
user = self.settings.get('USER', 'root')
password = self.settings.get('PASSWORD', '123456')
try:
database = self.databaseName
except:
database = self.settings.get('DATABASE', '')
if type == 'mysql':
connect = pymysql.connect(host=host, port=port, db=database, user=user, passwd=password, charset='utf8')
else:
connect = pymssql.connect(host=host, user=user, password=password, database=database)
return connect
def table_exists(self, cursor, table_name):
cursor.execute("show tables;")
tables = [cursor.fetchall()]
table_list = re.findall('(\'.*?\')', str(tables))
table_list = [re.sub("'", '', each) for each in table_list]
return 1 if table_name in table_list else 0