基于python实现垂直爬虫系统的方法详解

基于python实现垂直爬虫系统的方法详解

这篇文章主要为大家详细介绍了python实现垂直爬虫系统的方法,文中示例代码介绍的非常详细,具有一定的参考价值,感兴趣的小伙伴们可以参考一下,希望能够给你带来帮助。

html_downloader

1

2

3

4

5

6

7

8

from urllib import request

def download(url):

    if url is None:

        return

    response = request.urlopen(url)

    if response.getcode() != 200:

        return None

    return response.read()

html_outeputer

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

data_list = []

def collect_data(data):

    data_list.append(data)

def output_html():

    fout = open('output.html', 'w')

    fout.write('<html>')

    fout.write('<body>')

    fout.write('<table>')

    for dataitem in data_list:

        fout.write('<tr>')

        fout.write('<td>%s</td>' % dataitem['url'])

        fout.write('<td>%s</td>' % dataitem['title'])

        fout.write('<td>%s</td>' % dataitem['datetime'])

        fout.write('<td>%s</td>' % dataitem['visitcount'])

        fout.write('</tr>')

    fout.write('</table>')

    fout.write('</body>')

    fout.write('</html>')

    fout.close()

html_parser

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

import re

from bs4 import BeautifulSoup

from urllib.parse import urljoin

def get_new_urls(page_url, soup):

    new_urls = set()

    links = soup.find_all('a', href=re.compile(r"/\d+/\d+/\w+/page\.htm"))

    for link in links:

        new_url = link['href']

        new_full_url = urljoin(page_url, new_url)

        new_urls.add(new_full_url)

    return new_urls

def get_new_data(page_url, soup):

    res_data = {}

    title_node = soup.find('h1', class_='arti-title')

    if title_node is None:

        return res_data

    res_data['title'] = title_node.get_text()

    datetime_node = soup.find('span', class_='arti-update')

    res_data['datetime'] = datetime_node.get_text()

    visitcount_node = soup.find('span', class_='WP_VisitCount')

    res_data['visitcount'] = visitcount_node.get_text()

    res_data['url'] = page_url

    return res_data

def parse(page_url, html_cont):

    if page_url is None or html_cont is None:

        return

    soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')

    new_urls = get_new_urls(page_url, soup)

    new_data = get_new_data(page_url, soup)

    return new_urls, new_data

spider_main

test_64

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

from bs4 import BeautifulSoup

import re

html_doc = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

<p class="story">...</p>

"""

soup = BeautifulSoup(html_doc, 'html.parser')

print('获取所有链接')

links = soup.find_all('a')

for link in links:

    print(link.name, link['href'], link.get_text())

print('获取lacie链接')

link_node = soup.find('a', href='http://example.com/lacie')

print(link_node.name, link_node['href'], link_node.get_text())

print('正则匹配')

link_node = soup.find('a', href=re.compile(r'ill'))

print(link_node.name, link_node['href'], link_node.get_text())

print('获取P段落文字')

p_node = soup.find('p', class_='title')

print(p_node.name, p_node.get_text())

urls_manager

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

new_urls = set()

old_urls = set()

def add_new_url(url):

    if url is None:

        return

    if url not in new_urls and url not in old_urls:

        new_urls.add(url)

def add_new_urls(urls):

    if urls is None or len(urls) == 0:

        return

    for url in urls:

        add_new_url(url)

def get_new_url():

    new_url = new_urls.pop()

    old_urls.add(new_url)

    return new_url

def has_new_url():

    return len(new_urls) != 0

总结

本篇文章就到这里了,希望能够给你带来帮助。

①3000多本Python电子书有
②Python开发环境安装教程有
③Python400集自学视频有
④软件开发常用词汇有
⑤Python学习路线图有
⑥项目源码案例分享有
如果你用得到的话可以直接拿走,在我的QQ技术交流群里(技术交流和资源共享,广告勿入)可以自助拿走,群号是895937462。

Read more

印度统治阶级锁死底层人的5大阳谋

印度统治阶级锁死底层人的5大阳谋

基于社会学和心理学视角: 1. 情感道德: 统治阶级通过塑造道德规范和情感价值观,引导底层人群的行为。例如,宣扬“勤劳致富”“忍耐美德”等观念,让底层人接受现状并自我约束。这种道德框架往往掩盖结构性不平等,使人们将个人困境归咎于自身而非系统。 2. 欲望控制: 通过消费主义和媒体宣传,统治阶级刺激底层人的物质与社会欲望(如名牌、地位),但同时设置经济壁垒,使这些欲望难以实现。底层人被困在追求“更好生活”的循环中,精力被分散,无法聚焦于挑战权力结构。 3. 情绪煽动: 利用恐惧、愤怒或民族主义等情绪,统治阶级可以通过媒体或公共事件转移底层人对社会问题的注意力。例如,制造外部敌人或内部对立(如阶层、种族矛盾),让底层人内耗而非联合反抗。 4. 暴利诱惑: 通过展示少数“成功案例”或快速致富的机会(如赌博、投机),诱导底层人追逐短期暴利。这种机制不仅让底层人陷入经济风险,还强化了对现有经济体系的依赖,削弱长期变革的可能性。 5. 权力震撼: 通过展示统治阶级的权力(

By Ne0inhk