基于python实现垂直爬虫系统的方法详解-CFANZ编程社区

这篇文章主要为大家详细介绍了python实现垂直爬虫系统的方法，文中示例代码介绍的非常详细，具有一定的参考价值，感兴趣的小伙伴们可以参考一下，希望能够给你带来帮助。Python编程学习资料点击免费领取

html_downloader

from urllib import request

def download(url):

if url is None:

return

response = request.urlopen(url)

if response.getcode() != 200:

return None

return response.read()

html_outeputer

data_list = []

def collect_data(data):

data_list.append(data)

def output_html():

fout = open('output.html', 'w')

fout.write('<html>')

fout.write('<body>')

fout.write('<table>')

for dataitem in data_list:

fout.write('<tr>')

fout.write('<td>%s</td>' % dataitem['url'])

fout.write('<td>%s</td>' % dataitem['title'])

fout.write('<td>%s</td>' % dataitem['datetime'])

fout.write('<td>%s</td>' % dataitem['visitcount'])

fout.write('</tr>')

fout.write('</table>')

fout.write('</body>')

fout.write('</html>')

fout.close()

html_parser

import re

from bs4 import BeautifulSoup

from urllib.parse import urljoin

def get_new_urls(page_url, soup):

new_urls = set()

links = soup.find_all('a', href=re.compile(r"/\d+/\d+/\w+/page\.htm"))

for link in links:

new_url = link['href']

new_full_url = urljoin(page_url, new_url)

new_urls.add(new_full_url)

return new_urls

def get_new_data(page_url, soup):

res_data = {}

title_node = soup.find('h1', class_='arti-title')

if title_node is None:

return res_data

res_data['title'] = title_node.get_text()

datetime_node = soup.find('span', class_='arti-update')

res_data['datetime'] = datetime_node.get_text()

visitcount_node = soup.find('span', class_='WP_VisitCount')

res_data['visitcount'] = visitcount_node.get_text()

res_data['url'] = page_url

return res_data

def parse(page_url, html_cont):

if page_url is None or html_cont is None:

return

soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')

new_urls = get_new_urls(page_url, soup)

new_data = get_new_data(page_url, soup)

return new_urls, new_data

spider_main

test_64

from bs4 import BeautifulSoup

import re

html_doc = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

<p class="story">...</p>

"""

soup = BeautifulSoup(html_doc, 'html.parser')

print('获取所有链接')

links = soup.find_all('a')

for link in links:

print(link.name, link['href'], link.get_text())

print('获取lacie链接')

link_node = soup.find('a', href='http://example.com/lacie')

print(link_node.name, link_node['href'], link_node.get_text())

print('正则匹配')

link_node = soup.find('a', href=re.compile(r'ill'))

print(link_node.name, link_node['href'], link_node.get_text())

print('获取P段落文字')

p_node = soup.find('p', class_='title')

print(p_node.name, p_node.get_text())

urls_manager

new_urls = set()

old_urls = set()

def add_new_url(url):

if url is None:

return

if url not in new_urls and url not in old_urls:

new_urls.add(url)

def add_new_urls(urls):

if urls is None or len(urls) == 0:

return

for url in urls:

add_new_url(url)

def get_new_url():

new_url = new_urls.pop()

old_urls.add(new_url)

return new_url

def has_new_url():

return len(new_urls) != 0

总结

本篇文章就到这里了，希望能够给你带来帮助。