| import re from bs4 import BeautifulSoup from urllib.parse import urljoin def get_new_urls(page_url, soup): new_urls = set() links = soup.find_all('a', href=re.compile(r"/\d+/\d+/\w+/page\.htm")) for link in links: new_url = link['href'] new_full_url = urljoin(page_url, new_url) new_urls.add(new_full_url) return new_urls def get_new_data(page_url, soup): res_data = {} title_node = soup.find('h1', class_='arti-title') if title_node is None: return res_data res_data['title'] = title_node.get_text() datetime_node = soup.find('span', class_='arti-update') res_data['datetime'] = datetime_node.get_text() visitcount_node = soup.find('span', class_='WP_VisitCount') res_data['visitcount'] = visitcount_node.get_text() res_data['url'] = page_url return res_data def parse(page_url, html_cont): if page_url is None or html_cont is None: return soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8') new_urls = get_new_urls(page_url, soup) new_data = get_new_data(page_url, soup) return new_urls, new_data |