此自动化脚本能够从网页 URL 中提取 HTML,还能提供可用于解析 HTML 以获取数据的功能。
# Parse and Extract HTML
# pip install gazpacho
import gazpacho
# Extract HTML from URL
url = 'xxx'
html = gazpacho.get(url)
# Extract HTML with Headers
headers = {'User-Agent': 'Mozilla/5.0'}
html = gazpacho.get(url, headers=headers)
# Parse HTML
parse = gazpacho.Soup(html)
# Find single tags
tag1 = parse.find('h1')
tag2 = parse.find('span')
# Find multiple tags
tags1 = parse.find_all('p')
tags2 = parse.find_all('a')
# Find tags by class
tag = parse.find('.class')
# Find tags by Attribute
tag = parse.find(div, attrs={class: test})
# Extract text from tags
text = parse.find('h1').text
text = parse.find_all('p')[0].text