程序演示如下:
 import requests
 import re
def parse_page(url):
     headers = {
         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
     }
     response = requests.get(url , headers = headers)
     text = response.text
     titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>' ,text , re.DOTALL) #re.DOTALL让.可以匹配换行符\n
     authors = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>',text , re.DOTALL)
     dynasties = re.findall(r'<p class="source".*?<a.*?>.*?<a.*?>(.*?)</a>', text ,re.DOTALL) #因为朝代在p标签下的第二个a标签,所以要用两个写两个a标签
     contents_tags = re.findall(r'<div class="contson".*?>(.*?)</div>',text,re.DOTALL)
     contents = []
    for content in contents_tags:
         x = re.sub(r'<.*?>' ,'',content)
         contents.append(x.strip())        #.strip()把换行符去掉
     poems = []
     for value in zip(titles,dynasties,authors,contents):
         title,dynastie,author,content = value
         poem = {
             'title':title,
             'author':author,
             'dynastie':dynastie,
             'content':content
        }
         poems.append(poem)
     for poem in poems:
         print(poem)
         print('='*40)
 def main():
     url = 'https://www.gushiwen.cn/default_1.aspx'
     for x in range(1,5):
         url = 'https://www.gushiwen.cn/default_%s.aspx' %x
         parse_page(url)
if __name__ == '__main__':
     main()
总结:re正则表达式无非是将html所有标签以及网页内容变成文本进行定位提取
项目中使用到的zip如下程序演示:
#zip函数:
 a = [1,2]
 b = [3,4]
 c = zip(a,b)
 c = [
     (1,3),
     (2,4)
 ]
value =(1,2,3)
 a,b,c=value
 a=1
 b=2
 c=3










