import parsel
import requests
for page in range(0, 226, 25):
# 获取10页的所有数据
baseurl = f"https://movie.douban.com/top250?start={page}&filter=" # 用于翻页
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
}
response = requests.get(url=baseurl, headers=headers)
html_data = response.text
selector = parsel.Selector(html_data)
lis = selector.xpath('//ol[@class = "grid_view"]/li')
# 得到250部电影点击去详细页面的地址
for li in lis:
link = li.xpath('.//div[@class = "pic"]/a/@href').extract()
#再使用这些地址封装一个方法来进行详细内容的获取
for links in link:
baseurl = links
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
}
response = requests.get(url=baseurl, headers=headers)
html_data = response.text
selector = parsel.Selector(html_data)
#标题
db_title = selector.xpath('.//div[@id = "content"]/h1/span[1]/text()').get()
director = selector.xpath('.//span[@class = "attrs"]/a/text()').get()
type = selector.xpath('.//span[@property = "v:genre"]/text()').get()
type2 = selector.xpath('.//span[@property = "v:genre"][2]/text()').get()
type3 = selector.xpath('.//span[@property = "v:genre"][3]/text()').get()
#电影类型
db_type = (type, type2, type3)
db_Running_time = selector.xpath('.//span[@property = "v:runtime"]/text()').get()
#电影的详细剧情
db_juqing = selector.xpath('.//span[@class = "short"]/span/text()').get().strip() or selector.xpath('.//div[@class = "indent"]/span/text()').get().strip()
print(db_juqing)
这个的一个思想就是,先获取到250部电影点击进入的地址,再使用这些地址来获取里面的详细信息,获取的内容不全,这里只是保留一个方法,个别的内容需要自己写表达式来进行获取
至于数据的保存的话,csv和mysql会比较方便也好写
略略略....................










