前言

嗨喽！大家好呀，这里是魔王~

环境使用:

Python 3.8
Pycharm

模块使用:

requests >>> pip install requests
parsel >>> pip install parsel

解析方式 css xpath re
requests selenium

如果安装python第三方模块:

win + R 输入 cmd 点击确定, 输入安装命令 pip install 模块名 (pip install requests) 回车
在pycharm中点击Terminal(终端) 输入安装命令

如何配置pycharm里面的python解释器?

选择file(文件) >>> setting(设置) >>> Project(项目) >>> python interpreter(python解释器)
点击齿轮, 选择add
添加python安装路径

Python相关学习方向, 应用方向:

网站开发
开发网站, 豆瓣美团 youtube
论坛官网后台信息管理系统…
爬虫程序可见即可爬 (百度文库内容格式是比较乱) 采集网页上面数据程序公开的数据都没有 (个人信息涉及版权[不能盈利] 涩情)
批量采集数据内容, 抢购商品脚本自动发送弹幕自动评论点赞脚本自动发邮件
数据分析
把获取下来数据, 做一些可视化图表可以清晰的看到数据走势价格区间客户人群分析…
人工智能 (可以找工作但是学历要求很高硕士相关专业)
人脸识别视频换脸小机器人语音识别语气助手…
自动化办公
适合会计财务文职一类工作平时处理数据比较多的
游戏开发
小游戏

做一个简单爬虫案例: 一点点前端知识

基本数据类型字符串使用定义
数据容器 list(列表) dict(字典)
for循环使用
requests的简单使用
解析方法的使用

import re

"""
# 什么样的数据才是字符串数据 字符串是什么样子的?
a = 'python'  # 单引号 双引号 三引号(也可以作为多行注释)
b = "hello"
print(c)
"""

"""
列表 [] 数据容器 存储数据内容
    列表取值 根据他索引位置提取内容
    列表切片
lis = [1, 2, 3, 4, 5, 6, 7, 8]
# 提取lis里面 元素 4 怎么取
print(lis[3])
print(lis[-5])
# 提取列表里面 1 2 3 4   顾头不顾尾
print(lis[0:4:1])  # 步长默认是 1
# 提取列表 1 3 5 7   1 2 3 4 5 7
print(lis[0:7:2])
# 提取列表 2 4 6
print(lis[1:6:2])
lis = ['1', '2', '3', '4', '5', '6', '7', '8']
# 如果说想要提取  1 2 3 4 5 6 7 8 都提取出来 一个一个提取
# 如果想要获取数据 1,2,3,4,5,6,7,8  列表转字符串
# for i in lis:
#     print(i)
print(str(lis))
string = ','.join(lis)  # 把列表 转成字符串
print(string)
print('1,2,3,4,5,6,7,8')
"""

# # 导入一个数据请求模块
# import requests   # requests 别人写好的代码 程序 可以直接拿过来使用
# # 导入解析模块
# import parsel
# # 导入文件操作模块
# import os
# import re
#
# # 代理ip结构
# # proxies_dict = {
# #     "http": "http://" + ip:端口,
# #     "https": "http://" + ip:端口,
# # }
#
#
# def get_proxies():
#     proxies_url = 'http://tiqu.pyhttp.taolop.com/getip?count=1&neek=15790&type=2&yys=0&port=1&sb=&mr=1&sep=0&ts=1&time=4'
#     json_data = requests.get(url=proxies_url).json()
#     # print(json_data)
#     proxies_dict = {
#         "http": "http://" + json_data['data'][0]['ip'] + ':' + str(json_data['data'][0]['port']),
#         "https": "http://" + json_data['data'][0]['ip'] + ':' + str(json_data['data'][0]['port']),
#     }
#     return proxies_dict
#
#
# proxies_dict = get_proxies()
# list_url = 'https://www.qbiqu.com/0_1/'  # 小说目录页面
# headers = {
#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36'
# }
# response = requests.get(list_url, headers, proxies=proxies_dict)
# response.encoding = 'gbk'  # 编码  gbk编码格式
# # print(response.text)
# href = re.findall('<dd><a href="(.*?)">.*?</a></dd>', response.text)
# name = re.findall('<div id="fmimg"><img alt="(.*?)"', response.text)[0]  # 小说名字
# # print(href)
# for index in href:
#     # https://www.qbiqu.com/0_1/2.html
#     index_url = 'https://www.qbiqu.com' + index
#     # print(index_url)
#     # 爬虫就是模拟浏览器对于服务器发送请求
#     # url = 'https://www.qbiqu.com/0_1/1.html'  # 用自定义的变量接收字符串数据内容 url
#     # requests 发送请求模块  get 请求方法 url 要请求网址
#     response = requests.get(index_url, headers, proxies=proxies_dict)
#     response.encoding = 'gbk'  # 编码  gbk编码格
#     # print(response.text)
#     # re.sub(r'[/\*?":<>|]', '', title)
#     """
#     解析数据:
#         css xpath re
#
#     什么时候使用css和xpath:  没有办法直接对于字符串数据进行提取
#     css选择器 就根据标签属性提取数据
#     xpath 根据标签节点提取数据
#         当得到数据, 有标签的时候
#
#     无论是css还是xpath 都可以跨标签提取
#
#     re 当你没有办法使用标签提取数据的时候用正则 可以直接对于字符串数据进行提取
#
#     css和xpath 相当于面条 (不能生吃)
#     re 相当于 方便面  (可以直接生吃)
#     """
#     selector = parsel.Selector(response.text)  # response.text 字符串数据 转成可解析的对象
#     # h1::text 提取h1标签里面文本内容 get() 获得获取一个 python优点 简洁优雅
#     title = selector.css('.bookname h1::text').get()  # ctrl + C  ctrl + v
#     # title_1 = selector.xpath('//*[@class="bookname"]/h1/text()').get()  # ctrl + C  ctrl + v
#
#     # getall() 获取所有 全都要
#     content_list = selector.css('#content::text').getall()
#     content = ''.join(content_list)
#     # filename = 'data\\'
#     # if not os.path.exists(filename):
#     #     os.mkdir(filename)
#     # 关于文件操作 保存 相对路径(代码在哪里你就保存到哪里) 和 绝对路径(指定那个盘里面哪一个文件夹)
#     # mode 保存方式 w 写入数据(会覆盖)  a追加保存(不会覆盖)
#     with open(name + '.txt', mode='a', encoding='utf-8') as f:  # 配置文件 文件路径 名字 保存方式 编码格式
#         f.write(title)  # 写入内容
#         f.write('\n')
#         f.write(content)
#         f.write('\n')
#
#     print('正在保存: ', title)
#

import requests
import parsel
import concurrent.futures


def get_response(html_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36'
    }
    response = requests.get(url=html_url, headers=headers)
    response.encoding = response.apparent_encoding
    return response


def get_list_url(html_url):
    """章节url"""
    html_data = get_response(html_url).text
    selector = parsel.Selector(html_data)
    href = re.findall('<a href="/biquge_\d+/(\d+).html"', html_data)[11:]
    # href = selector.css('#list a::attr(href)').getall()[9:]
    return href


def get_name(html_url):
    html_data = get_response(html_url).text
    selector = parsel.Selector(html_data)
    name = selector.css('#info h1::text').get()
    return name


def get_content(list_url):
    html_data = get_response(list_url).text
    selector = parsel.Selector(html_data)
    title = selector.css('.bookname::text').get()
    content_list = selector.css('#booktxt p::text').getall()
    content = '\n'.join(content_list)
    novel_content = [title, content]
    # print(html_data)
    # print(title)
    # print(content_list)
    # print(content)
    return novel_content


def save(name, title, content):
    with open(name + '.txt', mode='a', encoding='utf-8') as f:
        f.write(title)
        f.write('\n')
        f.write(content)
        f.write('\n')
    print(title)


def main(html_url):
    href = get_list_url(html_url)
    name = get_name(html_url)
    for index in href:
        for page in range(1, 3):
            index_url = f'https://www.biqugeso.org/biquge_132699/{index}_{page}.html'
            print(index_url)
            content = get_content(index_url)
            save(name, content[0], content[1])



if __name__ == '__main__':
    url = 'https://www.biqugeso.org/biquge_132699/'
    main(url)