0
点赞
收藏
分享

微信扫一扫

【爬虫】利用Python爬虫爬取小麦苗itpub博客的所有文章的连接地址并写入Excel中(2)...



【爬虫】利用Python爬虫爬取小麦苗itpub博客的所有文章的连接地址并写入Excel中(2)


第一篇( ​​ http://blog.itpub.net/26736162/viewspace-2286553/ ​​ )是将地址写入了txt文本文件中,这篇博客将爬取到的结果写入Excel表格中。


爬取到的结果:

【爬虫】利用Python爬虫爬取小麦苗itpub博客的所有文章的连接地址并写入Excel中(2)..._.net​​​ 小麦苗itpub博客链接地址.xlsx ​​


Python爬取的源代码:

import requests
import re
import xlwt
url = 'http://blog.itpub.net/26736162/list/%d/'
pattern = re.compile(r'<a target=_blank href="(.*?)" class="w750"><p class="title">(.*?)</p></a>')
# pattern=re.compile(r'<a target=_blank href="(.*?)" class="w750"><p class="title">')
# ret=pattern.findall(data)
# print(''.join(ret))
# def write2file(items):
# with open('./download/lhrbest_itpub_link_title.txt','a',encoding='utf-8') as fp:
# for item in items:
# item=item[::-1]
# s=':'.join(item)
# # print('----',len(items))
# fp.write(s+'\n')
# # fp.write('---------------------------------------------------------------\n')
# pass
def set_style(name, height,colour_index,horz=xlwt.Alignment.HORZ_LEFT,bold=False):
style = xlwt.XFStyle() # 初始化样式
font = xlwt.Font() # 为样式创建字体
font.name = name
font.bold = bold
font.colour_index = colour_index # 1白2红3绿4蓝5黄 0 = Black, 1 = White, 2 = Red, 3 = Green, 4 = Blue, 5 = Yellow, 6 = Magenta, 7 = Cyan
font.height = height #0x190是16进制,换成10进制为400,然后除以20,就得到字体的大小为20
style.font = font
# 设置单元格对齐方式
alignment = xlwt.Alignment() # 创建alignment
alignment.horz = horz # 设置水平对齐为居中,May be: HORZ_GENERAL, HORZ_LEFT, HORZ_CENTER, HORZ_RIGHT, HORZ_FILLED, HORZ_JUSTIFIED, HORZ_CENTER_ACROSS_SEL, HORZ_DISTRIBUTED
alignment.vert = xlwt.Alignment.VERT_CENTER # 设置垂直对齐为居中,May be: VERT_TOP, VERT_CENTER, VERT_BOTTOM, VERT_JUSTIFIED, VERT_DISTRIBUTED
style.alignment = alignment # 应用alignment到style3上
# 设置单元格边框
borders = xlwt.Borders() # 创建borders
borders.left = xlwt.Borders.DASHED # 设置左边框的类型为虚线 May be: NO_LINE, THIN, MEDIUM, DASHED, DOTTED, THICK, DOUBLE, HAIR, MEDIUM_DASHED, THIN_DASH_DOTTED, MEDIUM_DASH_DOTTED, THIN_DASH_DOT_DOTTED, MEDIUM_DASH_DOT_DOTTED, SLANTED_MEDIUM_DASH_DOTTED, or 0x00 through 0x0D.
borders.right = xlwt.Borders.THIN # 设置右边框的类型为细线
borders.top = xlwt.Borders.THIN # 设置上边框的类型为打点的
borders.bottom = xlwt.Borders.THIN # 设置底部边框类型为粗线
borders.left_colour = 0x10 # 设置左边框线条颜色
borders.right_colour = 0x20
borders.top_colour = 0x30
borders.bottom_colour = 0x40
style.borders = borders # 将borders应用到style1上
return style
def init_excel():
f = xlwt.Workbook(encoding='gbk') # 创建工作薄
# 创建个人信息表
sheet1 = f.add_sheet(u'小麦苗itpub博客链接地址', cell_overwrite_ok=True)
sheet1.col(0).width = 256 * 50
sheet1.col(1).width = 256 * 50
rowTitle = [u'博客文章标题', u'链接地址']
# rowDatas = [[u'张一', u'男', u'18'], [u'李二', u'女', u'20'], [u'黄三', u'男', u'38'], [u'刘四', u'男', u'88']]
for i in range(0, len(rowTitle)):
sheet1.write(0, i, rowTitle[i], set_style('Courier New', 220, 2, xlwt.Alignment.HORZ_CENTER, True)) # 后面是设置样式
f.save('./download/excel_write_base.xlsx')
return f,sheet1
# 写excel
def write_excel(rowDatas,f,rowIndex):
f_excel=f[0]
f_sheet=f[1]
rowIndex= rowIndex if rowIndex == 0 else rowIndex*20
for k in range(0, len(rowDatas)): # 先遍历外层的集合,即每行数据
for j in range(0, len(rowDatas[k])): # 再遍历内层集合
if j == 1:
# 写入数据,k+1表示先去掉标题行,另外每一行数据也会变化,j正好表示第一列数据的变化,rowdatas[k][j] 插入数据
f_sheet.write(k +rowIndex+ 1, j,
xlwt.Formula('HYPERLINK("%s","%s")' % (rowDatas[k][::-1][j], rowDatas[k][::-1][j])),set_style('Courier New', 180,4))
else:
f_sheet.write(k +rowIndex+ 1, j, rowDatas[k][::-1][j],set_style('Courier New', 180,0))
f_excel.save('./download/excel_write_base.xlsx')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'}
def loadHtml(page):
if page >= 1:
f=init_excel() #初始化一个Excel工作簿,包括sheet
for p in range(1, page + 1):
url_itpub = url % (p)
print(url_itpub)
response = requests.get(url=url_itpub, headers=headers)
response.encoding = 'utf-8'
content = response.text
# print(content)
# Ctrl + Alt + V:提取变量
items = pattern.findall(content)
# print(items)
# write2file(items)
write_excel(items,f,p-1)
pass
else:
print('请输入数字!!!')
pass
if __name__ == '__main__':
page = int(input('请输入需要爬取多少页:'))
loadHtml(page)


举报

相关推荐

0 条评论