#一、通过requests模拟知乎登录
# -*- coding: utf-8 -*-
# author : seven
# time : 2017/7/21
import requests
import re
from bs4 import BeautifulSoup
try:
import http.cookiejar as cookielib # python3
except:
import cookielib # python2
session = requests.session()
session.cookies = cookielib.LWPCookieJar(filename='cookies.txt')
try:
session.cookies.load(ignore_discard=True)
except:
print('cookie 未能加载')
userAgent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
header = {
'HOST': 'www.zhihu.com',
'Referer': 'https://www.zhihu.com',
'User-Agent': userAgent
}
def is_login():
'''通过访问需要登录的页面,判断返回的状态码来判断是否登录,200登录来,302为登录'''
need_login_visit_url = 'https://www.zhihu.com/settings/profile'
response = session.get(need_login_visit_url, headers=header, allow_redirects=False)
# allow_redirects如果服务器返回302是否需要访问重定向后的页面,如果不设置为False,获取重定向后的页面之后就返回200了,就永远认为是登录了
if response.status_code != 200:
return False
else:
return True
def get_xsrf():
'''请求时候,要设置userAgent,有的网站就会验证此值,防止爬虫'''
response = session.get('https://www.zhihu.com/#signin', headers=header)
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
xsrfNodes = soup.select('input[name="_xsrf"]')
xsrfInput = xsrfNodes[0]
xsrf = xsrfInput.get('value', '')
return xsrf
# 不知道为何用下面正则匹配返回的内容匹配不出来,临时先用BeautifulSoup代替
# match_obj = re.match('.*name="_xsrf" value="(.*?)"',response.text)
# if match_obj:
# return match_obj.group(1)
# else:
# return ''
def get_index():
'''获取首页'''
response = session.get('https://www.zhihu.com', headers=header)
with open('index_page.html', 'wb') as f:
f.write(response.text.encode('utf-8')) # 向内存中写内容要用utf-8
print('ok')
def zhihu_login(account, password):
# 知乎登录
if re.match(r'1\d{10}', account):
'''phone login'''
post_url = 'http://www.zhihu.com/login/phone_num'
post_data = {
'_xsrf': get_xsrf(),
'phone_num': account,
'password': password
}
else:
'''email login'''
post_url = 'http://www.zhihu.com/login/email'
post_data = {
'_xsrf': get_xsrf(),
'phone_num': account,
'password': password
}
session.post(post_url, data=post_data, headers=header)
session.cookies.save() # 保存cookie
if __name__ == '__main__':
if not is_login():
zhihu_login('xxxxx','xxxxx.')
else:
# 2、带着cookie去访问其他页面就ok了
get_index()
#二、scrapy模拟登录
#1、创建工程:scrapy genspider zhihu www.zhibu.com
#2、spider文件夹下代码:
# -*- coding: utf-8 -*-
import scrapy
import re
import json
from PIL import Image
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['www.zhihu.com']
start_urls = ['http://www.zhihu.com/']
userAgent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
headers = {
'HOST': 'www.zhihu.com',
'Referer': 'https://www.zhihu.com',
'User-Agent': userAgent
}
def parse(self, response):
'''登录成功后访问页面'''
print(response.body)
pass
def parse_detail(self, response):
'''每个访问页面的详细数据'''
pass
def start_requests(self):
'''重写start_requests,入口方法'''
return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers,
callback=self.login)] # 所有scrapy的Request都要有callback,因为它的请求全是异步的
def login(self, response):
# 正则表达式默认之匹配第一行\r\n之后的不匹配,re.DOTALL:匹配所有
match_obj = re.match('.*name="_xsrf" value="(.*?)"', response.text, re.DOTALL)
if match_obj:
xsrf = match_obj.group(1)
import time
t = str(int(time.time() * 1000))
captcha_url = "http://www.zhihu.com/captcha.gif?r={0}&type=login".format(t)
yield scrapy.Request(captcha_url,headers=self.headers,meta={'xsrf':xsrf},callback=self.login_after_captcha,dont_filter=True)
def login_after_captcha(self,reponse):
'''请求验证码之后登录'''
captcha_name = 'captcha.jpg'
with open(captcha_name,'wb') as f:
f.write(reponse.body)
try:
im = Image.open(captcha_name)
im.show()
im.close()
except Exception as e:
print('打开图片出错')
captcha = input("请输入验证码:")
xsrf = reponse.meta.get('xsrf','')
return [scrapy.FormRequest( # 完成表单提交
# 为了简单这里只写phone login
url='http://www.zhihu.com/login/phone_num',
formdata={
'_xsrf': xsrf,
'phone_num': 'sdsd',
'password': 'asdasads.',
'captcha': captcha
},
headers=self.headers,
callback=self.check_login, # 所有scrapy的Request都要有callback,因为它的请求全是异步的
dont_filter=True # 默认scrapy是过滤掉除了allowed_domains中掉url,如果dont_filter=True ,则不会过滤
)]
def check_login(self, response):
'''登录后回调,在去访问其他url,scrapy会自动带cookie信息,不需要自己处理'''
reponse_text = response.text;
text_json = json.loads(reponse_text) # 服务器返回的是json字符串,解析出来
if 'msg' in text_json and text_json['msg'] == '登录成功':
for url in self.start_urls:
yield scrapy.Request(url, dont_filter=True, headers=self.headers) # 不写回调函数,默认走parse
else:
print('error %s' % reponse_text)