Scrapy
基本命令
安装
pip install scrapy
开始
startproject test
执行爬虫
scrapy crawn yaoblog # 爬虫名字
目录结构
…
examples
一、Scrapy版的知乎爬虫
import scrapy,time
from scrapy.spiders
import CrawlSpider
from bs4 import BeautifulSoup
###通用confige
mail=’youremail@163.com’
passowrd=’yourpassword’
get_url=”https://www.zhihu.com/“
login_url=”https://www.zhihu.com/login/email“
captcha_url=”https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() 1000)
firstpage=’https://www.zhihu.com/people/gchen20/activities‘
headers = {
“Accept”:”text/html, application/xhtml+xml, /*”, “Accept-Language”:”zh-CN”,
“User-Agent”:”Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0”,
“Accept-Encoding”: “gzip, deflate”,
“Host”: “http://www.zhihu.com“,
“DNT”: “1”,
“Connection”: “Keep-Alive”
}
###scrapy spider
class MySpider(CrawlSpider):
name = ‘zhihu’
allowed_domains = [‘zhihu.com’] ###起始的两个request,获取_xsrf和验证码
def start_requests(self):
urls = [get_url,str(captcha_url),]
for url in urls:
yield scrapy.Request(url=url, headers=headers, callback=self.parse_item)
###两个起始request的响应处理函数,获取_xsrf和验证码,并发起登陆的request def parse_item(self, response):
_xsrf=""
captChar=""
if(response.url==get_url):
soup = BeautifulSoup(response.body, 'lxml')
_xsrf = soup.find("input", {'type': 'hidden', 'name': '_xsrf'}).attrs['value']
else:
with open('capt.gif', 'wb') as f:
f.write(response.body)
captChar = input('please read the image capt.gif and input the captchar:')
yield scrapy.FormRequest(login_url,headers=headers, formdata={'_xsrf':_xsrf, 'captcha': captChar,"password": passowrd,"email": email}, callback=self.logged_in)
###logged_in的处理函数,主要是获取是否成功登陆的信息,成功登陆后,发起一个request
def logged_in(self,response):
soup = BeautifulSoup(response.body, 'lxml')
login_result = eval(soup.p.string)
result=login_result["msg"]
yield {'login':result}
if (login_result["r"] == 0):
yield scrapy.Request(url=firstpage, headers=headers, callback=self.parse_data)
###成功登陆后而发起的request的响应处理函数。
def parse_data(self,response):
soup = BeautifulSoup(response.body, 'lxml')
yield {'data':soup.prettify()}