ML:NeuralNetwork2

Scrapy

基本命令

安装
pip install scrapy
开始
startproject test
执行爬虫
scrapy crawn yaoblog # 爬虫名字

目录结构

examples

一、Scrapy版的知乎爬虫
import scrapy,time
from scrapy.spiders
import CrawlSpider
from bs4 import BeautifulSoup

###通用confige
mail=’youremail@163.com’
passowrd=’yourpassword’
get_url=”https://www.zhihu.com/
login_url=”https://www.zhihu.com/login/email
captcha_url=”https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() 1000)
firstpage=’https://www.zhihu.com/people/gchen20/activities
headers = {
“Accept”:”text/html, application/xhtml+xml,
/*”, “Accept-Language”:”zh-CN”,
“User-Agent”:”Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0”,
“Accept-Encoding”: “gzip, deflate”,
“Host”: “http://www.zhihu.com“,
“DNT”: “1”,
“Connection”: “Keep-Alive”
}

###scrapy spider
class MySpider(CrawlSpider):
name = ‘zhihu’
allowed_domains = [‘zhihu.com’] ###起始的两个request,获取_xsrf和验证码
def start_requests(self):
urls = [get_url,str(captcha_url),]
for url in urls:
yield scrapy.Request(url=url, headers=headers, callback=self.parse_item)

###两个起始request的响应处理函数,获取_xsrf和验证码,并发起登陆的request   def parse_item(self, response):        
    _xsrf=""        
    captChar=""        
    if(response.url==get_url):            
        soup = BeautifulSoup(response.body, 'lxml')            
        _xsrf = soup.find("input", {'type': 'hidden', 'name': '_xsrf'}).attrs['value']        
    else:            
        with open('capt.gif', 'wb') as f:                
            f.write(response.body)            
        captChar = input('please read the image capt.gif and input the captchar:')        
    yield scrapy.FormRequest(login_url,headers=headers,                                   formdata={'_xsrf':_xsrf, 'captcha': captChar,"password": passowrd,"email": email},                                   callback=self.logged_in)    
###logged_in的处理函数,主要是获取是否成功登陆的信息,成功登陆后,发起一个request    
def logged_in(self,response):
    soup = BeautifulSoup(response.body, 'lxml')        
    login_result = eval(soup.p.string)        
    result=login_result["msg"]        
    yield {'login':result}        
    if (login_result["r"] == 0):            
        yield scrapy.Request(url=firstpage, headers=headers,        callback=self.parse_data) 
###成功登陆后而发起的request的响应处理函数。    
def parse_data(self,response):        
    soup = BeautifulSoup(response.body, 'lxml')        
    yield {'data':soup.prettify()}