一、zhihuSpider.py 爬⾍代码:
from scrapy
.contrib
.spiders
import CrawlSpider
, Rule
from scrapy
.selector
import Selector
from scrapy
.contrib
.linkextractors
.sgml
import SgmlLinkExtractor
from scrapy
.http
import Request
, FormRequest
from zhihu
.items
import ZhihuItem
class ZhihuSipder(CrawlSpider
) :
name
= "zhihu"
allowed_domains
= ["www.zhihu.com"]
start_urls
= [ "http://www.zhihu.com" ]
rules
= (
Rule
(SgmlLinkExtractor
(allow
= ('/question/\d+#.*?', )), ca
llback
= 'parse_page', follow
= True),
Rule
(SgmlLinkExtractor
(allow
= ('/question/\d+', )), callba
ck
= 'parse_page', follow
= True),
)
headers
= {
"Accept": "*/*",
"Accept-Encoding": "gzip,deflate",
"Accept-Language": "en-US,en;q=0.8,zh-TW;q=0.6,zh;q=0.4",
"Connection": "keep-alive",
"Content-Type":" application
/x
-www
-form
-urlencoded
; charset
=UTF
-8"
,
"User-Agent": "Mozilla
/5.0 (Macintosh
; Intel Mac OS X 10_10_1
)
AppleWebKit
/537.36 (KHTML
, like Gecko
) Chrome
/38.0.2125.111 Safari
/
537.36"
,
"Referer": "http://www.zhihu.com/" }
函数
def start_requests(self
):
return [Request
("https://www.zhihu.com/login", meta
= {'coo
kiejar'
: 1}, callback
= self
.post_login
)]
def post_login(self
, response
):
print 'Preparing login'
功提交表单
xsrf
= Selector
(response
).xpath
('
//input[@name
="_xsrf"]/@va
lue'
).extract
()[0]
print xsrf
return [FormRequest
.from_response
(response
,
zhihu
.com
/login"
,
okiejar'
]},
ers
meta
= {'cookiejar' : response
.meta
['co
headers
= self
.headers
,
head
formdata
= {
'_xsrf': xsrf
,
'email': '1095511864@qq.com',
'password': '123456'
},
callback
= self
.after_login
,
dont_filter
= True
)]
def after_login(self
, response
) :
for url
in self
.start_urls
:
yield self
.make_requests_from_url
(url
)
def parse_page(self
, response
):
problem
= Selector
(response
)
item
= ZhihuItem
()
item
['url'] = response
.url
item
['name'] = problem
.xpath
('//span[@class="name"]/text()'
).extract
()
print item
['name']
item
['title'] = problem
.xpath
('//h2[@class="zm-item-title zm-editable-content"]/text()').extract
()
item
['description'] = problem
.xpath
('//div[@class="zm-editable-content"]/text()').extract
()
item
['answer']= problem
.xpath
('
//div
[@
class=" zm
-editable
-c
ontent clearfix"
]/text
()'
).extract
()
return item
二、Item 类设置
from scrapy
.item
import Item
, Field
class ZhihuItem(Item
):
url
= Field
()
title
= Field
()
description
= Field
()
answer
= Field
()
name
= Field
()
三、setting.py 设置抓取间隔
BOT_NAME
= 'zhihu'
SPIDER_MODULES
= ['zhihu.spiders']
NEWSPIDER_MODULE
= 'zhihu.spiders'
DOWNLOAD_DELAY
= 0.25
四、Cookie 原理
HTTP 是⽆状态的⾯向连接的协议, 为了保持连接状态, 引⼊了 Cookie 机制。
Cookie 是 http 消息头中的⼀种属性,包括: Cookie 名字(Name) Cookie 的值(Value) Cookie 的过期时间 (Expires/Max-Age) Cookie 作⽤路径(Path) Cookie 所在域名(Domain), 使⽤Cookie 进⾏安全连接(Secure)。 前两个参数是 Cookie 应⽤的必要条件,另外,还包括 Cookie⼤⼩(Size,不同浏览器对 Cookie 个数及⼤⼩限制是有差异的)。 更多爬虫项目爬虫案例视频教程学习,请点此处获取。