请求传参
使用场景:如果爬取解析的数据不在同一张页面中。(需要进行深度爬取)需求:爬取海外网中的新闻标题和新闻详细内容 http://hk.haiwainet.cn/news/构建scrapy框架工程文件,设置完setting.py配置信息
打开网页,获取新闻标题和新闻详细内容 首先,获取各标签下的新闻标题,xpath: /html/body/div[2]/div[3]/div[1]/ul/a/text()
然后,获取详情页的链接地址,xpath: /html/body/div[2]/div[3]/div[1]/ul/a/@href
之后进入详情页,获取详情内容 获取页面内容: //*[@id="cen"]//text()
编写代码 import scrapy class BossSpider(scrapy.Spider): name = 'boss' #allowed_domains = ['www.xxx.com'] start_urls = ['http://hk.haiwainet.cn/news/'] def parse_detail(self,response): detail_page = response.xpath('//*[@id="cen"]//text()').extract() detail_page = ''.join(detail_page) print(detail_page) def parse(self, response): ul_list = response.xpath('/html/body/div[2]/div[3]/div[1]/ul') # print(ul_list) for ul in ul_list: li_list = ul.xpath('./li') for li in li_list: news_name = li.xpath('./a/text()').extract_first() print(news_name) detail_url = li.xpath('./a/@href').extract_first() #print(detail_url) yield scrapy.Request(detail_url,callback=self.parse_detail)settings.py
ITEM_PIPELINES = { 'bossPro.pipelines.BossproPipeline': 300, }items.py
import scrapy class BossproItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() news_name = scrapy.Field() detail_page = scrapy.Field()boss.py
import scrapy from bossPro.items import BossproItem class BossSpider(scrapy.Spider): name = 'boss' #allowed_domains = ['www.xxx.com'] start_urls = ['http://hk.haiwainet.cn/news/'] #回调函数接收item def parse_detail(self,response): item = response.meta['item'] detail_page = response.xpath('//*[@id="cen"]//text()').extract() detail_page = ''.join(detail_page) item['detail_page'] = detail_page print(detail_page) yield item def parse(self, response): ul_list = response.xpath('/html/body/div[2]/div[3]/div[1]/ul') # print(ul_list) for ul in ul_list: li_list = ul.xpath('./li') for li in li_list: item = BossproItem() news_name = li.xpath('./a/text()').extract_first() item['news_name'] = news_name print(news_name) detail_url = li.xpath('./a/@href').extract_first() #print(detail_url) #对详情页发请求获取详情页的页面源码数据 #手动请求的发送 #请求参数:meta={},可以将meta字典传递给请求对应的回调函数 yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item})pipelines.py
class BossproPipeline: def process_item(self, item, spider): print(item) return item