16. python爬虫——基于scrapy爬取多页面新闻信息，通过请求传参完成持久化存储

tech2023-11-20 133

python爬虫——基于scrapy通过请求传参完成持久化存储

1、介绍【前置准备】2、分析（1）获取单页面情况（2）请求传参（3）获取多页信息

1、介绍

请求传参

使用场景：如果爬取解析的数据不在同一张页面中。（需要进行深度爬取）需求：爬取海外网中的新闻标题和新闻详细内容 http://hk.haiwainet.cn/news/

【前置准备】

构建scrapy框架工程文件，设置完setting.py配置信息

2、分析

（1）获取单页面情况

打开网页，获取新闻标题和新闻详细内容首先，获取各标签下的新闻标题，xpath： /html/body/div[2]/div[3]/div[1]/ul/a/text()

然后，获取详情页的链接地址，xpath： /html/body/div[2]/div[3]/div[1]/ul/a/@href

之后进入详情页，获取详情内容获取页面内容： //*[@id="cen"]//text()

编写代码 import scrapy class BossSpider(scrapy.Spider): name = 'boss' #allowed_domains = ['www.xxx.com'] start_urls = ['http://hk.haiwainet.cn/news/'] def parse_detail(self,response): detail_page = response.xpath('//*[@id="cen"]//text()').extract() detail_page = ''.join(detail_page) print(detail_page) def parse(self, response): ul_list = response.xpath('/html/body/div[2]/div[3]/div[1]/ul') # print(ul_list) for ul in ul_list: li_list = ul.xpath('./li') for li in li_list: news_name = li.xpath('./a/text()').extract_first() print(news_name) detail_url = li.xpath('./a/@href').extract_first() #print(detail_url) yield scrapy.Request(detail_url,callback=self.parse_detail)

（2）请求传参

settings.py

ITEM_PIPELINES = { 'bossPro.pipelines.BossproPipeline': 300, }

items.py

import scrapy class BossproItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() news_name = scrapy.Field() detail_page = scrapy.Field()

boss.py

import scrapy from bossPro.items import BossproItem class BossSpider(scrapy.Spider): name = 'boss' #allowed_domains = ['www.xxx.com'] start_urls = ['http://hk.haiwainet.cn/news/'] #回调函数接收item def parse_detail(self,response): item = response.meta['item'] detail_page = response.xpath('//*[@id="cen"]//text()').extract() detail_page = ''.join(detail_page) item['detail_page'] = detail_page print(detail_page) yield item def parse(self, response): ul_list = response.xpath('/html/body/div[2]/div[3]/div[1]/ul') # print(ul_list) for ul in ul_list: li_list = ul.xpath('./li') for li in li_list: item = BossproItem() news_name = li.xpath('./a/text()').extract_first() item['news_name'] = news_name print(news_name) detail_url = li.xpath('./a/@href').extract_first() #print(detail_url) #对详情页发请求获取详情页的页面源码数据 #手动请求的发送 #请求参数：meta={},可以将meta字典传递给请求对应的回调函数 yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item})

pipelines.py

class BossproPipeline: def process_item(self, item, spider): print(item) return item

（3）获取多页信息

import scrapy from bossPro.items import BossproItem class BossSpider(scrapy.Spider): name = 'boss' #allowed_domains = ['www.xxx.com'] start_urls = ['http://hk.haiwainet.cn/news/'] url = 'http://hk.haiwainet.cn/news/%d.html' page_num = 2 #回调函数接收item def parse_detail(self,response): item = response.meta['item'] detail_page = response.xpath('//*[@id="cen"]//text()').extract() detail_page = ''.join(detail_page) item['detail_page'] = detail_page print(detail_page) yield item #解析首页中的标题名称 def parse(self, response): ul_list = response.xpath('/html/body/div[2]/div[3]/div[1]/ul') # print(ul_list) for ul in ul_list: li_list = ul.xpath('./li') for li in li_list: item = BossproItem() news_name = li.xpath('./a/text()').extract_first() item['news_name'] = news_name print(news_name) detail_url = li.xpath('./a/@href').extract_first() #print(detail_url) #对详情页发请求获取详情页的页面源码数据 #手动请求的发送 #请求参数：meta={},可以将meta字典传递给请求对应的回调函数 yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item}) #分页操作 if self.page_num <= 3: new_url = format(self.url%self.page_num) self.page_num += 1 yield scrapy.Request(new_url,callback=self.parse)

最新回复(0)