爬取网址:http://sd.weather.com.cn/ 爬取目标:获得一个地区七天之内的天气状况,并存入excel表格中
爬虫文件部分
import scrapy from ..items import TianqiyubaoItem class TianqiSpider(scrapy.Spider): name = 'tianqi' allowed_domains = ['weather.com.cn'] start_urls = ['http://www.weather.com.cn/weather/101120402.shtml'] def parse(self, response): item = TianqiyubaoItem() date = response.xpath("/html/body/div[@class='con today clearfix']/div[@class='left fl']/div[@class='left-div'][1]/div[@id='7d']/ul[@class='t clearfix']/li/h1/text()").extract() weather = response.xpath("/html/body/div[@class='con today clearfix']/div[@class='left fl']/div[@class='left-div'][1]/div[@id='7d']/ul[@class='t clearfix']/li/p[@class='wea']/text()").extract() max_wendu = response.xpath("/html/body/div[@class='con today clearfix']/div[@class='left fl']/div[@class='left-div'][1]/div[@id='7d']/ul[@class='t clearfix']/li/p[@class='tem']/span/text()").extract() min_wendu = response.xpath("/html/body/div[@class='con today clearfix']/div[@class='left fl']/div[@class='left-div'][1]/div[@id='7d']/ul[@class='t clearfix']/li/p[@class='tem']/i/text()").extract() wind = response.xpath("/html/body/div[@class='con today clearfix']/div[@class='left fl']/div[@class='left-div'][1]/div[@id='7d']/ul[@class='t clearfix']/li/p[@class='win']/i/text()").extract() for i in range(len(date)): # print(date[i],weather[i],max_wendu[i],min_wendu[i],wind[i],"\n") item['日期'] = date[i] item['天气'] = weather[i] item['最高温'] = max_wendu[i] item['最低温'] = min_wendu[i] item['风级'] = wind[i] # pass yield itemitems部分
import scrapy class TianqiyubaoItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() 日期 = scrapy.Field() 天气 = scrapy.Field() 最高温 = scrapy.Field() 最低温 = scrapy.Field() 风级 = scrapy.Field() # passsettings部分
主要是关闭robots协议,开启头部信息和pipelines的下载设置
pipelines部分
class TianqiyubaoPipeline: def process_item(self, item, spider): with open('天气.csv','a') as f: item['日期'] = item.get('日期') item['天气'] = item.get('天气') item['最高温'] = item.get('最高温') item['最低温'] = item.get('最低温') item['风级'] = item.get('风级') txt = str.format('{},{},{},{},{}\n',item['日期'],item['天气'],item['最高温'],item['最低温'],item['风级']) f.write(txt) return item爬取结果
