2020.09.04
爬虫技术路线:requests库+bs4+xlwt 程序中分为三个函数: spider(url)函数提取网页,返回整个网页源码 jiexi(html.info)函数解析网页,并且提取信息,参数html是网页源码,info是存放信息的列表 save(data)函数是保存数据的,将提取出来的数据进行逐一的保存至excel文件中去
# -*- coding: utf-8 -*- # Author : YRH # Data : # Project : # Tool : PyCharm import requests from bs4 import BeautifulSoup import xlwt def spider(url): headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"} try: rep = requests.get(url, headers=headers) rep.raise_for_status() rep.encoding = rep.apparent_encoding txt = rep.text return txt except: print("解析失败") def jiexi(html, info): soup = BeautifulSoup(html, "lxml") text = soup.find_all("script", type="text/javascript")[2].string data = eval(str(text).split("=", 1)[1])["engine_search_result"] for d in data: try: job_name = d["job_name"].replace("\\", "") # 岗位名称 except: job_name = " " try: company_href = d["company_href"].replace("\\", "") # 招聘网站 except: company_href = " " try: company_name = d["company_name"].replace("\\", "") # 公司名称 except: company_name = " " try: providesalary_text = d["providesalary_text"].replace("\\", "") # 薪资 except: providesalary_text = " " try: workarea_text = d["workarea_text"].replace("\\", "") # 工作地点 except: workarea_text = " " info.append([job_name, company_name, workarea_text, providesalary_text, company_href]) def save(data): print("save.....") workbook = xlwt.Workbook(encoding="utf-8") # 创建workbook对象 movieBook = workbook.add_sheet("sheet1") # 创建工作表 # 输入头标签 head = ["岗位", "公司名称", "工作地点", "薪资", "招聘网站"] for i in range(0, len(head)): movieBook.write(0, i, head[i]) # 参数1是行,参数2是列,参数3是值 # 数据逐行输入 y = 1 for a in data: print("成功保存:" + str(y)) for x in range(0, len(a)): movieBook.write(y, x, a[x]) y += 1 workbook.save("招聘信息.xls") # 保存数据表 if __name__ == '__main__': name = input("请输入岗位名称") page = eval(input("请输入爬取页数")) info = [] for i in range(1,page+1): url = "https://search.51job.com/list/000000,000000,0000,00,9,99," + name + ",2," + str(i) + ".html?" html = spider(url) jiexi(html,info) save(info)注:禁止用于商业手段
