爬虫之网页数据提取

tech2024-11-19 66

爬虫流程：指定URL 发请求收响应解数据存数据数据解析方法分类：正则（各编程语言都可以用） bs4(python独有) xpath(重点，各种编程语言都可用) bs4.BeautifulSoup 提供的方法和属性：实例化BeautifulSoup的方法本地html文件例 BeautifulSoup(file) 通过url获取到的html文本例 BeautifulSoup(response.text) 定位区域： 1.按标签+属性查找 soup.tag_name：返回第一次出现的tag_name对应的标签 soup.find(tag_name, {attribute: value})：返回第一次出现的tag_name对应的标签 soup.find_all(tag_name, {attribute: value}): 返回所有找到的元素的列表例 soup.find('div', {'class': 'chapter_content'})，查找class='chapter_content'的div标签，然后返回 2.CSS选择器 soup.select 例 soup.select('.chapter_content > p')，查找class='chapter_content'下的所有p标签，组成列表后返回获取标签之间的文本数据 1.只获取标签的直系结点的文本内容 tag.string 2.递归获取标签内的所有文本内容 tag.text tag.get_text() 获取标签中的属性值 soup.tag_name['attribute'] 使用xpath解析数据步骤 1.实例化lxml.etree对象 2.使用etree对象的xpath方法结合xpath表达式解析数据实例化etree对象的方法用本地HTML文件实例化 etree.parse(some_file_or_file_like_object) 用request得到的html数据实例化 etree.HTML(response.text) xpath表达式的用法指定层级： .当前节点 /根节点或单层级，//跨层级指定属性： tag[@attr_name="attr_value"]，属性值必须用双引号包围，不能用单引号索引定位： tag[index] 索引从1开始取文本 /text() 获取直系文本 //text() 递归获取所有文本取属性 /@attr_name 或运算符 | 例如 expression1 | expression2 例 tree.xpath('.//div[@class="bottom"]/ul//li/a/text()') 爬取网页时遇见中文乱码问题的解决办法： 1. 如果是requests.get()或post()等方法得到的html乱码，则可修改response.encoding response.encoding = 'gbk' 或 response.encoding = 'utf-8' 2. 如果想修改编码python的str类型编码，可使用 string.encode('ISO-8859-1').decode('gbk')

例1：使用正则表达式提取图片信息

# 任务：使用requests+正则表达式爬取 https://www.qiushibaike.com/imgrank/ 中的图片 import requests import re import os if __name__ == '__main__': # 创建用于保存图片的文件夹 if not os.path.exists('./qiutu_libs'): os.mkdir('./qiutu_libs') # 1. 请求主页 url = 'https://www.qiushibaike.com/imgrank/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36' } page_text = requests.get(url=url, headers=headers).text # 2. 使用正则表达式提取图片链接 # html源码片段样例： # <div class="thumb"> # # <a href="/article/123534249" target="_blank"> # <img src="//pic.qiushibaike.com/system/pictures/12353/123534249/medium/4M9SDXIP5DDILHAH.jpg" alt="糗事#123534249" class="illustration" width="100%" height="auto"> # </a> # </div> ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>' img_src_list = re.findall(ex, page_text, re.S) for src in img_src_list: # 3.请求单个图片 src = 'https:' + src img_data = requests.get(url=src, headers=headers).content # 4. 保存图片 img_name = src.split('/')[-1] # 得图片名称 img_path = './qiutu_libs/' + img_name with open(img_path, 'wb') as fp: fp.write(img_data) print(img_name, src, 'done.')

例2： bs4.BeautifulSoup提取小说正文

# 任务：使用requests+bs4.BeautifulSoup 提取诗词名句网“三国演义”的正文，保存到"./sanguo.txt"中 import requests from bs4 import BeautifulSoup if __name__ == '__main__': # 1. url url = 'https://www.shicimingju.com/book/sanguoyanyi.html' # UA伪装 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36' } # 2. 请求主页 page_text = requests.get(url=url, headers=headers).text # 3. 使用CSS选择器提取主页中所有章节的li标签 soup = BeautifulSoup(page_text, 'lxml') li_list = soup.select('.book-mulu > ul > li') with open('./sanguo.txt', 'w', encoding='utf-8') as fp: for li in li_list: title = li.a.string # 4. 获取章节详情页 detail_url = 'https://www.shicimingju.com' + li.a['href'] detail_page_text = requests.get(url=detail_url, headers=headers).text detail_soup = BeautifulSoup(detail_page_text, 'lxml') # 5. 解析章节正文内容 # 法一： CSS选择器 p_list = detail_soup.select('.chapter_content > p') print(title, ':', file=fp) for p in p_list: print(p.string.strip(), file=fp) # 法二：soup.find # div_tag = detail_soup.find('div', {'class': 'chapter_content'}) # print(div_tag.text.strip(), file=fp) print('{} done.'.format(title))

例3：xpath提取图片

# 任务：使用requests + xpath爬取 http://pic.netbian.com/4kmeinv/ 中的图片 import requests from lxml import etree import os if __name__ == '__main__': # 创建文件夹 dir_path = './beautiful_girl' if not os.path.exists(dir_path): os.mkdir(dir_path) # 1. 请求主页 url = 'http://pic.netbian.com/4kmeinv/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36' } response = requests.get(url=url, headers=headers) # 修改编码，解决中文乱码问题 response.encoding = 'gbk' page_text = response.text # 2. 获取所有图片所在li标签，形成列表 tree = etree.HTML(page_text) li_list = tree.xpath('//ul[@class="clearfix"]/li') for li in li_list: # 3. 解析具体图片的链接和名称 detail_url = 'http://pic.netbian.com' + li.xpath('.//img/@src')[0] title = li.xpath('./a/img/@alt')[0] # 4. 请求具体图片 picture = requests.get(url=detail_url, headers=headers).content # 图片使用二进制 # 5. 保存图片 file_path = '{}/{}.jpg'.format(dir_path, title) with open(file_path, 'wb') as fp: fp.write(picture) print(file_path, 'done.') print('all done.')

最新回复(0)