例1:使用正则表达式提取图片信息
# 任务:使用requests+正则表达式爬取 https://www.qiushibaike.com/imgrank/ 中的图片 import requests import re import os if __name__ == '__main__': # 创建用于保存图片的文件夹 if not os.path.exists('./qiutu_libs'): os.mkdir('./qiutu_libs') # 1. 请求主页 url = 'https://www.qiushibaike.com/imgrank/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36' } page_text = requests.get(url=url, headers=headers).text # 2. 使用正则表达式提取图片链接 # html源码片段样例: # <div class="thumb"> # # <a href="/article/123534249" target="_blank"> # <img src="//pic.qiushibaike.com/system/pictures/12353/123534249/medium/4M9SDXIP5DDILHAH.jpg" alt="糗事#123534249" class="illustration" width="100%" height="auto"> # </a> # </div> ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>' img_src_list = re.findall(ex, page_text, re.S) for src in img_src_list: # 3.请求单个图片 src = 'https:' + src img_data = requests.get(url=src, headers=headers).content # 4. 保存图片 img_name = src.split('/')[-1] # 得图片名称 img_path = './qiutu_libs/' + img_name with open(img_path, 'wb') as fp: fp.write(img_data) print(img_name, src, 'done.')例2: bs4.BeautifulSoup提取小说正文
# 任务:使用requests+bs4.BeautifulSoup 提取诗词名句网“三国演义”的正文,保存到"./sanguo.txt"中 import requests from bs4 import BeautifulSoup if __name__ == '__main__': # 1. url url = 'https://www.shicimingju.com/book/sanguoyanyi.html' # UA伪装 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36' } # 2. 请求主页 page_text = requests.get(url=url, headers=headers).text # 3. 使用CSS选择器提取主页中所有章节的li标签 soup = BeautifulSoup(page_text, 'lxml') li_list = soup.select('.book-mulu > ul > li') with open('./sanguo.txt', 'w', encoding='utf-8') as fp: for li in li_list: title = li.a.string # 4. 获取章节详情页 detail_url = 'https://www.shicimingju.com' + li.a['href'] detail_page_text = requests.get(url=detail_url, headers=headers).text detail_soup = BeautifulSoup(detail_page_text, 'lxml') # 5. 解析章节正文内容 # 法一: CSS选择器 p_list = detail_soup.select('.chapter_content > p') print(title, ':', file=fp) for p in p_list: print(p.string.strip(), file=fp) # 法二:soup.find # div_tag = detail_soup.find('div', {'class': 'chapter_content'}) # print(div_tag.text.strip(), file=fp) print('{} done.'.format(title))例3:xpath提取图片
# 任务:使用requests + xpath爬取 http://pic.netbian.com/4kmeinv/ 中的图片 import requests from lxml import etree import os if __name__ == '__main__': # 创建文件夹 dir_path = './beautiful_girl' if not os.path.exists(dir_path): os.mkdir(dir_path) # 1. 请求主页 url = 'http://pic.netbian.com/4kmeinv/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36' } response = requests.get(url=url, headers=headers) # 修改编码,解决中文乱码问题 response.encoding = 'gbk' page_text = response.text # 2. 获取所有图片所在li标签,形成列表 tree = etree.HTML(page_text) li_list = tree.xpath('//ul[@class="clearfix"]/li') for li in li_list: # 3. 解析具体图片的链接和名称 detail_url = 'http://pic.netbian.com' + li.xpath('.//img/@src')[0] title = li.xpath('./a/img/@alt')[0] # 4. 请求具体图片 picture = requests.get(url=detail_url, headers=headers).content # 图片使用二进制 # 5. 保存图片 file_path = '{}/{}.jpg'.format(dir_path, title) with open(file_path, 'wb') as fp: fp.write(picture) print(file_path, 'done.') print('all done.')