python3-张子枫图片爬取

tech2024-07-20 144

Life is short, You need Python

想搞点妹妹图片，一张一张的下载太麻烦…于是决定爬上一爬目标网站：分析目标网站，调节到移动模式，进行抓包分析，不难发现所有图片均以json字符串从服务端返回，这里怎么分析就不多介绍

分析单个request url: 反正就是很长的一串一大堆参数反正不知道是干嘛的鼠标一直下滑发现出现多个类似的请求url 比较多个 request url: 这里复制出来一部分：

Request URL: https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=&copyright=&word=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&cg=star&pn=210&rn=30&gsm=d2&1599125756815= Request URL: https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=&copyright=&word=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&cg=star&pn=240&rn=30&gsm=f0&1599125756944= Request URL: https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=&copyright=&word=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&cg=star&pn=270&rn=30&gsm=10e&1599126065176=

不难发现: pn 参数呈现规律性的递增，有可能是每页显示的条数，最后面的参数像加密字符串暂且不管咱们先抓取其中一个URL看看是什么结果：

import requests import json header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36','referer': 'https://i.csdn.net/'} def parse_url(url): response = requests.get(url,headers=header) return json.loads(response.content.decode()) print(parse_url("https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=&copyright=&word=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&cg=star&pn=30&rn=30"))

得到结果：

{ "queryEnc": "%D5%C5%D7%D3%B7%E3%CD%BC%C6%AC", "queryExt": "张子枫图片", "listNum": 30, "displayNum": 21792, "gsm": "3c", "bdFmtDispNum": "约21,700", "bdSearchTime": "", "isNeedAsyncRequest": 0, "bdIsClustered": "1", "data": [ { "adType": "0", "hasAspData": "0", "thumbURL": "https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=3618941107,2176593943&fm=26&gp=0.jpg", "middleURL": "https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=3618941107,2176593943&fm=26&gp=0.jpg", "largeTnImageUrl": "", "hasLarge": 0, "hoverURL": "https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=3618941107,2176593943&fm=26&gp=0.jpg", "pageNum": 30, "objURL": "ippr_z2C$qAzdH3FAzdH3Fvn_z&e3Biwtkw5_z&e3BvgAzdH3Ft42AzdH3Fa_a_8aa_aAzdH3F8c9ndbld9d_z&e3B9anAzdH3Fba0nl8lj0jv8m8kk99abjc10cbuvv0jj_z&e3B3r2", "fromURL": "ippr_z2C$qAzdH3FAzdH3Frtv_z&e3Biwtkw5_z&e3Bv54AzdH3Ft4w2jAzdH3F8cdcblcm_z&e3Bip4s?ho=%Ec%BC%Aa%Ec%AD%la%Em%lE%AB&fhtr=nd", "fromURLHost": "pic.haibao.com", "currentIndex": "", "width": 1024, "height": 961, "type": "jpg", "is_gif": 0, "isCopyright": 0, "strategyAssessment": "1249909234_30_0_0", "filesize": "", "bdSrcType": "0", "di": "27610", "pi": "0", "is": "0,0", "imgCollectionWord": "", "replaceUrl": [ { "ObjURL": "http://img2.imgtn.bdimg.com/it/u=3618941107,2176593943&fm=214&gp=0.jpg", "ObjUrl": "http://img2.imgtn.bdimg.com/it/u=3618941107,2176593943&fm=214&gp=0.jpg", "FromURL": "http://fashion.ifeng.com/c/7ushnwc2kg2", "FromUrl": "http://fashion.ifeng.com/c/7ushnwc2kg2" }, { "ObjURL": "http://imgboys1.yohobuy.com/cmsimg01/2019/08/21/03/03/0188975a38b1fdbba46f4884c50910a6a6.jpeg", "ObjUrl": "http://imgboys1.yohobuy.com/cmsimg01/2019/08/21/03/03/0188975a38b1fdbba46f4884c50910a6a6.jpeg", "FromURL": "http://www.yohoboys.com/channel/detail/release/id/84361/app/", "FromUrl": "http://www.yohoboys.com/channel/detail/release/id/84361/app/" } ], "hasThumbData": "0", "bdSetImgNum": 0, "partnerId": 0, "spn": 0, "bdImgnewsDate": "2020-01-19 01:43", "fromPageTitle": "<strong>张子枫</strong>", "fromPageTitleEnc": "张子枫", "bdSourceName": "", "bdFromPageTitlePrefix": "", "isAspDianjing": 0, "token": "", "imgType": "", "cs": "3618941107,2176593943", "os": "840935132,1601297368", "simid": "2994841692,3622017507", "personalized": "0", "simid_info": null, "face_info": null, "xiangshi_info": null, "adPicId": "0", "source_type": "" }, ...... }

只显示部分到了这里从json数组中不难看出我们想要的东西发现"listNum": 30,由此可见pn就是每页显示的条数这里发现原来百度图库的图片基本也都是爬别人的，就没有一张高清图片咱们去掉url最后面的参数同时改变pn参数的值注意每次加30：经过再次抓取发现也能返回成功，到这一步，就可以开始整个图片的抓取了

直接上代码了：

from urllib import request #引入网页分解析器 import requests import json #引入json处理扩展 class ZzfSpider: def __init__(self): self.templete_url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=&copyright=&word=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&cg=star&pn={}&rn=30"#构建基础URL self.total = 30 self.i = 0 # 处理内容 def get_content_list(self, dict_str): return dict_str['data'], dict_str['displayNum'] # 保存内容 def save_content_list(self, content_list,i): j = 0 with open("zzf_image_spider.text", "a") as f: for content in content_list: if content:#内容不为空则写入文本并保存图片 request.urlretrieve(content['middleURL'], "zzf/%s.jpg" % content['strategyAssessment']) #创建图片 f.write(json.dumps(content, ensure_ascii=False, indent=2)) #将字典类型转化成json字符串写入文本 f.write("\n") j += 1 page = i*30+j print('保存成功，合计%s张图片' % page) # 发送请求获得数据 def parse_url(self, url): headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36", "referer": "https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%D5%C5%D7%D3%B7%E3%CD%BC%C6%AC&fr=ala&ala=1&alatpl=star&pos=0&hs=2&xthttps=111111", } response = requests.get(url, headers=headers) json_str = response.content.decode() dict_str = json.loads(json_str)#将数据转换成python字典类型 return dict_str # 主方法 def run(self): num = 30 total = self.total i = self.i while True: # 1 获取基础url url = self.templete_url.format(num) # print(url) # 2 发送请求获取相应 dict_str = self.parse_url(url) # 3 数据处理 content_list, total = self.get_content_list(dict_str) if total == 0: break else: # 4 数据保存 self.save_content_list(content_list,i) i += 1 # 5 构造下一页url num += 30 # print(content_list) if __name__ == '__main__': ZzfSpider().run()

这里有一点由于url中参数的值是累加的，抓取过程中发现到了pn=1350的时候，抓取数据返回空值，所以得在程序中做一个判断最后程序效果：视频展示：

python爬虫张子枫图片

由于图片太多本次只抓取了1000多图片张变终止了程序最后附上张子枫图片合集百度网盘链接： https://pan.baidu.com/s/1Jmd2BHCM3oY1DM2AqBZjuw 密码:hk0h

最新回复(0)