python3-张子枫图片爬取

tech2024-07-20  56

Life is short, You need Python

想搞点妹妹图片,一张一张的下载太麻烦…于是决定爬上一爬 目标网站: 分析目标网站,调节到移动模式,进行抓包分析,不难发现所有图片均以json字符串从服务端返回,这里怎么分析就不多介绍

分析单个request url: 反正就是很长的一串 一大堆参数 反正不知道是干嘛的 鼠标一直下滑 发现出现多个类似的请求url 比较多个 request url: 这里复制出来一部分:

Request URL: https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=&copyright=&word=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&cg=star&pn=210&rn=30&gsm=d2&1599125756815= Request URL: https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=&copyright=&word=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&cg=star&pn=240&rn=30&gsm=f0&1599125756944= Request URL: https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=&copyright=&word=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&cg=star&pn=270&rn=30&gsm=10e&1599126065176=

不难发现: pn 参数呈现规律性的递增,有可能是每页显示的条数,最后面的参数像加密字符串暂且不管 咱们先抓取其中一个URL看看是什么结果:

import requests import json header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36','referer': 'https://i.csdn.net/'} def parse_url(url): response = requests.get(url,headers=header) return json.loads(response.content.decode()) print(parse_url("https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=&copyright=&word=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&cg=star&pn=30&rn=30"))

得到结果:

{ "queryEnc": "%D5%C5%D7%D3%B7%E3%CD%BC%C6%AC", "queryExt": "张子枫图片", "listNum": 30, "displayNum": 21792, "gsm": "3c", "bdFmtDispNum": "约21,700", "bdSearchTime": "", "isNeedAsyncRequest": 0, "bdIsClustered": "1", "data": [ { "adType": "0", "hasAspData": "0", "thumbURL": "https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=3618941107,2176593943&fm=26&gp=0.jpg", "middleURL": "https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=3618941107,2176593943&fm=26&gp=0.jpg", "largeTnImageUrl": "", "hasLarge": 0, "hoverURL": "https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=3618941107,2176593943&fm=26&gp=0.jpg", "pageNum": 30, "objURL": "ippr_z2C$qAzdH3FAzdH3Fvn_z&e3Biwtkw5_z&e3BvgAzdH3Ft42AzdH3Fa_a_8aa_aAzdH3F8c9ndbld9d_z&e3B9anAzdH3Fba0nl8lj0jv8m8kk99abjc10cbuvv0jj_z&e3B3r2", "fromURL": "ippr_z2C$qAzdH3FAzdH3Frtv_z&e3Biwtkw5_z&e3Bv54AzdH3Ft4w2jAzdH3F8cdcblcm_z&e3Bip4s?ho=%Ec%BC%Aa%Ec%AD%la%Em%lE%AB&fhtr=nd", "fromURLHost": "pic.haibao.com", "currentIndex": "", "width": 1024, "height": 961, "type": "jpg", "is_gif": 0, "isCopyright": 0, "strategyAssessment": "1249909234_30_0_0", "filesize": "", "bdSrcType": "0", "di": "27610", "pi": "0", "is": "0,0", "imgCollectionWord": "", "replaceUrl": [ { "ObjURL": "http://img2.imgtn.bdimg.com/it/u=3618941107,2176593943&fm=214&gp=0.jpg", "ObjUrl": "http://img2.imgtn.bdimg.com/it/u=3618941107,2176593943&fm=214&gp=0.jpg", "FromURL": "http://fashion.ifeng.com/c/7ushnwc2kg2", "FromUrl": "http://fashion.ifeng.com/c/7ushnwc2kg2" }, { "ObjURL": "http://imgboys1.yohobuy.com/cmsimg01/2019/08/21/03/03/0188975a38b1fdbba46f4884c50910a6a6.jpeg", "ObjUrl": "http://imgboys1.yohobuy.com/cmsimg01/2019/08/21/03/03/0188975a38b1fdbba46f4884c50910a6a6.jpeg", "FromURL": "http://www.yohoboys.com/channel/detail/release/id/84361/app/", "FromUrl": "http://www.yohoboys.com/channel/detail/release/id/84361/app/" } ], "hasThumbData": "0", "bdSetImgNum": 0, "partnerId": 0, "spn": 0, "bdImgnewsDate": "2020-01-19 01:43", "fromPageTitle": "<strong>张子枫</strong>", "fromPageTitleEnc": "张子枫", "bdSourceName": "", "bdFromPageTitlePrefix": "", "isAspDianjing": 0, "token": "", "imgType": "", "cs": "3618941107,2176593943", "os": "840935132,1601297368", "simid": "2994841692,3622017507", "personalized": "0", "simid_info": null, "face_info": null, "xiangshi_info": null, "adPicId": "0", "source_type": "" }, ...... }

只显示部分 到了这里从json数组中不难看出我们想要的东西 发现"listNum": 30,由此可见pn就是每页显示的条数 这里发现原来百度图库的图片基本也都是爬别人的,就没有一张高清图片 咱们去掉url最后面的参数 同时改变pn参数的值 注意每次加30: 经过再次抓取发现也能返回成功,到这一步,就可以开始整个图片的抓取了

直接上代码了:

from urllib import request #引入网页分解析器 import requests import json #引入json处理扩展 class ZzfSpider: def __init__(self): self.templete_url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=&copyright=&word=%E5%BC%A0%E5%AD%90%E6%9E%AB%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&cg=star&pn={}&rn=30"#构建基础URL self.total = 30 self.i = 0 # 处理内容 def get_content_list(self, dict_str): return dict_str['data'], dict_str['displayNum'] # 保存内容 def save_content_list(self, content_list,i): j = 0 with open("zzf_image_spider.text", "a") as f: for content in content_list: if content:#内容不为空则写入文本 并保存图片 request.urlretrieve(content['middleURL'], "zzf/%s.jpg" % content['strategyAssessment']) #创建图片 f.write(json.dumps(content, ensure_ascii=False, indent=2)) #将字典类型转化成json字符串写入文本 f.write("\n") j += 1 page = i*30+j print('保存成功,合计%s张图片' % page) # 发送请求获得数据 def parse_url(self, url): headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36", "referer": "https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%D5%C5%D7%D3%B7%E3%CD%BC%C6%AC&fr=ala&ala=1&alatpl=star&pos=0&hs=2&xthttps=111111", } response = requests.get(url, headers=headers) json_str = response.content.decode() dict_str = json.loads(json_str)#将数据转换成python字典类型 return dict_str # 主方法 def run(self): num = 30 total = self.total i = self.i while True: # 1 获取基础url url = self.templete_url.format(num) # print(url) # 2 发送请求 获取相应 dict_str = self.parse_url(url) # 3 数据处理 content_list, total = self.get_content_list(dict_str) if total == 0: break else: # 4 数据保存 self.save_content_list(content_list,i) i += 1 # 5 构造下一页url num += 30 # print(content_list) if __name__ == '__main__': ZzfSpider().run()

这里有一点 由于url中参数的值是累加的,抓取过程中发现到了pn=1350的时候,抓取数据返回空值,所以得在程序中做一个判断 最后程序效果: 视频展示:

python爬虫张子枫图片

由于图片太多 本次只抓取了1000多图片张变终止了程序 最后附上张子枫图片合集百度网盘链接: https://pan.baidu.com/s/1Jmd2BHCM3oY1DM2AqBZjuw 密码:hk0h

最新回复(0)