写了一个爬取图片的代码,但是一直是爬取失败,请求各位解答。(以国家地理网图片为例)`import requests import os import re
def getHTMLText(url): try: headers = { “User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36”, } cookie = “省略” COOKIE = {‘cookie’:cookie} r = requests.get(url,timeout=30,headers = headers,cookies=COOKIE) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return “”
def parseJPG(pglist,html): jpgs = re.findall(r’“imgUrl”:“https://i02piccdn.*?”’,html) for jpg in jpgs: jpg = eval(jpg.split(’:’)[1]+":"+jpg.split(’:’)[2]) pglist.append(jpg)
def saveJPG(pglist): root = “E://研究生//爬虫//图片//” #保存位置 for jpg in pglist: path = root + jpg.split(’/’)[-1] try: if not os.path.exists(root): #判断根目录是否存在 os.mkdir(root) if not os.path.exists(path): #判断文件是否存在 r = requests.get(url,headers ={“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36”}) with open(path,‘wb’) as f: f.write(r.content) f.close() print(“文件保存成功”) else: print(“文件已存在”) except: print(“爬取失败”)
def main(): url = ‘https://pic.sogou.com/pics?query=%B9%FA%BC%D2%B5%D8%C0%ED%CD%F8%CD%BC%C6%AC&ie=gbk&p=40230500&st=255&mode=255&policyType=0’ html = getHTMLText(url) pagelist = [] parseJPG(pagelist,html) saveJPG(pagelist)
main()
`