爬虫基础 之深度优先,广度优先策略

tech2022-08-06  126

1.深度优先递归方式;

import re import requests headers = { 'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41" } def get_html(url): try: res= requests.get(url,headers=headers) return res.text except: return "" def get_son_url(url): # 获取 html = get_html(url) html_re = '<a.*?href="(.*?)".*?>' href_list = re.findall(html_re,html,re.S) return href_list def deep_path(url): if deepdict[url] > 3: return print("\t"*deepdict[url],"当前层级:%d" % deepdict[url]) # 获取子url列表 sonurl_list = get_son_url(url) #返回的是一个列表 #遍历所有的子url for sonurl in sonurl_list: if sonurl.startswith('https') or sonurl.startswith('http'): if sonurl not in deepdict: deepdict[sonurl] = deepdict[url]+1 deep_path(sonurl) if __name__ == '__main__': url = "https://www.baidu.com/s?wd=%E6%AD%A6%E6%B1%89%E5%85%89%E8%B0%B7" # 控制层级 deepdict = {} deepdict[url] = 1 deep_path(url)

2.广度优先策略之队列方法:

import re import requests headers = { 'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41" } #获取网页源代码 def get_html(url): try: res= requests.get(url,headers=headers) return res.text except: return "" #获取子url列表 def get_son_url(url): # 获取 html = get_html(url) html_re = '<a.*?href="(.*?)".*?>' href_list = re.findall(html_re,html,re.S) return href_list #广度爬取 def vast_path(url): #队列方法 先进先出 #append 入队列 pop 出队列 用列表 模拟队列 url_queue = [] url_queue.append(url) #默认先把第一个放进来 while len(url_queue)>0: #出队列 每次取出一个 url = url_queue.pop(0) print("\t" * deepdict[url],'当前层级:%d'%deepdict[url]) if deepdict[url]<3: #获取子url列表 sonurl_list = get_son_url(url) for sonurl in sonurl_list: #过滤出有效链接 if sonurl.startswith('https') or sonurl.startswith('http'): if sonurl not in deepdict: #过滤重复url deepdict[sonurl] = deepdict[url]+1 #入队列 url_queue.append(sonurl) if __name__ == '__main__': url = "https://www.baidu.com/s?wd=%E6%AD%A6%E6%B1%89%E5%85%89%E8%B0%B7" # 控制层级 deepdict = {} #控制层级 deepdict[url] = 1 # 默认第一级 vast_path(url)

3.深度优先策略之栈方法:

import re import requests headers = { 'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41" } #获取网页源代码 def get_html(url): try: res= requests.get(url,headers=headers) return res.text except: return "" #获取子url列表 def get_son_url(url): # 获取 html = get_html(url) html_re = '<a.*?href="(.*?)".*?>' href_list = re.findall(html_re,html,re.S) return href_list #广度爬取 def vast_path(url): #队列方法 先进先出 #append 入栈 pop 出栈 用列表 模拟栈 url_queue = [] url_queue.append(url) #默认先把第一个放进来 while len(url_queue)>0: #出栈 每次取出最后一个 url = url_queue.pop() print("\t" * deepdict[url],'当前层级:%d'%deepdict[url]) if deepdict[url]<3: #获取子url列表 sonurl_list = get_son_url(url) for sonurl in sonurl_list: #过滤出有效链接 if sonurl.startswith('https') or sonurl.startswith('http'): if sonurl not in deepdict: #过滤重复url deepdict[sonurl] = deepdict[url]+1 #子url相比父url层级+1 #入队列 url_queue.append(sonurl) if __name__ == '__main__': url = "https://www.baidu.com/s?wd=%E6%AD%A6%E6%B1%89%E5%85%89%E8%B0%B7" # 控制层级 deepdict = {} #控制层级 deepdict[url] = 1 # 默认第一级 vast_path(url)

万水千山总是情,点个关注行不行。

最新回复(0)