某查查企业爬虫(模拟人工验证码)

tech2023-01-04  116

1、.net core 3.1 C#  selenium 

//爬取所有省份、城市、区县 public override async Task WriteAreaToFileAsync(string configPath, string directory) { const string BaseProvinceLink = "https://www.qcc.com/search?key={keyword}#industrycode:K&"; const string BaseCityLink = "https://www.qcc.com/search_getCityListHtml?province={0}"; const string BaseCountyLink = "https://www.qcc.com/search_getCountyListHtml?city={0}"; List<string> provinces = await GetCodeAsync(new Uri(BaseProvinceLink), ".sfilter-tag.clearfix.provinceChoose dd a"); const string baseText = "province:{0}&city:{1}&county:{2}&"; List<string> list = new List<string>(); foreach (var province in provinces) { StringBuilder.Clear(); Uri provinceUri = new Uri(StringBuilder.AppendFormat(BaseCityLink, province).ToString()); var cities = await GetCodeAsync(provinceUri, "dd a"); foreach (var city in cities) { StringBuilder.Clear(); Uri cityUri = new Uri(StringBuilder.AppendFormat(BaseCountyLink, city).ToString()); var counties = await GetCodeAsync(cityUri, "dd a"); foreach (var county in counties) { StringBuilder.Clear(); StringBuilder.Append(BaseProvinceLink); string area = StringBuilder.AppendFormat(baseText, province, city, county). Replace("search", "search_index").Replace("中介#", "中介&ajaxflag=1&") .Replace(":industrycode", "=industrycode").ToString(); list.Add(area); } } } await File.WriteAllLinesAsync("企查查.txt", list); } //分页爬取企业信息 private async Task<bool> GetAgentsAsync(Uri cityUri) { LogHelper.Info(cityUri.ToString()); var pageSource = await HttpClient.GetStringAsync(cityUri); while (!pageSource.Contains("查企业")) { if (pageSource.StartsWith("<script>window.location")) { VertifyCode(new Uri(pageSource.Split("'")[1])); pageSource = await HttpClient.GetStringAsync(cityUri); } else if (pageSource.Contains("小查还没找到数据")) { return false; } } var block = JumonyParser.Parse(pageSource).Find(".m_srchList tbody tr td:nth-child(3)"); foreach (var item in block) { await VertifyAsync(item.InnerHtml()); } if (block.Count() < PageSize) { return false; } return true; }

2、结果截图

3、需要开通vip账号

4、过滑动验证码

最新回复(0)