分享php,swoole多进程,消息队列,协程,爬取数据案例demo

tech2022-08-22  129

执行环境想必不用我多说了,有问题留言

<?php /** * Created by PhpStorm. * User: Administrator * Date: 2020/9/1 0001 * Time: 22:16 */ $citys = ['luohuqu','futianqu','nanshanqu','yantianqu','baoanqu','longgangqu','longhuaqu','guangmingqu', 'pingshanqu','dapengxinqu']; $start_time = time(); //$citys = ['luohuqu','futianqu']; $page = 10; //循环城市创建多进程,使用消息队列 foreach ($citys as $key=>$city){ $process = new Swoole\Process(function ($worker) use($city,$page){ //循环分页数 for ($i = 1; $i <= $page; $i++){ //创建分页地址 $url = 'https://sz.lianjia.com/zufang/' . $city . '/pg' . $i; //爬取网页html数据 $data = getUrlData($url); //往队列放入数据 $worker->push(json_encode($data,JSON_UNESCAPED_UNICODE)); } }); //使用队列 $process->useQueue(); //开启进程获取进程id $pid = $process->start(); $pid = $process->pid; //赋值进程数组 $workers[$pid] = $process; } //循环进程数组取出队列,使用协程将数据插入表 foreach ($workers as $worker){ for ($i = 1; $i <= $page; $i++){ $data = json_decode($worker->pop(),true); //三种方式,任意一种即可 //协程容器里面开启协程,短名称特性,需要在php.ini设置swoole.use_shortname='on' Co\run(function() use($data){ go(function () use($data){ mysql_query($data); }); }); // //协程容器(对Scheduler的封装),短名称特性,需要在php.ini设置swoole.use_shortname='on' // Co\run(function() use($data){ // mysql_query($data); // }); // //协程调度器类 // $scheduler = new Swoole\Coroutine\Scheduler(); // $scheduler->add(function() use($data){ // mysql_query($data); // }); // $scheduler->start(); } } //执行协程mysql客户端 function mysql_query($data){ //创建mysql连接 $mysql = new Swoole\Coroutine\MySQL(); $mysql->connect([ 'host'=>'127.0.0.1', 'port'=>3306, 'user'=>'root', 'password'=>'cxh1002.', 'database'=>'lianjia', ]); $time = time(); foreach ($data as $val){ //预处理语句 $stmt = $mysql->prepare('INSERT INTO house (title,address,area,aspect,house_type,price,add_time) VALUES (?,?,?,?,?,?,?)'); if(!$stmt || $stmt->error){ var_dump($mysql->error); return; } //发送预处理数据参数 $res = $stmt->execute([ $val['title'], $val['address'], $val['area'], $val['aspect'], $val['house_type'], $val['price'], $time, ]); // var_dump($res); } } //爬取网页数据 function getUrlData($url){ $data = []; //获取整个网页html $html = file_get_contents($url); //匹配某个div数据块 $preg_div = '/<div class=\"content__list--item--main\">.*?<\/div>/ism'; preg_match_all($preg_div,$html,$match_div); //循环匹配数据存入数据库 foreach ($match_div[0] as $key=>$val){ //匹配标题,地址 $preg_a = '/<a .*?>.*?<\/a>/ism'; preg_match_all($preg_a,$val,$match_a); if(count($match_a[0]) < 4) continue; list($a,$b,$c,$d) = $match_a[0]; $data[$key]['title'] = trim(strip_tags($a)); $data[$key]['address'] = trim(strip_tags($b)) . '/' . trim(strip_tags($c)) . '/' . trim(strip_tags($d)); //匹配面积,朝向,户型 $preg_i = '/<\/i>.*?<i>/ism'; preg_match_all($preg_i,$val,$match_i); if(count($match_i[0]) < 3) continue; list($e,$f,$g) = $match_i[0]; $data[$key]['area'] = trim(strip_tags($e)); $data[$key]['aspect'] = trim(strip_tags($f)); $data[$key]['house_type'] = trim(strip_tags($g)); //匹配月租 $preg_em = '/<em>.*?<\/em>/ism'; preg_match_all($preg_em,$val,$match_em); $data[$key]['price'] = trim(strip_tags($match_em[0][0])); } return $data; } echo 'time:' . (time() - $start_time) . PHP_EOL;
最新回复(0)