执行环境想必不用我多说了,有问题留言
<?php
/**
* Created by PhpStorm.
* User: Administrator
* Date: 2020/9/1 0001
* Time: 22:16
*/
$citys = ['luohuqu',
'futianqu',
'nanshanqu',
'yantianqu',
'baoanqu',
'longgangqu',
'longhuaqu',
'guangmingqu',
'pingshanqu',
'dapengxinqu'];
$start_time = time
();
//
$citys = ['luohuqu',
'futianqu'];
$page = 10
;
//循环城市创建多进程,使用消息队列
foreach
($citys as
$key=>$city){
$process = new Swoole\Process
(function
($worker) use
($city,
$page){
//循环分页数
for ($i = 1
; $i <= $page; $i++
){
//创建分页地址
$url = 'https://sz.lianjia.com/zufang/' . $city . '/pg' . $i;
//爬取网页html数据
$data = getUrlData
($url);
//往队列放入数据
$worker-
>push
(json_encode
($data,JSON_UNESCAPED_UNICODE
));
}
});
//使用队列
$process-
>useQueue
();
//开启进程获取进程id
$pid = $process-
>start
();
$pid = $process-
>pid
;
//赋值进程数组
$workers[$pid] = $process;
}
//循环进程数组取出队列,使用协程将数据插入表
foreach
($workers as
$worker){
for ($i = 1
; $i <= $page; $i++
){
$data = json_decode
($worker-
>pop
(),true
);
//三种方式,任意一种即可
//协程容器里面开启协程,短名称特性,需要在php.ini设置swoole.use_shortname
='on'
Co\run
(function
() use
($data){
go
(function
() use
($data){
mysql_query
($data);
});
});
// //协程容器(对Scheduler的封装),短名称特性,需要在php.ini设置swoole.use_shortname
='on'
// Co\run
(function
() use
($data){
// mysql_query
($data);
//
});
// //协程调度器类
//
$scheduler = new Swoole\Coroutine\Scheduler
();
//
$scheduler-
>add
(function
() use
($data){
// mysql_query
($data);
//
});
//
$scheduler-
>start
();
}
}
//执行协程mysql客户端
function mysql_query
($data){
//创建mysql连接
$mysql = new Swoole\Coroutine\MySQL
();
$mysql-
>connect
([
'host'=>'127.0.0.1',
'port'=>3306,
'user'=>'root',
'password'=>'cxh1002.',
'database'=>'lianjia',
]);
$time = time
();
foreach
($data as
$val){
//预处理语句
$stmt = $mysql-
>prepare
('INSERT INTO house (title,address,area,aspect,house_type,price,add_time) VALUES (?,?,?,?,?,?,?)');
if
(!$stmt || $stmt-
>error
){
var_dump
($mysql-
>error
);
return;
}
//发送预处理数据参数
$res = $stmt-
>execute
([
$val['title'],
$val['address'],
$val['area'],
$val['aspect'],
$val['house_type'],
$val['price'],
$time,
]);
// var_dump
($res);
}
}
//爬取网页数据
function getUrlData
($url){
$data = [];
//获取整个网页html
$html = file_get_contents
($url);
//匹配某个div数据块
$preg_div = '/<div class=\"content__list--item--main\">.*?<\/div>/ism';
preg_match_all
($preg_div,
$html,
$match_div);
//循环匹配数据存入数据库
foreach
($match_div[0
] as
$key=>$val){
//匹配标题,地址
$preg_a = '/<a .*?>.*?<\/a>/ism';
preg_match_all
($preg_a,
$val,
$match_a);
if
(count
($match_a[0
]) < 4
) continue;
list
($a,
$b,
$c,
$d) = $match_a[0
];
$data[$key]['title'] = trim
(strip_tags
($a));
$data[$key]['address'] = trim
(strip_tags
($b)) . '/' . trim
(strip_tags
($c)) . '/' . trim
(strip_tags
($d));
//匹配面积,朝向,户型
$preg_i = '/<\/i>.*?<i>/ism';
preg_match_all
($preg_i,
$val,
$match_i);
if
(count
($match_i[0
]) < 3
) continue;
list
($e,
$f,
$g) = $match_i[0
];
$data[$key]['area'] = trim
(strip_tags
($e));
$data[$key]['aspect'] = trim
(strip_tags
($f));
$data[$key]['house_type'] = trim
(strip_tags
($g));
//匹配月租
$preg_em = '/<em>.*?<\/em>/ism';
preg_match_all
($preg_em,
$val,
$match_em);
$data[$key]['price'] = trim
(strip_tags
($match_em[0
][0
]));
}
return $data;
}
echo 'time:' . (time
() -
$start_time) . PHP_EOL
;