nodejs爬取小说
爬取小说获取书籍链接代码段获取目录链接获取内容
以下是完整的代码
爬取小说
这次分享一个爬取一个网站小说的demo,刚开始学习爬虫的朋友也可以参考思路去编写自己的程序,爬取思路—— 首先,找到你要爬取的网站的链接,然后获取书籍的链接; 二、每本书都有一个自己的链接; 三、然后该书的链接的内容会包含目录列表链接 四、我们便可以借用书籍的链接去获取目录链接 五、然后根据目录链接去获取目录内容,然后讲内容爬取出来并写入文件中就完成爬取了 基本就是这五部就可以完成爬取任务了
获取书籍链接代码段
var data
= [];
for (var i
= 1; i
< 2; i
++) {
var d
= await getHttpData('http://book.zongheng.com/store/c1/c1003/b0/u0/p' + i
+ '/v0/s1/t0/u0/i1/ALL.html',
getBookName
);
data
= [...data
, ...d
];
}
function getBookName($
) {
var reg
= /http.+.html/g;
var url
= [];
var _this
= $('.store_collist .bookbox');
$('.store_collist .bookbox').each(function(index
, ele
) {
var _url
= $(ele
).find('.bookname').html().match(reg
);
var bookName
= $(ele
).find('.bookname a').text();
var author
= $(ele
).find('.bookilnk a:first-child').text();
_url
[0] = _url
[0].replace('com/book', "com/showchapter");
var imgUrl
= $(ele
).find('.bookimg img').prop('src');
var describe
= $(ele
).find('.bookintro').text();
var obj
= {
bookUrl
: _url
[0],
bookName
: bookName
,
author
: author
,
imgSrc
: imgUrl
,
describe
: describe
,
childrenUrl
: []
}
url
.push(obj
);
})
return url
;
}
获取目录链接
for (var i
= 0; i
< data
.length
; i
++) {
var d1
= await getHttpData(data
[i
].bookUrl
, getChapterList
);
data
[i
].childrenUrl
= d1
;
}
function getChapterList($
) {
var url
= [];
var reg
= /http.+.html/g;
$('.chapter-list .col-4').each(function(index
, ele
) {
var str_url
= $(this).html().match(reg
);
url
.push(str_url
[0]);
});
return url
;
}
获取内容
for (var i
= 0; i
< data
.length
; i
++) {
for (var j
= 0; j
< data
[i
].childrenUrl
.length
; j
++) {
var obj
= await getHttpData(data
[i
].childrenUrl
[j
], getChapterContent
);
var p_txt
= `aa/${data[i].bookName}-${data[i].author}/${obj.txtName}`
if (!fs
.existsSync(`aa/${data[i].bookName}-${data[i].author}`)) {
try {
fs
.mkdirSync(`aa/${data[i].bookName}-${data[i].author}`)
} catch (e) {
console
.log(e
)
}
}
if (!fs
.existsSync(p_txt
)) {
try {
fs
.writeFileSync(p_txt
, obj
.content
)
} catch (e) {
console
.log(e
)
}
}
for (var k
= 0; k
< 1000000000; k
++) {}
}
}
function getChapterContent($
) {
var reg1
= /(?:\s)*/g;
var title
= $('.reader_box .title_txtbox').text();
var author
= $('.bookinfo a:first-child').text();
var number
= $('.bookinfo span:nth-child(2)').text()
var time
= $('.bookinfo span:nth-child(3)').text();
var content
= $('.reader_box .content').html();
var info
= `${title}-${number}字-${time}`;
content
= `<h5>${info}</h5>${content}`;
title
= title
.replace(/\?|\!|\"|\:|\<|\>|\||\\|\//g, " ");
var obj
= {
txtName
: title
+ ".txt",
content
: content
}
console
.log("获取到" + info
)
return obj
;
}*斜体样式
*
*整个爬取就是分为三部分,首先获取书籍,然后根据书籍获取目录 , 然后根据目录获取内容,由于这三部分的数据获取需要一个顺序,所以
我采用了promise对象讲一些异步操作转换成同步,不理解如何转同步的朋友
可以先不管,直接照着用就行了,有些地方不需要同步的,只是为了让这个爬取过程编程顺序过程,会看的更仔细
注:有一段空循环代码,是为了控制每次爬取间隔为1s,加上去用来阻塞代码运行的,这里不能用setTimeout是异步的,不会阻塞,不控制爬取速度会被服务器禁止访问
以下是完整的代码
const http
= require('http');
const fs
= require('fs');
const cheerio
= require('cheerio');
function Pac() {}
function getHttpData(http1
, callback
) {
var p
= new Promise(function(resolve
, reject
) {
http
.get(http1
, function(req
, res
) {
var html
= '';
req
.on('data', function(data
) {
html
+= data
;
})
req
.on('end', function() {
const $
= cheerio
.load(html
, {
decodeEntities
: false
})
var d
= callback($
);
resolve(d
);
})
})
}).catch((err
) => {
console
.log(err
);
})
return p
;
}
function getChapterContent($
) {
var reg1
= /(?:\s)*/g;
var title
= $('.reader_box .title_txtbox').text();
var author
= $('.bookinfo a:first-child').text();
var number
= $('.bookinfo span:nth-child(2)').text()
var time
= $('.bookinfo span:nth-child(3)').text();
var content
= $('.reader_box .content').html();
var info
= `${title}-${number}字-${time}`;
content
= `<h5>${info}</h5>${content}`;
title
= title
.replace(/\?|\!|\"|\:|\<|\>|\||\\|\//g, " ");
var obj
= {
txtName
: title
+ ".txt",
content
: content
}
console
.log("获取到" + info
)
return obj
;
}
function getChapterList($
) {
var url
= [];
var reg
= /http.+.html/g;
$('.chapter-list .col-4').each(function(index
, ele
) {
var str_url
= $(this).html().match(reg
);
url
.push(str_url
[0]);
});
return url
;
}
function getBookName($
) {
var reg
= /http.+.html/g;
var url
= [];
var _this
= $('.store_collist .bookbox');
$('.store_collist .bookbox').each(function(index
, ele
) {
var _url
= $(ele
).find('.bookname').html().match(reg
);
var bookName
= $(ele
).find('.bookname a').text();
var author
= $(ele
).find('.bookilnk a:first-child').text();
_url
[0] = _url
[0].replace('com/book', "com/showchapter");
var imgUrl
= $(ele
).find('.bookimg img').prop('src');
var describe
= $(ele
).find('.bookintro').text();
var obj
= {
bookUrl
: _url
[0],
bookName
: bookName
,
author
: author
,
imgSrc
: imgUrl
,
describe
: describe
,
childrenUrl
: []
}
url
.push(obj
);
})
return url
;
}
Pac
.start
= async function() {
var data
= [];
for (var i
= 1; i
< 2; i
++) {
var d
= await getHttpData('http://book.zongheng.com/store/c1/c1003/b0/u0/p' + i
+ '/v0/s1/t0/u0/i1/ALL.html',
getBookName
);
data
= [...data
, ...d
];
}
for (var i
= 0; i
< data
.length
; i
++) {
var d1
= await getHttpData(data
[i
].bookUrl
, getChapterList
);
data
[i
].childrenUrl
= d1
;
}
for (var i
= 0; i
< data
.length
; i
++) {
for (var j
= 0; j
< data
[i
].childrenUrl
.length
; j
++) {
var obj
= await getHttpData(data
[i
].childrenUrl
[j
], getChapterContent
);
var p_txt
= `aa/${data[i].bookName}-${data[i].author}/${obj.txtName}`
if (!fs
.existsSync(`aa/${data[i].bookName}-${data[i].author}`)) {
try {
fs
.mkdirSync(`aa/${data[i].bookName}-${data[i].author}`)
} catch (e) {
console
.log(e
)
}
}
if (!fs
.existsSync(p_txt
)) {
try {
fs
.writeFileSync(p_txt
, obj
.content
)
} catch (e) {
console
.log(e
)
}
}
for (var k
= 0; k
< 1000000000; k
++) {}
}
}
console
.log("爬取完成")
}
Pac
.start();