nodejs 简单并发爬虫

作者: Shaun_lan | 来源:发表于2017-12-29 16:52 被阅读178次

nodejs 简单并发爬虫
Golang实现简单爬虫框架（4）——队列实现并发任务调度
nodeJS爬虫（完整版）
nodejs简单爬虫技巧
简单的 nodejs 爬虫
nodejs通过钉钉群机器人推送消息
NodeJs + Phantomjs 简易爬虫
简单NodeJS爬虫和使用cookie进行模拟登录
Go基础编程---web编程
nodeJS做一个简单的爬虫

1、安装5.0以上的node版本 #推荐使用nvm进行node版本控制安装node相应版本，详见：http://bubkoo.com/2017/01/08/quick-tip-multiple-versions-node-nvm/

2.项目根目录下执行 npm init ，然后一直回车即可 #如果node项目根目录下没有package.json的情况下，生成package.json

3、安装所需模块

npm install async --save

npm install cheerio --save

npm install superagent --save

#async使用node的异步模块，各种用法详见：https://github.com/ShaunLan/async

#cheerio将爬虫获取的HTML进行解析，你可以像使用jQuery一样的使用它

#superagent关于http的库可以进行http的get,post等请求

3、爬虫的过程分析：

① 使用superagent请求要爬虫的网址

② 获取到想要爬的HTML内容使用cheerio进行解析，再按jQuery获取数据的方式从解析的数据中获取到自己想要爬的数据

③ 如果想要并发异步的去请求要爬虫的网址则使用async

#可参考：http://blog.didispace.com/nodejspachong/

4、实践代码
var superagent = require('superagent');
var cheerio = require('cheerio');
var async = require('async');

console.log('爬虫程序开始运行......');

superagent
   //待抓取网页
   .get('http://www.shouce.ren/api/index')
   //设置Header
   .set('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8')
   //返回结果值
   .end(function(err, res){
       if(err || !res){
           console.log('抓取数据失败');

           return ;
       }

       //解析返回html
       var $ = cheerio.load(res.text);
       var data = [];
       var host = 'www.shouce.ren';

       //遍历获取数据
       $('#bs-navbar-collapse .width-134').each(function(key, item){
           var title = $(item).text();
           var address = host + $(item).attr('href');

           if(title.trim() && address.trim()){
               data.push({
                   'title' : title,
                   'address' : address
               });
           }
       })

       var parallel_request_qty = 10;

       if(data.length > 0){
           check_url_access(parallel_request_qty, data);
       }
   });

//并发请求
function check_url_access(parallel_request_qty, data){
   async.mapLimit(data, parallel_request_qty, function(item, callback){
       var addr = item.address;
       var name = item.title;

       superagent
           .get(addr)
           .set('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8')
           .end(function(err, res){
               if(err || !res){
                   callback('访问该URL失败: ' + addr);
               } else {
                   console.log(
                       '文档名称为:' + name +
                       '，文档地址为:' + addr +
                       '，可以成功访问'
                   );

                   callback(null, null);
               }
           });

   }, function(err, result){
       if(err){
           console.log(err);
       }
   })
}