-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathscrape.js
71 lines (57 loc) · 2.45 KB
/
scrape.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
// From http://blog.miguelgrinberg.com/post/easy-web-scraping-with-nodejs
var request = require('request');
var cheerio = require('cheerio');
var async = require('async');
var people = require(process.argv[2]);
var scrapeEntry = function(person, doneCallback) {
var url = people[person];
var data = {};
// properly set the encoding, or we'll mangle accented characters:
// http://stackoverflow.com/questions/8332500/module-request-how-to-properly-retrieve-accented-characters-%EF%BF%BD-%EF%BF%BD-%EF%BF%BD
request({ encoding: 'binary', method: "GET", uri: url}, function(err, resp, body) {
var $ = cheerio.load(body);
try {
// We're output to stdout, so log to stderr
console.error("Scraping " + person + "...");
var photo = $('#gsc_prf_pup-img')[0].attribs.src;
var affiliation = $('.gsc_prf_il', '#gsc_prf_i').first().text();
var keywords_root = $('#gsc_prf_int')[0].children;
var keywords = [];
for (var i=0; i<keywords_root.length; i++) {
keywords.push(keywords_root[i].children[0].data);
}
var rawStats = $('#gsc_rsb_st');
var stats = {
'citations' : [ rawStats[0].children[1].children[0].children[1].children[0].data,
rawStats[0].children[1].children[0].children[2].children[0].data ],
'hindex' : [ rawStats[0].children[1].children[1].children[1].children[0].data,
rawStats[0].children[1].children[1].children[2].children[0].data ],
'i10index' : [ rawStats[0].children[1].children[2].children[1].children[0].data,
rawStats[0].children[1].children[2].children[2].children[0].data ]
};
var rawYear = $('.gsc_md_hist_b');
data = {
'name' : person,
'url': url,
'photo' : 'http://scholar.google.com' + photo,
'affiliation' : affiliation,
'keywords' : keywords,
'stats' : stats,
'year' : rawYear[0].children[0].children[0].data
};
} catch (ex) {
console.error(ex);
throw new Error(person);
}
// Adding a timeout to regulate scraping speed.
setTimeout(function() {
doneCallback(null, data);
}, 5000);
});
};
// http://javascriptplayground.com/blog/2013/06/think-async/
async.mapSeries(Object.keys(people), scrapeEntry, function (err, results) {
var date = new Date();
console.log('var date = "' + date + '"');
console.log('var data = ' + JSON.stringify(results, null, 2) + ';');
});