-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawler.js
115 lines (106 loc) · 3.54 KB
/
crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
const _ = require('lodash');
const cheerio = require('cheerio');
const puppeteer = require('puppeteer');
let keyword = '米森 蔓越莓麥片';
const sourceList = [
{
name: 'momo',
url: 'https://www.momoshop.com.tw/search/searchShop.jsp?searchType=1&curPage=1',
params: {
type: 'query',
name: 'keyword'
},
product: {
count: '.listArea ul li',
img: '.listArea ul li .prdImg',
title: '.listArea ul li .prdName',
price: '.listArea ul li .money .price b'
}
},
{
name: 'yahoo',
url: 'https://tw.search.buy.yahoo.com/search/shopping/product?qt=product',
params: {
type: 'query',
name: 'p'
},
product: {
count: '.list-type .item',
img: '.srp-pdimage a img',
title: '.srp-pdtitle a',
price: '.srp-actprice em'
}
},
{
name: 'book',
url: 'http://search.books.com.tw/search/query/key/keyword/cat/all',
params: {
type: 'param',
name: 'keyword'
},
product: {
count: '.searchbook .item',
img: '.searchbook .item a img.itemcov',
title: '.searchbook .item a img.itemcov',
price: '.searchbook .item .price'
}
}
];
const crawler = ($, params = {}, type) => {
// console.info('------------ parse start ------------');
const itemCount = ($(params.count).length > 10) ? 10 : $(params.count).length;
// console.info('------------ parse count ------------', itemCount);
const parseData = [];
for (let index = 0; index < itemCount; index++) {
const img = (type === 'book') ? $(params.img).get(index).attribs['data-original'] : $(params.img).get(index).attribs.src;
const title = (type === 'book') ? $(params.title).get(index).attribs.alt : $(params.title).get(index).children[0].data;
let price;
if (type === 'book') {
const selector = `${params.count}:nth-child(${index + 1}) .price strong`;
const length = ($(selector).length > 0) ? $(selector).length - 1 : 0;
price = $(`${selector} b`).get(length).children[0].data;
} else {
price = $(params.price).get(index).children[0].data;
}
parseData.push({ img, title, price });
}
return parseData;
};
module.exports = api => {
api.get('/', async ctx => {
const argv = process.argv.slice(2);
if (!_.isEmpty(argv)) {
keyword = _.reduce(argv, (sum, val) => {
return sum + val;
}, '');
}
// console.info('------------ Program start ------------', keyword);
const browser = await puppeteer.launch();
// console.info('------------ start crawler ------------');
const data = await Promise.all(sourceList.map(source => {
return new Promise(async (resolve, reject) => {
try {
const page = await browser.newPage();
let url = source.url;
if (source.params.type === 'query') {
url = `${url}&${source.params.name}=${keyword}`;
} else if (source.params.type === 'param') {
url = url.replace(source.params.name, keyword);
}
// console.info('------------ url ------------', url);
await page.goto(url);
const content = await page.content();
const $ = cheerio.load(content);
const result = crawler($, source.product, source.name);
await page.close();
resolve({ data: result, website: source.name });
} catch (error) {
reject(error);
}
});
}));
await browser.close();
ctx.render('index', { keyword, data, title: 'Crawler' });
// console.error('------------ Program end ------------');
});
};