Merge pull request #106 from yujiosaka/add_denied_domains

feat: support deniedDomains option
yujiosaka · Feb 21, 2018 · ce24a6a · ce24a6a
2 parents 690e7d7 + c83977a
commit ce24a6a
Show file tree

Hide file tree

Showing 5 changed files with 64 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ### Added
 
 - Emit `newpage` event.
+- Support `deniedDomains` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler#crawlerqueueoptions)'s options.
+
+### changed
+
+-  Allow `allowedDomains` option to accept a list of regular expressions.
 
 ## [1.3.2] - 2018-01-19
 

diff --git a/README.md b/README.md
@@ -180,7 +180,7 @@ browserWSEndpoint, ignoreHTTPSErrors
 Also, the following options can be set as default values when [crawler.queue()](#crawlerqueueoptions) are executed.
 
 ```
-url, allowedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage
+url, allowedDomains, deniedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage
 ```
 
 > **Note**: In practice, setting the options every time you queue equests is redundant. Therefore, it's recommended to set the default values and override them depending on the necessity.
@@ -220,7 +220,7 @@ ignoreHTTPSErrors, headless, executablePath, slowMo, args, ignoreDefaultArgs, ha
 Also, the following options can be set as default values when [crawler.queue()](#crawlerqueueoptions) are executed.
 
 ```
-url, allowedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage
+url, allowedDomains, deniedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage
 ```
 
 > **Note**: In practice, setting the options every time you queue the requests is redundant. Therefore, it's recommended to set the default values and override them depending on the necessity.
@@ -242,7 +242,8 @@ url, allowedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, d
   * `skipDuplicates` <[boolean]> Whether to skip duplicate requests, default to `null`. The request is considered to be the same if `url`, `userAgent`, `device` and `extraHeaders` are strictly the same.
   * `obeyRobotsTxt` <[boolean]> Whether to obey [robots.txt](https://developers.google.com/search/reference/robots_txt), default to `true`.
   * `followSitemapXml` <[boolean]> Whether to use [sitemap.xml](https://www.sitemaps.org/) to find locations, default to `false`.
-  * `allowedDomains` <[Array]<[string]>> List of domains allowed to request. `www.example.com` will be allowed if `example.com` is listed.
+  * `allowedDomains` <[Array]<[string]|[RegExp]>> List of domains allowed to request. Pass `null` or leave default to skip checking allowed domain
+  * `deniedDomains` <[Array]<[string]|[RegExp]>> List of domains not allowed to request. Pass `null` or leave default to skip checking denied domain.
   * `delay` <[number]> Number of milliseconds after each request, defaults to `0`. When delay is set, `maxConcurrency` option must be `1`.
   * `retryCount` <[number]> Number of limit when retry fails, defaults to `3`.
   * `retryDelay` <[number]> Number of milliseconds after each retry fails, defaults to `10000`.
@@ -548,6 +549,7 @@ Dynamic crawlers based on [PhantomJS](http://phantomjs.org) and [Selenium](http:
 [Object]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Object "Object"
 [Promise]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Promise "Promise"
 [string]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures#String_type "String"
+[RegExp]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp "RegExp"
 [Serializable]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON/stringify#Description "Serializable"
 [Error]: https://nodejs.org/api/errors.html#errors_class_error "Error"
 [HCCrawler]: #class-hccrawler "HCCrawler"

diff --git a/lib/hccrawler.js b/lib/hccrawler.js
@@ -7,8 +7,6 @@ const {
   map,
   each,
   includes,
-  some,
-  endsWith,
   isString,
   isArray,
 } = require('lodash');
@@ -19,6 +17,7 @@ const devices = require('puppeteer/DeviceDescriptors');
 const {
   delay,
   generateKey,
+  checkDomainMatch,
   getRobotsUrl,
   getSitemapUrls,
   tracePublicAPI,
@@ -452,8 +451,9 @@ class HCCrawler extends EventEmitter {
    */
   _checkAllowedDomains(options) {
     const { hostname } = parse(options.url);
-    if (!options.allowedDomains) return true;
-    return some(options.allowedDomains, domain => endsWith(hostname, domain));
+    if (options.deniedDomains && checkDomainMatch(options.deniedDomains, hostname)) return false;
+    if (options.allowedDomains && !checkDomainMatch(options.allowedDomains, hostname)) return false;
+    return true;
   }
 
   /**

diff --git a/lib/helper.js b/lib/helper.js
@@ -3,12 +3,14 @@ const { parse, resolve, format } = require('url');
 const crypto = require('crypto');
 const {
   pick,
-  isPlainObject,
   trim,
   startsWith,
+  some,
   includes,
+  isPlainObject,
   isString,
   isFunction,
+  isRegExp,
 } = require('lodash');
 const debug = require('debug');
 
@@ -121,6 +123,18 @@ class Helper {
     return first;
   }
 
+  /**
+   * @param {!Array<!string|RegExp} domains
+   * @param {!string} hostname
+   * @return {!boolean}
+   */
+  static checkDomainMatch(domains, hostname) {
+    return some(domains, domain => {
+      if (isRegExp(domain)) return domain.test(hostname);
+      return domain === hostname;
+    });
+  }
+
   /**
    * @param {!string} sitemapXml
    * @return {!Array<!string>}

diff --git a/test/hccrawler.test.js b/test/hccrawler.test.js
@@ -191,7 +191,7 @@ describe('HCCrawler', () => {
             });
         });
 
-        it('crawls when the requested domain is allowed', () => {
+        it('crawls when the requested domain exactly matches allowed domain', () => {
           let requestskipped = 0;
           crawler.on('requestskipped', () => { requestskipped += 1; });
           crawler.queue({ url: INDEX_PAGE, allowedDomains: ['127.0.0.1'] });
@@ -202,7 +202,18 @@ describe('HCCrawler', () => {
             });
         });
 
-        it('skips crawling when the requested domain is not allowed', () => {
+        it('crawls when the requested domain matches allowed domain by regular expression', () => {
+          let requestskipped = 0;
+          crawler.on('requestskipped', () => { requestskipped += 1; });
+          crawler.queue({ url: INDEX_PAGE, allowedDomains: [/\d+\.\d+\.\d+\.\d+/] });
+          return crawler.onIdle()
+            .then(() => {
+              assert.equal(requestskipped, 0);
+              assert.equal(onSuccess.callCount, 1);
+            });
+        });
+
+        it('skips crawling when the requested domain does not match allowed domain', () => {
           let requestskipped = 0;
           crawler.on('requestskipped', () => { requestskipped += 1; });
           crawler.queue({ url: INDEX_PAGE, allowedDomains: ['0.0.0.0'] });
@@ -213,6 +224,28 @@ describe('HCCrawler', () => {
             });
         });
 
+        it('skips crawling when the requested domain exactly matches denied domain', () => {
+          let requestskipped = 0;
+          crawler.on('requestskipped', () => { requestskipped += 1; });
+          crawler.queue({ url: INDEX_PAGE, deniedDomains: ['127.0.0.1'] });
+          return crawler.onIdle()
+            .then(() => {
+              assert.equal(requestskipped, 1);
+              assert.equal(onSuccess.callCount, 0);
+            });
+        });
+
+        it('skips crawling when the requested domain matches denied domain by regular expression', () => {
+          let requestskipped = 0;
+          crawler.on('requestskipped', () => { requestskipped += 1; });
+          crawler.queue({ url: INDEX_PAGE, deniedDomains: [/\d+\.\d+\.\d+\.\d+/] });
+          return crawler.onIdle()
+            .then(() => {
+              assert.equal(requestskipped, 1);
+              assert.equal(onSuccess.callCount, 0);
+            });
+        });
+
         it('follows links when maxDepth is set', () => {
           let maxdepthreached = 0;
           server.setContent('/1.html', `go to <a href="${PREFIX}/2.html">/2.html</a>`);