From a29689b282e98ba5ff3f9e872ac82141eedd494b Mon Sep 17 00:00:00 2001 From: Mike Amaral Date: Thu, 30 Apr 2015 11:12:23 -0400 Subject: [PATCH 1/3] Added tests validating bug exists where stopword filtering was incorrectly case-sensitive. --- spec/porter_stemmer_spec.js | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/spec/porter_stemmer_spec.js b/spec/porter_stemmer_spec.js index dd413f0fb..37592a53a 100644 --- a/spec/porter_stemmer_spec.js +++ b/spec/porter_stemmer_spec.js @@ -21,6 +21,7 @@ THE SOFTWARE. */ var stemmer = require('../lib/natural/stemmers/porter_stemmer'); +var stopwords = require('../lib/natural/util/stopwords'); describe('porter_stemmer', function() { it('should categorizeGroups', function() { @@ -179,4 +180,13 @@ describe('porter_stemmer', function() { expect('scoring stinks'.tokenizeAndStem()).toEqual(['score', 'stink']); expect('SCORING STINKS'.tokenizeAndStem()).toEqual(['score', 'stink']); }); + + it('should tokenize and stem ignoring stopwords', function() { + expect('My dog is very fun TO play with And another thing, he is A poodle.'.tokenizeAndStem()).toEqual(['dog', 'fun', 'plai', 'thing', 'poodl']); + }); + + it('should tokenize and stem ignoring all capital stopwords', function() { + var allCapitalStopwords = stopwords.words.join(' ').toUpperCase(); + expect(allCapitalStopwords.tokenizeAndStem()).toEqual([]); + }); }); From 6eedbb16a2da6c72f739a7e96aeb23b970ed93b9 Mon Sep 17 00:00:00 2001 From: Mike Amaral Date: Thu, 30 Apr 2015 11:28:09 -0400 Subject: [PATCH 2/3] Ensure we lowercase all tokens before testing against the stopwords, and reduce the number of conditionals per function call. --- lib/natural/stemmers/stemmer.js | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/lib/natural/stemmers/stemmer.js b/lib/natural/stemmers/stemmer.js index f1c2d1777..12f4cca59 100644 --- a/lib/natural/stemmers/stemmer.js +++ b/lib/natural/stemmers/stemmer.js @@ -40,11 +40,21 @@ module.exports = function() { stemmer.tokenizeAndStem = function(text, keepStops) { var stemmedTokens = []; - - new Tokenizer().tokenize(text).forEach(function(token) { - if(keepStops || stopwords.words.indexOf(token) == -1) + var lowercaseText = text.toLowerCase(); + var tokens = new Tokenizer().tokenize(lowercaseText); + + if (keepStops) { + tokens.forEach(function(token) { stemmedTokens.push(stemmer.stem(token)); - }); + }); + } + + else { + tokens.forEach(function(token) { + if (stopwords.words.indexOf(token) == -1) + stemmedTokens.push(stemmer.stem(token)); + }); + } return stemmedTokens; }; From e99431ba9fc5f34b96d2968018f310e027b61875 Mon Sep 17 00:00:00 2001 From: Mike Amaral Date: Thu, 30 Apr 2015 11:29:23 -0400 Subject: [PATCH 3/3] Added test to ensure that if a true flag is passed in indicating we want to keep stopwords, that they are properly stemmed and included in the output. --- spec/porter_stemmer_spec.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spec/porter_stemmer_spec.js b/spec/porter_stemmer_spec.js index 37592a53a..a2c90bdb9 100644 --- a/spec/porter_stemmer_spec.js +++ b/spec/porter_stemmer_spec.js @@ -189,4 +189,8 @@ describe('porter_stemmer', function() { var allCapitalStopwords = stopwords.words.join(' ').toUpperCase(); expect(allCapitalStopwords.tokenizeAndStem()).toEqual([]); }); + + it('should tokenize and stem including stopwords', function() { + expect('My dog is very fun TO play with And another thing, he is A poodle.'.tokenizeAndStem(true)).toEqual(['my', 'dog', 'is', 'veri', 'fun', 'to', 'plai', 'with', 'and', 'anoth', 'thing', 'he', 'is', 'a', 'poodl']); + }); });