diff --git a/lib/natural/stemmers/stemmer.js b/lib/natural/stemmers/stemmer.js index f1c2d1777..12f4cca59 100644 --- a/lib/natural/stemmers/stemmer.js +++ b/lib/natural/stemmers/stemmer.js @@ -40,11 +40,21 @@ module.exports = function() { stemmer.tokenizeAndStem = function(text, keepStops) { var stemmedTokens = []; - - new Tokenizer().tokenize(text).forEach(function(token) { - if(keepStops || stopwords.words.indexOf(token) == -1) + var lowercaseText = text.toLowerCase(); + var tokens = new Tokenizer().tokenize(lowercaseText); + + if (keepStops) { + tokens.forEach(function(token) { stemmedTokens.push(stemmer.stem(token)); - }); + }); + } + + else { + tokens.forEach(function(token) { + if (stopwords.words.indexOf(token) == -1) + stemmedTokens.push(stemmer.stem(token)); + }); + } return stemmedTokens; }; diff --git a/spec/porter_stemmer_spec.js b/spec/porter_stemmer_spec.js index dd413f0fb..a2c90bdb9 100644 --- a/spec/porter_stemmer_spec.js +++ b/spec/porter_stemmer_spec.js @@ -21,6 +21,7 @@ THE SOFTWARE. */ var stemmer = require('../lib/natural/stemmers/porter_stemmer'); +var stopwords = require('../lib/natural/util/stopwords'); describe('porter_stemmer', function() { it('should categorizeGroups', function() { @@ -179,4 +180,17 @@ describe('porter_stemmer', function() { expect('scoring stinks'.tokenizeAndStem()).toEqual(['score', 'stink']); expect('SCORING STINKS'.tokenizeAndStem()).toEqual(['score', 'stink']); }); + + it('should tokenize and stem ignoring stopwords', function() { + expect('My dog is very fun TO play with And another thing, he is A poodle.'.tokenizeAndStem()).toEqual(['dog', 'fun', 'plai', 'thing', 'poodl']); + }); + + it('should tokenize and stem ignoring all capital stopwords', function() { + var allCapitalStopwords = stopwords.words.join(' ').toUpperCase(); + expect(allCapitalStopwords.tokenizeAndStem()).toEqual([]); + }); + + it('should tokenize and stem including stopwords', function() { + expect('My dog is very fun TO play with And another thing, he is A poodle.'.tokenizeAndStem(true)).toEqual(['my', 'dog', 'is', 'veri', 'fun', 'to', 'plai', 'with', 'and', 'anoth', 'thing', 'he', 'is', 'a', 'poodl']); + }); });