Skip to content

Commit

Permalink
Merge pull request NaturalNode#235 from mamaral/tokenizeAndStem-bug
Browse files Browse the repository at this point in the history
Tokenize and stem bug
  • Loading branch information
kkoch986 committed Apr 30, 2015
2 parents fd0340c + e99431b commit 83dc200
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 4 deletions.
18 changes: 14 additions & 4 deletions lib/natural/stemmers/stemmer.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,21 @@ module.exports = function() {

stemmer.tokenizeAndStem = function(text, keepStops) {
var stemmedTokens = [];

new Tokenizer().tokenize(text).forEach(function(token) {
if(keepStops || stopwords.words.indexOf(token) == -1)
var lowercaseText = text.toLowerCase();
var tokens = new Tokenizer().tokenize(lowercaseText);

if (keepStops) {
tokens.forEach(function(token) {
stemmedTokens.push(stemmer.stem(token));
});
});
}

else {
tokens.forEach(function(token) {
if (stopwords.words.indexOf(token) == -1)
stemmedTokens.push(stemmer.stem(token));
});
}

return stemmedTokens;
};
Expand Down
14 changes: 14 additions & 0 deletions spec/porter_stemmer_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ THE SOFTWARE.
*/

var stemmer = require('../lib/natural/stemmers/porter_stemmer');
var stopwords = require('../lib/natural/util/stopwords');

describe('porter_stemmer', function() {
it('should categorizeGroups', function() {
Expand Down Expand Up @@ -179,4 +180,17 @@ describe('porter_stemmer', function() {
expect('scoring stinks'.tokenizeAndStem()).toEqual(['score', 'stink']);
expect('SCORING STINKS'.tokenizeAndStem()).toEqual(['score', 'stink']);
});

it('should tokenize and stem ignoring stopwords', function() {
expect('My dog is very fun TO play with And another thing, he is A poodle.'.tokenizeAndStem()).toEqual(['dog', 'fun', 'plai', 'thing', 'poodl']);
});

it('should tokenize and stem ignoring all capital stopwords', function() {
var allCapitalStopwords = stopwords.words.join(' ').toUpperCase();
expect(allCapitalStopwords.tokenizeAndStem()).toEqual([]);
});

it('should tokenize and stem including stopwords', function() {
expect('My dog is very fun TO play with And another thing, he is A poodle.'.tokenizeAndStem(true)).toEqual(['my', 'dog', 'is', 'veri', 'fun', 'to', 'plai', 'with', 'and', 'anoth', 'thing', 'he', 'is', 'a', 'poodl']);
});
});

0 comments on commit 83dc200

Please sign in to comment.