-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathPreprocess.py
28 lines (23 loc) · 938 Bytes
/
Preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 30 14:48:07 2017
@author: Vishnu
"""
from nltk.corpus import stopwords
import string
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
stopword_set = set(stopwords.words("english"))
exclude = set(string.punctuation)
# =============================================================================
# module to preprocess query and data(stemming, stopword removal and punctuation removal)
# =============================================================================
def preprocess(raw_text):
stemmed = [stemmer.stem(i) for i in raw_text.split()]
raw_text = ' '.join(stemmed)
raw_text = raw_text.lower()
words = raw_text.split()
meaningful_words = [w for w in words if w not in stopword_set]
cleaned_word_list = " ".join(meaningful_words)
cleaned_query = ''.join(ch for ch in cleaned_word_list if ch not in exclude)
return cleaned_query