-
Notifications
You must be signed in to change notification settings - Fork 73
/
Copy pathnlp.py
196 lines (174 loc) · 6.13 KB
/
nlp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
"""
@author: Aayush Agrawal
@Purpose - Re-usable code in Python 3 for Natural Language processing task
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re
import string
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
eng_stop = set(stopwords.words('english'))
def word_grams(text, min=1, max=4):
'''
Function to create N-grams from text
Required Input -
- text = text string for which N-gram needs to be created
- min = minimum number of N
- max = maximum number of N
Expected Output -
- s = list of N-grams
'''
s = []
for n in range(min, max+1):
for ngram in ngrams(text, n):
s.append(' '.join(str(i) for i in ngram))
return s
def make_worlcloud(df,column, bg_color='white', w=1200, h=1000, font_size_max=50, n_words=40,g_min=1,g_max=1):
'''
Function to make wordcloud from a text corpus
Required Input -
- df = Pandas DataFrame
- column = name of column containing text
- bg_color = Background color
- w = width
- h = height
- font_size_max = maximum font size allowed
- n_word = maximum words allowed
- g_min = minimum n-grams
- g_max = maximum n-grams
Expected Output -
- World cloud image
'''
text = ""
for ind, row in df.iterrows():
text += row[column] + " "
text = text.strip().split(' ')
text = word_grams(text,g_min,g_max)
text = list(pd.Series(word_grams(text,1,2)).apply(lambda x: x.replace(' ','_')))
s = ""
for i in range(len(text)):
s += text[i] + " "
wordcloud = WordCloud(background_color=bg_color, \
width=w, \
height=h, \
max_font_size=font_size_max, \
max_words=n_words).generate(s)
wordcloud.recolor(random_state=1)
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
def get_tokens(text):
'''
Function to tokenize the text
Required Input -
- text - text string which needs to be tokenized
Expected Output -
- text - tokenized list output
'''
return word_tokenize(text)
def convert_lowercase(text):
'''
Function to tokenize the text
Required Input -
- text - text string which needs to be lowercased
Expected Output -
- text - lower cased text string output
'''
return text.lower()
def remove_punctuations(text):
'''
Function to tokenize the text
Required Input -
- text - text string
Expected Output -
- text - text string with punctuation removed
'''
return text.translate(None,string.punctuation)
def remove_stopwords(text):
'''
Function to tokenize the text
Required Input -
- text - text string which needs to be tokenized
Expected Output -
- text - list output with stopwords removed
'''
return [word for word in text.split() if word not in eng_stop]
def convert_stemmer(word):
'''
Function to tokenize the text
Required Input -
- word - word which needs to be tokenized
Expected Output -
- text - word output after stemming
'''
porter_stemmer = PorterStemmer()
return porter_stemmer.stem(word)
def convert_lemmatizer(word):
'''
Function to tokenize the text
Required Input -
- word - word which needs to be lemmatized
Expected Output -
- word - word output after lemmatizing
'''
wordnet_lemmatizer = WordNetLemmatizer()
return wordnet_lemmatizer.lemmatize(word)
def create_tf_idf(df, column, train_df = None, test_df = None,n_features = None):
'''
Function to do tf-idf on a pandas dataframe
Required Input -
- df = Pandas DataFrame
- column = name of column containing text
- train_df(optional) = Train DataFrame
- test_df(optional) = Test DataFrame
- n_features(optional) = Maximum number of features needed
Expected Output -
- train_tfidf = train tf-idf sparse matrix output
- test_tfidf = test tf-idf sparse matrix output
- tfidf_obj = tf-idf model
'''
tfidf_obj = TfidfVectorizer(ngram_range=(1,1), stop_words='english',
analyzer='word', max_features = n_features)
tfidf_text = tfidf_obj.fit_transform(df.ix[:,column].values)
if train_df is not None:
train_tfidf = tfidf_obj.transform(train_df.ix[:,column].values)
else:
train_tfidf = tfidf_text
test_tfidf = None
if test_df is not None:
test_tfidf = tfidf_obj.transform(test_df.ix[:,column].values)
return train_tfidf, test_tfidf, tfidf_obj
def create_countvector(df, column, train_df = None, test_df = None,n_features = None):
'''
Function to do count vectorizer on a pandas dataframe
Required Input -
- df = Pandas DataFrame
- column = name of column containing text
- train_df(optional) = Train DataFrame
- test_df(optional) = Test DataFrame
- n_features(optional) = Maximum number of features needed
Expected Output -
- train_cvect = train count vectorized sparse matrix output
- test_cvect = test count vectorized sparse matrix output
- cvect_obj = count vectorized model
'''
cvect_obj = CountVectorizer(ngram_range=(1,1), stop_words='english',
analyzer='word', max_features = n_features)
cvect_text = cvect_obj.fit_transform(df.ix[:,column].values)
if train_df is not None:
train_cvect = cvect_obj.transform(train_df.ix[:,column].values)
else:
train_cvect = cvect_text
test_cvect = None
if test_df is not None:
test_cvect = cvect_obj.transform(test_df.ix[:,column].values)
return train_cvect, test_cvect, cvect_obj