-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtext_age_prediction.py
46 lines (37 loc) · 1.59 KB
/
text_age_prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import f1_score, accuracy_score
from sklearn.pipeline import Pipeline
import pickle
import os
import pandas as pd
def get_pipeline():
vec = TfidfVectorizer()
scaler = MaxAbsScaler()
nb = MultinomialNB()
pipeline = Pipeline(steps=[('tfidf', vec), ('scaler', scaler), ('nb', nb)])
return pipeline
data = pd.read_csv("data/user-age-dataset.csv")
X, y = data['text'], data['age']
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)
print("Loaded and split data")
pipeline = get_pipeline()
if not os.path.exists('pretrained-models/user-age/text_age_pipeline.pkl'):
pipeline.fit(x_train, y_train)
pickle.dump(pipeline, open('pretrained-models/user-age/text_age_pipeline.pkl', 'wb'))
print("Training completed")
else:
pipeline = pickle.load(open('pretrained-models/user-age/text_age_pipeline.pkl', 'rb'))
print("Pipeline loaded from pickle file")
y_train_pred = pipeline.predict(x_train)
y_pred = pipeline.predict(x_test)
train_f1 = f1_score(y_train, y_train_pred, average='macro')
train_accuracy = accuracy_score(y_train, y_train_pred)
test_f1 = f1_score(y_test, y_pred, average='macro')
test_accuracy = accuracy_score(y_test, y_pred)
print("Training F1:", round(train_f1, 2))
print("Training accuracy:", round(train_accuracy, 2))
print("Testing F1:", round(test_f1, 2))
print("Testing accuracy:", round(test_accuracy, 2))