-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
95 lines (71 loc) · 2.93 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import re
import numpy as np
import pandas as pd
import pickle
from nltk.tokenize import SyllableTokenizer
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
# set random seed for reproducibility
SEED = 42
### INTIALIZATION
# load datasets
datasets_folder_path = os.path.join(os.getcwd(), "datasets")
hongkong_username_df = pd.read_csv(os.path.join(datasets_folder_path, "hongkong_username.csv"))
non_hongkong_username_df = pd.read_csv(os.path.join(datasets_folder_path, "non_hongkong_username.csv"))
# label 0 for non-HK and 1 for HK
non_hongkong_username_df["Hong Kong"] = 0
hongkong_username_df["Hong Kong"] = 1
# merge datasets
df = pd.concat([hongkong_username_df, non_hongkong_username_df], axis=0, ignore_index=True)
# PREPROCESSING PIPELINE
# drop duplicates
original_size = len(df)
df = df.drop_duplicates(subset=["IG Username"])
print(f"{original_size - len(df)} duplicated entries removed. {len(df)} entries retained.")
# remove non-alpha characters
df["IG Username"] = df["IG Username"].astype(str)
df["IG Username"] = df["IG Username"].apply(lambda x: re.sub(r"[\d._]+", "", x))
# drop empty usernames
original_size = len(df)
df = df[df["IG Username"] != ""]
print(f"{original_size - len(df)} empty usernames removed. {len(df)} entries retained.")
# tokenize
tokenizer = SyllableTokenizer()
df["Tokenized IG Username"] = df["IG Username"].apply(tokenizer.tokenize)
# create a unique syllable vocabulary
syllable_vocab = list(set(df["Tokenized IG Username"].explode()))
print(f"Vocabulary size: {len(syllable_vocab)} unique syllables.")
# encoding
def manual_binarize(tokenized_username, syllable_vocab):
binary_vector = [0] * len(syllable_vocab)
for syllable in tokenized_username:
if syllable in syllable_vocab:
index = syllable_vocab.index(syllable)
binary_vector[index] = 1
return binary_vector
df["Encoded Username"] = df["Tokenized IG Username"].apply(lambda x: manual_binarize(x, syllable_vocab))
# MODEL TRAINING
# convert to approporiate data types
X = np.array(df["Encoded Username"].tolist())
y = df["Hong Kong"].values
# standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# initialize SVM with the tuned hyperparameters
svm_model = SVC(C=1, probability=True, random_state=SEED)
# train the model
print("Training the SVM model...")
svm_model.fit(X_scaled, y)
print("Training complete.")
# save the trained model, syllable vocabulary, and scaler
output_folder = os.getcwd()
with open(os.path.join(output_folder, "svm_model.pkl"), "wb") as model_file:
pickle.dump(svm_model, model_file)
with open(os.path.join(output_folder, "syllable_vocab.pkl"), "wb") as vocab_file:
pickle.dump(syllable_vocab, vocab_file)
with open(os.path.join(output_folder, "scaler.pkl"), "wb") as scaler_file:
pickle.dump(scaler, scaler_file)
print("Trained model, syllable vocabulary, and scaler saved successfully.")