forked from tazeek/BullyDetect
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsoundex_dict.py
59 lines (39 loc) · 1.32 KB
/
soundex_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from contractions import soundex_dictionary
import pickle
def getSoundex(word):
# Uppercase the word
word = word.upper()
# Get the first letter of the word
soundex = word[0]
# Skip the following letters
skip_dict = "HW"
word = [letter for letter in word[1:] if letter not in skip_dict]
word = "".join(word)
# Loop character by character (Start with 2nd character)
for char in word[0:]:
code = soundex_dictionary[char]
if code != soundex[-1]:
soundex += code
# Replace period characters
soundex = soundex.replace(".", "")
# If the string has only one character, append rest with three 0s.
soundex = soundex[:4].ljust(4, "0")
return soundex
# Load K-Means model here
# We use this over W2V model, due to memory constraints and loading time
FILE = "K-Means Models/full_500C.pk"
word_centroid_map = pickle.load(open(FILE,"rb"))
# Create dictionary
soundex_dict = {}
# Loop one pair at a time
# Word is stored in key
counter = 0
for key, value in word_centroid_map.items():
soundex_key = getSoundex(key)
soundex_dict[key] = soundex_key
if counter % 10000 == 0:
print("%i words encoded" % (counter))
counter += 1
# Save dictionary
FILE = "Word Dictionaries/soundex_dict.pk"
pickle.dump(soundex_dict, open(FILE, "wb"))