-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSBD.py
101 lines (89 loc) · 4.38 KB
/
SBD.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#Andrew Gasiorowski
import sys, re
from sklearn import tree
from sklearn.feature_extraction import DictVectorizer
#opens a file and returns it's content as a list of tupples and num lines in file
def return_text_list(fileName):
text_list = []
num_lines = 0
with open(fileName) as file:
for line in file:
content = line.split()
text_list.append(content)
num_lines = num_lines + 1
return text_list, num_lines
#takes a list of tuples and returns a dictionary with features extracted and a class listing
def extract_features(text_list, num):
feature_list = []
class_list = []
output_list = []
#I assume that a period will never be the first 'char' in the list
#Instances of form like 'J.B.' will be treated as 'JB.' though not transformed(period remains)
for line in range(num-1):
if text_list[line][2][0] != 'T':
#The following lines define the contents of the dictionary
#[strip period from word on lhs of period,
#don't remove any punction from word on rhs of period,
#binary: 1 if lhs<3,
#bianry: 1 if lhs first char capitalized,
#binary: 1 if rhs first char capitalized
#binary: 1 if word to the right of "." ended with "."
#binary: 1 if length of R < 3
#binary: 1 if word to left of "." is followed by word ending with "."
lhs_len = 1 if len(text_list[line][1][:-1])<3 else 0
lhs_upper = 1 if text_list[line][1][0].isupper() == True else 0
rhs_upper = 1 if text_list[line+1][1][0].isupper() == True else 0
rhs_rhs_period = 1 if text_list[line+1][1][-1] == "." else 0
rhs_len = 1 if len(text_list[line+1][1][:-1])<3 else 0
lhs_lhs_period = 1 if text_list[line-1][1][-1] == "." else 0
features = {'lhs_word':text_list[line][1][:-1], 'rhs_word':text_list[line+1][1], 'lhs_lt_3':lhs_len, 'lhs_uppercase':lhs_upper, 'rhs_uppercase':rhs_upper, 'rhs_rhs_upper':rhs_rhs_period, 'rhs_len':rhs_len, 'lhslhs_period':lhs_lhs_period}
feature_list.append(features)
class_list.append(text_list[line][2])
output_list.append((text_list[line][1],text_list[line][2]))
return feature_list, class_list, output_list
#takes a dictionary and class list as input returns a trained decision tree classifier
def train_classifier(x, y):
vectorizer = DictVectorizer()
x_train = vectorizer.fit_transform(x)
classifier = tree.DecisionTreeClassifier()
classifier = classifier.fit(x_train, y)
return classifier, vectorizer
#takes a set of input features that match the requirements of the classifier argument
#returns a list of predictions on the input
def test_classifier(x, classifier, vectorizer):
x_test = vectorizer.transform(x)
result = classifier.predict(x_test)
return result
#takes a set of predictions and a set of answers and computes accuracy
def compute_accuracy(y_actual, y_predict):
correct = 0
num_elements = len(y_actual)
for i in range(num_elements):
if y_actual[i] == y_predict[i]:
correct = correct + 1
return correct / num_elements
#creates output file
def create_output_file(testing_data, prediction):
template = "{0:15}{1:5}{2:5}"
f = open("SBD.test.out", "w")
for i in range(len(testing_data)):
line_tup = (testing_data[i][0],testing_data[i][1],prediction[i])
f.write(template.format(*line_tup)+'\n')
f.close()
#This block of code grabs file names from command line
able = str(sys.argv).split(',')
regex = re.compile('[^a-zA-Z.]')
training_file = regex.sub('', able[1])
testing_file = regex.sub('', able[2])
#this block of code preprocesses training data and trains classifier
training_data, num_ele_train = return_text_list(training_file)
x_train, y_train,train_output_list = extract_features(training_data, num_ele_train)
decision_tree_classifier, vectorizer = train_classifier(x_train, y_train)
#this block of code preprocesses testing data and tests classifier
testing_data, num_ele_test = return_text_list(testing_file)
x_test, y_test, test_output_list = extract_features(testing_data, num_ele_test)
y_predict = test_classifier(x_test, decision_tree_classifier, vectorizer)
acc = compute_accuracy(y_test, y_predict)
print(str(round(acc, 4)))
#this outputs the results to a file
create_output_file(test_output_list, y_predict)