-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplot_n-gram_TFIDF.py
81 lines (77 loc) · 2.44 KB
/
plot_n-gram_TFIDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn import model_selection
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
import math
avg = lambda items: float(sum(items)) / len(items)
TPR = []
FPR = []
Score =[]
Acc = []
Gmean = []
subtrainLabel = pd.read_csv('subtrainLabels.csv')
for i in range(1,6):
subtrainfeature = pd.read_csv("TFIDF_top50_{0}feature.csv".format(i))
subtrain = pd.merge(subtrainLabel,subtrainfeature,on='Id')
labels = subtrain.Class
subtrain.drop(["Class","Id"], axis=1, inplace=True)
subtrain = subtrain.values
#Develop test set and training set
X_train, X_test, y_train, y_test = model_selection.train_test_split(subtrain,labels,test_size=0.5)
srf = RF(n_estimators=10, n_jobs=-1)
srf.fit(X_train,y_train)
score = srf.score(X_test,y_test)
print(score)
Score.append(score)
y_pred = srf.predict(X_test)
CM = confusion_matrix(y_test, y_pred)
#print(CM)
FP = CM.sum(axis=0) - np.diag(CM)
FN = CM.sum(axis=1) - np.diag(CM)
TP = np.diag(CM)
TN = CM.sum() - (FP + FN + TP)
FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)
tpr = TP/(TP+FN)
# Specificity or true negative rate
tnr = TN/(TN+FP)
# Precision or positive predictive value
ppv = TP/(TP+FP)
# Negative predictive value
npv = TN/(TN+FN)
# Fall out or false positive rate
fpr = FP/(FP+TN)
# False negative rate
fnr = FN/(TP+FN)
# False discovery rate
fdr = FP/(TP+FP)
# Overall accuracy
acc = (TP+TN)/(TP+FP+FN+TN)
TPR.append(avg(tpr))
FPR.append(avg(fpr))
Acc.append(avg(acc))
gmean=math.sqrt(avg(tnr)*avg(tpr))
Gmean.append(gmean)
print(TPR,FPR,Score,Acc,Gmean)
array=[TPR,FPR,Acc,Gmean]
list1=[1,2,3,4,5]
name = ['TPR','FPR','Accuracy','Gmean']
import matplotlib.pyplot as plt
for i in range(4):
plt.figure(i)
x=[1,2,3,4,5]
plt.plot(x,array[i],label = 'TFIDF')
plt.xlabel('Opcode n-gram size')
plt.ylabel(name[i])
plt.tick_params(axis='both', which='major', labelsize=14)
for a,b,c in zip(x,array[i],name[i]):
str_label = c + '_' + str(b)
plt.text(a,b,str_label, ha = 'center' ,va = 'bottom' ,fontsize=5)
plt.legend(loc='best')
plt.show()