-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3-gram-TF.py
77 lines (72 loc) · 2.29 KB
/
3-gram-TF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import re
from collections import *
import os
import pandas as pd
from sklearn import feature_extraction
import sklearn
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
#3-gram
#从asm文件提取opcode
def getOpcodeSequence(filename):
print(filename)
opcode_seq = []
p = re.compile(r'\s([a-fA-F0-9]{2}\s)+\s*([a-z]+)')
with open(filename,encoding='gb18030',errors='ignore') as f:
for line in f:
if line.startswith(".text"):
m = re.findall(p,line)
if m:
opc = m[0][1]
if opc != "align":
opcode_seq.append(opc)
with open("opcode.txt",'w') as f2:
f2.writelines(opcode_seq)
f2.write("\t")
#print(opcode_seq)
return opcode_seq
#n-gram n=3
def getOpcodeNgram(ops, n=3):
opngramlist = [tuple(ops[i:i+n]) for i in range(len(ops)-n)]
opngram = Counter(opngramlist)
#TF
values_sum = sum(opngram.values())
for k in opngram:
opngram[k] = opngram[k]/values_sum
return opngram
for i in range(1,4):
basepath = "D:\MyDownloads\\train\\subtrain\\"
map3gram = defaultdict(Counter)
subtrain = pd.read_csv('subtrainLabels.csv')
count = 1
for sid in subtrain.Id:
print ("counting the 3-gram of the {0} file...".format(str(count)))
count += 1
filename = basepath + sid + ".asm"
ops = getOpcodeSequence(filename)
op3gram = getOpcodeNgram(ops)
map3gram[sid] = op3gram
cc = Counter([])
for d in map3gram.values():
cc += d
selectedfeatures = {}
tc = 0
#select top 50 by DF
for k,v in cc.most_common(50):
selectedfeatures[k] = v
print (k,v)
tc += 1
dataframelist = []
j = 0
for fid,op3gram in map3gram.items():
standard = {}
standard["Id"] = fid
for feature in selectedfeatures:
if feature in op3gram:
standard[feature] = op3gram[feature]
else:
standard[feature] = 0
dataframelist.append(standard)
j += 1
df = pd.DataFrame(dataframelist)
df.to_csv("TF_{0}feature.csv".format(50*i),index=False)