-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlabel_malware.py
69 lines (57 loc) · 2.47 KB
/
label_malware.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import multiprocessing
import tqdm
import os
import json
import make_entropy_vis
import pandas as pd
import numpy as np
sigresults = {
"Signed": "Valid",
"A required certificate is not within its validity period when verifying against the current system clock or the timestamp in the signed file.": "Expired",
"A certificate was explicitly revoked by its issuer.": "Revoked"
}
class ParseFailure(Exception):
pass
def parse_VT_file(filename):
with open(filename) as json_file:
json_text = json.load(json_file)
name = json_text['submission']['filename']
sha256 = json_text['sha256']
try:
detected = json_text['scans']['Microsoft']['detected']
detectedname = json_text['scans']['Microsoft']['result']
except KeyError:
detected = False
if detected is False:
try:
detected = json_text['scans']['BitDefender']['detected']
detectedname = json_text['scans']['BitDefender']['result']
except KeyError:
detected = False
detectedname = None
try:
sigstatus = json_text['additional_info']['sigcheck']['verified']
try:
signature = sigresults[sigstatus]
except ValueError:
signature = sigstatus
signers = json_text['additional_info']['sigcheck']['signers']
except KeyError:
signature = 'unsigned'
signers = 'none'
link = json_text['permalink']
if os.path.exists(filename[:-5]):
image = make_entropy_vis.worker(filename[:-5])['entropy_norm'].tolist()
exists = True
else:
image = np.zeros((8, 8)).tolist()
exists = False
return [sha256, name, signature, signers, detected, detectedname, link, image, exists]
if __name__ == '__main__':
path_to_json = './malware'
json_files = [os.path.join(path_to_json, pos_json)for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
pool = multiprocessing.Pool(processes=8)
results = list(tqdm.tqdm(pool.imap_unordered(parse_VT_file, json_files), total=len(json_files)))
mal_data = pd.DataFrame(results, columns=['sha256', 'filename', 'signed', 'signers', 'detected', 'detectedname', 'link', 'image', 'exists'])
mal_data['class'] = mal_data['detectedname'].str.split(':').str[0]
mal_data[mal_data.exists].drop('exists', axis=1).to_csv("./data/labelledVTEntropy2.csv")