-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathpoor_cluster_logic.py
60 lines (57 loc) · 1.65 KB
/
poor_cluster_logic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def cluster_sort(l):
"""
merge list that have shared elements
source https://stackoverflow.com/a/4842897
"""
out = []
while len(l) > 0:
first, *rest = l
first = set(first)
lf = -1
while len(first) > lf:
lf = len(first)
rest2 = []
for r in rest:
if len(first.intersection(set(r)))>0:
first |= set(r)
else:
rest2.append(r)
rest = rest2
out.append(first)
l = rest
return out
def cluster(export):
hashes = []
samples = []
nope = []
for ii in export:
file, attr = ii
go_hash = []
if attr["hash_import_all"]:
go_hash.append(attr["hash_import_all"])
if attr["hash_import_no_main"]:
go_hash.append(attr["hash_import_no_main"])
if attr["hash_import_main"]:
go_hash.append(attr["hash_import_main"])
if attr["hash_file_path"]:
go_hash.append(attr["hash_file_path"])
complete = go_hash.copy()
complete.append(file)
# skip files that don't have hash values
if not go_hash:
nope.append(file)
continue
# hashes & samples is a list of lists
hashes.append(go_hash)
samples.append(complete)
clustered_matches = cluster_sort(hashes)
cluster = {}
for c, match in enumerate(clustered_matches):
ss = set([])
for m in match:
for aa in samples:
if m in aa:
ss.add((aa[-1]))
cluster[c] = ss
cluster["nope"] = nope
return cluster