-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmst.py
130 lines (109 loc) · 4.1 KB
/
mst.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""
Taken from https://github.com/ryan112358/private-pgm/blob/master/mechanisms/mst.py
This is a generalization of the winning mechanism from the
2018 NIST Differential Privacy Synthetic Data Competition.
Unlike the original implementation, this one can work for any discrete dataset,
and does not rely on public provisional data for measurement selection.
"""
import itertools
import networkx as nx
import numpy as np
from disjoint_set import DisjointSet
from mbi import Dataset, Domain, FactoredInference
from scipy import sparse
from scipy.special import logsumexp
def measure(data, cliques, sigma, weights=None):
if weights is None:
weights = np.ones(len(cliques))
weights = np.array(weights) / np.linalg.norm(weights)
measurements = []
for proj, wgt in zip(cliques, weights):
x = data.project(proj).datavector()
y = x + np.random.normal(loc=0, scale=sigma / wgt, size=x.size)
Q = sparse.eye(x.size)
measurements.append((Q, y, sigma / wgt, proj))
return measurements
def compress_domain(data, measurements):
supports = {}
new_measurements = []
for Q, y, sigma, proj in measurements:
col = proj[0]
sup = y >= 3 * sigma
supports[col] = sup
if supports[col].sum() == y.size:
new_measurements.append((Q, y, sigma, proj))
else: # need to re-express measurement over the new domain
y2 = np.append(y[sup], y[~sup].sum())
I2 = np.ones(y2.size)
I2[-1] = 1.0 / np.sqrt(y.size - y2.size + 1.0)
y2[-1] /= np.sqrt(y.size - y2.size + 1.0)
I2 = sparse.diags(I2)
new_measurements.append((I2, y2, sigma, proj))
undo_compress_fn = lambda data: reverse_data(data, supports)
return transform_data(data, supports), new_measurements, undo_compress_fn
def exponential_mechanism(q, eps, sensitivity, prng=np.random, monotonic=False):
coef = 1.0 if monotonic else 0.5
scores = coef * eps / sensitivity * q
probas = np.exp(scores - logsumexp(scores))
return prng.choice(q.size, p=probas)
def select(data, rho, measurement_log, cliques=[]):
engine = FactoredInference(data.domain, iters=1000)
est = engine.estimate(measurement_log)
weights = {}
candidates = list(itertools.combinations(data.domain.attrs, 2))
for a, b in candidates:
xhat = est.project([a, b]).datavector()
x = data.project([a, b]).datavector()
weights[a, b] = np.linalg.norm(x - xhat, 1)
T = nx.Graph()
T.add_nodes_from(data.domain.attrs)
ds = DisjointSet()
for e in cliques:
T.add_edge(*e)
ds.union(*e)
r = len(list(nx.connected_components(T)))
epsilon = np.sqrt(8 * rho / (r - 1))
for i in range(r - 1):
candidates = [e for e in candidates if not ds.connected(*e)]
wgts = np.array([weights[e] for e in candidates])
idx = exponential_mechanism(wgts, epsilon, sensitivity=1.0)
e = candidates[idx]
T.add_edge(*e)
ds.union(*e)
return list(T.edges)
def transform_data(data, supports):
df = data.df.copy()
newdom = {}
for col in data.domain:
support = supports[col]
size = support.sum()
newdom[col] = int(size)
if size < support.size:
newdom[col] += 1
mapping = {}
idx = 0
for i in range(support.size):
mapping[i] = size
if support[i]:
mapping[i] = idx
idx += 1
assert idx == size
df[col] = df[col].map(mapping)
newdom = Domain.fromdict(newdom)
return Dataset(df, newdom)
def reverse_data(data, supports):
df = data.df.copy()
newdom = {}
for col in data.domain:
support = supports[col]
mx = support.sum()
newdom[col] = int(support.size)
idx, extra = np.where(support)[0], np.where(~support)[0]
mask = df[col] == mx
if extra.size == 0:
pass
else:
df.loc[mask, col] = np.random.choice(extra, mask.sum())
df.loc[~mask, col] = idx[df.loc[~mask, col]]
newdom = Domain.fromdict(newdom)
return Dataset(df, newdom)