Skip to content

Commit

Permalink
Weights can now provided to vdid
Browse files Browse the repository at this point in the history
  • Loading branch information
Julian Blank committed Aug 15, 2024
1 parent bbdd0bf commit a5fa843
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 37 deletions.
52 changes: 37 additions & 15 deletions azcausal/estimators/panel/vdid.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,21 +82,45 @@ def group_by_index(dx):
return dx.groupby(list(dx.index.names))


def vdid_avg_by(dx, label, col, dim=None):
def dot_by_columns(ds, columns, name, weight=None):
if weight is None:
weight = dict()
counts = columns.map(len)

avg = dict()
for k, v in columns.items():
v = [e for e in v if e in ds.columns]

if k in weight:
w = np.array([weight[k].get(e, 0.0) for e in v])
w = w / w.sum()
# print(w.sum())
avg[k] = ds[v].values @ w
else:
avg[k] = np.sum(ds[v], axis=1) / counts[k]

return pd.DataFrame(avg, index=ds.index).rename_axis(name, axis=1)


def vdid_avg_by(dx, label, col, dim=None, weight=None):
if dim is None:
dim = dx.reset_index().groupby(label)[col].unique()

counts = dim.map(len)
avg = group_by_index(dx.droplevel(col, axis='index')).sum().multiply(1 / counts, axis='index', level=label)
if weight is None:
avg = group_by_index(dx.droplevel(col, axis='index')).sum().divide(counts, axis='index', level=label)
else:
avg = dot_by_columns(dx.droplevel(label, axis='index').unstack(col), dim, label, weight=weight).stack()

return avg, counts


def vdid_avg(dx, groups, dims=None):
def vdid_avg(dx, groups, dims=None, weights=None):
if dims is None:
dims = dict()
counts = dict()
for label, col in groups:
dx, counts[label] = vdid_avg_by(dx, label, col, dims.get(label))
dx, counts[label] = vdid_avg_by(dx, label, col, dim=dims.get(label), weight=weights.get(label, None))
return dx, counts


Expand Down Expand Up @@ -175,12 +199,6 @@ def sample(treatment: pd.Series) -> Iterator[pd.Series]:
return sample, vdid_se


def dot_by_columns(ds, columns, name):
counts = columns.map(len)
columns = columns.map(lambda x: [e for e in x if e in ds])
return pd.DataFrame({k: np.sum(ds[v].values, axis=1) / counts[k] for k, v in columns.items()}, index=ds.index).rename_axis(name, axis=1)


def vdid_sign(row):
if row['lb'] < 0 and row['ub'] < 0:
return '-'
Expand Down Expand Up @@ -210,6 +228,7 @@ def vdid(dx: pd.DataFrame,
ratio_marginal=None,
fillna=None,
dims=None,
weights=None,
f: Callable = lambda dx: dx,
g: Callable = lambda dx: dx
):
Expand All @@ -221,6 +240,8 @@ def vdid(dx: pd.DataFrame,
randomize, _ = diffs[-1]
if dims is None:
dims = defaultdict(None)
if weights is None:
weights = defaultdict(None)

labels = {k: v for k, v in diffs}
did = list(labels.keys())
Expand All @@ -238,15 +259,16 @@ def vdid(dx: pd.DataFrame,
dx = pd.melt(dx, id_vars=index, var_name='target', value_name='value').set_index(index + ['target'])['value']

# grouping along the difference list that was provided
davg, counts = vdid_avg(dx, [(k, v) for (k, v) in diffs if k != randomize], dims=dims)
davg, counts = vdid_avg(dx, [(k, v) for (k, v) in diffs if k != randomize], dims=dims, weights=weights)

units = dims.get(randomize)
units = dims.get(randomize, None)
if units is None:
units = davg.reset_index().groupby(randomize)[labels[randomize]].unique()
counts[randomize] = units.map(lambda x: len(x))

matrix = davg.droplevel(axis='index', level=randomize).unstack(labels[randomize]).fillna(0.0)
dagg = f(vdid_ratio(dot_by_columns(matrix, units, randomize).stack(), ratio)).unstack(did)
weight = weights.get(randomize, None)
dagg = f(vdid_ratio(dot_by_columns(matrix, units, randomize, weight=weight).stack(), ratio)).unstack(did)

# calculate the differences from the aggregated data
dte_avg = g(vdid_ratio(vdid_did(dagg, fillna=fillna), ratio_marginal)).to_frame('te')
Expand All @@ -259,8 +281,8 @@ def vdid(dx: pd.DataFrame,
ci_sample, ci_fit = ci

# simulate based on the standard error method
ci_samp_dict = {sample: f(vdid_ratio(dot_by_columns(matrix, treatment_mod, randomize).stack(), ratio)) for sample, treatment_mod in
enumerate(ci_sample(units))}
ci_samp_dict = {sample: f(vdid_ratio(dot_by_columns(matrix, treatment_mod, randomize, weight=weight).stack(), ratio))
for sample, treatment_mod in enumerate(ci_sample(units))}

ci_samp = pd.DataFrame(ci_samp_dict).rename_axis('sample', axis=1).stack()

Expand Down
57 changes: 35 additions & 22 deletions tests/estimators/panel/test_vdid.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,39 @@
import pytest
from numpy.testing import assert_almost_equal

from azcausal.core.error import JackKnife
from azcausal.data import CaliforniaProp99
from azcausal.estimators.panel.did import DID
from azcausal.estimators.panel.sdid import SDID
from azcausal.estimators.panel.vdid import vdid_panel

california99 = CaliforniaProp99().panel()


def test_vdid():
@pytest.fixture
def df():
return (CaliforniaProp99()
.df()
.pipe(lambda dx: dx.assign(treatment=dx['State'].isin(dx.query('treated == 1')['State'].unique())))
.pipe(lambda dx: dx.assign(post=dx['Year'].isin(dx.query('treated == 1')['Year'].unique())))
.assign(total='total')
)


def test_vdid(df):
estimator = DID()
result = estimator.fit(california99)
estimator.error(result, JackKnife())
assert_almost_equal(-27.349111083614947, result.effect.value)

df = (CaliforniaProp99()
.df()
.pipe(lambda dx: dx.assign(treatment=dx['State'].isin(dx.query('treated == 1')['State'].unique())))
.pipe(lambda dx: dx.assign(post=dx['Year'].isin(dx.query('treated == 1')['Year'].unique())))
.assign(total='total')
)

dte = vdid_panel(df, ['total'], 'PacksPerCapita', 'Year', 'State')
te = dte['cum'].loc['total', 'PacksPerCapita']
te = dte['avg'].loc['total', 'PacksPerCapita']

assert_almost_equal(te['te'], result.effect.cumulative().value)
assert_almost_equal(te['se'], result.effect.cumulative().se)
assert_almost_equal(te['te'], result.effect.value)
assert_almost_equal(te['se'], result.effect.se)


def test_vdid_zero_column():
def test_vdid_zero_column(df):
panel = california99
panel.data['outcome']['VOID'] = 0.0
panel.data['intervention']['VOID'] = 0
Expand All @@ -40,21 +45,29 @@ def test_vdid_zero_column():
result = estimator.fit(california99)
estimator.error(result, JackKnife())

df = (CaliforniaProp99()
.df()
.pipe(lambda dx: dx.assign(treatment=dx['State'].isin(dx.query('treated == 1')['State'].unique())))
.pipe(lambda dx: dx.assign(post=dx['Year'].isin(dx.query('treated == 1')['Year'].unique())))
.assign(total='total')
)

treatment = df.groupby('treatment')['State'].unique()
treatment[False] = list(treatment[False]) + ['VOID']
treatment[True] = list(treatment[True]) + ['VOID_T']
dims = dict(treatment=treatment)

dte = vdid_panel(df, ['total'], 'PacksPerCapita', 'Year', 'State', dims=dims)
te = dte['cum'].loc['total', 'PacksPerCapita']
te = dte['avg'].loc['total', 'PacksPerCapita']

assert_almost_equal(te['te'], result.effect.value)
assert_almost_equal(te['se'], result.effect.se)


def test_vdid_with_weights(df):
estimator = SDID()
result = estimator.fit(california99)
estimator.error(result, JackKnife())

weights = dict(treatment={False: result.effect['omega']},
post={False: result.effect['lambd']}
)

assert_almost_equal(te['te'], result.effect.cumulative().value)
assert_almost_equal(te['se'], result.effect.cumulative().se)
dte = vdid_panel(df, ['total'], 'PacksPerCapita', 'Year', 'State', weights=weights)
te = dte['avg'].loc['total', 'PacksPerCapita']

assert_almost_equal(te['te'], result.effect.value)
assert_almost_equal(te['se'], result.effect.se)

0 comments on commit a5fa843

Please sign in to comment.