Skip to content

Commit

Permalink
Updates for qa4sm and ISMN paper (#201)
Browse files Browse the repository at this point in the history
* Fix nwise apply to handle nans

* Fix doy import

* Add option to calc TC metrics for ref

* Update changelog and tests

* Add TC metrics calc option to find metrics for combis without ref

* Add option for intercomp between non-ref datasets

* Update changelog
  • Loading branch information
wpreimes authored Sep 14, 2020
1 parent a49e709 commit 93412dd
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 38 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ tags
# Unittest and coverage
htmlcov/*
.coverage
.coverage*
.tox
junit.xml
coverage.xml
Expand Down
9 changes: 9 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ Unreleased

-

Version 0.9.1, 2020-09-14
=========================

- Include more combinations in validation framework, raise error if n < n_datasets
- `n_wise_apply` now can handle (drop) nans in a passed data frame correctly.
- TC metrics calculator has now option to calculate metrics for reference
- Fix deprecation warnings in anomaly adapter (Issue #198)
- Change combinations created by val framework, catch cases where scaling ref not in combinations (Issue #187)


Version 0.9, 2020-07-02
=======================
Expand Down
4 changes: 2 additions & 2 deletions src/pytesmo/df_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import itertools
import pandas as pd
import warnings
from pytesmo.utils import array_dropna

def n_combinations(iterable, n, must_include=None, permutations=False):
"""
Expand Down Expand Up @@ -448,8 +449,7 @@ def nwise_apply(df, method, n=2, comm=False, as_df=False, ds_names=True,
# find out how many variables the applyf returns
result = []
# apply the method using the first data set to find out the shape of c,
# we add a bias (i) to avoid raising warnings.
c = applyf(*[mat[i] for i in range(n)])
c = applyf(*array_dropna(*[mat[i] for i in range(n)]))
for index, value in enumerate(np.atleast_1d(c)):
result.append(OrderedDict([(c, np.nan) for c in combs]))
result = np.array(result) # array of OrderedDicts
Expand Down
57 changes: 29 additions & 28 deletions src/pytesmo/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import numpy as np
import scipy.stats as sc_stats
from itertools import permutations,combinations
from pytesmo.utils import array_dropna

def bias(x, y):
"""
Expand Down Expand Up @@ -569,29 +570,29 @@ def RSS(o, p):


def pearsonr(x, y):
"""
Wrapper for scipy.stats.pearsonr. Calculates a Pearson correlation
coefficient and the p-value for testing non-correlation.
Parameters
----------
x : numpy.ndarray
First input vector.
y : numpy.ndarray
Second input vector.
Returns
-------
r : float
Pearson's correlation coefficent.
p-value : float
2 tailed p-value.
See Also
--------
scipy.stats.pearsonr
"""
return sc_stats.pearsonr(x, y)
"""
Wrapper for scipy.stats.pearsonr. Calculates a Pearson correlation
coefficient and the p-value for testing non-correlation.
Parameters
----------
x : numpy.ndarray
First input vector.
y : numpy.ndarray
Second input vector.
Returns
-------
r : float
Pearson's correlation coefficent.
p-value : float
2 tailed p-value.
See Also
--------
scipy.stats.pearsonr
"""
return sc_stats.pearsonr(x, y)

@np.errstate(invalid='ignore')
def pearsonr_recursive(x, y, n_old=0, sum_xi_yi=0,
Expand Down Expand Up @@ -695,9 +696,9 @@ def spearmanr(x, y):
Parameters
----------
x : numpy.ndarray
x : numpy.array
First input vector.
y : numpy.ndarray
y : numpy.array
Second input vector.
Returns
Expand All @@ -721,9 +722,9 @@ def kendalltau(x, y):
Parameters
----------
x : numpy.ndarray
x : numpy.array
First input vector.
y : numpy.ndarray
y : numpy.array
Second input vector.
Returns
Expand All @@ -738,7 +739,7 @@ def kendalltau(x, y):
--------
scipy.stats.kendalltau
"""
return sc_stats.kendalltau(x.tolist(), y.tolist())
return sc_stats.kendalltau(x, y)


def index_of_agreement(o, p):
Expand Down
2 changes: 1 addition & 1 deletion src/pytesmo/time_series/anomaly.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import pandas as pd
import numpy as np
from pytesmo.timedate.julian import doy
from cadati.conv_doy import doy
from cadati.jd_date import julian2date
from pytesmo.time_series.filtering import moving_average

Expand Down
22 changes: 22 additions & 0 deletions src/pytesmo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,3 +251,25 @@ def ensure_iterable(el):
return [el]
else:
return el

def array_dropna(*arrs):
"""
Drop elements from input arrays where ANY array is NaN
Parameters
----------
*arrs : np.array(s)
One or multiple numpy arrays of the same length that contain nans
Returns
-------
arrs_dropna : np.array
Input arrays without NaNs
"""

idx = ~np.logical_or(*[np.isnan(x) for x in arrs])
arrs_dropna = [np.compress(idx, x) for x in arrs]

if len(arrs_dropna) == 1: arrs_dropna = arrs_dropna[0]

return tuple(arrs_dropna)
27 changes: 21 additions & 6 deletions src/pytesmo/validation_framework/metric_calculators.py
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,9 @@ class IntercomparisonMetrics(MetadataMetrics):
if True then also tau is calculated. This is set to False by default
since the calculation of Kendalls tau is rather slow and can significantly
impact performance of e.g. global validation studies
metrics_between_nonref : bool, optional (default: False)
Allow 2-dataset combinations where the ref is not included.
Warning: can lead to many combinations.
dataset_names : list, optional (default: None)
Names of the original datasets, that are used to find the lookup table
for the df cols.
Expand All @@ -651,6 +654,7 @@ class IntercomparisonMetrics(MetadataMetrics):
"""

def __init__(self, other_names=('k1', 'k2', 'k3'), calc_tau=False,
metrics_between_nonref=False,
dataset_names=None, metadata_template=None):

other_names = list(other_names)
Expand All @@ -674,7 +678,9 @@ def __init__(self, other_names=('k1', 'k2', 'k3'), calc_tau=False,
for name, col in zip(self.ds_names, self.df_columns):
self.ds_names_lut[col] = name

combis = n_combinations(self.df_columns, 2, must_include='ref')
combis = n_combinations(self.df_columns, 2,
must_include='ref' if not metrics_between_nonref else None)

self.tds_names = []
for combi in combis:
self.tds_names.append("{1}{0}{2}".format(
Expand Down Expand Up @@ -833,6 +839,7 @@ class TCMetrics(MetadataMetrics):
"""

def __init__(self, other_names=('k1', 'k2'), calc_tau=False, dataset_names=None,
tc_metrics_for_ref=True, metrics_between_nonref=False,
metadata_template=None):
"""
Triple Collocation metrics as implemented in the QA4SM project.
Expand All @@ -846,6 +853,11 @@ def __init__(self, other_names=('k1', 'k2'), calc_tau=False, dataset_names=None,
dataset_names : tuple, optional (default: None)
List that maps the names of the satellite dataset columns to their
real name that will be used in the results file.
tc_metrics_for_ref : bool, optional (default: False)
Store TC metrics for the reference data set as well.
metrics_between_nonref : bool, optional (default: False)
Allow 2-dataset combinations where the ref is not included.
Warning: can lead to many combinations.
metadata_template: dictionary, optional
A dictionary containing additional fields (and types) of the form
dict = {'field': np.float32([np.nan]}. Allows users to specify
Expand Down Expand Up @@ -876,6 +888,7 @@ def __init__(self, other_names=('k1', 'k2'), calc_tau=False, dataset_names=None,
for name, col in zip(self.ds_names, self.df_columns):
self.ds_names_lut[col] = name

self.metrics_between_nonref = metrics_between_nonref
self.tds_names, self.thds_names = self._make_names()

# metrics that are equal for all datasets
Expand All @@ -888,8 +901,10 @@ def __init__(self, other_names=('k1', 'k2'), calc_tau=False, dataset_names=None,

metrics_common = _get_metric_template(metrics_common)
metrics_tds = _get_metric_template(metrics_tds)

ignore_ds = [self.ref_name] if not tc_metrics_for_ref else ()
metrics_thds = _get_tc_metric_template(metrics_thds,
[self.ds_names_lut[n] for n in self.df_columns if n != self.ref_name])
[self.ds_names_lut[n] for n in self.df_columns if n not in ignore_ds])

for metric in metrics_common.keys():
self.result_template[metric] = metrics_common[metric].copy()
Expand Down Expand Up @@ -923,8 +938,8 @@ def __init__(self, other_names=('k1', 'k2'), calc_tau=False, dataset_names=None,

def _make_names(self):
tds_names, thds_names = [], []
combis_2 = n_combinations(
self.df_columns, 2, must_include=[self.ref_name])
combis_2 = n_combinations(self.df_columns, 2,
must_include=[self.ref_name] if not self.metrics_between_nonref else None)
combis_3 = n_combinations(
self.df_columns, 3, must_include=[self.ref_name])

Expand Down Expand Up @@ -1264,7 +1279,7 @@ def get_dataset_names(ref_key, datasets, n=3):
return dataset_names

if __name__ == '__main__':
calc = IntercomparisonMetrics(other_names=('k1', 'k2', 'k3'),
calc = TCMetrics(other_names=('k1', 'k2', 'k3'),
calc_tau=False,
metadata_template=dict(meta1=np.array(['TBD']),
meta2=np.float32([np.nan])))
Expand All @@ -1278,4 +1293,4 @@ def get_dataset_names(ref_key, datasets, n=3):
'k2': np.random.rand(idx.size),
'k3': np.random.rand(idx.size)})

adapted.calc_metrics(df, (0,1,2,{'meta1':'meta', 'meta2':12}))
calc.calc_metrics(df, (0,1,2,{'meta1':'meta', 'meta2':12}))
7 changes: 6 additions & 1 deletion tests/test_df_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""

import pytesmo.df_metrics as df_metrics
from pytesmo.metrics import bias
from pytesmo.metrics import bias, pearsonr
import pandas as pd
import numpy as np

Expand All @@ -23,8 +23,11 @@ def test_apply():

df = pd.DataFrame(index=pd.date_range(start='2000-01-01', end='2000-12-31', freq='D'),
data={'ds0': np.repeat(0, 366), 'ds1': np.repeat(1, 366)})
df.loc[df.index[np.random.choice(range(366), 10)], 'ds0'] = np.nan
df.loc[df.index[np.random.choice(range(366), 10)], 'ds1'] = np.nan
bias_matrix_old = df_metrics.pairwise_apply(df, bias)
bias_matrix_new = df_metrics.nwise_apply(df, bias, n=2, as_df=True)
r_matrix_new = df_metrics.nwise_apply(df, pearsonr, n=2, as_df=True)
assert bias_matrix_old.equals(bias_matrix_new)

# check if dict implementation and matrix implementation have same result
Expand All @@ -38,3 +41,5 @@ def test_dict_to_namedtuple():
assert d_named._fields == ('a', 'b')
assert type(d_named).__name__ == 'name'

if __name__ == '__main__':
test_apply()

0 comments on commit 93412dd

Please sign in to comment.