Updates for qa4sm and ISMN paper (#201)

* Fix nwise apply to handle nans * Fix doy import * Add option to calc TC metrics for ref * Update changelog and tests * Add TC metrics calc option to find metrics for combis without ref * Add option for intercomp between non-ref datasets * Update changelog
TUW-GEO · Sep 14, 2020 · 93412dd · 93412dd
1 parent a49e709
commit 93412dd
Show file tree

Hide file tree

Showing 8 changed files with 91 additions and 38 deletions.
diff --git a/.gitignore b/.gitignore
@@ -31,6 +31,7 @@ tags
 # Unittest and coverage
 htmlcov/*
 .coverage
+.coverage*
 .tox
 junit.xml
 coverage.xml

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,15 @@ Unreleased
 
 -
 
+Version 0.9.1, 2020-09-14
+=========================
+
+- Include more combinations in validation framework, raise error if n < n_datasets
+- `n_wise_apply` now can handle (drop) nans in a passed data frame correctly.
+- TC metrics calculator has now option to calculate metrics for reference
+- Fix deprecation warnings in anomaly adapter (Issue #198)
+- Change combinations created by val framework, catch cases where scaling ref not in combinations (Issue #187)
+
 
 Version 0.9, 2020-07-02
 =======================

diff --git a/src/pytesmo/df_metrics.py b/src/pytesmo/df_metrics.py
@@ -40,6 +40,7 @@
 import itertools
 import pandas as pd
 import warnings
+from pytesmo.utils import array_dropna
 
 def n_combinations(iterable, n, must_include=None, permutations=False):
     """
@@ -448,8 +449,7 @@ def nwise_apply(df, method, n=2, comm=False, as_df=False, ds_names=True,
     # find out how many variables the applyf returns
     result = []
     # apply the method using the first data set to find out the shape of c,
-    # we add a bias (i) to avoid raising warnings.
-    c = applyf(*[mat[i] for i in range(n)])
+    c = applyf(*array_dropna(*[mat[i] for i in range(n)]))
     for index, value in enumerate(np.atleast_1d(c)):
         result.append(OrderedDict([(c, np.nan) for c in combs]))
     result = np.array(result)    # array of OrderedDicts

diff --git a/src/pytesmo/metrics.py b/src/pytesmo/metrics.py
@@ -31,6 +31,7 @@
 import numpy as np
 import scipy.stats as sc_stats
 from itertools import permutations,combinations
+from pytesmo.utils import array_dropna
 
 def bias(x, y):
     """
@@ -569,29 +570,29 @@ def RSS(o, p):
 
 
 def pearsonr(x, y):
-    """
-    Wrapper for scipy.stats.pearsonr. Calculates a Pearson correlation
-    coefficient and the p-value for testing non-correlation.
-
-    Parameters
-    ----------
-    x : numpy.ndarray
-        First input vector.
-    y : numpy.ndarray
-        Second input vector.
-
-    Returns
-    -------
-    r : float
-        Pearson's correlation coefficent.
-    p-value : float
-        2 tailed p-value.
-
-    See Also
-    --------
-    scipy.stats.pearsonr
-    """
-    return sc_stats.pearsonr(x, y)
+   """
+   Wrapper for scipy.stats.pearsonr. Calculates a Pearson correlation
+   coefficient and the p-value for testing non-correlation.
+
+   Parameters
+   ----------
+   x : numpy.ndarray
+       First input vector.
+   y : numpy.ndarray
+       Second input vector.
+
+   Returns
+   -------
+   r : float
+       Pearson's correlation coefficent.
+   p-value : float
+       2 tailed p-value.
+
+   See Also
+   --------
+   scipy.stats.pearsonr
+   """
+   return sc_stats.pearsonr(x, y)
 
 @np.errstate(invalid='ignore')
 def pearsonr_recursive(x, y, n_old=0, sum_xi_yi=0,
@@ -695,9 +696,9 @@ def spearmanr(x, y):
 
     Parameters
     ----------
-    x : numpy.ndarray
+    x : numpy.array
         First input vector.
-    y : numpy.ndarray
+    y : numpy.array
         Second input vector.
 
     Returns
@@ -721,9 +722,9 @@ def kendalltau(x, y):
 
     Parameters
     ----------
-    x : numpy.ndarray
+    x : numpy.array
         First input vector.
-    y : numpy.ndarray
+    y : numpy.array
         Second input vector.
 
     Returns
@@ -738,7 +739,7 @@ def kendalltau(x, y):
     --------
     scipy.stats.kendalltau
     """
-    return sc_stats.kendalltau(x.tolist(), y.tolist())
+    return sc_stats.kendalltau(x, y)
 
 
 def index_of_agreement(o, p):

diff --git a/src/pytesmo/time_series/anomaly.py b/src/pytesmo/time_series/anomaly.py
@@ -4,7 +4,7 @@
 
 import pandas as pd
 import numpy as np
-from pytesmo.timedate.julian import doy
+from cadati.conv_doy import doy
 from cadati.jd_date import julian2date
 from pytesmo.time_series.filtering import moving_average
 

diff --git a/src/pytesmo/utils.py b/src/pytesmo/utils.py
@@ -251,3 +251,25 @@ def ensure_iterable(el):
         return [el]
     else:
         return el
+
+def array_dropna(*arrs):
+    """
+    Drop elements from input arrays where ANY array is NaN
+
+    Parameters
+    ----------
+    *arrs : np.array(s)
+        One or multiple numpy arrays of the same length that contain nans
+
+    Returns
+    -------
+    arrs_dropna : np.array
+        Input arrays without NaNs
+    """
+
+    idx = ~np.logical_or(*[np.isnan(x) for x in arrs])
+    arrs_dropna = [np.compress(idx, x) for x in arrs]
+
+    if len(arrs_dropna) == 1: arrs_dropna = arrs_dropna[0]
+
+    return tuple(arrs_dropna)
diff --git a/src/pytesmo/validation_framework/metric_calculators.py b/src/pytesmo/validation_framework/metric_calculators.py
@@ -642,6 +642,9 @@ class IntercomparisonMetrics(MetadataMetrics):
         if True then also tau is calculated. This is set to False by default
         since the calculation of Kendalls tau is rather slow and can significantly
         impact performance of e.g. global validation studies
+    metrics_between_nonref : bool, optional (default: False)
+        Allow 2-dataset combinations where the ref is not included.
+        Warning: can lead to many combinations.
     dataset_names : list, optional (default: None)
         Names of the original datasets, that are used to find the lookup table
         for the df cols.
@@ -651,6 +654,7 @@ class IntercomparisonMetrics(MetadataMetrics):
     """
 
     def __init__(self, other_names=('k1', 'k2', 'k3'), calc_tau=False,
+                 metrics_between_nonref=False,
                  dataset_names=None, metadata_template=None):
 
         other_names = list(other_names)
@@ -674,7 +678,9 @@ def __init__(self, other_names=('k1', 'k2', 'k3'), calc_tau=False,
         for name, col in zip(self.ds_names, self.df_columns):
             self.ds_names_lut[col] = name
 
-        combis = n_combinations(self.df_columns, 2, must_include='ref')
+        combis = n_combinations(self.df_columns, 2,
+            must_include='ref' if not metrics_between_nonref else None)
+
         self.tds_names = []
         for combi in combis:
             self.tds_names.append("{1}{0}{2}".format(
@@ -833,6 +839,7 @@ class TCMetrics(MetadataMetrics):
     """
 
     def __init__(self, other_names=('k1', 'k2'), calc_tau=False, dataset_names=None,
+                 tc_metrics_for_ref=True, metrics_between_nonref=False,
                  metadata_template=None):
         """
         Triple Collocation metrics as implemented in the QA4SM project.
@@ -846,6 +853,11 @@ def __init__(self, other_names=('k1', 'k2'), calc_tau=False, dataset_names=None,
         dataset_names : tuple, optional (default: None)
             List that maps the names of the satellite dataset columns to their
             real name that will be used in the results file.
+        tc_metrics_for_ref : bool, optional (default: False)
+            Store TC metrics for the reference data set as well.
+        metrics_between_nonref : bool, optional (default: False)
+            Allow 2-dataset combinations where the ref is not included.
+            Warning: can lead to many combinations.
         metadata_template: dictionary, optional
             A dictionary containing additional fields (and types) of the form
             dict = {'field': np.float32([np.nan]}. Allows users to specify
@@ -876,6 +888,7 @@ def __init__(self, other_names=('k1', 'k2'), calc_tau=False, dataset_names=None,
         for name, col in zip(self.ds_names, self.df_columns):
             self.ds_names_lut[col] = name
 
+        self.metrics_between_nonref = metrics_between_nonref
         self.tds_names, self.thds_names = self._make_names()
 
         # metrics that are equal for all datasets
@@ -888,8 +901,10 @@ def __init__(self, other_names=('k1', 'k2'), calc_tau=False, dataset_names=None,
 
         metrics_common = _get_metric_template(metrics_common)
         metrics_tds = _get_metric_template(metrics_tds)
+
+        ignore_ds = [self.ref_name] if not tc_metrics_for_ref else ()
         metrics_thds = _get_tc_metric_template(metrics_thds,
-                                               [self.ds_names_lut[n] for n in self.df_columns if n != self.ref_name])
+            [self.ds_names_lut[n] for n in self.df_columns if n not in ignore_ds])
 
         for metric in metrics_common.keys():
             self.result_template[metric] = metrics_common[metric].copy()
@@ -923,8 +938,8 @@ def __init__(self, other_names=('k1', 'k2'), calc_tau=False, dataset_names=None,
 
     def _make_names(self):
         tds_names, thds_names = [], []
-        combis_2 = n_combinations(
-            self.df_columns, 2, must_include=[self.ref_name])
+        combis_2 = n_combinations(self.df_columns, 2,
+            must_include=[self.ref_name] if not self.metrics_between_nonref else None)
         combis_3 = n_combinations(
             self.df_columns, 3, must_include=[self.ref_name])
 
@@ -1264,7 +1279,7 @@ def get_dataset_names(ref_key, datasets, n=3):
     return dataset_names
 
 if __name__ == '__main__':
-    calc = IntercomparisonMetrics(other_names=('k1', 'k2', 'k3'),
+    calc = TCMetrics(other_names=('k1', 'k2', 'k3'),
                                   calc_tau=False,
                                   metadata_template=dict(meta1=np.array(['TBD']),
                                                          meta2=np.float32([np.nan])))
@@ -1278,4 +1293,4 @@ def get_dataset_names(ref_key, datasets, n=3):
                             'k2': np.random.rand(idx.size),
                             'k3': np.random.rand(idx.size)})
 
-    adapted.calc_metrics(df, (0,1,2,{'meta1':'meta', 'meta2':12}))
+    calc.calc_metrics(df, (0,1,2,{'meta1':'meta', 'meta2':12}))
diff --git a/tests/test_df_metrics.py b/tests/test_df_metrics.py
@@ -6,7 +6,7 @@
 """
 
 import pytesmo.df_metrics as df_metrics
-from pytesmo.metrics import bias
+from pytesmo.metrics import bias, pearsonr
 import pandas as pd
 import numpy as np
 
@@ -23,8 +23,11 @@ def test_apply():
 
     df = pd.DataFrame(index=pd.date_range(start='2000-01-01', end='2000-12-31', freq='D'),
                       data={'ds0': np.repeat(0, 366), 'ds1': np.repeat(1, 366)})
+    df.loc[df.index[np.random.choice(range(366), 10)], 'ds0'] = np.nan
+    df.loc[df.index[np.random.choice(range(366), 10)], 'ds1'] = np.nan
     bias_matrix_old = df_metrics.pairwise_apply(df, bias)
     bias_matrix_new = df_metrics.nwise_apply(df, bias,  n=2, as_df=True)
+    r_matrix_new = df_metrics.nwise_apply(df, pearsonr,  n=2, as_df=True)
     assert bias_matrix_old.equals(bias_matrix_new)
 
     # check if dict implementation and matrix implementation have same result
@@ -38,3 +41,5 @@ def test_dict_to_namedtuple():
     assert d_named._fields == ('a', 'b')
     assert type(d_named).__name__ == 'name'
 
+if __name__ == '__main__':
+    test_apply()