From 907e87d0566ca2356f4671dc99c42a08fb12a439 Mon Sep 17 00:00:00 2001 From: TimAdams84 Date: Mon, 12 Aug 2024 15:05:01 +0200 Subject: [PATCH 1/8] doc: Change quality.py to scores.py and adjust README.md --- README.md | 13 +++++-- syndat/__init__.py | 4 +- syndat/{quality.py => scores.py} | 63 +++++++++++++++++++++----------- tests/test_correlation.py | 2 +- 4 files changed, 55 insertions(+), 27 deletions(-) rename syndat/{quality.py => scores.py} (83%) diff --git a/README.md b/README.md index c594952..297a708 100644 --- a/README.md +++ b/README.md @@ -25,11 +25,18 @@ import syndat real = pd.read_csv("real.csv") synthetic = pd.read_csv("synthetic.csv") -jsd = syndat.quality.jsd(real, synthetic) -auc = syndat.quality.auc(real, synthetic) -norm = syndat.quality.correlation(real, synthetic) +# How similar are the statistical distributions of real and synthetic features +distribution_similarity_score = syndat.scores.distribution(real, synthetic) + +# How hard is it for a classifier to discriminate real and synthetic data +discrimination_score = syndat.scores.discrimination(real, synthetic) + +# How well are pairwise feature correlations preserved +correlation_score = syndat.scores.correlation(real, synthetic) ``` +Scores are defined in a range of 0-100, with a higher score corresponding to better data fidelity. + ## Visualization Visualize real vs. synthetic data distributions and summary statistics for each feature: diff --git a/syndat/__init__.py b/syndat/__init__.py index 5ec6e93..a4778ef 100644 --- a/syndat/__init__.py +++ b/syndat/__init__.py @@ -1,3 +1,3 @@ from syndat import domain -from syndat import quality -from syndat import visualization \ No newline at end of file +from syndat import scores +from syndat import visualization diff --git a/syndat/quality.py b/syndat/scores.py similarity index 83% rename from syndat/quality.py rename to syndat/scores.py index 8641487..2171640 100644 --- a/syndat/quality.py +++ b/syndat/scores.py @@ -13,11 +13,13 @@ from syndat.domain import AggregationMethod +logger = logging.getLogger(__name__) + def auc(real: pandas.DataFrame, synthetic: pandas.DataFrame, n_folds=5, drop_na_threshold=0.9, score: bool = True) -> float: """ - Computes the Differentiation Complexity Score / ROC AUC score of a classifier trained to differentiate between real + Computes the Discrimination Complexity Score / ROC AUC score of a classifier trained to differentiate between real and synthetic data. :param real: The real data. @@ -27,18 +29,43 @@ def auc(real: pandas.DataFrame, synthetic: pandas.DataFrame, n_folds=5, :param score: Return result in a normalized score in [0,100]. Default is True. :return: Differentiation Complexity Score / AUC ROC Score """ + warnings.warn( - "old_function is deprecated and will be removed in a future version. Please use discrimination_score instead.", + "auc is deprecated and will be removed in a future version. Please use discrimination instead.", DeprecationWarning, stacklevel=2 ) - return discrimination_score(real, synthetic, n_folds=n_folds, drop_na_threshold=drop_na_threshold, score=score) + return discrimination(real, synthetic, n_folds=n_folds, drop_na_threshold=drop_na_threshold, score=score) -def discrimination_score(real: pandas.DataFrame, synthetic: pandas.DataFrame, n_folds=5, - drop_na_threshold=0.9, score: bool = True) -> float: +def jsd(real: pd.DataFrame, synthetic: pd.DataFrame, aggregate_results: bool = True, + aggregation_method: AggregationMethod = AggregationMethod.AVERAGE, score: bool = True, + n_unique_threshold=10) -> Union[List[float], float]: """ - Computes the Differentiation Complexity Score / ROC AUC score of a classifier trained to differentiate between real + Computes the feature distribution similarity using the Jensen-Shannon distance of real and synthetic data. + + :param real: The real data. + :param synthetic: The synthetic data. + :param aggregate_results: Compute a single aggregated score for all features. Default is True. + :param aggregation_method: How the scores are aggregated. Default is using the median of all feature scores. + :param score: Return result in a normalized score in [0,100]. Default is True. + :param n_unique_threshold: Threshold to determine at which number of unique values bins will span over several + values. + :return: Distribution Similarity / JSD + """ + + warnings.warn( + "auc is deprecated and will be removed in a future version. Please use discrimination instead.", + DeprecationWarning, + stacklevel=2 + ) + return distribution(real, synthetic, aggregate_results, aggregation_method, score, n_unique_threshold) + + +def discrimination(real: pandas.DataFrame, synthetic: pandas.DataFrame, n_folds=5, + drop_na_threshold=0.9, score: bool = True) -> float: + """ + Computes the Discrimination Complexity Score / ROC AUC score of a classifier trained to differentiate between real and synthetic data. :param real: The real data. @@ -52,10 +79,10 @@ def discrimination_score(real: pandas.DataFrame, synthetic: pandas.DataFrame, n_ real_filtered, synthetic_filtered = __filter_rows_with_common_categories(real, synthetic) # check for missing values in real data real_clean = real_filtered.dropna(thresh=int(drop_na_threshold * len(real_filtered)), axis=1) - logging.info(f'Dropped {real_clean.shape[1] - real_clean.shape[1]} ' + logger.info(f'Dropped {real_clean.shape[1] - real_clean.shape[1]} ' f'due to high missingness (threshold is {drop_na_threshold}).') real_clean = real_clean.dropna() - logging.info(f'Removed {len(real) - len(real_clean)} entries due to missing values.') + logger.info(f'Removed {len(real) - len(real_clean)} entries due to missing values.') # assert that both real and synthetic have same columns synthetic_clean = synthetic_filtered[real_clean.columns] # one-hot-encode categorical columns @@ -74,9 +101,9 @@ def discrimination_score(real: pandas.DataFrame, synthetic: pandas.DataFrame, n_ return auc_score -def jsd(real: pd.DataFrame, synthetic: pd.DataFrame, aggregate_results: bool = True, - aggregation_method: AggregationMethod = AggregationMethod.AVERAGE, score: bool = True, - n_unique_threshold=10) -> Union[List[float], float]: +def distribution(real: pd.DataFrame, synthetic: pd.DataFrame, aggregate_results: bool = True, + aggregation_method: AggregationMethod = AggregationMethod.AVERAGE, score: bool = True, + n_unique_threshold=10) -> Union[List[float], float]: """ Computes the feature distribution similarity using the Jensen-Shannon distance of real and synthetic data. @@ -100,7 +127,7 @@ def jsd(real: pd.DataFrame, synthetic: pd.DataFrame, aggregate_results: bool = T col_dtype_real = real[col].dtype col_dtype_synthetic = synthetic[col].dtype if col_dtype_real != col_dtype_synthetic: - logging.warning(f'Real data at col {col} is dtype {col_dtype_real} but synthetic is {col_dtype_synthetic}. ' + logger.warning(f'Real data at col {col} is dtype {col_dtype_real} but synthetic is {col_dtype_synthetic}. ' f'Evaluation will be done based on the assumed data type of the real data.') synthetic[col] = synthetic[col].astype(col_dtype_real) # categorical column @@ -216,21 +243,15 @@ def __filter_rows_with_common_categories(real: pd.DataFrame, synthetic: pd.DataF synthetic_categorical_cols = synthetic.select_dtypes(include=['object', 'category']).columns # Identify common categorical columns common_categorical_cols = set(real_categorical_cols) & set(synthetic_categorical_cols) - if not common_categorical_cols: - logging.warning("No common categorical columns found. Correlation will be computed on numeric data only.") # Filter rows with common categories in each column for col in common_categorical_cols: real_categories = set(real[col].unique()) synthetic_categories = set(synthetic[col].unique()) common_categories = real_categories & synthetic_categories if len(real_categories - common_categories) > 0: - logging.warning( - f"Categories {real_categories - common_categories} in column '{col}' " - f"are in real data but not in synthetic data and will be excluded.") - if len(synthetic_categories - common_categories) > 0: - logging.warning( - f"Categories {synthetic_categories - common_categories} in column '{col}' " - f"are in synthetic data but not in real data and will be excluded.") + logger.warning( + f"Categories {real_categories - common_categories} in column '{col}' are in real data but not in " + f"synthetic data. They will not be considered in the score computation.") # Filter rows to keep only common categories real = real[real[col].isin(common_categories)] synthetic = synthetic[synthetic[col].isin(common_categories)] diff --git a/tests/test_correlation.py b/tests/test_correlation.py index 965656e..ef4b55f 100644 --- a/tests/test_correlation.py +++ b/tests/test_correlation.py @@ -2,7 +2,7 @@ import pandas as pd -from syndat.quality import correlation +from syndat.scores import correlation class TestCorrelation(unittest.TestCase): From e172c87a4bc24a6f9633941fe5e832356c699bed Mon Sep 17 00:00:00 2001 From: TimAdams84 Date: Mon, 12 Aug 2024 15:11:33 +0200 Subject: [PATCH 2/8] build: Move requirements to setup file --- requirements.txt | 7 ------- setup.py | 27 +++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 7 deletions(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index f18fe5f..0000000 --- a/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -pandas~=2.1.4 -numpy~=1.26.2 -scipy~=1.11.4 -scikit-learn~=1.3.2 -matplotlib~=3.8.2 -seaborn~=0.13.0 -setuptools==69.0.2 \ No newline at end of file diff --git a/setup.py b/setup.py index 13edc7a..673b954 100644 --- a/setup.py +++ b/setup.py @@ -23,4 +23,31 @@ author_email='tim.adams@scai.fraunhofer.de', description=DESCRIPTION, long_description=DESCRIPTION, + long_description_content_type='text/markdown', + install_requires=[ + 'pandas~=2.1.4', + 'numpy~=1.26.2', + 'scipy~=1.11.4', + 'scikit-learn~=1.3.2', + 'matplotlib~=3.8.2', + 'seaborn~=0.13.0', + 'setuptools==69.0.2' + ], + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: Apache Software License', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + ], + include_package_data=True, # Ensure non-Python files are included + python_requires='>=3.9', # Specify minimum Python version + keywords='synthetic-data', + project_urls={ + 'Documentation': 'https://github.com/SCAI-BIO/syndat#readme', + 'Source': 'https://github.com/SCAI-BIO/syndat', + 'Tracker': 'https://github.com/SCAI-BIO/syndat/issues', + }, ) From b31222192ad922b4c6c1fc77867c26e44dbceaaf Mon Sep 17 00:00:00 2001 From: TimAdams84 Date: Mon, 12 Aug 2024 15:15:12 +0200 Subject: [PATCH 3/8] build: Add keywords --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 673b954..040f0f5 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ ], include_package_data=True, # Ensure non-Python files are included python_requires='>=3.9', # Specify minimum Python version - keywords='synthetic-data', + keywords='synthetic-data, data-quality, data-visualization', project_urls={ 'Documentation': 'https://github.com/SCAI-BIO/syndat#readme', 'Source': 'https://github.com/SCAI-BIO/syndat', From 7ff573f283d0f2d452d7977e20cf4eef974edf1c Mon Sep 17 00:00:00 2001 From: TimAdams84 Date: Mon, 12 Aug 2024 16:07:53 +0200 Subject: [PATCH 4/8] fix: Re-add requirements.txt to resolve conflicts --- requirements.txt | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 requirements.txt diff --git a/ requirements.txt b/ requirements.txt new file mode 100644 index 0000000..3327fc2 --- /dev/null +++ b/ requirements.txt @@ -0,0 +1,7 @@ +pandas~=2.1.4 +numpy~=1.26.2 +scipy~=1.11.4 +scikit-learn~=1.5.1 +matplotlib~=3.8.2 +seaborn~=0.13.0 +setuptools==70.0.0 \ No newline at end of file From e81e62963626bfec1fa926b1b3b2493ff2d33f09 Mon Sep 17 00:00:00 2001 From: TimAdams84 Date: Mon, 12 Aug 2024 16:09:35 +0200 Subject: [PATCH 5/8] fix: Re-add requirements.txt to resolve conflicts --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ requirements.txt b/ requirements.txt index 3327fc2..f18fe5f 100644 --- a/ requirements.txt +++ b/ requirements.txt @@ -1,7 +1,7 @@ pandas~=2.1.4 numpy~=1.26.2 scipy~=1.11.4 -scikit-learn~=1.5.1 +scikit-learn~=1.3.2 matplotlib~=3.8.2 seaborn~=0.13.0 -setuptools==70.0.0 \ No newline at end of file +setuptools==69.0.2 \ No newline at end of file From 3a4d71363727a80c0e1bfe770be4898c4d3c1925 Mon Sep 17 00:00:00 2001 From: TimAdams84 Date: Mon, 12 Aug 2024 16:15:11 +0200 Subject: [PATCH 6/8] chore: Update setup dependencies --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 040f0f5..a51e185 100644 --- a/setup.py +++ b/setup.py @@ -28,10 +28,10 @@ 'pandas~=2.1.4', 'numpy~=1.26.2', 'scipy~=1.11.4', - 'scikit-learn~=1.3.2', + 'scikit-learn~=1.5.1', 'matplotlib~=3.8.2', 'seaborn~=0.13.0', - 'setuptools==69.0.2' + 'setuptools==70.0.0' ], classifiers=[ 'Development Status :: 3 - Alpha', From b3d47cfa635337816ffc9ef65e924ee3ef8974c9 Mon Sep 17 00:00:00 2001 From: TimAdams84 Date: Mon, 12 Aug 2024 16:18:35 +0200 Subject: [PATCH 7/8] fix: tests --- tests/test_auc.py | 12 ++++++------ tests/test_jsd.py | 48 +++++++++++++++++++++++------------------------ 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/tests/test_auc.py b/tests/test_auc.py index 9ffd9ae..d6d6ee3 100644 --- a/tests/test_auc.py +++ b/tests/test_auc.py @@ -43,13 +43,13 @@ def preprocess_categorical_data(self, real_data, synthetic_data): return real_data_encoded, synthetic_data_encoded def test_auc_score(self): - auc_score = syndat.quality.auc(self.real_data, self.synthetic_data) + auc_score = syndat.scores.discrimination(self.real_data, self.synthetic_data) self.assertTrue(isinstance(auc_score, float)) self.assertGreaterEqual(auc_score, 0.0) self.assertLessEqual(auc_score, 100.0) def test_auc_score_normalized(self): - auc_score = syndat.quality.auc(self.real_data, self.synthetic_data) + auc_score = syndat.scores.discrimination(self.real_data, self.synthetic_data) self.assertTrue(isinstance(auc_score, float)) self.assertGreaterEqual(auc_score, 0.0) self.assertLessEqual(auc_score, 100.0) @@ -57,7 +57,7 @@ def test_auc_score_normalized(self): def test_auc_score_with_missing_values(self): # Introduce missing values in real data self.real_data.iloc[::10, 0] = np.nan # 10% missing data - auc_score = syndat.quality.auc(self.real_data, self.synthetic_data) + auc_score = syndat.scores.discrimination(self.real_data, self.synthetic_data) self.assertTrue(isinstance(auc_score, float)) self.assertGreaterEqual(auc_score, 0.0) self.assertLessEqual(auc_score, 100.0) @@ -65,19 +65,19 @@ def test_auc_score_with_missing_values(self): def test_auc_score_with_missing_values_drop_col(self): # Introduce missing values in real data self.real_data.iloc[::2, 0] = np.nan # 50% missing data -> col drop - auc_score = syndat.quality.auc(self.real_data, self.synthetic_data) + auc_score = syndat.scores.discrimination(self.real_data, self.synthetic_data) self.assertTrue(isinstance(auc_score, float)) self.assertGreaterEqual(auc_score, 0.0) self.assertLessEqual(auc_score, 100.0) def test_auc_score_with_custom_folds(self): - auc_score = syndat.quality.auc(self.real_data, self.synthetic_data, n_folds=5) + auc_score = syndat.scores.discrimination(self.real_data, self.synthetic_data, n_folds=5) self.assertTrue(isinstance(auc_score, float)) self.assertGreaterEqual(auc_score, 0.0) self.assertLessEqual(auc_score, 100.0) def test_auc_score_with_categorical_data(self): - auc_score = syndat.quality.auc(self.real_data_cat, self.synthetic_data_cat) + auc_score = syndat.scores.discrimination(self.real_data_cat, self.synthetic_data_cat) self.assertTrue(isinstance(auc_score, float)) self.assertGreaterEqual(auc_score, 0.0) self.assertLessEqual(auc_score, 100.0) diff --git a/tests/test_jsd.py b/tests/test_jsd.py index 60fbc0e..aa1de0e 100644 --- a/tests/test_jsd.py +++ b/tests/test_jsd.py @@ -17,8 +17,8 @@ def test_jsd_zero_int64(self): 'feature1': [6, 7, 8, 9, 10], 'feature2': [15, 16, 17, 18, 19] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertEqual(0, jsd) + distribution = syndat.scores.distribution(real, synthetic) + self.assertEqual(0, distribution) def test_jsd_zero_int64_float64(self): synthetic = pd.DataFrame({ @@ -30,8 +30,8 @@ def test_jsd_zero_int64_float64(self): 'feature1': [6, 7, 8, 9, 10], 'feature2': [0.6, 0.7, 0.8, 0.9, 1.0] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertEqual(jsd, 0) + distribution = syndat.scores.distribution(real, synthetic) + self.assertEqual(distribution, 0) def test_jsd_perfect_int64(self): synthetic = pd.DataFrame({ @@ -43,8 +43,8 @@ def test_jsd_perfect_int64(self): 'feature1': [1, 2, 1, 2, 3], 'feature2': [11, 12, 13, 14, 15] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertEqual(jsd, 100) + distribution = syndat.scores.distribution(real, synthetic) + self.assertEqual(distribution, 100) def test_jsd_perfect_int64_and_float64(self): synthetic = pd.DataFrame({ @@ -56,8 +56,8 @@ def test_jsd_perfect_int64_and_float64(self): 'feature1': [1, 2, 1, 2, 3], 'feature2': [0.1, 0.2, 0.3, 0.4, 0.5] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertEqual(jsd, 100) + distribution = syndat.scores.distribution(real, synthetic) + self.assertEqual(distribution, 100) def test_jsd_different_col_types(self): synthetic = pd.DataFrame({ @@ -69,7 +69,7 @@ def test_jsd_different_col_types(self): 'feature1': [1.2, 2.1, 1.1, 2.1, 3.1], 'feature2': [1, 2, 3, 4, 5] }) - jsd = syndat.quality.jsd(real, synthetic, score=False) + distribution = syndat.scores.distribution(real, synthetic, score=False) def test_jsd_negative_int64(self): synthetic = pd.DataFrame({ @@ -81,7 +81,7 @@ def test_jsd_negative_int64(self): 'feature1': [-1, 2, 3, 4, 5], 'feature2': [1, 2, 3, 4, 5] }) - jsd = syndat.quality.jsd(real, synthetic) + distribution = syndat.scores.distribution(real, synthetic) def test_jsd_single_outlier(self): synthetic = pd.DataFrame({ @@ -93,8 +93,8 @@ def test_jsd_single_outlier(self): 'feature1': [1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9], 'feature2': [1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 100], }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertTrue(jsd < 100) + distribution = syndat.scores.distribution(real, synthetic) + self.assertTrue(distribution < 100) def test_jsd_categorical_equal(self): synthetic = pd.DataFrame({ @@ -106,8 +106,8 @@ def test_jsd_categorical_equal(self): 'feature1': ['A', 'B', 'A', 'B', 'C'], 'feature2': ['X', 'Y', 'Y', 'X', 'Z'] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertEqual(jsd, 100) + distribution = syndat.scores.distribution(real, synthetic) + self.assertEqual(distribution, 100) def test_jsd_categorical_different(self): synthetic = pd.DataFrame({ @@ -119,8 +119,8 @@ def test_jsd_categorical_different(self): 'feature1': ['A', 'B', 'A', 'B', 'D'], 'feature2': ['X', 'Y', 'Z', 'X', 'W'] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertTrue(jsd < 100) + distribution = syndat.scores.distribution(real, synthetic) + self.assertTrue(distribution < 100) def test_jsd_categorical_mixed(self): synthetic = pd.DataFrame({ @@ -132,8 +132,8 @@ def test_jsd_categorical_mixed(self): 'feature1': ['A', 'B', 'C', 'F', 'G'], 'feature2': [1.0, 2.0, 3.0, 6.0, 7.0] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertTrue(jsd < 100) + distribution = syndat.scores.distribution(real, synthetic) + self.assertTrue(distribution < 100) def test_jsd_categorical_with_numerical(self): synthetic = pd.DataFrame({ @@ -145,8 +145,8 @@ def test_jsd_categorical_with_numerical(self): 'feature1': ['A', 'B', 'C', 'A', 'D'], 'feature2': [1.0, 2.0, 3.0, 4.0, 6.0] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertTrue(jsd < 100) + distribution = syndat.scores.distribution(real, synthetic) + self.assertTrue(distribution < 100) def test_jsd_categorical_with_nan(self): synthetic = pd.DataFrame({ @@ -158,8 +158,8 @@ def test_jsd_categorical_with_nan(self): 'feature1': ['A', 'B', 'C', 'D', None], 'feature2': [1.0, 2.0, None, 4.0, 5.0] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertTrue(jsd < 100) + distribution = syndat.scores.distribution(real, synthetic) + self.assertTrue(distribution < 100) def test_jsd_categorical_all_nan(self): synthetic = pd.DataFrame({ @@ -171,5 +171,5 @@ def test_jsd_categorical_all_nan(self): 'feature1': [None, None, None, None, None], 'feature2': [None, None, None, None, None] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertEqual(jsd, 100) + distribution = syndat.scores.distribution(real, synthetic) + self.assertEqual(distribution, 100) From 178c152edfc87fce7d10323f0ee2dcb9ee941e1e Mon Sep 17 00:00:00 2001 From: TimAdams84 Date: Mon, 12 Aug 2024 16:22:56 +0200 Subject: [PATCH 8/8] fix: rename requirements --- requirements.txt => requirements.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename requirements.txt => requirements.txt (100%) diff --git a/ requirements.txt b/requirements.txt similarity index 100% rename from requirements.txt rename to requirements.txt