change envvar from PYSPARK_UNPERSIST to PYSPARK_KEEP_CACHE

Signed-off-by: Filipe Oliveira <[email protected]>
unionai-oss · Nov 16, 2023 · 08f0a25 · 08f0a25
1 parent 952bef3
commit 08f0a25
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 21 deletions.
diff --git a/pandera/backends/pyspark/decorators.py b/pandera/backends/pyspark/decorators.py
@@ -144,7 +144,7 @@ def cache_check_obj():
     entrypoint.
 
     The behavior of the resulting decorator depends on the `PANDERA_PYSPARK_CACHING` and
-    `PANDERA_PYSPARK_UNPERSIST` (optional) environment variables.
+    `PANDERA_PYSPARK_KEEP_CACHE` (optional) environment variables.
 
     Usage:
         @cache_check_obj()
@@ -186,7 +186,7 @@ def cached_check_obj():
 
                 yield  # Execute the decorated function
 
-                if CONFIG.pyspark_unpersist:
+                if not CONFIG.pyspark_keep_cache:
                     # If not cached, `.unpersist()` does nothing
                     logger.debug("Unpersisting dataframe...")
                     check_obj.unpersist()

diff --git a/pandera/config.py b/pandera/config.py
@@ -21,13 +21,13 @@ class PanderaConfig(BaseModel):
     export PANDERA_VALIDATION_ENABLED=False
     export PANDERA_VALIDATION_DEPTH=DATA_ONLY
     export PANDERA_PYSPARK_CACHE=True
-    export PANDERA_PYSPARK_UNPERSIST=False
+    export PANDERA_PYSPARK_KEEP_CACHE=True
     """
 
     validation_enabled: bool = True
     validation_depth: ValidationDepth = ValidationDepth.SCHEMA_AND_DATA
     pyspark_cache: bool = False
-    pyspark_unpersist: bool = True
+    pyspark_keep_cache: bool = False
 
 
 # this config variable should be accessible globally
@@ -43,8 +43,8 @@ class PanderaConfig(BaseModel):
         "PANDERA_PYSPARK_CACHE",
         False,
     ),
-    pyspark_unpersist=os.environ.get(
-        "PANDERA_PYSPARK_UNPERSIST",
-        True,
+    pyspark_keep_cache=os.environ.get(
+        "PANDERA_PYSPARK_KEEP_CACHE",
+        False,
     ),
 )
diff --git a/tests/pyspark/test_pyspark_config.py b/tests/pyspark/test_pyspark_config.py
@@ -43,7 +43,7 @@ class TestSchema(DataFrameModel):
             "validation_enabled": False,
             "validation_depth": ValidationDepth.SCHEMA_AND_DATA,
             "pyspark_cache": False,
-            "pyspark_unpersist": True,
+            "pyspark_keep_cache": False,
         }
 
         assert CONFIG.dict() == expected
@@ -67,7 +67,7 @@ def test_schema_only(self, spark, sample_spark_schema):
             "validation_enabled": True,
             "validation_depth": ValidationDepth.SCHEMA_ONLY,
             "pyspark_cache": False,
-            "pyspark_unpersist": True,
+            "pyspark_keep_cache": False,
         }
         assert CONFIG.dict() == expected
 
@@ -147,7 +147,7 @@ def test_data_only(self, spark, sample_spark_schema):
             "validation_enabled": True,
             "validation_depth": ValidationDepth.DATA_ONLY,
             "pyspark_cache": False,
-            "pyspark_unpersist": True,
+            "pyspark_keep_cache": False,
         }
         assert CONFIG.dict() == expected
 
@@ -234,7 +234,7 @@ def test_schema_and_data(self, spark, sample_spark_schema):
             "validation_enabled": True,
             "validation_depth": ValidationDepth.SCHEMA_AND_DATA,
             "pyspark_cache": False,
-            "pyspark_unpersist": True,
+            "pyspark_keep_cache": False,
         }
         assert CONFIG.dict() == expected
 
@@ -337,23 +337,23 @@ class TestSchema(DataFrameModel):
         )
 
     @pytest.mark.parametrize("cache_enabled", [True, False])
-    @pytest.mark.parametrize("unpersist_enabled", [True, False])
+    @pytest.mark.parametrize("keep_cache_enabled", [True, False])
     # pylint:disable=too-many-locals
     def test_pyspark_cache_settings(
         self,
         cache_enabled,
-        unpersist_enabled,
+        keep_cache_enabled,
     ):
-        """This function validates setter and getters of caching/unpersisting options."""
+        """This function validates setters and getters for cache/keep_cache options."""
         # Set expected properties in Config object
         CONFIG.pyspark_cache = cache_enabled
-        CONFIG.pyspark_unpersist = unpersist_enabled
+        CONFIG.pyspark_keep_cache = keep_cache_enabled
 
         # Evaluate expected Config
         expected = {
             "validation_enabled": True,
             "validation_depth": ValidationDepth.SCHEMA_AND_DATA,
             "pyspark_cache": cache_enabled,
-            "pyspark_unpersist": unpersist_enabled,
+            "pyspark_keep_cache": keep_cache_enabled,
         }
         assert CONFIG.dict() == expected
diff --git a/tests/pyspark/test_pyspark_decorators.py b/tests/pyspark/test_pyspark_decorators.py
@@ -62,11 +62,11 @@ def func_wo_check_obj(self, message: str):
             _ = instance.func_wo_check_obj("wrong")
 
     @pytest.mark.parametrize(
-        "cache_enabled,unpersist_enabled,"
+        "cache_enabled,keep_cache_enabled,"
         "expected_caching_message,expected_unpersisting_message",
         [
-            (True, True, True, True),
-            (True, False, True, None),
+            (True, True, True, None),
+            (True, False, True, True),
             (False, True, None, None),
             (False, False, None, None),
         ],
@@ -79,15 +79,15 @@ def test_pyspark_cache_settings(
         spark,
         sample_spark_schema,
         cache_enabled,
-        unpersist_enabled,
+        keep_cache_enabled,
         expected_caching_message,
         expected_unpersisting_message,
         caplog,
     ):
         """This function validates that caching/unpersisting works as expected."""
         # Set expected properties in Config object
         CONFIG.pyspark_cache = cache_enabled
-        CONFIG.pyspark_unpersist = unpersist_enabled
+        CONFIG.pyspark_keep_cache = keep_cache_enabled
 
         # Prepare test data
         input_df = spark_df(spark, self.sample_data, sample_spark_schema)