From cf6b5e45dfb0cd593f948b12a2a327bbf3699657 Mon Sep 17 00:00:00 2001 From: filipeo2-mck <110418479+filipeo2-mck@users.noreply.github.com> Date: Mon, 23 Oct 2023 15:21:48 -0300 Subject: [PATCH] move code from PR #1376 (#1387) Signed-off-by: Filipe Oliveira --- pandera/backends/pyspark/container.py | 2 +- tests/pyspark/test_pyspark_model.py | 52 +++++++++++++++++++++++---- 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/pandera/backends/pyspark/container.py b/pandera/backends/pyspark/container.py index fb04064d4..6d2ef2683 100644 --- a/pandera/backends/pyspark/container.py +++ b/pandera/backends/pyspark/container.py @@ -283,7 +283,7 @@ def collect_schema_components( schema_components = [] for col_name, column in schema.columns.items(): if ( - column.required or col_name in check_obj + column.required or col_name in check_obj.columns ) and col_name not in column_info.lazy_exclude_column_names: column = copy.deepcopy(column) if schema.dtype is not None: diff --git a/tests/pyspark/test_pyspark_model.py b/tests/pyspark/test_pyspark_model.py index 1bdbacd12..80cabe2be 100644 --- a/tests/pyspark/test_pyspark_model.py +++ b/tests/pyspark/test_pyspark_model.py @@ -300,18 +300,56 @@ def function_expected(): ) -def test_optional_column() -> None: - """Test that optional columns are not required.""" +# Define a fixture for the Schema +@pytest.fixture(scope="module", name="test_schema_optional_columns") +def test_schema(): + """Fixture containing DataFrameModel with optional columns.""" + + class Schema(pa.DataFrameModel): + """Simple DataFrameModel containing optional columns.""" - class Schema(DataFrameModel): # pylint:disable=missing-class-docstring a: Optional[str] b: Optional[str] = pa.Field(eq="b") c: Optional[str] # test pandera.typing alias - schema = Schema.to_schema() - assert not schema.columns["a"].required - assert not schema.columns["b"].required - assert not schema.columns["c"].required + return Schema + + +def test_optional_column(test_schema_optional_columns) -> None: + """Test that optional columns are not required.""" + + schema = test_schema_optional_columns.to_schema() + assert not schema.columns[ + "a" + ].required, "Optional column 'a' shouldn't be required" + assert not schema.columns[ + "b" + ].required, "Optional column 'b' shouldn't be required" + assert not schema.columns[ + "c" + ].required, "Optional column 'c' shouldn't be required" + + +def test_validation_succeeds_with_missing_optional_column( + spark, test_schema_optional_columns +) -> None: + """Test that validation succeeds even when an optional column is missing.""" + + data = [("5", "b"), ("15", "b")] + spark_schema = T.StructType( + [ + T.StructField("a", T.StringType(), False), + T.StructField("b", T.StringType(), False), + # 'c' column is missing, but it's optional + ], + ) + df = spark_df(spark, data, spark_schema) + df_out = test_schema_optional_columns.validate(check_obj=df) + + # `df_out.pandera.errors` should be empty if validation is successful. + assert ( + df_out.pandera.errors == {} + ), "No error should be raised in case of a missing optional column." def test_invalid_field() -> None: