From cf6b5e45dfb0cd593f948b12a2a327bbf3699657 Mon Sep 17 00:00:00 2001
From: filipeo2-mck <110418479+filipeo2-mck@users.noreply.github.com>
Date: Mon, 23 Oct 2023 15:21:48 -0300
Subject: [PATCH] move code from PR #1376 (#1387)

Signed-off-by: Filipe Oliveira <filipe_oliveira@mckinsey.com>
---
 pandera/backends/pyspark/container.py |  2 +-
 tests/pyspark/test_pyspark_model.py   | 52 +++++++++++++++++++++++----
 2 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/pandera/backends/pyspark/container.py b/pandera/backends/pyspark/container.py
index fb04064d4..6d2ef2683 100644
--- a/pandera/backends/pyspark/container.py
+++ b/pandera/backends/pyspark/container.py
@@ -283,7 +283,7 @@ def collect_schema_components(
         schema_components = []
         for col_name, column in schema.columns.items():
             if (
-                column.required or col_name in check_obj
+                column.required or col_name in check_obj.columns
             ) and col_name not in column_info.lazy_exclude_column_names:
                 column = copy.deepcopy(column)
                 if schema.dtype is not None:
diff --git a/tests/pyspark/test_pyspark_model.py b/tests/pyspark/test_pyspark_model.py
index 1bdbacd12..80cabe2be 100644
--- a/tests/pyspark/test_pyspark_model.py
+++ b/tests/pyspark/test_pyspark_model.py
@@ -300,18 +300,56 @@ def function_expected():
     )
 
 
-def test_optional_column() -> None:
-    """Test that optional columns are not required."""
+# Define a fixture for the Schema
+@pytest.fixture(scope="module", name="test_schema_optional_columns")
+def test_schema():
+    """Fixture containing DataFrameModel with optional columns."""
+
+    class Schema(pa.DataFrameModel):
+        """Simple DataFrameModel containing optional columns."""
 
-    class Schema(DataFrameModel):  # pylint:disable=missing-class-docstring
         a: Optional[str]
         b: Optional[str] = pa.Field(eq="b")
         c: Optional[str]  # test pandera.typing alias
 
-    schema = Schema.to_schema()
-    assert not schema.columns["a"].required
-    assert not schema.columns["b"].required
-    assert not schema.columns["c"].required
+    return Schema
+
+
+def test_optional_column(test_schema_optional_columns) -> None:
+    """Test that optional columns are not required."""
+
+    schema = test_schema_optional_columns.to_schema()
+    assert not schema.columns[
+        "a"
+    ].required, "Optional column 'a' shouldn't be required"
+    assert not schema.columns[
+        "b"
+    ].required, "Optional column 'b' shouldn't be required"
+    assert not schema.columns[
+        "c"
+    ].required, "Optional column 'c' shouldn't be required"
+
+
+def test_validation_succeeds_with_missing_optional_column(
+    spark, test_schema_optional_columns
+) -> None:
+    """Test that validation succeeds even when an optional column is missing."""
+
+    data = [("5", "b"), ("15", "b")]
+    spark_schema = T.StructType(
+        [
+            T.StructField("a", T.StringType(), False),
+            T.StructField("b", T.StringType(), False),
+            # 'c' column is missing, but it's optional
+        ],
+    )
+    df = spark_df(spark, data, spark_schema)
+    df_out = test_schema_optional_columns.validate(check_obj=df)
+
+    # `df_out.pandera.errors` should be empty if validation is successful.
+    assert (
+        df_out.pandera.errors == {}
+    ), "No error should be raised in case of a missing optional column."
 
 
 def test_invalid_field() -> None: