update test cases

UBC-MDS · Dec 14, 2024 · 6c2cfdc · 6c2cfdc
1 parent 3939631
commit 6c2cfdc
Show file tree

Hide file tree

Showing 4 changed files with 91 additions and 51 deletions.
diff --git a/src/validate_train_data.py b/src/validate_train_data.py
@@ -17,14 +17,22 @@ def validate_category_distribution(y_train, age_group_thresholds, tolerance):
     Returns:
     - bool: True if the distribution meets the thresholds with tolerance, False otherwise.
     """
-    value_counts = y_train.value_counts(normalize=True)  # Get proportions
+    if y_train.empty:
+        return False
+
+    # Get the proportions of categories in y_train
+    value_counts = y_train.value_counts(normalize=True)
 
     # Loop through each category and its thresholds
     for category, (min_threshold, max_threshold) in age_group_thresholds.items():
         proportion = value_counts.get(category, 0)  # Get proportion for the category
 
+        # If the category is missing in y_train (proportion == 0), return False
+        if proportion == 0:
+            return False  # Missing category should cause failure
+
         # Check if the proportion is within the threshold range with tolerance
         if not (min_threshold - tolerance <= proportion <= max_threshold + tolerance):
             return False  # Return False if the proportion is out of the acceptable range
 
-    return True  # Return True if all categories meet the criteria
+    return True 
diff --git a/testcases/test_mean_cross_validation_score.py b/testcases/test_mean_cross_validation_score.py
@@ -3,6 +3,7 @@
 # date: 2024-12-10
 
 import pytest
+import unittest
 import pandas as pd
 import numpy as np
 from sklearn.datasets import make_classification
@@ -15,38 +16,54 @@
 from src.mean_cross_validation_score import mean_cross_val_scores
 
 
-def test_mean_cross_val_scores():
-    # Generate a simple classification dataset
-    X, y = make_classification(
-        n_samples=200, n_features=10, n_informative=5, n_redundant=0, random_state=42
-    )
+class test_mean_cross_val_scores(unittest.TestCase):
 
-    # Split the dataset
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    def setUp(self):
+        # Generate a simple classification dataset
+        self.X, self.y = make_classification(
+            n_samples=200, n_features=10, n_informative=5, n_redundant=0, random_state=42
+        )
 
-    # Initialize models to test
-    rf_model = RandomForestClassifier(random_state=42)
-    lr_model = LogisticRegression(random_state=42, max_iter=500)
+        # Split the dataset
+        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
+            self.X, self.y, test_size=0.2, random_state=42
+        )
 
-    # Test SVC
-    rf_scores = mean_cross_val_scores(rf_model, X_train, y_train, cv=5, scoring="accuracy")
-    assert isinstance(rf_scores, pd.Series), "Output should be a pandas Series."
-    assert "test_score" in rf_scores.index, "Expected 'test_score' in Series index."
-    assert rf_scores["test_score"] > 0.5, "Mean accuracy should be above 0.5 for RandomForestClassifier."
+        # Initialize models to test
+        self.rf_model = RandomForestClassifier(random_state=42)
+        self.lr_model = LogisticRegression(random_state=42, max_iter=500)
 
-    # Test LogisticRegression
-    lr_scores = mean_cross_val_scores(lr_model, X_train, y_train, cv=5, scoring="accuracy")
-    assert isinstance(lr_scores, pd.Series), "Output should be a pandas Series."
-    assert "test_score" in lr_scores.index, "Expected 'test_score' in Series index."
-    assert lr_scores["test_score"] > 0.5, "Mean accuracy should be above 0.5 for LogisticRegression."
+    def test_rf_model_cross_val_scores(self):
+        # Test RandomForestClassifier
+        rf_scores = mean_cross_val_scores(self.rf_model, self.X_train, self.y_train, cv=5, scoring="accuracy")
+
+        self.assertIsInstance(rf_scores, pd.Series, "Output should be a pandas Series.")
+        self.assertIn("test_score", rf_scores.index, "Expected 'test_score' in Series index.")
+        self.assertGreater(rf_scores["test_score"], 0.5, "Mean accuracy should be above 0.5 for RandomForestClassifier.")
 
-    # Validate the standard deviation
-    assert "test_score" in lr_scores.index, "Expected 'test_score' in Series index."
-    assert "fit_time" in rf_scores.index, "Expected 'fit_time' in Series index."
+    def test_lr_model_cross_val_scores(self):
+        # Test LogisticRegression
+        lr_scores = mean_cross_val_scores(self.lr_model, self.X_train, self.y_train, cv=5, scoring="accuracy")
+
+        self.assertIsInstance(lr_scores, pd.Series, "Output should be a pandas Series.")
+        self.assertIn("test_score", lr_scores.index, "Expected 'test_score' in Series index.")
+        self.assertGreater(lr_scores["test_score"], 0.5, "Mean accuracy should be above 0.5 for LogisticRegression.")
 
-    # Check for consistent mean vs. computed values
-    assert np.isfinite(rf_scores["test_score"]), "Mean test_score should be finite."
-    assert np.isfinite(lr_scores["test_score"]), "Mean test_score should be finite."
+    def test_standard_deviation_inclusion(self):
+        # Validate the standard deviation
+        rf_scores = mean_cross_val_scores(self.rf_model, self.X_train, self.y_train, cv=5, scoring="accuracy")
+        lr_scores = mean_cross_val_scores(self.lr_model, self.X_train, self.y_train, cv=5, scoring="accuracy")
+
+        self.assertIn("test_score", lr_scores.index, "Expected 'test_score' in Series index.")
+        self.assertIn("fit_time", rf_scores.index, "Expected 'fit_time' in Series index.")
+
+    def test_finite_test_scores(self):
+        # Check for consistent mean vs. computed values
+        rf_scores = mean_cross_val_scores(self.rf_model, self.X_train, self.y_train, cv=5, scoring="accuracy")
+        lr_scores = mean_cross_val_scores(self.lr_model, self.X_train, self.y_train, cv=5, scoring="accuracy")
+
+        self.assertTrue(np.isfinite(rf_scores["test_score"]), "Mean test_score should be finite.")
+        self.assertTrue(np.isfinite(lr_scores["test_score"]), "Mean test_score should be finite.")
 
 if __name__ == "__main__":
-    pytest.main(["-v", "testcases/test_mean_cross_validation_score.py"])
+    unittest.main()
diff --git a/testcases/test_validate_train_data.py b/testcases/test_validate_train_data.py
@@ -3,7 +3,7 @@
 # date: 2024-12-11
 
 
-#import pytest
+import pytest
 import pandas as pd
 import os
 import sys
@@ -23,10 +23,22 @@ def test_validate_category_distribution():
 
     # Case 2: Invalid category distribution (A proportion too high)
     y_train = pd.Series(["A", "A", "A", "B", "B", "C", "C", "C"])
+    thresholds = {
+        "A": (0.1, 0.3),
+        "B": (0.2, 0.4),
+        "C": (0.2, 0.5)
+    }
     assert validate_category_distribution(y_train, thresholds, tolerance) == False, "Distribution exceeding threshold should fail."
 
     # Case 3: Missing category
+    thresholds = {
+    "A": (0.1, 0.4),
+    "B": (0.2, 0.4),
+    "C": (0.2, 0.5),
+    "D": (0.1, 0.3)  # Adding a missing category 'D'
+}
     y_train = pd.Series(["A", "A", "B", "B", "B", "C", "C", "C"])
+    tolerance = 0.05
     assert validate_category_distribution(y_train, thresholds, tolerance) == False, "Missing category should fail."
 
     # Case 4: All categories meet thresholds with tolerance

diff --git a/testcases/test_write_csv.py b/testcases/test_write_csv.py
@@ -2,7 +2,7 @@
 # author: Forgive Agbesi
 # date: 2024-12-11
 
-
+import pytest
 import os
 import unittest
 import pandas as pd
@@ -22,6 +22,12 @@ def setUp(self):
             'Column2': [4, 5, 6]
         }
         self.sample_df = pd.DataFrame(self.sample_data)
+
+    def test_invalid_filename_extension(self):
+        """Test that a ValueError is raised if the filename doesn't end with '.csv'."""
+        filename = 'test_file.txt'
+        with self.assertRaises(ValueError):
+            write_csv(self.sample_df, self.test_dir, filename)    
 
     def tearDown(self):
         """Clean up by removing the test directory and files created."""
@@ -31,38 +37,35 @@ def tearDown(self):
                 os.remove(file_path)
         os.rmdir(self.test_dir)
 
+    def test_empty_dataframe(self):
+        """Test that a ValueError is raised if the DataFrame is empty."""
+        empty_df = pd.DataFrame()
+        filename = 'test_file.csv'
+        with self.assertRaises(ValueError):
+            write_csv(empty_df, self.test_dir, filename)
+
     def test_write_valid_csv(self):
         """Test that the DataFrame is correctly saved as a CSV file."""
         filename = 'test_file.csv'
         write_csv(self.sample_df, self.test_dir, filename)
         self.assertTrue(os.path.exists(os.path.join(self.test_dir, filename)))
-
-    def test_invalid_filename_extension(self):
-        """Test that a ValueError is raised if the filename doesn't end with '.csv'."""
-        filename = 'test_file.txt'
-        with self.assertRaises(ValueError):
-            write_csv(self.sample_df, self.test_dir, filename)
-
-    def test_non_existing_directory(self):
-        """Test that a FileNotFoundError is raised if the directory doesn't exist."""
-        invalid_dir = 'invalid_dir'
-        filename = 'test_file.csv'
-        with self.assertRaises(FileNotFoundError):
-            write_csv(self.sample_df, invalid_dir, filename)
 
     def test_invalid_dataframe(self):
         """Test that a TypeError is raised if the input is not a pandas DataFrame."""
         invalid_df = "invalid"
         filename = 'test_file.csv'
         with self.assertRaises(TypeError):
             write_csv(invalid_df, self.test_dir, filename)
-    
-    def test_empty_dataframe(self):
-        """Test that a ValueError is raised if the DataFrame is empty."""
-        empty_df = pd.DataFrame()
+
+    def test_non_existing_directory(self):
+        """Test that a FileNotFoundError is raised if the directory doesn't exist."""
+        invalid_dir = 'invalid_dir'
         filename = 'test_file.csv'
-        with self.assertRaises(ValueError):
-            write_csv(empty_df, self.test_dir, filename)
+        with self.assertRaises(FileNotFoundError):
+            write_csv(self.sample_df, invalid_dir, filename)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
+
+
+