Skip to content

Commit

Permalink
Merge pull request #51 from UBC-MDS/validate-data
Browse files Browse the repository at this point in the history
Validate data updates
  • Loading branch information
jasonmlee authored Nov 30, 2024
2 parents a77030f + 9fd38f4 commit 5c698bc
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 63 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ __pycache__/
.ipynb_checkpoints/


.github
.config
174 changes: 112 additions & 62 deletions notebooks/age_group_classification.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -994,6 +994,56 @@
{
"cell_type": "code",
"execution_count": 12,
"id": "330d0af2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n"
]
}
],
"source": [
"# Validation check: Target/response variable follows expected distribution\n",
"\n",
"def validate_category_distribution(y_train, age_group_thresholds, tolerance):\n",
" \"\"\"\n",
" Validate if a categorical variable's distribution meets specified thresholds with tolerance.\n",
"\n",
" Parameters:\n",
" - y_train (pd.Series): The categorical variable (target/response variable).\n",
" - age_group_thresholds (dict): Minimum and maximum proportion thresholds for each category.\n",
" - tolerance (float): The tolerance to apply when checking proportions.\n",
"\n",
" Returns:\n",
" - bool: True if the distribution meets the thresholds with tolerance, False otherwise.\n",
" \"\"\"\n",
" value_counts = y_train.value_counts(normalize=True) # Get proportions\n",
"\n",
" # Loop through each category and its thresholds\n",
" for category, (min_threshold, max_threshold) in age_group_thresholds.items():\n",
" proportion = value_counts.get(category, 0) # Get proportion for the category\n",
" \n",
" # Check if the proportion is within the threshold range with tolerance\n",
" if not (min_threshold - tolerance <= proportion <= max_threshold + tolerance):\n",
" return False # Return False if the proportion is out of the acceptable range\n",
" \n",
" return True # Return True if all categories meet the criteria\n",
"\n",
"\n",
"age_group_thresholds = {\"Adult\": (0.2, 0.9), \"Senior\": (0.2, 0.9)}\n",
"tolerance = 0.05\n",
"\n",
"# Validate the distribution\n",
"is_valid = validate_category_distribution(y_train, age_group_thresholds, tolerance)\n",
"print(is_valid)\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "376e656d",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -1048,7 +1098,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 14,
"id": "3f082286-59b7-4e2e-b9ea-d9487243c73d",
"metadata": {},
"outputs": [
Expand All @@ -1059,7 +1109,7 @@
"alt.RepeatChart(...)"
]
},
"execution_count": 13,
"execution_count": 14,
"metadata": {
"image/png": {
"height": 1443,
Expand Down Expand Up @@ -1123,7 +1173,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 15,
"id": "567c1d92-5bf3-4d4e-ae96-d58b2c5bc56a",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -1167,7 +1217,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 16,
"id": "e0c21311-d2a0-4ec1-ba98-7db4753f2b14",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -1208,36 +1258,36 @@
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.003508</td>\n",
" <td>0.001606</td>\n",
" <td>0.012185</td>\n",
" <td>0.005086</td>\n",
" <td>0.847887</td>\n",
" <td>0.847887</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.002565</td>\n",
" <td>0.001945</td>\n",
" <td>0.003144</td>\n",
" <td>0.001268</td>\n",
" <td>0.847887</td>\n",
" <td>0.847887</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.002816</td>\n",
" <td>0.001523</td>\n",
" <td>0.002095</td>\n",
" <td>0.001168</td>\n",
" <td>0.847887</td>\n",
" <td>0.847887</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.002603</td>\n",
" <td>0.001459</td>\n",
" <td>0.001958</td>\n",
" <td>0.001162</td>\n",
" <td>0.847887</td>\n",
" <td>0.847887</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.002390</td>\n",
" <td>0.001417</td>\n",
" <td>0.001904</td>\n",
" <td>0.001143</td>\n",
" <td>0.847887</td>\n",
" <td>0.847887</td>\n",
" </tr>\n",
Expand All @@ -1247,14 +1297,14 @@
],
"text/plain": [
" fit_time score_time test_score train_score\n",
"0 0.003508 0.001606 0.847887 0.847887\n",
"1 0.002565 0.001945 0.847887 0.847887\n",
"2 0.002816 0.001523 0.847887 0.847887\n",
"3 0.002603 0.001459 0.847887 0.847887\n",
"4 0.002390 0.001417 0.847887 0.847887"
"0 0.012185 0.005086 0.847887 0.847887\n",
"1 0.003144 0.001268 0.847887 0.847887\n",
"2 0.002095 0.001168 0.847887 0.847887\n",
"3 0.001958 0.001162 0.847887 0.847887\n",
"4 0.001904 0.001143 0.847887 0.847887"
]
},
"execution_count": 15,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -1282,7 +1332,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 17,
"id": "9665e1e4-be90-4e1c-95df-14b71e3dd8b8",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -1323,36 +1373,36 @@
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.008831</td>\n",
" <td>0.001605</td>\n",
" <td>0.006177</td>\n",
" <td>0.001391</td>\n",
" <td>0.856338</td>\n",
" <td>0.842958</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.006077</td>\n",
" <td>0.001480</td>\n",
" <td>0.004904</td>\n",
" <td>0.001316</td>\n",
" <td>0.839437</td>\n",
" <td>0.846479</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.011404</td>\n",
" <td>0.001568</td>\n",
" <td>0.006408</td>\n",
" <td>0.002851</td>\n",
" <td>0.845070</td>\n",
" <td>0.846479</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.006722</td>\n",
" <td>0.005142</td>\n",
" <td>0.010664</td>\n",
" <td>0.002238</td>\n",
" <td>0.842254</td>\n",
" <td>0.843662</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.008283</td>\n",
" <td>0.001538</td>\n",
" <td>0.004707</td>\n",
" <td>0.002686</td>\n",
" <td>0.845070</td>\n",
" <td>0.845775</td>\n",
" </tr>\n",
Expand All @@ -1362,14 +1412,14 @@
],
"text/plain": [
" fit_time score_time test_score train_score\n",
"0 0.008831 0.001605 0.856338 0.842958\n",
"1 0.006077 0.001480 0.839437 0.846479\n",
"2 0.011404 0.001568 0.845070 0.846479\n",
"3 0.006722 0.005142 0.842254 0.843662\n",
"4 0.008283 0.001538 0.845070 0.845775"
"0 0.006177 0.001391 0.856338 0.842958\n",
"1 0.004904 0.001316 0.839437 0.846479\n",
"2 0.006408 0.002851 0.845070 0.846479\n",
"3 0.010664 0.002238 0.842254 0.843662\n",
"4 0.004707 0.002686 0.845070 0.845775"
]
},
"execution_count": 16,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -1397,7 +1447,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 18,
"id": "886719d1-ce3a-43ed-b917-6d9110364a42",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -1438,36 +1488,36 @@
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.026676</td>\n",
" <td>0.012726</td>\n",
" <td>0.039943</td>\n",
" <td>0.015553</td>\n",
" <td>0.853521</td>\n",
" <td>0.851408</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.027358</td>\n",
" <td>0.017028</td>\n",
" <td>0.028877</td>\n",
" <td>0.006685</td>\n",
" <td>0.847887</td>\n",
" <td>0.854930</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.029954</td>\n",
" <td>0.021583</td>\n",
" <td>0.034430</td>\n",
" <td>0.015849</td>\n",
" <td>0.850704</td>\n",
" <td>0.851408</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.018470</td>\n",
" <td>0.011419</td>\n",
" <td>0.015580</td>\n",
" <td>0.005749</td>\n",
" <td>0.847887</td>\n",
" <td>0.856338</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.018552</td>\n",
" <td>0.011388</td>\n",
" <td>0.015271</td>\n",
" <td>0.005909</td>\n",
" <td>0.839437</td>\n",
" <td>0.853521</td>\n",
" </tr>\n",
Expand All @@ -1477,14 +1527,14 @@
],
"text/plain": [
" fit_time score_time test_score train_score\n",
"0 0.026676 0.012726 0.853521 0.851408\n",
"1 0.027358 0.017028 0.847887 0.854930\n",
"2 0.029954 0.021583 0.850704 0.851408\n",
"3 0.018470 0.011419 0.847887 0.856338\n",
"4 0.018552 0.011388 0.839437 0.853521"
"0 0.039943 0.015553 0.853521 0.851408\n",
"1 0.028877 0.006685 0.847887 0.854930\n",
"2 0.034430 0.015849 0.850704 0.851408\n",
"3 0.015580 0.005749 0.847887 0.856338\n",
"4 0.015271 0.005909 0.839437 0.853521"
]
},
"execution_count": 17,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -1512,7 +1562,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 19,
"id": "029a372d-e405-46d8-9dbf-ddbac571eb80",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -1556,7 +1606,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 20,
"id": "0a1ee050-4cae-4dbf-af98-da0669312e4f",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -1586,7 +1636,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 21,
"id": "33ad4c4e-8807-4946-89cc-b194fc49ee5b",
"metadata": {},
"outputs": [
Expand All @@ -1600,10 +1650,10 @@
{
"data": {
"text/plain": [
"<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x30bcc4890>"
"<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x340b16690>"
]
},
"execution_count": 20,
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
},
Expand Down Expand Up @@ -1641,7 +1691,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 22,
"id": "18d9f096-5f54-429c-9ae9-b0dbc1c7dde6",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -1729,9 +1779,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:DSCI522-39-FMJ]",
"display_name": "DSCI522-39-FMJ",
"language": "python",
"name": "conda-env-DSCI522-39-FMJ-py"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand Down
2 changes: 1 addition & 1 deletion notebooks/validation_errors.log
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
2024-11-29 13:24:01,680 -
2024-11-29 17:37:31,943 -
{
"DATA": {
"DATAFRAME_CHECK": [
Expand Down

0 comments on commit 5c698bc

Please sign in to comment.