diff --git a/.bash_history b/.bash_history deleted file mode 100644 index e69de29..0000000 diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index d4b06bf..42ce9ba 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -11,6 +11,7 @@ on: paths: - 'Dockerfile' - 'conda-linux-64.lock' + - 'requirements.txt' jobs: push_to_registry: diff --git a/.local/share/jupyter/runtime/jpserver-7-open.html b/.local/share/jupyter/runtime/jpserver-7-open.html deleted file mode 100644 index 481090f..0000000 --- a/.local/share/jupyter/runtime/jpserver-7-open.html +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - - Opening Jupyter Application - - - -

- This page should redirect you to a Jupyter application. If it doesn't, - click here to go to Jupyter. -

- - - \ No newline at end of file diff --git a/.local/share/jupyter/runtime/jpserver-7.json b/.local/share/jupyter/runtime/jpserver-7.json deleted file mode 100644 index 80129ab..0000000 --- a/.local/share/jupyter/runtime/jpserver-7.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "base_url": "/", - "hostname": "0.0.0.0", - "password": false, - "pid": 7, - "port": 8888, - "root_dir": "/home/jovyan", - "secure": false, - "sock": "", - "token": "3f215dc744f7f6a7b9862bab25e384a4f945433e18bd349f", - "url": "http://323fcf1aabb3:8888/", - "version": "2.12.5" -} \ No newline at end of file diff --git a/.local/share/jupyter/runtime/jupyter_cookie_secret b/.local/share/jupyter/runtime/jupyter_cookie_secret deleted file mode 100644 index 4552dbb..0000000 --- a/.local/share/jupyter/runtime/jupyter_cookie_secret +++ /dev/null @@ -1 +0,0 @@ -1oxZ8IMfji6y/SsTsU05V6GcJDmjo85Rm4scaeQV8oY= diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..2e1bb4c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,71 @@ +--- +editor_options: + markdown: + wrap: 72 +--- + +Revisions: + +Who: Merari Santana + +What was addressed: + +- Scripts on README file were not running Description of Revision: I + revised the instructions for running Make file. This runs all the + scripts correctly. Evidence: + + +- Improved accessibility to our report Description of Revision: I + deployed Github pages so that our README file has a direct link to + our HTML report. Evidence: + + +- Change acronymns in final report and delete bullet points + Description of Revision: I changed the acronyms in our qmd file and + deleted bullet points. These changes were rendered to our pdf and + html files. Evidence: + + + +Who: Gurmehak Kaur + +What was addressed: + +- Improve the project folder structure Description of Revision: I cleaned up and improved the project’s folder structure by organizing files into dedicated folders that were earlier missing in our repo: `reports/` for generated summaries, `results/` with subfolders for tables and figures for visualizations, `scripts/` for executable workflows and `src/` for abstract functions. This streamlined structure improves clarity and project maintainability. + Evidence: + + + +Who: Ke Gao + +What was addressed: + +- Improve Automatic Numbering of Figures in the Report Description of + Revision: I improved automatic numbering of figures in the report. + Evidence: + + +- Improve Automatic Numbering of Tables in the Report Description of + Revision: I improved automatic numbering of tables in the report. + Evidence: + + +Who: Yuhan Fan + +What was addressed: + +- Updated README.me with following: + + - the 'About' section of README.md with most resent results + metrics from our final report, and fixed any grammar errors. + + - Deleted bullet point and capitalized "contributors" in + README.md. + + - Added GitHub repository link under 'Usage' - 'Setup' section. + + - Added example screenshot image to 'Running the analysis' + section. + + - Evidence: + diff --git a/Dockerfile b/Dockerfile index 47fcdf4..f266623 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,6 +9,8 @@ USER root RUN sudo apt update \ && sudo apt install -y lmodern +RUN apt-get update && apt-get install -y build-essential make + USER $NB_UID RUN mamba update --quiet --file /tmp/conda-linux-64.lock @@ -17,8 +19,7 @@ RUN mamba clean --all -y -f RUN pip install --no-cache-dir -r /tmp/requirements.txt RUN pip cache purge + RUN fix-permissions "${CONDA_DIR}" RUN fix-permissions "/home/${NB_USER}" -RUN pip install deepchecks==0.18.1 - diff --git a/Makefile b/Makefile index 900f493..7102868 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,16 @@ .PHONY: all clean -all: report/heart_failure_analysis.html report/heart_failure_analysis.pdf +all: data/raw/heart_failure_clinical_records.data \ + data/processed/heart_failure_train.csv \ + results/figures/correlation_heatmap.png \ + results/models/pipeline.pickle results/figures/training_plots \ + results/tables/confusion_matrix.csv \ + results/tables/test_scores.csv \ + reports/heart-failure-analysis.html \ + reports/heart-failure-analysis.pdf # Download and convert data -data/raw/heart_failure_clinical_records.data : scripts/download_and_convert.py +data/raw/heart_failure_clinical_records.data: scripts/download_and_convert.py python scripts/download_and_convert.py \ --url="https://archive.ics.uci.edu/static/public/519/heart+failure+clinical+records.zip" \ --write_to=data/raw @@ -11,47 +18,56 @@ data/raw/heart_failure_clinical_records.data : scripts/download_and_convert.py # Process and analyze data data/processed/heart_failure_train.csv data/processed/heart_failure_test.csv : scripts/process_and_analyze.py data/raw/heart_failure_clinical_records.data python scripts/process_and_analyze.py \ - --file_path=data/raw/heart_failure_clinical_records.data \ - --data-to=data/processed + --file_path="data/raw/heart_failure_clinical_records_dataset_converted.csv" \ + --output_dir=data/processed # Perform correlation analysis results/figures/correlation_heatmap.png : scripts/correlation_analysis.py data/processed/heart_failure_train.csv data/processed/heart_failure_test.csv python scripts/correlation_analysis.py \ --train_file=data/processed/heart_failure_train.csv \ --test_file=data/processed/heart_failure_test.csv \ - --output_file=results/figures/correlation_heatmap.png + --output_file="./results/figures/heatmap.png" # Train and evaluate the model -results/models/pipeline.pickle results/figures/training_plots : scripts/modelling.py data/processed/heart_failure_train.csv - python scripts/modelling.py \ - --training-data=data/processed/heart_failure_train.csv \ - --pipeline-to=results/models \ - --plot-to=results/figures \ - --seed=123 - -results/tables/test_evaluation.csv : scripts/model_evaluation.py data/processed/heart_failure_test.csv results/models/pipeline.pickle +results/models/pipeline.pickle results/figures/training_plots: data/processed/heart_failure_train.csv + python scripts/modelling.py \ + --training-data "./data/processed/heart_failure_train.csv" \ + --pipeline-to "results/models" \ + --plot-to "results/figures" \ + --table-to "results/tables" \ + --seed 123 + +results/tables/confusion_matrix.csv results/tables/test_scores.csv: scripts/model_evaluation.py data/processed/heart_failure_test.csv results/models/pipeline.pickle python scripts/model_evaluation.py \ - --scaled-test-data=data/processed/heart_failure_test.csv \ - --pipeline-from=results/models/pipeline.pickle \ - --results-to=results/tables + --scaled-test-data "data/processed/heart_failure_test.csv" \ + --pipeline-from "results/models/pipeline.pickle" \ + --results-to "results/tables" # Build HTML and PDF reports -report/heart_failure_analysis.html report/heart_failure_analysis.pdf : report/heart_failure_analysis.qmd \ -results/models/pipeline.pickle \ -results/figures/heatmap.html \ -results/figures/training_plots \ -results/tables/test_evaluation.csv - quarto render report/heart_failure_analysis.qmd --to html - quarto render report/heart_failure_analysis.qmd --to pdf +# Rule to generate HTML +reports/heart-failure-analysis.html: + quarto render reports/heart-failure-analysis.qmd --to html --embed-resources --standalone + +# Rule to generate PDF +reports/heart-failure-analysis.pdf: + quarto render reports/heart-failure-analysis.qmd --to pdf + # Clean up analysis clean: - rm -rf data/raw/* - rm -f results/data/processed/heart_failure_train.csv \ - results/data/processed/heart_failure_test.csv \ - results/models/pipeline.pickle \ - results/figures/heatmap.html \ - results/figures/training_plots \ - results/tables/test_evaluation.csv \ - report/heart_failure_analysis.html \ - report/heart_failure_analysis.pdf + rm -rf \ + data/processed/* \ + results/figures/* \ + results/img/* \ + results/models/* \ + results/pipeline/* \ + + rm -f \ + results/tables/test_scores.csv \ + results/tables/confusion_matrix.csv \ + results/tables/confusion_matrix.csv \ + results/tables/logistic_regression_coefficients.csv \ + reports/heart-failure-analysis.html \ + reports/heart-failure-analysis.pdf + + diff --git a/README.md b/README.md index 3154043..97c4c08 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,17 @@ # Heart Failure Analysis -- contributors: Yuhan Fan, Gurmehak Kaur, Ke Gao, Merari Santana +Contributors: Yuhan Fan, Gurmehak Kaur, Ke Gao, Merari Santana ## About -In this project, we attempt to build a classification model using logistic regression algorithm to predict patient mortality risk after surviving a heart attack using their medical records. Using patient test results, the final classifier achieves an accuracy of 81.6%. The model’s precision of 70.0% suggests it is moderately conservative in predicting the positive class (death), minimizing false alarms.More importantly, the recall of 73.68% ensures the model identifies the majority of high-risk patients, reducing the likelihood of missing true positive cases, however, there is still room for a lot of improvement, particularly in aiming to maximise recall by minimising False Negatives. The F1-score of 0.71 reflects a good balance between precision and recall, highlighting the model’s robustness in survival prediction. While promising, further refinements are essential for more reliable predictions and effectively early intervention. +In this project, we attempt to build a classification model using logistic regression algorithm to predict patient mortality risk after surviving a heart attack using their medical records. Using patient test results, the final classifier achieves an accuracy of 0.82. The model’s precision of 0.70 suggests it is moderately conservative in predicting the positive class (death), minimizing false alarms. More importantly, the recall of 0.74 ensures the model identifies the majority of high-risk patients, reducing the likelihood of missing true positive cases, however, there is still room for a lot of improvement, particularly in aiming to maximise recall by minimising False Negatives. The F1-score of 0.72 reflects a good balance between precision and recall, highlighting the model’s robustness in survival prediction. While promising, further refinements are essential for more reliable predictions and effectively early intervention. The data set used in this project was created by D. Chicco, Giuseppe Jurman in 2020. It was sourced from the UCI Machine Learning Repository and can be found [here](https://archive.ics.uci.edu/dataset/519/heart+failure+clinical+records). Each row in the data set represents the medical records of 299 patients who had heart failure, collected during their follow-up period, where each patient profile has 13 clinical features(age, anaemia, diabetes, platelets, etc.). +## Report + +The final report can be found [here](https://ubc-mds.github.io/heart-failure-analysis/reports/heart-failure-analysis.html). + ## Dependencies - Docker @@ -20,21 +24,30 @@ The data set used in this project was created by D. Chicco, Giuseppe Jurman in 2 > If you are using Windows or Mac, make sure Docker Desktop is running. -1. Clone this GitHub repository. +1. Clone this [GitHub repository](https://github.com/UBC-MDS/heart-failure-analysis/tree/main). ### Running the analysis -1. Navigate to the root of this project on your computer using the command line and enter the following command: +2. Navigate to the root of this project on your computer using your local terminal and then enter the following command: ``` docker compose up ``` -2. In the terminal, look for a URL that starts with [`http://127.0.0.1:8888/lab?token=`](http://127.0.0.1:8888/lab?token=) (for an example, see the highlighted text in the terminal below). Copy and paste that URL into your browser. +3. In the terminal output, look for a URL that starts with `http://127.0.0.1:8888/lab?token=`. (for an example, see the highlighted text in the terminal below). Copy and paste that URL into your browser. + + +4. Navigate to the root of this project on your computer using the command line and enter the following command to reset the project to a clean state (i.e., remove all files generated by previous runs of the analysis): + +``` +make clean +``` - +5. To run the analysis in its entirety, enter the following command in the terminal in the project root: -3. To run the analysis, open `heart-failure-analysis.ipynb` in Jupyter Lab you just launched and under the "Kernel" menu click "Restart Kernel and Run All Cells...". +``` +make all +``` ### Clean up @@ -61,37 +74,6 @@ docker compose up 6. Send a pull request to merge the changes into the `main` branch. -### Calling scripts - -To run the analysis, open a terminal and run the following commands and their respective arguments: - -``` -python scripts/download_and_convert.py \ - --url "https://archive.ics.uci.edu/static/public/519/heart+failure+clinical+records.zip" - -python scripts/process_and_analyze.py \ - --file_path "../data/heart_failure_clinical_records_dataset_converted.csv" - -python scripts/correlation_analysis.py \ - --train_file "./data/processed/heart_failure_train.csv" \ - --test_file "./data/processed/heart_failure_test.csv" \ - --output_file "./results/figures/heatmap.html" - -python scripts/modelling.py \ - --training-data "./data/processed/heart_failure_train.csv" \ - --pipeline-to "results/pipeline" \ - --plot-to "results/figures" \ - --seed 123 - -python scripts/model_evaluation.py \ - --scaled-test-data=data/processed/heart_failure_test.csv \ - --pipeline-from=results/pipeline/heart_failure_model.pickle \ - --results-to=results/figures - -quarto render heart-failure-analysis.qmd --to html -quarto render heart-failure-analysis.qmd --to pdf -``` - ## License This dataset is licensed under a [Creative Commons Attribution 4.0 International (CC BY 4.0) license](https://creativecommons.org/licenses/by/4.0/legalcode). diff --git a/data/.gitkeep b/data/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/data/heart+failure+clinical+records.zip b/data/raw/heart+failure+clinical+records.zip similarity index 100% rename from data/heart+failure+clinical+records.zip rename to data/raw/heart+failure+clinical+records.zip diff --git a/data/heart_failure_clinical_records_dataset.csv b/data/raw/heart_failure_clinical_records_dataset.csv similarity index 100% rename from data/heart_failure_clinical_records_dataset.csv rename to data/raw/heart_failure_clinical_records_dataset.csv diff --git a/data/heart_failure_clinical_records_dataset_converted.csv b/data/raw/heart_failure_clinical_records_dataset_converted.csv similarity index 100% rename from data/heart_failure_clinical_records_dataset_converted.csv rename to data/raw/heart_failure_clinical_records_dataset_converted.csv diff --git a/docker-compose.yml b/docker-compose.yml index f3016f5..d2dfda4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,7 @@ services: jupyter-notebook: - image: gur5/heart-failure-prediction:7de3b28 + image: gur5/heart-failure-prediction:fe61672 + ports: - "8888:8888" volumes: diff --git a/environment.yml b/environment.yml index da16f79..6dbf554 100644 --- a/environment.yml +++ b/environment.yml @@ -13,9 +13,9 @@ dependencies: - joblib=1.3.1 - pip=24.0 - pytest=8.3.4 - - pip: - - altair-ally==0.1.1 - - vega-datasets==0.9.0 - - vegafusion==1.6.9 - - deepchecks==0.18.1 - - pandera==0.20.4 \ No newline at end of file + # - pip: + # - altair-ally==0.1.1 + # - vega-datasets==0.9.0 + # - vegafusion==1.6.9 + # - deepchecks==0.18.1 + # - pandera==0.20.4 \ No newline at end of file diff --git a/reports/heart-failure-analysis.html b/reports/heart-failure-analysis.html index 4725dc1..852803c 100644 --- a/reports/heart-failure-analysis.html +++ b/reports/heart-failure-analysis.html @@ -3060,7 +3060,7 @@ .bi-suitcase2::before { content: "\f902"; } .bi-vignette::before { content: "\f903"; } - + @@ -3116,8 +3116,8 @@

Heart Failure Survival Analysis

Summary

-

We built a classification model using the logistic regression algorithm to predict survival outcomes for patients with heart failure. Using patient test results, the final classifier achieves an accuracy of 0.82. The model’s precision of 0.7 suggests it is moderately conservative in predicting the positive class (death), minimizing false alarms. More importantly, the recall of 0.74 ensures the model identifies the majority of high-risk patients, reducing the likelihood of missing true positive cases, which could have serious consequences. The F1-score of 0.72 reflects a good balance between precision and recall, highlighting the model’s robustness in survival prediction, see Table 6.

-

From the confusion matrix, the model correctly identified 14 patients who passed away (true positives) and35 patients who survived (true negatives). However, it also predicted 6 false positives, incorrectly classifying some survivors as deceased, and missed 5 actual cases of death (false negatives). While these errors warrant consideration, the model’s performance demonstrates strong predictive capabilities for both positive and negative outcomes, see Table 5.

+

We built a classification model using the logistic regression algorithm to predict survival outcomes for patients with heart failure. Using patient test results, the final classifier achieves an accuracy of 0.82. The model’s precision of 0.7 suggests it is moderately conservative in predicting the positive class (death), minimizing false alarms. More importantly, the recall of 0.74 ensures the model identifies the majority of high-risk patients, reducing the likelihood of missing true positive cases, which could have serious consequences. The F1-score of 0.72 reflects a good balance between precision and recall, highlighting the model’s robustness in survival prediction, see Table 7.

+

From the confusion matrix, the model correctly identified 14 patients who passed away (true positives) and35 patients who survived (true negatives). However, it also predicted 6 false positives, incorrectly classifying some survivors as deceased, and missed 5 actual cases of death (false negatives). While these errors warrant consideration, the model’s performance demonstrates strong predictive capabilities for both positive and negative outcomes, see Table 6.

Overall, the logistic regression classifier effectively leverages patient test results to support survival prediction, providing a valuable tool to aid clinical decision-making in heart failure management.

@@ -3137,86 +3137,86 @@

-
+
- +
- - + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + +
 Column NameDescriptionColumn NameDescription
0agePatient's age0agePatient's age
1anaemiaDecrease of red blood cells or hemoglobin1anaemiaDecrease of red blood cells or hemoglobin
2creatinine_phosphokinaseLevel of the CPK enzyme in the blood2creatinine_phosphokinaseLevel of the CPK enzyme in the blood
3diabetesIf the patient has diabetes3diabetesIf the patient has diabetes
4ejection_fractionPercentage of blood leaving the heart at each contraction4ejection_fractionPercentage of blood leaving the heart at each contraction
5high_blood_pressureIf the patient has hypertension5high_blood_pressureIf the patient has hypertension
6plateletsPlatelets in the blood6plateletsPlatelets in the blood
7serum_creatinineLevel of serum creatinine in the blood7serum_creatinineLevel of serum creatinine in the blood
8serum_sodiumLevel of serum sodium in the blood8serum_sodiumLevel of serum sodium in the blood
9sexWoman or man9sexWoman or man
10smokingIf the patient smokes or not10smokingIf the patient smokes or not
11timeFollow-up period11timeFollow-up period
12DEATH_EVENTWhether the patient died or not (target variable)12DEATH_EVENTWhether the patient died or not (target variable)
@@ -3225,10 +3225,15 @@

-
-

-
Correlation heatmap- Based on the correlation matrix graph below, all features have relatively low correlations between each other, the correlations are below 0.5, so there is no strong evidence to drop additional featues.
+

Based on the correlation matrix graph Figure 1, all features have relatively low correlations between each other, the correlations are below 0.5, so there is no strong evidence to drop additional features.

+
+
+
+ +
+
+Figure 1: Correlation heatmap +
@@ -3238,86 +3243,86 @@

-
+
- +
- - + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + +
 ColumnMissing ValuesColumnMissing Values
0age00age0
1anaemia01anaemia0
2creatinine_phosphokinase02creatinine_phosphokinase0
3diabetes03diabetes0
4ejection_fraction04ejection_fraction0
5high_blood_pressure05high_blood_pressure0
6platelets06platelets0
7serum_creatinine07serum_creatinine0
8serum_sodium08serum_sodium0
9sex09sex0
10smoking010smoking0
11time011time0
12DEATH_EVENT012DEATH_EVENT0
@@ -3334,7 +3339,7 @@

-
+
@@ -3373,12 +3378,18 @@

Model

We compared Decision Tree, KNN, Logistic Regression, and selected Logistic Regression due to its interpretability, and ability to handle both linear and non-linear relationships between features. Logistic Regression performed better than the other two models as it works well with fewer features and is less prone to overfitting compared to more complex models like Decision Trees or KNN, especially when the data is relatively small.

Hyperparameter tuning to find find the best Logistic Regression model:

-
-
+
+
+
+
+Table 4: Logistic Regression Scores +
+
+
- +
@@ -3424,26 +3435,30 @@

Model

+ + +

The model is performing well with C = 0.0001 with a CV score of 0.83 and is close to train score, indicating that model is generalising well.

-
+
- +
-Figure 1: Cross-validation scores for Logistic Regression - Logistic regression performs better than Decision tree and KNN on the cross validation data, hence, we selected it as our final model. +Figure 2: Cross-validation scores for Logistic Regression
-

The best features to train our model are show in Table 4:

+

Logistic regression performs better than Decision tree and KNN on the cross validation data, hence, we selected it as our final model.

+

The best features to train our model are show in Table 5:

-Table 4: Top features for trainig the model. +Table 5: Top features for trainig the model.
-
+
@@ -3547,19 +3562,19 @@

Confusion Matrix

-Table 5: Confusion matrix for the final model on the test dataset. +Table 6: Confusion matrix for the final model on the test dataset.
-
+
-
+
- - + + @@ -3569,14 +3584,14 @@

Confusion Matrix

- - - + + + - - - + + +
Predicted0101
Actual
03560356
15141514
@@ -3589,10 +3604,10 @@

Confusion Matrix

-Table 6: Evaluation metrics for the final model. +Table 7: Evaluation metrics for the final model.
-
+
@@ -3638,7 +3653,7 @@

Confusion Matrix

Results and Conculsion

-

The analysis revealed that platelets and ejection_fraction are the most important features (see Table 4) in predicting the risk of patient mortality. These features significantly impact the model’s ability to assess patient risk, which is crucial for early intervention. Our model achieved a recall score of 0.74 (see Table 6), which is a good start, but there is room for improvement, particularly in reducing the number of high risk patients the model might miss, i.e., maximising recall by minimising False Negatives.

+

The analysis revealed that platelets and ejection_fraction are the most important features (see Table 5) in predicting the risk of patient mortality. These features significantly impact the model’s ability to assess patient risk, which is crucial for early intervention. Our model achieved a recall score of 0.74 (see Table 7), which is a good start, but there is room for improvement, particularly in reducing the number of high risk patients the model might miss, i.e., maximising recall by minimising False Negatives.

The main challenges in this project stem from class imbalance and limited data availability. With more diverse and comprehensive datasets, performance could be further enhanced. We would also like to explore other machine learning models to improve the overall accuracy.

In conclusion, while the current model shows potential, there is significant opportunity to enhance its effectiveness. With improvements in data quality and model optimization, this tool could become a crucial asset in predicting patient risk and saving lives.

diff --git a/reports/heart-failure-analysis.ipynb b/reports/heart-failure-analysis.ipynb index 73c6cc9..f3e7ebf 100644 --- a/reports/heart-failure-analysis.ipynb +++ b/reports/heart-failure-analysis.ipynb @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 44, "metadata": {}, "outputs": [ { @@ -70,74 +70,74 @@ "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", "
Column NameDescriptionColumn NameDescription
agePatient's ageagePatient's age
anaemiaDecrease of red blood cells or hemoglobinanaemiaDecrease of red blood cells or hemoglobin
creatinine_phosphokinaseLevel of the CPK enzyme in the bloodcreatinine_phosphokinaseLevel of the CPK enzyme in the blood
diabetesIf the patient has diabetesdiabetesIf the patient has diabetes
ejection_fractionPercentage of blood leaving the heart at each contractionejection_fractionPercentage of blood leaving the heart at each contraction
high_blood_pressureIf the patient has hypertensionhigh_blood_pressureIf the patient has hypertension
plateletsPlatelets in the bloodplateletsPlatelets in the blood
serum_creatinineLevel of serum creatinine in the bloodserum_creatinineLevel of serum creatinine in the blood
serum_sodiumLevel of serum sodium in the bloodserum_sodiumLevel of serum sodium in the blood
sexWoman or mansexWoman or man
smokingIf the patient smokes or notsmokingIf the patient smokes or not
timeFollow-up periodtimeFollow-up period
DEATH_EVENTWhether the patient died or not (target variable)DEATH_EVENTWhether the patient died or not (target variable)
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 1, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -170,7 +170,7 @@ "Markdown(table_df.to_markdown(index=False))\n", "\n", "# Save the table as a CSV\n", - "table_df.to_csv('tables/patient_table.csv', index=False)\n", + "table_df.to_csv('../results/tables/patient_table.csv', index=False)\n", "\n", "# Display the DataFrame without the index\n", "table_df.style.hide(axis=\"index\")\n" @@ -224,17 +224,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 45, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "deepchecks - WARNING - You are using deepchecks version 0.18.1, however a newer version is available. Deepchecks is frequently updated with major improvements. You should consider upgrading via the \"python -m pip install --upgrade deepchecks\" command.\n" - ] - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", @@ -265,12 +257,12 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "# Load the dataset\n", - "file_path = 'data/heart_failure_clinical_records_dataset.csv'\n", + "file_path = '../data/raw/heart_failure_clinical_records_dataset.csv'\n", "heart_failure_data = pd.read_csv(file_path)\n", "\n", "# List of binary columns\n", @@ -289,7 +281,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 47, "metadata": {}, "outputs": [ { @@ -553,7 +545,7 @@ "[299 rows x 13 columns]" ] }, - "execution_count": 4, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } @@ -587,7 +579,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 48, "metadata": {}, "outputs": [ { @@ -596,7 +588,7 @@ "(299, 13)" ] }, - "execution_count": 5, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -607,7 +599,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 49, "metadata": {}, "outputs": [ { @@ -643,7 +635,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 50, "metadata": {}, "outputs": [ { @@ -655,7 +647,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 7, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -674,7 +666,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 51, "metadata": {}, "outputs": [ { @@ -821,7 +813,7 @@ "max 9.40000 148.000000 285.000000 " ] }, - "execution_count": 8, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -834,7 +826,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 52, "metadata": {}, "outputs": [ { @@ -949,7 +941,7 @@ "12 DEATH_EVENT 0" ] }, - "execution_count": 9, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -977,7 +969,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 53, "metadata": {}, "outputs": [ { @@ -985,23 +977,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, - "execution_count": 12, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } @@ -1250,7 +1242,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 56, "metadata": {}, "outputs": [ { @@ -1258,23 +1250,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, - "execution_count": 14, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } @@ -1426,7 +1418,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 58, "metadata": {}, "outputs": [ { @@ -1434,23 +1426,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "