diff --git a/.github/workflows/automatic_semantic_pr.yml b/.github/workflows/automatic_semantic_pr.yml deleted file mode 100644 index 7c9ccb96..00000000 --- a/.github/workflows/automatic_semantic_pr.yml +++ /dev/null @@ -1,19 +0,0 @@ -# for config, see here: https://github.com/amannn/action-semantic-pull-requests - -name: "Lint PR" - -on: - pull_request_target: - types: - - opened - - edited - - synchronize - -jobs: - main: - name: Validate PR title - runs-on: ubuntu-latest - steps: - - uses: amannn/action-semantic-pull-request@v4 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml new file mode 100644 index 00000000..887e6e65 --- /dev/null +++ b/.github/workflows/documentation.yml @@ -0,0 +1,29 @@ + +name: Documentation +on: + push: + branches: + - master +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 # otherwise, you will failed to push refs to dest repo + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install -e . + - name: Build and Commit + uses: sphinx-notes/pages@v2 + with: + documentation_path: docs + install_requirements: "true" + - name: Push changes + uses: ad-m/github-push-action@v2 + with: + github_token: ${{ secrets.SPHINX_DOCUMENTATION }} + branch: gh-pages \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7582c489..f411785a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,37 @@ +default_stages: [commit, push] + repos: -- repo: https://github.com/psf/black + - repo: https://github.com/pycqa/isort + rev: 5.10.1 + hooks: + - id: isort + name: isort (python) + args: ["--profile", "black", "--filter-files"] + + - repo: https://github.com/asottile/add-trailing-comma + rev: v2.2.3 + hooks: + - id: add-trailing-comma + + - repo: https://github.com/asottile/pyupgrade + rev: v2.34.0 + hooks: + - id: pyupgrade + + - repo: https://github.com/myint/docformatter + rev: v1.3.1 + hooks: + - id: docformatter + args: [--in-place] + + - repo: https://github.com/psf/black rev: 22.3.0 hooks: - - id: black - language_version: python3.8 \ No newline at end of file + - id: black + language_version: python3.8 + + - repo: https://github.com/PyCQA/flake8 + rev: 4.0.1 + hooks: + - id: flake8 + args: [--config, .flake8] diff --git a/README.md b/README.md index bf651b4f..e756eb73 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,12 @@ + +# PSYCOP Machine Learning Utilities + ![python versions](https://img.shields.io/badge/Python-%3E=3.7-blue) [![Code style: black](https://img.shields.io/badge/Code%20Style-Black-black)](https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html) +[![github actions pytest](https://github.com/Aarhus-Psychiatry-Research/psycop-ml-utils/actions/workflows/pytest.yml/badge.svg)](https://github.com/Aarhus-Psychiatry-Research/psycop-ml-utils/actions) +[![github actions docs](https://github.com/Aarhus-Psychiatry-Research/psycop-ml-utils/actions/workflows/documentation.yml/badge.svg)](https://Aarhus-Psychiatry-Research.github.io/psycop-ml-utils/) +![coverage](https://img.shields.io/endpoint?url=https://gist.githubusercontent.com/martbern/d6c40a5b5a3169c079e8b8f778b8e517/raw/badge-psycop-ml-utils-pytest-coverage.json) +======= ![badge](https://img.shields.io/endpoint?url=https://gist.githubusercontent.com/martbern/d6c40a5b5a3169c079e8b8f778b8e517/raw/badge-psycop-ml-utils-pytest-coverage.json) # Installation @@ -32,106 +39,43 @@ or # Usage - [ ] Update examples as API matures -## Loading data from SQL - -Currently only contains one function to load a view from SQL, `sql_load` -```py -from psycopmlutils.loaders.sql_load import sql_load +## 🔧 Installation +To get started using psycop-ml-utils simply install it using pip by running the following line in your terminal: -view = "[FOR_SFI_fritekst_resultat_udfoert_i_psykiatrien_aendret_2011]" -sql = "SELECT * FROM [fct]." + view -df = sql_load(sql, chunksize = None) ``` - -## Flattening time series -To train baseline models (logistic regression, elastic net, SVM, XGBoost/random forest etc.), we need to represent the longitudinal data in a tabular, flattened way. - -In essence, we need to generate a training example for each prediction time, where that example contains "latest_blood_pressure" (float), "X_diagnosis_within_n_hours" (boolean) etc. - -To generate this, I propose the time-series flattener class (`TimeSeriesFlattener`). It builds a dataset like described above. - -### TimeSeriesFlattener -```python -class FlattenedDataset: - def __init__(): - """Class containing a time-series flattened. - - Args: - prediction_times_df (DataFrame): Dataframe with prediction times. - prediction_timestamp_colname (str, optional): Colname for timestamps. Defaults to "timestamp". - id_colname (str, optional): Colname for patients ids. Defaults to "dw_ek_borger". - """ - - def add_outcome(): - """Adds an outcome-column to the dataset - - Args: - outcome_df (DataFrame): Cols: dw_ek_borger, datotid, value if relevant. - lookahead_days (float): How far ahead to look for an outcome in days. If none found, use fallback. - resolve_multiple (str): What to do with more than one value within the lookahead. - Suggestions: earliest, latest, mean, max, min. - fallback (List[str]): What to do if no value within the lookahead. - Suggestions: latest, mean_of_patient, mean_of_population, hardcode (qualified guess) - timestamp_colname (str): Column name for timestamps - values_colname (str): Colname for outcome values in outcome_df - id_colname (str): Column name for citizen id - new_col_name (str): Name to use for new col. Automatically generated as '{new_col_name}_within_{lookahead_days}_days'. - Defaults to using values_colname. - """ - - def add_predictor(): - """Adds a predictor-column to the dataset - - Args: - predictor_df (DataFrame): Cols: dw_ek_borger, datotid, value if relevant. - lookahead_days (float): How far ahead to look for an outcome in days. If none found, use fallback. - resolve_multiple (str): What to do with more than one value within the lookahead. - Suggestions: earliest, latest, mean, max, min. - fallback (List[str]): What to do if no value within the lookahead. - Suggestions: latest, mean_of_patient, mean_of_population, hardcode (qualified guess) - outcome_colname (str): What to name the column - id_colname (str): Column name for citizen id - timestamp_colname (str): Column name for timestamps - """ +pip install git+https://github.com/Aarhus-Psychiatry-Research/psycop-ml-utils.git ``` -Inspiration-code can be found in previous commits. +For more detailed instructions on installation, see the [installation instructions](https://Aarhus-Psychiatry-Research.github.io/psycop-ml-utils/installation). -#### Example -- [ ] Update examples as API matures -```python -import FlattenedDataset - -dataset = FlattenedDataset(prediction_times_df = prediction_times, prediction_timestamp_colname = "timestamp", id_colname = "dw_ek_borger") - -dataset.add_outcome( - outcome_df=type_2_diabetes_df, - lookahead_days=730, - resolve_multiple="max", - fallback=[0], - name="t2d", -) - -dataset.add_predictor( - predictor=hba1c, - lookback_window=365, - resolve_multiple="max", - fallback=["latest", 40], - name="hba1c", -) -``` +## 📖 Documentation + +| Documentation | | +| -------------------------- | --------------------------------------------------------------------------- | +| 📚 **[Usage Guides]** | Guides and instructions on how the package and its features. | +| 📰 **[News and changelog]** | New additions, changes and version history. | +| 🎛 **[API References]** | The detailed reference for psycop-ml-utils's API. Including function documentation | +| 🙋 **[FAQ]** | Frequently asked question | + +[usage guides]: https://Aarhus-Psychiatry-Research.github.io/psycop-ml-utils/introduction.html +[api references]: https://Aarhus-Psychiatry-Research.github.io/psycop-ml-utils/ +[Augmenters]: https://Aarhus-Psychiatry-Research.github.io/psycop-ml-utils/augmenters.html +[Demo]: https://share.streamlit.io/Aarhus-Psychiatry-Research/psycop-ml-utils/dev/streamlit.py +[News and changelog]: https://Aarhus-Psychiatry-Research.github.io/psycop-ml-utils/news.html +[FAQ]: https://Aarhus-Psychiatry-Research.github.io/psycop-ml-utils/faq.html -Dataset now looks like this: +## 💬 Where to ask questions -| dw_ek_borger | datetime_prediction | outc_t2d_within_next_730_days | pred_max_hba1c_within_prev_365_days | -|--------------|---------------------|-------------------------------|-------------------------------------| -| 1 | yyyy-mm-dd hh:mm:ss | 0 | 48 | -| 2 | yyyy-mm-dd hh:mm:ss | 0 | 40 | -| 3 | yyyy-mm-dd hh:mm:ss | 1 | 44 | +| Type | | +| ------------------------------ | ---------------------- | +| 🚨 **Bug Reports** | [GitHub Issue Tracker] | +| 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker] | +| 👩‍💻 **Usage Questions** | [GitHub Discussions] | +| 🗯 **General Discussion** | [GitHub Discussions] | +[github issue tracker]: https://github.com/Aarhus-Psychiatry-Research/psycop-ml-utils/issues +[github discussions]: https://github.com/Aarhus-Psychiatry-Research/psycop-ml-utils/discussions -For binary outcomes, `add_predictor` with `fallback = [0]` would take a df with only the times where the event occurred, and then generate 0's for the rest. -I propose we create the above functionality on a just-in-time basis, building the features as we need them. diff --git a/citation.cff b/citation.cff new file mode 100644 index 00000000..82b7c46b --- /dev/null +++ b/citation.cff @@ -0,0 +1,15 @@ +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +authors: +- family-names: "Martin" + given-names: "Bernstorff" +- family-names: "Lasse" + given-names: "Hansen" +- family-names: "Enevoldsen" + given-names: "Kenneth" + orcid: "https://orcid.org/0000-0001-8733-0966" +title: "PSYCOP machine learning utilities" +version: 0.1.1 +# doi: 10.5281/zenodo.6675315 +date-released: 2022-21-06 +url: "https://github.com/Aarhus-Psychiatry-Research/psycop-ml-utils" \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..d4bb2cbb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/_static/favicon.ico b/docs/_static/favicon.ico new file mode 100644 index 00000000..292771a0 Binary files /dev/null and b/docs/_static/favicon.ico differ diff --git a/docs/_static/icon.png b/docs/_static/icon.png new file mode 100644 index 00000000..85126960 Binary files /dev/null and b/docs/_static/icon.png differ diff --git a/docs/_static/icon_with_title.png b/docs/_static/icon_with_title.png new file mode 100644 index 00000000..9ed41011 Binary files /dev/null and b/docs/_static/icon_with_title.png differ diff --git a/docs/api.model_performance.rst b/docs/api.model_performance.rst new file mode 100644 index 00000000..0cf36918 --- /dev/null +++ b/docs/api.model_performance.rst @@ -0,0 +1,21 @@ +Model Performance +-------------------------------------------------- + + +model_performance.model_performance +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: psycopmlutils.model_performance.model_performance + :members: + :undoc-members: + :show-inheritance: + :exclude-members: + +model_performance.utils +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: psycopmlutils.model_performance.utils + :members: + :undoc-members: + :show-inheritance: + :exclude-members: \ No newline at end of file diff --git a/docs/api.timeseriesflattener.rst b/docs/api.timeseriesflattener.rst new file mode 100644 index 00000000..255b973f --- /dev/null +++ b/docs/api.timeseriesflattener.rst @@ -0,0 +1,31 @@ +Time Series Flattener +-------------------------------------------------- + + +timeseriesflattener.create_feature_combinations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: psycopmlutils.timeseriesflattener.create_feature_combinations + :members: + :undoc-members: + :show-inheritance: + :exclude-members: + +timeseriesflattener.flattened_dataset +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: psycopmlutils.timeseriesflattener.flattened_dataset + :members: + :undoc-members: + :show-inheritance: + :exclude-members: + + +timeseriesflattener.resolve_multiple_functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: psycopmlutils.timeseriesflattener.resolve_multiple_functions + :members: + :undoc-members: + :show-inheritance: + :exclude-members: \ No newline at end of file diff --git a/docs/api.writers.rst b/docs/api.writers.rst new file mode 100644 index 00000000..a87b702c --- /dev/null +++ b/docs/api.writers.rst @@ -0,0 +1,12 @@ +Writers +-------------------------------------------------- + + +writers.sql_writer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: psycopmlutils.writers.sql_writer + :members: + :undoc-members: + :show-inheritance: + :exclude-members: diff --git a/docs/changelog.md b/docs/changelog.md new file mode 100644 index 00000000..9a1b94a5 --- /dev/null +++ b/docs/changelog.md @@ -0,0 +1,4 @@ +# News and Changelog + +- v. 0.1.1 (21 June 2022) + - Documentation was added \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..0507d42d --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,93 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# + +from psycopmlutils import __version__ + +# -- Project information ----------------------------------------------------- + +project = "psycop-ml-utils" +author = "Martin Bernstorff, Lasse Hansen, and Kenneth Enevoldsen" + +# The full version, including alpha/beta/rc tags +release = __version__ + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", + "sphinxext.opengraph", + "sphinx_copybutton", + "sphinx.ext.githubpages", + "myst_parser", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "furo" # "press", "sphinx_rtd_theme", "furo" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] + +html_show_sourcelink = True + +html_context = { + "display_github": True, # Add 'Edit on Github' link instead of 'View page source' + "github_user": "Aarhus-Psychiatry-Research", + "github_repo": project, + "github_version": "main", + "conf_py_path": "/docs/", +} + + +source_suffix = { + ".rst": "restructuredtext", + ".md": "markdown", +} + +html_static_path = ["_static"] +html_favicon = "_static/favicon.ico" + +html_theme_options = { + "light_logo": "icon_with_title.png", + "dark_logo": "icon_with_title.png", + "light_css_variables": { + "color-brand-primary": "#204279", + "color-brand-content": "#204279", + }, + "dark_css_variables": { + "color-brand-primary": "#4872b8", + "color-brand-content": "#4872b8", + }, + "sidebar_hide_name": True, + "navigation_with_keys": True, + "top_of_page_button": "edit", +} diff --git a/docs/faq.rst b/docs/faq.rst new file mode 100644 index 00000000..dad9d4ba --- /dev/null +++ b/docs/faq.rst @@ -0,0 +1,62 @@ +FAQ +------- + + +How do I test the code and run the test suite? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Augmenty comes with an extensive test suite. To run the tests, you should clone the repository, then build augmenty from the source. +This will also install the required development dependencies and test utilities defined in the requirements.txt. + + +.. code-block:: + + # install test dependencies + pip install -r requirements.txt + + python -m pytest + + +which will run all the test in the :code:`tests` folder. + +Specific tests can be run using: + +.. code-block:: + + python -m pytest tests/desired_test.py + + +If you want to check code coverage you can run the following: + +.. code-block:: + + pip install pytest-cov + + python -m pytest --cov=. + + +How is the documentation generated? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Augmenty uses `sphinx `__ to generate documentation. It uses the `Furo `__ theme with custom styling. + +To make the documentation you can run: + +.. code-block:: + + # install sphinx, themes and extensions + pip install requirements.txt + + # generate html from documentations + + make -C docs html + + +How do I cite this work? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you use this library in your research, we please cite: + +.. code-block:: + + add citation here diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..6a2d9e71 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,36 @@ +PSYCOP Machine Learning Utilites +================================ + +.. image:: https://img.shields.io/github/stars/Aarhus-Psychiatry-Research/psycop-ml-utils.svg?style=social&label=Star&maxAge=2592000 + :target: https://github.com/Aarhus-Psychiatry-Research/psycop-ml-utils + + +Contents +--------------------------------- + +.. toctree:: + :maxdepth: 3 + :caption: Getting started + + installation + changelog + faq + +.. toctree:: + :maxdepth: 3 + :caption: API references + + api.model_performance + api.timeseriesflattener + api.writers + + +.. toctree:: + GitHub Repository + + +Indices and search +================== + +* :ref:`genindex` +* :ref:`search` diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 00000000..19ae461c --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,40 @@ + +# Installation +To get started using psycop-ml-utils simply install it using pip by running the following line in your terminal: + +Install using your preferred package manager, e.g.: +`pip install git+https://github.com/Aarhus-Psychiatry-Research/psycop-ml-utils.git` + +or using peotry by running + +`poetry add git+https://github.com/Aarhus-Psychiatry-Research/psycop-ml-utils.git` + + +## For development +We use poetry for dependency management. To install poety following the instruction on their [website](https://python-poetry.org/docs/#osx--linux--bashonwindows-install-instructions). + + +Clone the repo, move into it, then run `poetry install`. I.e.: + +```bash +git clone https://github.com/Aarhus-Psychiatry-Research/psycop-ml-utils.git +cd psycop-ml-utils +poetry install +``` + +To increase the version: +`poetry version [patch|minor|major]` according to [semantic versioning](https://semver.org/). + +Adding new as dependnecies: +`poetry add (--dev) [packagename]` + +No need to update a `requirements.txt`. It's replace by `pyproject.toml`, and `poetry` manages it automatically. + + +## When using +Install using your preferred package manager, e.g.: +`pip install git+https://github.com/Aarhus-Psychiatry-Research/psycop-ml-utils.git` + +or + +`poetry add git+https://github.com/Aarhus-Psychiatry-Research/psycop-ml-utils.git` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..2119f510 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/example/timeseriesflattener/generate_features_and_write_to_sql.py b/example/timeseriesflattener/generate_features_and_write_to_sql.py index 677849b8..eda9a9fe 100644 --- a/example/timeseriesflattener/generate_features_and_write_to_sql.py +++ b/example/timeseriesflattener/generate_features_and_write_to_sql.py @@ -1,15 +1,15 @@ import time -from pathlib import Path import numpy as np import pandas as pd +from wasabi import msg + import psycopmlutils.loaders # noqa from psycopmlutils.timeseriesflattener import ( FlattenedDataset, create_feature_combinations, ) from psycopmlutils.writers.sql_writer import write_df_to_sql -from wasabi import msg if __name__ == "__main__": RESOLVE_MULTIPLE = ["mean", "max", "min"] @@ -41,7 +41,7 @@ "resolve_multiple": RESOLVE_MULTIPLE, "fallback": np.nan, }, - ] + ], ) event_times = psycopmlutils.loaders.LoadOutcome.t2d() @@ -87,11 +87,11 @@ # Finish msg.good( - f"Finished adding {len(PREDICTOR_LIST)} predictors, took {round((end_time - start_time)/60, 1)} minutes" + f"Finished adding {len(PREDICTOR_LIST)} predictors, took {round((end_time - start_time)/60, 1)} minutes", ) msg.info( - f"Dataframe size is {flattened_df.df.memory_usage(index=True, deep=True).sum() / 1024 / 1024} MiB" + f"Dataframe size is {flattened_df.df.memory_usage(index=True, deep=True).sum() / 1024 / 1024} MiB", ) msg.good("Done!") @@ -117,7 +117,7 @@ ] msg.warn( - f"{dataset_name}: There are {len(ids_in_split_but_not_in_flattened_df)} ({round(len(ids_in_split_but_not_in_flattened_df)/len(split_ids)*100, 2)}%) ids which are in {dataset_name}_ids but not in flattened_df_ids, will get dropped during merge" + f"{dataset_name}: There are {len(ids_in_split_but_not_in_flattened_df)} ({round(len(ids_in_split_but_not_in_flattened_df)/len(split_ids)*100, 2)}%) ids which are in {dataset_name}_ids but not in flattened_df_ids, will get dropped during merge", ) split_df = pd.merge(flattened_df.df, df_split_ids, how="inner") diff --git a/poetry.lock b/poetry.lock index dbeb791c..fd640ce0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,3 +1,11 @@ +[[package]] +name = "alabaster" +version = "0.7.12" +description = "A configurable sidebar-enabled Sphinx theme" +category = "dev" +optional = false +python-versions = "*" + [[package]] name = "atomicwrites" version = "1.4.0" @@ -20,11 +28,37 @@ docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "cloudpickle"] tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "cloudpickle"] +[[package]] +name = "babel" +version = "2.10.3" +description = "Internationalization utilities" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +pytz = ">=2015.7" + +[[package]] +name = "beautifulsoup4" +version = "4.11.1" +description = "Screen-scraping library" +category = "dev" +optional = false +python-versions = ">=3.6.0" + +[package.dependencies] +soupsieve = ">1.2" + +[package.extras] +html5lib = ["html5lib"] +lxml = ["lxml"] + [[package]] name = "black" version = "22.3.0" description = "The uncompromising code formatter." -category = "main" +category = "dev" optional = false python-versions = ">=3.6.2" @@ -50,6 +84,14 @@ category = "main" optional = false python-versions = ">=3.6" +[[package]] +name = "certifi" +version = "2022.6.15" +description = "Python package for providing Mozilla's CA Bundle." +category = "dev" +optional = false +python-versions = ">=3.6" + [[package]] name = "cfgv" version = "3.3.1" @@ -58,11 +100,22 @@ category = "dev" optional = false python-versions = ">=3.6.1" +[[package]] +name = "charset-normalizer" +version = "2.0.12" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +category = "dev" +optional = false +python-versions = ">=3.5.0" + +[package.extras] +unicode_backport = ["unicodedata2"] + [[package]] name = "click" version = "8.1.3" description = "Composable command line interface toolkit" -category = "main" +category = "dev" optional = false python-versions = ">=3.7" @@ -73,7 +126,7 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "colorama" version = "0.4.5" description = "Cross-platform colored terminal text." -category = "main" +category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" @@ -99,6 +152,25 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "docformatter" +version = "1.4" +description = "Formats docstrings to follow PEP 257." +category = "dev" +optional = false +python-versions = "*" + +[package.dependencies] +untokenize = "*" + +[[package]] +name = "docutils" +version = "0.18.1" +description = "Docutils -- Python Documentation Utilities" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + [[package]] name = "filelock" version = "3.7.1" @@ -111,6 +183,33 @@ python-versions = ">=3.7" docs = ["furo (>=2021.8.17b43)", "sphinx (>=4.1)", "sphinx-autodoc-typehints (>=1.12)"] testing = ["covdefaults (>=1.2.0)", "coverage (>=4)", "pytest (>=4)", "pytest-cov", "pytest-timeout (>=1.4.2)"] +[[package]] +name = "flake8" +version = "4.0.1" +description = "the modular source code checker: pep8 pyflakes and co" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +mccabe = ">=0.6.0,<0.7.0" +pycodestyle = ">=2.8.0,<2.9.0" +pyflakes = ">=2.4.0,<2.5.0" + +[[package]] +name = "furo" +version = "2022.6.21" +description = "A clean customisable Sphinx documentation theme." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +beautifulsoup4 = "*" +pygments = "*" +sphinx = ">=4.0,<6.0" +sphinx-basic-ng = "*" + [[package]] name = "greenlet" version = "1.1.2" @@ -133,6 +232,38 @@ python-versions = ">=3.7" [package.extras] license = ["ukkonen"] +[[package]] +name = "idna" +version = "3.3" +description = "Internationalized Domain Names in Applications (IDNA)" +category = "dev" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "imagesize" +version = "1.3.0" +description = "Getting image size from png/jpeg/jpeg2000/gif file" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "importlib-metadata" +version = "4.11.4" +description = "Read metadata from Python packages" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +zipp = ">=0.5" + +[package.extras] +docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"] +perf = ["ipython"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"] + [[package]] name = "iniconfig" version = "1.1.1" @@ -141,6 +272,20 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "jinja2" +version = "3.1.2" +description = "A very fast and expressive template engine." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +MarkupSafe = ">=2.0" + +[package.extras] +i18n = ["Babel (>=2.7)"] + [[package]] name = "joblib" version = "1.1.0" @@ -149,14 +294,98 @@ category = "main" optional = false python-versions = ">=3.6" +[[package]] +name = "markdown-it-py" +version = "2.1.0" +description = "Python port of markdown-it. Markdown parsing, done right!" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +mdurl = ">=0.1,<1.0" + +[package.extras] +benchmarking = ["psutil", "pytest", "pytest-benchmark (>=3.2,<4.0)"] +code_style = ["pre-commit (==2.6)"] +compare = ["commonmark (>=0.9.1,<0.10.0)", "markdown (>=3.3.6,<3.4.0)", "mistletoe (>=0.8.1,<0.9.0)", "mistune (>=2.0.2,<2.1.0)", "panflute (>=2.1.3,<2.2.0)"] +linkify = ["linkify-it-py (>=1.0,<2.0)"] +plugins = ["mdit-py-plugins"] +profiling = ["gprof2dot"] +rtd = ["attrs", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx-book-theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + +[[package]] +name = "markupsafe" +version = "2.1.1" +description = "Safely add untrusted strings to HTML/XML markup." +category = "dev" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "mccabe" +version = "0.6.1" +description = "McCabe checker, plugin for flake8" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "mdit-py-plugins" +version = "0.3.0" +description = "Collection of plugins for markdown-it-py" +category = "dev" +optional = false +python-versions = "~=3.6" + +[package.dependencies] +markdown-it-py = ">=1.0.0,<3.0.0" + +[package.extras] +code_style = ["pre-commit (==2.6)"] +rtd = ["myst-parser (>=0.14.0,<0.15.0)", "sphinx-book-theme (>=0.1.0,<0.2.0)"] +testing = ["coverage", "pytest (>=3.6,<4)", "pytest-cov", "pytest-regressions"] + +[[package]] +name = "mdurl" +version = "0.1.1" +description = "Markdown URL utilities" +category = "dev" +optional = false +python-versions = ">=3.7" + [[package]] name = "mypy-extensions" version = "0.4.3" description = "Experimental type system extensions for programs checked with the mypy typechecker." -category = "main" +category = "dev" optional = false python-versions = "*" +[[package]] +name = "myst-parser" +version = "0.18.0" +description = "An extended commonmark compliant parser, with bridges to docutils & sphinx." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +docutils = ">=0.15,<0.19" +jinja2 = "*" +markdown-it-py = ">=1.0.0,<3.0.0" +mdit-py-plugins = ">=0.3.0,<0.4.0" +pyyaml = "*" +sphinx = ">=4,<6" +typing-extensions = "*" + +[package.extras] +code_style = ["pre-commit (>=2.12,<3.0)"] +linkify = ["linkify-it-py (>=1.0,<2.0)"] +rtd = ["ipython", "sphinx-book-theme", "sphinx-design", "sphinxext-rediraffe (>=0.2.7,<0.3.0)", "sphinxcontrib.mermaid (>=0.7.1,<0.8.0)", "sphinxext-opengraph (>=0.6.3,<0.7.0)"] +testing = ["beautifulsoup4", "coverage", "pytest (>=6,<7)", "pytest-cov", "pytest-regressions", "pytest-param-files (>=0.3.4,<0.4.0)", "sphinx-pytest"] + [[package]] name = "nodeenv" version = "1.6.0" @@ -209,7 +438,7 @@ test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] name = "pathspec" version = "0.9.0" description = "Utility library for gitignore style pattern matching of file paths." -category = "main" +category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" @@ -217,7 +446,7 @@ python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" name = "platformdirs" version = "2.5.2" description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -category = "main" +category = "dev" optional = false python-versions = ">=3.7" @@ -261,6 +490,30 @@ category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "pycodestyle" +version = "2.8.0" +description = "Python style guide checker" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[[package]] +name = "pyflakes" +version = "2.4.0" +description = "passive checker of Python programs" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "pygments" +version = "2.12.0" +description = "Pygments is a syntax highlighting package written in Python." +category = "dev" +optional = false +python-versions = ">=3.6" + [[package]] name = "pyodbc" version = "4.0.32" @@ -343,6 +596,24 @@ category = "dev" optional = false python-versions = ">=3.6" +[[package]] +name = "requests" +version = "2.28.0" +description = "Python HTTP for Humans." +category = "dev" +optional = false +python-versions = ">=3.7, <4" + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2.0.0,<2.1.0" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<1.27" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"] + [[package]] name = "scikit-learn" version = "1.1.1" @@ -382,6 +653,165 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +[[package]] +name = "snowballstemmer" +version = "2.2.0" +description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms." +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "soupsieve" +version = "2.3.2.post1" +description = "A modern CSS selector implementation for Beautiful Soup." +category = "dev" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "sphinx" +version = "5.0.2" +description = "Python documentation generator" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +alabaster = ">=0.7,<0.8" +babel = ">=1.3" +colorama = {version = ">=0.3.5", markers = "sys_platform == \"win32\""} +docutils = ">=0.14,<0.19" +imagesize = "*" +importlib-metadata = {version = ">=4.4", markers = "python_version < \"3.10\""} +Jinja2 = ">=2.3" +packaging = "*" +Pygments = ">=2.0" +requests = ">=2.5.0" +snowballstemmer = ">=1.1" +sphinxcontrib-applehelp = "*" +sphinxcontrib-devhelp = "*" +sphinxcontrib-htmlhelp = ">=2.0.0" +sphinxcontrib-jsmath = "*" +sphinxcontrib-qthelp = "*" +sphinxcontrib-serializinghtml = ">=1.1.5" + +[package.extras] +docs = ["sphinxcontrib-websupport"] +lint = ["flake8 (>=3.5.0)", "isort", "mypy (>=0.950)", "docutils-stubs", "types-typed-ast", "types-requests"] +test = ["pytest (>=4.6)", "html5lib", "cython", "typed-ast"] + +[[package]] +name = "sphinx-basic-ng" +version = "0.0.1a11" +description = "A modern skeleton for Sphinx themes." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +sphinx = ">=4.0,<6.0" + +[package.extras] +docs = ["furo", "myst-parser", "sphinx-copybutton", "sphinx-inline-tabs", "ipython"] + +[[package]] +name = "sphinx-copybutton" +version = "0.5.0" +description = "Add a copy button to each of your code cells." +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +sphinx = ">=1.8" + +[package.extras] +code_style = ["pre-commit (==2.12.1)"] +rtd = ["sphinx", "ipython", "myst-nb", "sphinx-book-theme"] + +[[package]] +name = "sphinxcontrib-applehelp" +version = "1.0.2" +description = "sphinxcontrib-applehelp is a sphinx extension which outputs Apple help books" +category = "dev" +optional = false +python-versions = ">=3.5" + +[package.extras] +lint = ["flake8", "mypy", "docutils-stubs"] +test = ["pytest"] + +[[package]] +name = "sphinxcontrib-devhelp" +version = "1.0.2" +description = "sphinxcontrib-devhelp is a sphinx extension which outputs Devhelp document." +category = "dev" +optional = false +python-versions = ">=3.5" + +[package.extras] +lint = ["flake8", "mypy", "docutils-stubs"] +test = ["pytest"] + +[[package]] +name = "sphinxcontrib-htmlhelp" +version = "2.0.0" +description = "sphinxcontrib-htmlhelp is a sphinx extension which renders HTML help files" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.extras] +lint = ["flake8", "mypy", "docutils-stubs"] +test = ["pytest", "html5lib"] + +[[package]] +name = "sphinxcontrib-jsmath" +version = "1.0.1" +description = "A sphinx extension which renders display math in HTML via JavaScript" +category = "dev" +optional = false +python-versions = ">=3.5" + +[package.extras] +test = ["pytest", "flake8", "mypy"] + +[[package]] +name = "sphinxcontrib-qthelp" +version = "1.0.3" +description = "sphinxcontrib-qthelp is a sphinx extension which outputs QtHelp document." +category = "dev" +optional = false +python-versions = ">=3.5" + +[package.extras] +lint = ["flake8", "mypy", "docutils-stubs"] +test = ["pytest"] + +[[package]] +name = "sphinxcontrib-serializinghtml" +version = "1.1.5" +description = "sphinxcontrib-serializinghtml is a sphinx extension which outputs \"serialized\" HTML files (json and pickle)." +category = "dev" +optional = false +python-versions = ">=3.5" + +[package.extras] +lint = ["flake8", "mypy", "docutils-stubs"] +test = ["pytest"] + +[[package]] +name = "sphinxext-opengraph" +version = "0.6.3" +description = "Sphinx Extension to enable OGP support" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +sphinx = ">=2.0" + [[package]] name = "sqlalchemy" version = "1.4.37" @@ -434,7 +864,7 @@ python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "main" +category = "dev" optional = false python-versions = ">=3.7" @@ -442,10 +872,31 @@ python-versions = ">=3.7" name = "typing-extensions" version = "4.2.0" description = "Backported and Experimental Type Hints for Python 3.7+" -category = "main" +category = "dev" optional = false python-versions = ">=3.7" +[[package]] +name = "untokenize" +version = "0.1.1" +description = "Transforms tokens into original source code (while preserving whitespace)." +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "urllib3" +version = "1.26.9" +description = "HTTP library with thread-safe connection pooling, file post, and more." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" + +[package.extras] +brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] +secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] +socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] + [[package]] name = "virtualenv" version = "20.14.1" @@ -472,12 +923,28 @@ category = "main" optional = false python-versions = "*" +[[package]] +name = "zipp" +version = "3.8.0" +description = "Backport of pathlib-compatible object wrapper for zip files" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"] + [metadata] lock-version = "1.1" python-versions = ">=3.8,<3.11" -content-hash = "49d06b895f6a4d7c2c5e13c0ba283fa49d80deea3a6b9f6d170d253ffb41472b" +content-hash = "92f8e7f8c16c1f0d3f5718e62d8dbec5046929c2fbcc9be5b14d550ce0931e83" [metadata.files] +alabaster = [ + {file = "alabaster-0.7.12-py2.py3-none-any.whl", hash = "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359"}, + {file = "alabaster-0.7.12.tar.gz", hash = "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02"}, +] atomicwrites = [ {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"}, {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"}, @@ -486,6 +953,14 @@ attrs = [ {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"}, {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"}, ] +babel = [ + {file = "Babel-2.10.3-py3-none-any.whl", hash = "sha256:ff56f4892c1c4bf0d814575ea23471c230d544203c7748e8c68f0089478d48eb"}, + {file = "Babel-2.10.3.tar.gz", hash = "sha256:7614553711ee97490f732126dc077f8d0ae084ebc6a96e23db1482afabdb2c51"}, +] +beautifulsoup4 = [ + {file = "beautifulsoup4-4.11.1-py3-none-any.whl", hash = "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30"}, + {file = "beautifulsoup4-4.11.1.tar.gz", hash = "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693"}, +] black = [ {file = "black-22.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2497f9c2386572e28921fa8bec7be3e51de6801f7459dffd6e62492531c47e09"}, {file = "black-22.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5795a0375eb87bfe902e80e0c8cfaedf8af4d49694d69161e5bd3206c18618bb"}, @@ -515,10 +990,18 @@ catalogue = [ {file = "catalogue-2.0.7-py3-none-any.whl", hash = "sha256:cab4feda641fe05da1e6a1a9d123b0869d5ca324dcd93d4a5c384408ab62e7fb"}, {file = "catalogue-2.0.7.tar.gz", hash = "sha256:535d33ae79ebd21ca298551d85da186ae8b8e1df36b0fb0246da774163ec2d6b"}, ] +certifi = [ + {file = "certifi-2022.6.15-py3-none-any.whl", hash = "sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412"}, + {file = "certifi-2022.6.15.tar.gz", hash = "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d"}, +] cfgv = [ {file = "cfgv-3.3.1-py2.py3-none-any.whl", hash = "sha256:c6a0883f3917a037485059700b9e75da2464e6c27051014ad85ba6aaa5884426"}, {file = "cfgv-3.3.1.tar.gz", hash = "sha256:f5a830efb9ce7a445376bb66ec94c638a9787422f96264c98edc6bdeed8ab736"}, ] +charset-normalizer = [ + {file = "charset-normalizer-2.0.12.tar.gz", hash = "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597"}, + {file = "charset_normalizer-2.0.12-py3-none-any.whl", hash = "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"}, +] click = [ {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, @@ -574,10 +1057,25 @@ distlib = [ {file = "distlib-0.3.4-py2.py3-none-any.whl", hash = "sha256:6564fe0a8f51e734df6333d08b8b94d4ea8ee6b99b5ed50613f731fd4089f34b"}, {file = "distlib-0.3.4.zip", hash = "sha256:e4b58818180336dc9c529bfb9a0b58728ffc09ad92027a3f30b7cd91e3458579"}, ] +docformatter = [ + {file = "docformatter-1.4.tar.gz", hash = "sha256:064e6d81f04ac96bc0d176cbaae953a0332482b22d3ad70d47c8a7f2732eef6f"}, +] +docutils = [ + {file = "docutils-0.18.1-py2.py3-none-any.whl", hash = "sha256:23010f129180089fbcd3bc08cfefccb3b890b0050e1ca00c867036e9d161b98c"}, + {file = "docutils-0.18.1.tar.gz", hash = "sha256:679987caf361a7539d76e584cbeddc311e3aee937877c87346f31debc63e9d06"}, +] filelock = [ {file = "filelock-3.7.1-py3-none-any.whl", hash = "sha256:37def7b658813cda163b56fc564cdc75e86d338246458c4c28ae84cabefa2404"}, {file = "filelock-3.7.1.tar.gz", hash = "sha256:3a0fd85166ad9dbab54c9aec96737b744106dc5f15c0b09a6744a445299fcf04"}, ] +flake8 = [ + {file = "flake8-4.0.1-py2.py3-none-any.whl", hash = "sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d"}, + {file = "flake8-4.0.1.tar.gz", hash = "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"}, +] +furo = [ + {file = "furo-2022.6.21-py3-none-any.whl", hash = "sha256:061b68e323345e27fcba024cf33a1e77f3dfd8d9987410be822749a706e2add6"}, + {file = "furo-2022.6.21.tar.gz", hash = "sha256:9aa983b7488a4601d13113884bfb7254502c8729942e073a0acb87a5512af223"}, +] greenlet = [ {file = "greenlet-1.1.2-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:58df5c2a0e293bf665a51f8a100d3e9956febfbf1d9aaf8c0677cf70218910c6"}, {file = "greenlet-1.1.2-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:aec52725173bd3a7b56fe91bc56eccb26fbdff1386ef123abb63c84c5b43b63a"}, @@ -639,18 +1137,96 @@ identify = [ {file = "identify-2.5.1-py2.py3-none-any.whl", hash = "sha256:0dca2ea3e4381c435ef9c33ba100a78a9b40c0bab11189c7cf121f75815efeaa"}, {file = "identify-2.5.1.tar.gz", hash = "sha256:3d11b16f3fe19f52039fb7e39c9c884b21cb1b586988114fbe42671f03de3e82"}, ] +idna = [ + {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, + {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, +] +imagesize = [ + {file = "imagesize-1.3.0-py2.py3-none-any.whl", hash = "sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c"}, + {file = "imagesize-1.3.0.tar.gz", hash = "sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d"}, +] +importlib-metadata = [ + {file = "importlib_metadata-4.11.4-py3-none-any.whl", hash = "sha256:c58c8eb8a762858f49e18436ff552e83914778e50e9d2f1660535ffb364552ec"}, + {file = "importlib_metadata-4.11.4.tar.gz", hash = "sha256:5d26852efe48c0a32b0509ffbc583fda1a2266545a78d104a6f4aff3db17d700"}, +] iniconfig = [ {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, ] +jinja2 = [ + {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"}, + {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"}, +] joblib = [ {file = "joblib-1.1.0-py2.py3-none-any.whl", hash = "sha256:f21f109b3c7ff9d95f8387f752d0d9c34a02aa2f7060c2135f465da0e5160ff6"}, {file = "joblib-1.1.0.tar.gz", hash = "sha256:4158fcecd13733f8be669be0683b96ebdbbd38d23559f54dca7205aea1bf1e35"}, ] +markdown-it-py = [ + {file = "markdown-it-py-2.1.0.tar.gz", hash = "sha256:cf7e59fed14b5ae17c0006eff14a2d9a00ed5f3a846148153899a0224e2c07da"}, + {file = "markdown_it_py-2.1.0-py3-none-any.whl", hash = "sha256:93de681e5c021a432c63147656fe21790bc01231e0cd2da73626f1aa3ac0fe27"}, +] +markupsafe = [ + {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-win32.whl", hash = "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-win32.whl", hash = "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-win32.whl", hash = "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-win32.whl", hash = "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247"}, + {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"}, +] +mccabe = [ + {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, + {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, +] +mdit-py-plugins = [ + {file = "mdit-py-plugins-0.3.0.tar.gz", hash = "sha256:ecc24f51eeec6ab7eecc2f9724e8272c2fb191c2e93cf98109120c2cace69750"}, + {file = "mdit_py_plugins-0.3.0-py3-none-any.whl", hash = "sha256:b1279701cee2dbf50e188d3da5f51fee8d78d038cdf99be57c6b9d1aa93b4073"}, +] +mdurl = [ + {file = "mdurl-0.1.1-py3-none-any.whl", hash = "sha256:6a8f6804087b7128040b2fb2ebe242bdc2affaeaa034d5fc9feeed30b443651b"}, + {file = "mdurl-0.1.1.tar.gz", hash = "sha256:f79c9709944df218a4cdb0fcc0b0c7ead2f44594e3e84dc566606f04ad749c20"}, +] mypy-extensions = [ {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"}, ] +myst-parser = [ + {file = "myst-parser-0.18.0.tar.gz", hash = "sha256:739a4d96773a8e55a2cacd3941ce46a446ee23dcd6b37e06f73f551ad7821d86"}, + {file = "myst_parser-0.18.0-py3-none-any.whl", hash = "sha256:4965e51918837c13bf1c6f6fe2c6bddddf193148360fbdaefe743a4981358f6a"}, +] nodeenv = [ {file = "nodeenv-1.6.0-py2.py3-none-any.whl", hash = "sha256:621e6b7076565ddcacd2db0294c0381e01fd28945ab36bcf00f41c5daf63bef7"}, {file = "nodeenv-1.6.0.tar.gz", hash = "sha256:3ef13ff90291ba2a4a7a4ff9a979b63ffdd00a464dbe04acf0ea6471517a4c2b"}, @@ -726,6 +1302,18 @@ py = [ {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] +pycodestyle = [ + {file = "pycodestyle-2.8.0-py2.py3-none-any.whl", hash = "sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20"}, + {file = "pycodestyle-2.8.0.tar.gz", hash = "sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"}, +] +pyflakes = [ + {file = "pyflakes-2.4.0-py2.py3-none-any.whl", hash = "sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"}, + {file = "pyflakes-2.4.0.tar.gz", hash = "sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c"}, +] +pygments = [ + {file = "Pygments-2.12.0-py3-none-any.whl", hash = "sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519"}, + {file = "Pygments-2.12.0.tar.gz", hash = "sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb"}, +] pyodbc = [ {file = "pyodbc-4.0.32-cp27-cp27m-win32.whl", hash = "sha256:2152ce6d5131d769ff5839aa762e12d844c95e9ec4bb2f666e8cd9dfa1ae2240"}, {file = "pyodbc-4.0.32-cp27-cp27m-win_amd64.whl", hash = "sha256:56ec4974096d40d6c62a228799122dbc2ade6c4045cc5d31860212a32cae95b1"}, @@ -797,6 +1385,10 @@ pyyaml = [ {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"}, {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"}, ] +requests = [ + {file = "requests-2.28.0-py3-none-any.whl", hash = "sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f"}, + {file = "requests-2.28.0.tar.gz", hash = "sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"}, +] scikit-learn = [ {file = "scikit-learn-1.1.1.tar.gz", hash = "sha256:3e77b71e8e644f86c8b5be7f1c285ef597de4c384961389ee3e9ca36c445b256"}, {file = "scikit_learn-1.1.1-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:102f51797cd8944bf44a038d106848ddf2804f2c1edf7aea45fba81a4fdc4d80"}, @@ -846,6 +1438,54 @@ six = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] +snowballstemmer = [ + {file = "snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"}, + {file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"}, +] +soupsieve = [ + {file = "soupsieve-2.3.2.post1-py3-none-any.whl", hash = "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759"}, + {file = "soupsieve-2.3.2.post1.tar.gz", hash = "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"}, +] +sphinx = [ + {file = "Sphinx-5.0.2-py3-none-any.whl", hash = "sha256:d3e57663eed1d7c5c50895d191fdeda0b54ded6f44d5621b50709466c338d1e8"}, + {file = "Sphinx-5.0.2.tar.gz", hash = "sha256:b18e978ea7565720f26019c702cd85c84376e948370f1cd43d60265010e1c7b0"}, +] +sphinx-basic-ng = [ + {file = "sphinx_basic_ng-0.0.1a11-py3-none-any.whl", hash = "sha256:9aecb5345816998789ef76658a83e3c0a12aafa14b17d40e28cd4aaeb94d1517"}, + {file = "sphinx_basic_ng-0.0.1a11.tar.gz", hash = "sha256:bf9a8fda0379c7d2ab51c9543f2b18e014b77fb295b49d64f3c1a910c863b34f"}, +] +sphinx-copybutton = [ + {file = "sphinx-copybutton-0.5.0.tar.gz", hash = "sha256:a0c059daadd03c27ba750da534a92a63e7a36a7736dcf684f26ee346199787f6"}, + {file = "sphinx_copybutton-0.5.0-py3-none-any.whl", hash = "sha256:9684dec7434bd73f0eea58dda93f9bb879d24bff2d8b187b1f2ec08dfe7b5f48"}, +] +sphinxcontrib-applehelp = [ + {file = "sphinxcontrib-applehelp-1.0.2.tar.gz", hash = "sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58"}, + {file = "sphinxcontrib_applehelp-1.0.2-py2.py3-none-any.whl", hash = "sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a"}, +] +sphinxcontrib-devhelp = [ + {file = "sphinxcontrib-devhelp-1.0.2.tar.gz", hash = "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"}, + {file = "sphinxcontrib_devhelp-1.0.2-py2.py3-none-any.whl", hash = "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e"}, +] +sphinxcontrib-htmlhelp = [ + {file = "sphinxcontrib-htmlhelp-2.0.0.tar.gz", hash = "sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2"}, + {file = "sphinxcontrib_htmlhelp-2.0.0-py2.py3-none-any.whl", hash = "sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07"}, +] +sphinxcontrib-jsmath = [ + {file = "sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"}, + {file = "sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178"}, +] +sphinxcontrib-qthelp = [ + {file = "sphinxcontrib-qthelp-1.0.3.tar.gz", hash = "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72"}, + {file = "sphinxcontrib_qthelp-1.0.3-py2.py3-none-any.whl", hash = "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"}, +] +sphinxcontrib-serializinghtml = [ + {file = "sphinxcontrib-serializinghtml-1.1.5.tar.gz", hash = "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"}, + {file = "sphinxcontrib_serializinghtml-1.1.5-py2.py3-none-any.whl", hash = "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd"}, +] +sphinxext-opengraph = [ + {file = "sphinxext-opengraph-0.6.3.tar.gz", hash = "sha256:cd89e13cc7a44739f81b64ee57c1c20ef0c05dda5d1d8201d31ec2f34e4c29db"}, + {file = "sphinxext_opengraph-0.6.3-py3-none-any.whl", hash = "sha256:bf76017c105856b07edea6caf4942b6ae9bb168585dccfd6dbdb6e4161f6b03a"}, +] sqlalchemy = [ {file = "SQLAlchemy-1.4.37-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:d9050b0c4a7f5538650c74aaba5c80cd64450e41c206f43ea6d194ae6d060ff9"}, {file = "SQLAlchemy-1.4.37-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:b4c92823889cf9846b972ee6db30c0e3a92c0ddfc76c6060a6cda467aa5fb694"}, @@ -900,6 +1540,13 @@ typing-extensions = [ {file = "typing_extensions-4.2.0-py3-none-any.whl", hash = "sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708"}, {file = "typing_extensions-4.2.0.tar.gz", hash = "sha256:f1c24655a0da0d1b67f07e17a5e6b2a105894e6824b92096378bb3668ef02376"}, ] +untokenize = [ + {file = "untokenize-0.1.1.tar.gz", hash = "sha256:3865dbbbb8efb4bb5eaa72f1be7f3e0be00ea8b7f125c69cbd1f5fda926f37a2"}, +] +urllib3 = [ + {file = "urllib3-1.26.9-py2.py3-none-any.whl", hash = "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14"}, + {file = "urllib3-1.26.9.tar.gz", hash = "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"}, +] virtualenv = [ {file = "virtualenv-20.14.1-py2.py3-none-any.whl", hash = "sha256:e617f16e25b42eb4f6e74096b9c9e37713cf10bf30168fb4a739f3fa8f898a3a"}, {file = "virtualenv-20.14.1.tar.gz", hash = "sha256:ef589a79795589aada0c1c5b319486797c03b67ac3984c48c669c0e4f50df3a5"}, @@ -908,3 +1555,7 @@ wasabi = [ {file = "wasabi-0.9.1-py3-none-any.whl", hash = "sha256:217edcb2850993c7931399e7419afccde13539d589e333bc85f9053cf0bb1772"}, {file = "wasabi-0.9.1.tar.gz", hash = "sha256:ada6f13e9b70ef26bf95fad0febdfdebe2005e29a08ad58f4bbae383a97298cf"}, ] +zipp = [ + {file = "zipp-3.8.0-py3-none-any.whl", hash = "sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099"}, + {file = "zipp-3.8.0.tar.gz", hash = "sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad"}, +] diff --git a/pyproject.toml b/pyproject.toml index 074fefb1..d7897257 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,6 @@ classifiers = [ [tool.poetry.dependencies] python = ">=3.8,<3.11" -black = "^22.3.0" numpy = "^1.22.4" scipy = "^1.8.1" pandas = "^1.4.2" @@ -30,9 +29,18 @@ wasabi = "^0.9.1" scikit-learn = "^1.1.1" [tool.poetry.dev-dependencies] +black = "^22.3.0" pre-commit = "^2.19.0" pytest = "^7.1.2" pytest-cov = "^3.0.0" +Sphinx = "^5.0.2" +furo = "^2022.6.21" +sphinx-copybutton = "^0.5.0" +sphinxext-opengraph = "^0.6.3" +myst-parser = "^0.18.0" +flake8 = "^4.0.1" +docformatter = "^1.4" + [tool.coverage.run] omit = [ diff --git a/src/psycopmlutils/__init__.py b/src/psycopmlutils/__init__.py index e69de29b..bcfcb44d 100644 --- a/src/psycopmlutils/__init__.py +++ b/src/psycopmlutils/__init__.py @@ -0,0 +1 @@ +from .about import __download_url__, __title__, __version__ # noqa diff --git a/src/about.py b/src/psycopmlutils/about.py similarity index 69% rename from src/about.py rename to src/psycopmlutils/about.py index 1cd3cf55..8e6562a3 100644 --- a/src/about.py +++ b/src/psycopmlutils/about.py @@ -1,3 +1,3 @@ -__version__ = "0.1.0" # only source of version ID +__version__ = "0.1.2" # only source of version ID __title__ = "psycopmlutils" __download_url__ = "https://github.com/Aarhus-Psychiatry-Research/psycop-ml-utils.git" diff --git a/src/psycopmlutils/loaders/__init__.py b/src/psycopmlutils/loaders/__init__.py index 3ca6ce80..b3e16b24 100644 --- a/src/psycopmlutils/loaders/__init__.py +++ b/src/psycopmlutils/loaders/__init__.py @@ -1,8 +1,8 @@ -from .load_demographics import LoadDemographic -from .load_diagnoses import LoadDiagnoses -from .load_outcomes import LoadOutcome -from .load_lab_results import LoadLabResults -from .load_medications import LoadMedications -from .load_visits import LoadVisits -from .sql_load import sql_load -from .load_ids import LoadIDs +from .load_demographics import LoadDemographic # noqa +from .load_diagnoses import LoadDiagnoses # noqa +from .load_ids import LoadIDs # noqa +from .load_lab_results import LoadLabResults # noqa +from .load_medications import LoadMedications # noqa +from .load_outcomes import LoadOutcome # noqa +from .load_visits import LoadVisits # noqa +from .sql_load import sql_load # noqa diff --git a/src/psycopmlutils/loaders/load_demographics.py b/src/psycopmlutils/loaders/load_demographics.py index b11baa72..a02e0084 100644 --- a/src/psycopmlutils/loaders/load_demographics.py +++ b/src/psycopmlutils/loaders/load_demographics.py @@ -1,14 +1,12 @@ import pandas as pd + from psycopmlutils.loaders.sql_load import sql_load from psycopmlutils.utils import data_loaders -from wasabi import msg class LoadDemographic: @data_loaders.register("birthdays") def birthdays(): - # msg.info("Loading birthdays") - view = "[FOR_kohorte_demografi_inkl_2021_feb2022]" sql = f"SELECT dw_ek_borger, foedselsdato FROM [fct].{view}" @@ -25,8 +23,6 @@ def birthdays(): @data_loaders.register("male") def male(): - # msg.info("Loading sexes") - view = "[FOR_kohorte_demografi_inkl_2021_feb2022]" sql = f"SELECT dw_ek_borger, koennavn FROM [fct].{view}" @@ -40,5 +36,4 @@ def male(): inplace=True, ) - # msg.good("Loaded sexes") return df.reset_index(drop=True) diff --git a/src/psycopmlutils/loaders/load_diagnoses.py b/src/psycopmlutils/loaders/load_diagnoses.py index 7d4c3f38..ee076b3e 100644 --- a/src/psycopmlutils/loaders/load_diagnoses.py +++ b/src/psycopmlutils/loaders/load_diagnoses.py @@ -1,11 +1,9 @@ -from pathlib import Path from typing import List, Union -import catalogue import pandas as pd + from psycopmlutils.loaders.sql_load import sql_load from psycopmlutils.utils import data_loaders -from wasabi import msg class LoadDiagnoses: @@ -14,7 +12,8 @@ def aggregate_from_physical_visits( output_col_name: str, wildcard_icd_10_end: bool = False, ) -> pd.DataFrame: - """Load all diagnoses matching any icd_code in icd_codes. Create output_col_name and set to 1. + """Load all diagnoses matching any icd_code in icd_codes. Create + output_col_name and set to 1. Args: icd_codes (List[str]): List of icd_codes. @@ -24,8 +23,6 @@ def aggregate_from_physical_visits( Returns: pd.DataFrame """ - print_str = f"diagnoses matching any of {icd_codes}" - # msg.info(f"Loading {print_str}") diagnoses_source_table_info = { "lpr3": { @@ -55,8 +52,6 @@ def aggregate_from_physical_visits( ] df = pd.concat(dfs) - - # msg.good(f"Loaded {print_str}") return df.reset_index(drop=True) def from_physical_visits( @@ -64,7 +59,9 @@ def from_physical_visits( output_col_name: str = "value", wildcard_icd_10_end: bool = False, ) -> pd.DataFrame: - """Load diagnoses from all physical visits. If icd_code is a list, will aggregate as one column (e.g. ["E780", "E785"] into a ypercholesterolemia column). + """Load diagnoses from all physical visits. If icd_code is a list, will + aggregate as one column (e.g. ["E780", "E785"] into a + ypercholesterolemia column). Args: icd_code (str): Substring to match diagnoses for. Matches any diagnoses, whether a-diagnosis, b-diagnosis etc. @@ -74,9 +71,6 @@ def from_physical_visits( Returns: pd.DataFrame """ - print_str = f"diagnoses matching ICD-code {icd_code}" - # msg.info(f"Loading {print_str}") - diagnoses_source_table_info = { "lpr3": { "fct": "FOR_LPR3kontakter_psyk_somatik_inkl_2021", @@ -104,7 +98,6 @@ def from_physical_visits( df = pd.concat(dfs) - # msg.good(f"Loaded {print_str}") return df.reset_index(drop=True) def _load( @@ -114,18 +107,24 @@ def _load( output_col_name: str = None, wildcard_icd_10_end: bool = True, ) -> pd.DataFrame: - """Load the visits that have diagnoses that match icd_code from the beginning of their adiagnosekode string. - Aggregates all that match. + """Load the visits that have diagnoses that match icd_code from the + beginning of their adiagnosekode string. Aggregates all that match. Args: - icd_code (Union[List[str], str]): Substring(s) to match diagnoses for. Matches any diagnoses, whether a-diagnosis, b-diagnosis etc. - source_timestamp_col_name (str): Name of the timestamp column in the SQL table. - view (str): Which view to use, e.g. "FOR_Medicin_ordineret_inkl_2021_feb2022" - output_col_name (str, optional): Name of new column string. Defaults to None. - wildcard_icd_10_end (bool, optional): Whether to match on icd_code*. Defaults to true. + icd_code (Union[List[str], str]): Substring(s) to match diagnoses for. + Matches any diagnoses, whether a-diagnosis, b-diagnosis etc. + source_timestamp_col_name (str): Name of the timestamp column in the SQL + table. + view (str): Which view to use, e.g. + "FOR_Medicin_ordineret_inkl_2021_feb2022" + output_col_name (str, optional): Name of new column string. Defaults to + None. + wildcard_icd_10_end (bool, optional): Whether to match on icd_code*. + Defaults to true. Returns: - pd.DataFrame: A pandas dataframe with dw_ek_borger, timestamp and output_col_name = 1 + pd.DataFrame: A pandas dataframe with dw_ek_borger, timestamp and + output_col_name = 1 """ fct = f"[{fct}]" @@ -140,11 +139,14 @@ def _load( match_col_sql_str = " OR ".join(match_col_sql_strings) else: - match_col_sql_str = ( + match_col_sql_str = ( # noqa f"lower(diagnosegruppestreng) LIKE lower('%{icd_code}{sql_ending}')" ) - sql = f"SELECT dw_ek_borger, {source_timestamp_col_name}, diagnosegruppestreng FROM [fct].{fct} WHERE ({match_col_sql_str})" + sql = ( + f"SELECT dw_ek_borger, {source_timestamp_col_name}, diagnosegruppestreng" + + " FROM [fct].{fct} WHERE ({match_col_sql_str})" + ) df = sql_load(sql, database="USR_PS_FORSK", chunksize=None) @@ -158,7 +160,7 @@ def _load( return df.rename( columns={ source_timestamp_col_name: "timestamp", - } + }, ) @data_loaders.register("essential_hypertension") diff --git a/src/psycopmlutils/loaders/load_ids.py b/src/psycopmlutils/loaders/load_ids.py index 4536aa9f..185b1bf4 100644 --- a/src/psycopmlutils/loaders/load_ids.py +++ b/src/psycopmlutils/loaders/load_ids.py @@ -1,12 +1,11 @@ import pandas as pd + from psycopmlutils.loaders.sql_load import sql_load -from psycopmlutils.utils import data_loaders -from wasabi import msg class LoadIDs: def load(split: str) -> pd.DataFrame: - """Loads ids for a given split + """Loads ids for a given split. Args: split (str): Which split to load IDs from. Takes either "train", "test" or "val". diff --git a/src/psycopmlutils/loaders/load_lab_results.py b/src/psycopmlutils/loaders/load_lab_results.py index d5145a75..cab73d0f 100644 --- a/src/psycopmlutils/loaders/load_lab_results.py +++ b/src/psycopmlutils/loaders/load_lab_results.py @@ -1,8 +1,7 @@ -import catalogue import pandas as pd + from psycopmlutils.loaders.sql_load import sql_load from psycopmlutils.utils import data_loaders -from wasabi import msg class LoadLabResults: @@ -15,11 +14,11 @@ def blood_sample(blood_sample_id: str) -> pd.DataFrame: Returns: pd.DataFrame """ - print_str = f"blood samples matching NPU-code {blood_sample_id}" - # msg.info(f"Loading {print_str}") - view = "[FOR_labka_alle_blodprover_inkl_2021_feb2022]" - sql = f"SELECT dw_ek_borger, datotid_sidstesvar, numerisksvar FROM [fct].{view} WHERE NPUkode = '{blood_sample_id}'" + sql = ( + f"SELECT dw_ek_borger, datotid_sidstesvar, numerisksvar FROM [fct].{view}" + + " WHERE NPUkode = '{blood_sample_id}'" + ) df = sql_load(sql, database="USR_PS_FORSK", chunksize=None) @@ -32,7 +31,8 @@ def blood_sample(blood_sample_id: str) -> pd.DataFrame: return df.reset_index(drop=True) def _aggregate_blood_samples(blood_sample_ids: list) -> pd.DataFrame: - """Aggregate multiple blood_sample_ids (typically NPU-codes) into one column. + """Aggregate multiple blood_sample_ids (typically NPU-codes) into one + column. Args: blood_sample_ids (list): List of blood_sample_id, typically an NPU-codes. @@ -118,7 +118,7 @@ def unscheduled_p_glc(): blood_sample_ids += [f"DNK{suffix}" for suffix in dnk_suffixes] return LoadLabResults._aggregate_blood_samples( - blood_sample_ids=blood_sample_ids + blood_sample_ids=blood_sample_ids, ) @data_loaders.register("triglycerides") @@ -142,7 +142,7 @@ def ldl(): @data_loaders.register("fasting_ldl") def fasting_ldl(): return LoadLabResults._aggregate_blood_samples( - blood_sample_ids=["NPU10171", "AAB00102"] + blood_sample_ids=["NPU10171", "AAB00102"], ) @data_loaders.register("alat") @@ -168,13 +168,13 @@ def crp(): @data_loaders.register("creatinine") def creatinine(): return LoadLabResults._aggregate_blood_samples( - blood_sample_ids=["NPU18016", "ASS00355", "ASS00354"] + blood_sample_ids=["NPU18016", "ASS00355", "ASS00354"], ) @data_loaders.register("egfr") def egfr(): return LoadLabResults._aggregate_blood_samples( - blood_sample_ids=["DNK35302", "DNK35131", "AAB00345", "AAB00343"] + blood_sample_ids=["DNK35302", "DNK35131", "AAB00345", "AAB00343"], ) @data_loaders.register("albumine_creatinine_ratio") diff --git a/src/psycopmlutils/loaders/load_medications.py b/src/psycopmlutils/loaders/load_medications.py index 5a6383ef..a42082ef 100644 --- a/src/psycopmlutils/loaders/load_medications.py +++ b/src/psycopmlutils/loaders/load_medications.py @@ -1,14 +1,16 @@ import pandas as pd -from psycopmlutils.loaders.sql_load import sql_load -from psycopmlutils.utils import data_loaders from wasabi import msg +from psycopmlutils.loaders.sql_load import sql_load + class LoadMedications: def aggregate_medications( - output_col_name: str, atc_code_prefixes: list + output_col_name: str, + atc_code_prefixes: list, ) -> pd.DataFrame: - """Aggregate multiple blood_sample_ids (typically NPU-codes) into one column. + """Aggregate multiple blood_sample_ids (typically NPU-codes) into one + column. Args: output_col_name (str): Name for new column. @@ -19,7 +21,8 @@ def aggregate_medications( """ dfs = [ LoadMedications.load( - blood_sample_id=f"{id}", output_col_name=output_col_name + blood_sample_id=f"{id}", + output_col_name=output_col_name, ) for id in atc_code_prefixes ] @@ -33,25 +36,32 @@ def load( load_administered: bool = True, wildcard_at_end: bool = True, ) -> pd.DataFrame: - """Load medications. Aggregates prescribed/administered if both true. If wildcard_atc_at_end, match from atc_code*. - Aggregates all that match. Beware that data is incomplete prior to sep. 2016 for prescribed medications. + """Load medications. Aggregates prescribed/administered if both true. + If wildcard_atc_at_end, match from atc_code*. Aggregates all that + match. Beware that data is incomplete prior to sep. 2016 for prescribed + medications. Args: - atc_code (str): ATC-code prefix to load. Matches atc_code_prefix*. Aggregates all. - output_col_name (str, optional): Name of output_col_name. Contains 1 if atc_code matches atc_code_prefix, 0 if not.Defaults to {atc_code_prefix}_value. - load_prescribed (bool, optional): Whether to load prescriptions. Defaults to True. Beware incomplete until sep 2016. - load_administered (bool, optional): Whether to load administrations. Defaults to True. - wildcard_atc_at_end (bool, optional): Whether to match on atc_code* or atc_code. + atc_code (str): ATC-code prefix to load. Matches atc_code_prefix*. + Aggregates all. + output_col_name (str, optional): Name of output_col_name. Contains 1 if + atc_code matches atc_code_prefix, 0 if not.Defaults to + {atc_code_prefix}_value. + load_prescribed (bool, optional): Whether to load prescriptions. Defaults to + True. Beware incomplete until sep 2016. + load_administered (bool, optional): Whether to load administrations. + Defaults to True. + wildcard_atc_at_end (bool, optional): Whether to match on atc_code* or + atc_code. Returns: pd.DataFrame: Cols: dw_ek_borger, timestamp, {atc_code_prefix}_value = 1 """ - print_str = f"medications matching NPU-code {atc_code}" - # msg.info(f"Loading {print_str}") if load_prescribed: msg.warn( - "Beware, there are missing prescriptions until september 2019. Hereafter, data is complete." + "Beware, there are missing prescriptions until september 2019. " + "Hereafter, data is complete.", ) df = pd.DataFrame() @@ -76,17 +86,16 @@ def load( ) df = pd.concat([df, df_medication_administered]) - if output_col_name == None: + if output_col_name is None: output_col_name = atc_code df.rename( columns={ - atc_code: f"value", + atc_code: "value", }, inplace=True, ) - # msg.good(f"Loaded {print_str}") return df.reset_index(drop=True) def _load_one_source( @@ -96,27 +105,37 @@ def _load_one_source( output_col_name: str = None, wildcard_atc_at_end: bool = False, ) -> pd.DataFrame: - """Load the prescribed medications that match atc. If wildcard_atc_at_end, match from atc_code*. - Aggregates all that match. Beware that data is incomplete prior to sep. 2016 for prescribed medications. + """Load the prescribed medications that match atc. If + wildcard_atc_at_end, match from atc_code*. Aggregates all that match. + Beware that data is incomplete prior to sep. 2016 for prescribed + medications. Args: atc_code (str): ATC string to match on. - source_timestamp_col_name (str): Name of the timestamp column in the SQL table. - view (str): Which view to use, e.g. "FOR_Medicin_ordineret_inkl_2021_feb2022" - output_col_name (str, optional): Name of new column string. Defaults to None. - wildcard_atc_at_end (bool, optional): Whether to match on atc_code* or atc_code. + source_timestamp_col_name (str): Name of the timestamp column in the SQL + table. + view (str): Which view to use, e.g. + "FOR_Medicin_ordineret_inkl_2021_feb2022" + output_col_name (str, optional): Name of new column string. Defaults to + None. + wildcard_atc_at_end (bool, optional): Whether to match on atc_code* or + atc_code. Returns: - pd.DataFrame: A pandas dataframe with dw_ek_borger, timestamp and output_col_name = 1 + pd.DataFrame: A pandas dataframe with dw_ek_borger, timestamp and + output_col_name = 1 """ if wildcard_atc_at_end: end_of_sql = "%" else: - end_of_sql = "" + end_of_sql = "" # noqa view = f"[{view}]" - sql = f"SELECT dw_ek_borger, {source_timestamp_col_name}, atc FROM [fct].{view} WHERE (lower(atc)) LIKE lower('{atc_code}{end_of_sql}')" + sql = ( + f"SELECT dw_ek_borger, {source_timestamp_col_name}, atc FROM [fct].{view}" + + " WHERE (lower(atc)) LIKE lower('{atc_code}{end_of_sql}')" + ) df = sql_load(sql, database="USR_PS_FORSK", chunksize=None) @@ -130,5 +149,5 @@ def _load_one_source( return df.rename( columns={ source_timestamp_col_name: "timestamp", - } + }, ) diff --git a/src/psycopmlutils/loaders/load_outcomes.py b/src/psycopmlutils/loaders/load_outcomes.py index 806e9ddb..325cb3d2 100644 --- a/src/psycopmlutils/loaders/load_outcomes.py +++ b/src/psycopmlutils/loaders/load_outcomes.py @@ -1,7 +1,8 @@ +import pandas as pd from wasabi import msg + from psycopmlutils.loaders.sql_load import sql_load from psycopmlutils.utils import data_loaders -import pandas as pd class LoadOutcome: diff --git a/src/psycopmlutils/loaders/load_visits.py b/src/psycopmlutils/loaders/load_visits.py index f477b870..6feca2be 100644 --- a/src/psycopmlutils/loaders/load_visits.py +++ b/src/psycopmlutils/loaders/load_visits.py @@ -1,6 +1,7 @@ -from psycopmlutils.loaders.sql_load import sql_load from wasabi import msg +from psycopmlutils.loaders.sql_load import sql_load + class LoadVisits: def physical_visits_to_psychiatry(): diff --git a/src/psycopmlutils/loaders/sql_load.py b/src/psycopmlutils/loaders/sql_load.py index c34f0815..b6cec102 100644 --- a/src/psycopmlutils/loaders/sql_load.py +++ b/src/psycopmlutils/loaders/sql_load.py @@ -4,7 +4,6 @@ import pandas as pd from sqlalchemy import create_engine -from sqlalchemy.pool import NullPool def sql_load( @@ -14,15 +13,17 @@ def sql_load( chunksize: Optional[int] = None, format_timestamp_cols_to_datetime: bool = True, ) -> Union[pd.DataFrame, Generator[pd.DataFrame, None, None]]: - """Function to load a SQL query. If chunksize is None, all data will be loaded into memory. - Otherwise, will stream the data in chunks of chunksize as a generator + """Function to load a SQL query. If chunksize is None, all data will be + loaded into memory. Otherwise, will stream the data in chunks of chunksize + as a generator. Args: query (str): The SQL query server (str): The BI server database (str): The BI database chunksize (int, optional): Defaults to 1000. - format_timestamp_cols_to_datetime (bool, optional): Whether to format all columns with "datotid" in their name as pandas datetime. Defaults to true. + format_timestamp_cols_to_datetime (bool, optional): Whether to format all + columns with "datotid" in their name as pandas datetime. Defaults to true. Returns: Union[pd.DataFrame, Generator[pd.DataFrame]]: DataFrame or generator of DataFrames @@ -32,19 +33,21 @@ def sql_load( >>> view = "[FOR_SFI_fritekst_resultat_udfoert_i_psykiatrien_aendret_2011]" >>> sql = "SELECT * FROM [fct]." + view >>> df = sql_load(sql, chunksize = None) - """ driver = "SQL Server" params = urllib.parse.quote( - "DRIVER={0};SERVER={1};DATABASE={2};Trusted_Connection=yes".format( - driver, server, database - ) + "DRIVER={};SERVER={};DATABASE={};Trusted_Connection=yes".format( + driver, + server, + database, + ), ) engine = create_engine(f"mssql+pyodbc:///?odbc_connect={params}") conn = engine.connect().execution_options( - stream_results=True, fast_executemany=True + stream_results=True, + fast_executemany=True, ) df = pd.read_sql(query, conn, chunksize=chunksize) diff --git a/src/psycopmlutils/model_performance/__init__.py b/src/psycopmlutils/model_performance/__init__.py index 6fd36429..c7de8188 100644 --- a/src/psycopmlutils/model_performance/__init__.py +++ b/src/psycopmlutils/model_performance/__init__.py @@ -1 +1 @@ -from .model_performance import ModelPerformance +from .model_performance import ModelPerformance # noqa diff --git a/src/psycopmlutils/model_performance/model_performance.py b/src/psycopmlutils/model_performance/model_performance.py index d6ae048d..95990c1c 100644 --- a/src/psycopmlutils/model_performance/model_performance.py +++ b/src/psycopmlutils/model_performance/model_performance.py @@ -5,14 +5,6 @@ import numpy as np import pandas as pd -from psycopmlutils.model_performance.utils import ( - add_metadata_cols, - aggregate_predictions, - get_metadata_cols, - idx_to_class, - labels_to_int, - scores_to_probs, -) from sklearn.metrics import ( accuracy_score, confusion_matrix, @@ -22,6 +14,15 @@ roc_auc_score, ) +from psycopmlutils.model_performance.utils import ( + add_metadata_cols, + aggregate_predictions, + get_metadata_cols, + idx_to_class, + labels_to_int, + scores_to_probs, +) + class ModelPerformance: """Class to generate model performances.""" @@ -87,7 +88,9 @@ def performance_metrics_from_df( if metadata_col_names: # Add metadata if specified metadata = get_metadata_cols( - df, metadata_col_names, skip=[prediction_col_name, label_col_name] + df, + metadata_col_names, + skip=[prediction_col_name, label_col_name], ) performance = add_metadata_cols(performance, metadata) return performance @@ -123,7 +126,7 @@ def performance_metrics_from_file( path = Path(path) if path.suffix != ".jsonl": raise ValueError( - f"Only .jsonl files are supported for import, not {path.suffix}" + f"Only .jsonl files are supported for import, not {path.suffix}", ) df = pd.read_json(path, orient="records", lines=True) return ModelPerformance.performance_metrics_from_df( @@ -147,7 +150,8 @@ def performance_metrics_from_folder( id2label: Optional[Dict[int, str]] = None, to_wide=False, ) -> pd.DataFrame: - """Load and calculates performance metrics for all files matching a pattern in a folder. + """Load and calculates performance metrics for all files matching a + pattern in a folder. Only supports jsonl for now. @@ -187,7 +191,8 @@ def _evaluate_single_model( to_wide: bool, id2label: Dict[int, str] = None, ) -> pd.DataFrame: - """Calculate performance metrics from a dataframe. Optionally adds aggregated performance by id. + """Calculate performance metrics from a dataframe. Optionally adds + aggregated performance by id. Args: df (pd.DataFrame): Dataframe with one prediction per row @@ -205,7 +210,10 @@ def _evaluate_single_model( """ if aggregate_by_id: df = aggregate_predictions( - df, id_col_name, prediction_col_name, label_col_name + df, + id_col_name, + prediction_col_name, + label_col_name, ) # get predicted labels @@ -216,7 +224,9 @@ def _evaluate_single_model( predictions = np.round(df[prediction_col_name]) metrics = ModelPerformance.compute_metrics( - df[label_col_name], predictions, to_wide + df[label_col_name], + predictions, + to_wide, ) # calculate roc if binary model @@ -238,9 +248,12 @@ def _evaluate_single_model( @staticmethod def calculate_roc_auc( - labels: Union[pd.Series, List], predicted: Union[pd.Series, List], to_wide: bool + labels: Union[pd.Series, List], + predicted: Union[pd.Series, List], + to_wide: bool, ) -> pd.DataFrame: - """Calculate the area under the receiver operating characteristic curve. + """Calculate the area under the receiver operating characteristic + curve. Potentially extendable to calculate other metrics that require probabilities instead of label predictions @@ -257,7 +270,7 @@ def calculate_roc_auc( return pd.DataFrame([{"auc-overall": roc_auc}]) else: return pd.DataFrame( - [{"class": "overall", "score_type": "auc", "value": roc_auc}] + [{"class": "overall", "score_type": "auc", "value": roc_auc}], ) @staticmethod @@ -266,7 +279,8 @@ def compute_metrics( predicted: Union[pd.Series, List[Union[str, int]]], to_wide: bool, ) -> pd.DataFrame: - """Compute a whole bunch of performance metrics for both binary and multiclass tasks. + """Compute a whole bunch of performance metrics for both binary and + multiclass tasks. Args: labels (Union[pd.Series, List]): True class @@ -284,16 +298,24 @@ def compute_metrics( performance["f1_macro-overall"] = f1_score(labels, predicted, average="macro") performance["f1_micro-overall"] = f1_score(labels, predicted, average="micro") performance["precision_macro-overall"] = precision_score( - labels, predicted, average="macro" + labels, + predicted, + average="macro", ) performance["precision_micro-overall"] = precision_score( - labels, predicted, average="micro" + labels, + predicted, + average="micro", ) performance["recall_macro-overall"] = recall_score( - labels, predicted, average="macro" + labels, + predicted, + average="macro", ) performance["recall_micro-overall"] = recall_score( - labels, predicted, average="micro" + labels, + predicted, + average="micro", ) performance["confusion_matrix-overall"] = confusion_matrix(labels, predicted) @@ -315,7 +337,9 @@ def compute_metrics( performance = pd.melt(performance) # split score and class into two columns performance[["score_type", "class"]] = performance["variable"].str.split( - "-", 1, expand=True + "-", + 1, + expand=True, ) # drop unused columns and rearrange performance = performance[["class", "score_type", "value"]] @@ -343,7 +367,7 @@ def compute_metrics( ], "label": ["ASD", "ASD", "DEPR", "DEPR", "TD", "TD", "SCHZ", "SCHZ"], "model_name": ["test"] * 8, - } + }, ) id2label = {0: "ASD", 1: "DEPR", 2: "TD", 3: "SCHZ"} @@ -364,7 +388,7 @@ def compute_metrics( "label": ["TD", "TD", "DEPR", "DEPR"], "optional_grouping1": ["grouping1"] * 4, "optional_grouping2": ["grouping2"] * 4, - } + }, ) binary_res = ModelPerformance.performance_metrics_from_df( diff --git a/src/psycopmlutils/model_performance/utils.py b/src/psycopmlutils/model_performance/utils.py index 50044049..94e7d360 100644 --- a/src/psycopmlutils/model_performance/utils.py +++ b/src/psycopmlutils/model_performance/utils.py @@ -1,4 +1,4 @@ -from typing import List, TypeVar, Union, Dict +from typing import Dict, List, TypeVar, Union import numpy as np import pandas as pd @@ -12,7 +12,8 @@ def scores_to_probs(scores: Union[SeriesListOfFloats, SeriesOfFloats]) -> Series: - """Converts a series of lists of probabilities for each class or a list of floats for binary classification a list of floats of maximum length 2. + """Converts a series of lists of probabilities for each class or a list of + floats for binary classification a list of floats of maximum length 2. Args: scores (Union[Series[List[float]], Series[float]]): Series containing probabilities for each class or a list of floats for binary classification. @@ -28,7 +29,8 @@ def scores_to_probs(scores: Union[SeriesListOfFloats, SeriesOfFloats]) -> Series def labels_to_int( - labels: Union[SeriesOfStr, SeriesOfInt], label2id: Dict[str, int] + labels: Union[SeriesOfStr, SeriesOfInt], + label2id: Dict[str, int], ) -> Series: """Converts label to int mapping. Only makes sense for binary models. If already int will return as is. @@ -48,7 +50,10 @@ def labels_to_int( def aggregate_predictions( - df: pd.DataFrame, id_col: str, predictions_col: str, label_col: str + df: pd.DataFrame, + id_col: str, + predictions_col: str, + label_col: str, ): """Calculates the mean prediction by a grouping col (id_col). @@ -67,12 +72,12 @@ def get_first_entry(x: pd.Series): return x.unique()[0] return df.groupby(id_col).agg( - {predictions_col: mean_scores, label_col: get_first_entry} + {predictions_col: mean_scores, label_col: get_first_entry}, ) def idx_to_class(idx: List[int], mapping: dict) -> List[str]: - """Returns the label from an id2label mapping + """Returns the label from an id2label mapping. Args: idx (List[int]): index @@ -85,9 +90,11 @@ def idx_to_class(idx: List[int], mapping: dict) -> List[str]: def get_metadata_cols( - df: pd.DataFrame, cols: List[str], skip: List[str] + df: pd.DataFrame, + cols: List[str], + skip: List[str], ) -> pd.DataFrame: - """Extracts model metadata to a 1 row dataframe + """Extracts model metadata to a 1 row dataframe. Args: df (pd.DataFrame): Dataframe with predictions and metadata. @@ -125,18 +132,19 @@ def get_metadata_cols( val = df[col].unique() if len(val) > 1: raise ValueError( - f"The column '{col}' contains more than one unique value." + f"The column '{col}' contains more than one unique value.", ) metadata[col] = val[0] else: raise ValueError( - f"The metadata column '{col}' is not contained in the data" + f"The metadata column '{col}' is not contained in the data", ) return pd.DataFrame.from_records([metadata]) def add_metadata_cols(df: pd.DataFrame, metadata: pd.DataFrame) -> pd.DataFrame: - """Adds 1 row dataframe with metadata to the long format performance dataframe + """Adds 1 row dataframe with metadata to the long format performance + dataframe. Args: df (pd.DataFrame): Dataframe to add metadata to. diff --git a/src/psycopmlutils/timeseriesflattener/__init__.py b/src/psycopmlutils/timeseriesflattener/__init__.py index 496e2ae4..a0bbe874 100644 --- a/src/psycopmlutils/timeseriesflattener/__init__.py +++ b/src/psycopmlutils/timeseriesflattener/__init__.py @@ -1,2 +1,2 @@ -from .create_feature_combinations import create_feature_combinations -from .flattened_dataset import FlattenedDataset +from .create_feature_combinations import create_feature_combinations # noqa +from .flattened_dataset import FlattenedDataset # noqa diff --git a/src/psycopmlutils/timeseriesflattener/create_feature_combinations.py b/src/psycopmlutils/timeseriesflattener/create_feature_combinations.py index 5528f3c8..59555a4f 100644 --- a/src/psycopmlutils/timeseriesflattener/create_feature_combinations.py +++ b/src/psycopmlutils/timeseriesflattener/create_feature_combinations.py @@ -2,7 +2,7 @@ def list_has_dict_with_list_as_value( - list_of_dicts: List[Dict[str, Union[str, list]]] + list_of_dicts: List[Dict[str, Union[str, list]]], ) -> bool: """Checks if any dict in a list of dicts has a value that is a list. @@ -20,9 +20,7 @@ def list_has_dict_with_list_as_value( def dict_has_list_in_any_value(dict: Dict[str, Union[str, list]]) -> bool: - """ - Checks if a dict has any values that are lists - """ + """Checks if a dict has any values that are lists.""" for value in dict.values(): if type(value) == list: return True @@ -32,7 +30,8 @@ def dict_has_list_in_any_value(dict: Dict[str, Union[str, list]]) -> bool: def create_feature_combinations( arg_sets: List[Dict[str, Union[str, list]]], ) -> List[Dict[str, Union[str, float, int]]]: - """Create feature combinations from a dictionary of feature specifications. See example for shape. + """Create feature combinations from a dictionary of feature specifications. + See example for shape. Args: arg_sets (List[Dict[str, Union[str, list]]]): A set of argument sets for .add_predictor. See example for shape. diff --git a/src/psycopmlutils/timeseriesflattener/flattened_dataset.py b/src/psycopmlutils/timeseriesflattener/flattened_dataset.py index 9d6d4631..eb606a37 100644 --- a/src/psycopmlutils/timeseriesflattener/flattened_dataset.py +++ b/src/psycopmlutils/timeseriesflattener/flattened_dataset.py @@ -6,9 +6,10 @@ import pandas as pd from catalogue import Registry # noqa from pandas import DataFrame +from wasabi import msg + from psycopmlutils.timeseriesflattener.resolve_multiple_functions import resolve_fns from psycopmlutils.utils import data_loaders -from wasabi import msg class FlattenedDataset: @@ -21,7 +22,8 @@ def __init__( timestamp_col_name: str = "timestamp", n_workers: int = 60, ): - """Class containing a time-series, flattened. A 'flattened' version is a tabular representation for each prediction time. + """Class containing a time-series, flattened. A 'flattened' version is + a tabular representation for each prediction time. A prediction time is every timestamp where you want your model to issue a prediction. @@ -60,7 +62,7 @@ def __init__( for col_name in [self.timestamp_col_name, self.id_col_name]: if col_name not in self.df.columns: raise ValueError( - f"{col_name} does not exist in prediction_times_df, change the df or set another argument" + f"{col_name} does not exist in prediction_times_df, change the df or set another argument", ) # Check timestamp col type @@ -69,16 +71,16 @@ def __init__( if timestamp_col_type not in ["Timestamp"]: try: self.df[self.timestamp_col_name] = pd.to_datetime( - self.df[self.timestamp_col_name] + self.df[self.timestamp_col_name], ) - except: + except Exception: raise ValueError( - f"prediction_times_df: {self.timestamp_col_name} is of type {timestamp_col_type}, and could not be converted to 'Timestamp' from Pandas. Will cause problems. Convert before initialising FlattenedDataset." + f"prediction_times_df: {self.timestamp_col_name} is of type {timestamp_col_type}, and could not be converted to 'Timestamp' from Pandas. Will cause problems. Convert before initialising FlattenedDataset.", ) # Create pred_time_uuid_columne self.df[self.pred_time_uuid_col_name] = self.df[self.id_col_name].astype( - str + str, ) + self.df[self.timestamp_col_name].dt.strftime("-%Y-%m-%d-%H-%M-%S") self.loaders_catalogue = data_loaders @@ -141,19 +143,19 @@ def add_temporal_predictors_from_list_of_argument_dictionaries( if resolve_multiple_fns is not None: try: resolved_func = resolve_multiple_fns.get( - [arg_dict["resolve_multiple"]] + [arg_dict["resolve_multiple"]], ) - except: + except Exception: pass try: resolved_func = resolve_fns.get(arg_dict["resolve_multiple"]) - except: + except Exception: pass if not isinstance(resolved_func, Callable): raise ValueError( - "resolve_function neither is nor resolved to a Callable" + "resolve_function neither is nor resolved to a Callable", ) # Rename arguments for create_flattened_df_for_val @@ -181,7 +183,7 @@ def add_temporal_predictors_from_list_of_argument_dictionaries( try: arg_dict["values_df"] = predictor_dfs[arg_dict["values_df"]] - except: + except Exception: # Error handling in _validate_processed_arg_dicts # to handle in bulk pass @@ -197,7 +199,7 @@ def add_temporal_predictors_from_list_of_argument_dictionaries( ] processed_arg_dicts.append( - select_and_assert_keys(dictionary=arg_dict, key_list=required_keys) + select_and_assert_keys(dictionary=arg_dict, key_list=required_keys), ) # Validate dicts before starting pool, saves time if errors! @@ -206,7 +208,8 @@ def add_temporal_predictors_from_list_of_argument_dictionaries( pool = Pool(self.n_workers) flattened_predictor_dfs = pool.map( - self._flatten_temporal_values_to_df_wrapper, processed_arg_dicts + self._flatten_temporal_values_to_df_wrapper, + processed_arg_dicts, ) flattened_predictor_dfs = [ @@ -235,7 +238,7 @@ def _validate_processed_arg_dicts(self, arg_dicts: list): for d in arg_dicts: if not isinstance(d["values_df"], (DataFrame, Callable)): msg.warn( - f"values_df resolves to neither a Callable nor a DataFrame in {d}" + f"values_df resolves to neither a Callable nor a DataFrame in {d}", ) warn = True @@ -249,11 +252,12 @@ def _validate_processed_arg_dicts(self, arg_dicts: list): if warn: raise ValueError( - "Errors in argument dictionaries, didn't generate any features." + "Errors in argument dictionaries, didn't generate any features.", ) def _flatten_temporal_values_to_df_wrapper(self, kwargs_dict: Dict) -> DataFrame: - """Wrap flatten_temporal_values_to_df with kwargs for multithreading pool. + """Wrap flatten_temporal_values_to_df with kwargs for multithreading + pool. Args: kwargs_dict (Dict): Dictionary of kwargs @@ -290,9 +294,9 @@ def add_age( id_to_date_of_birth_mapping[date_of_birth_col_name], format="%Y-%m-%d", ) - except: + except Exception: raise ValueError( - f"Conversion of {date_of_birth_col_name} to datetime failed, doesn't match format %Y-%m-%d. Recommend converting to datetime before adding." + f"Conversion of {date_of_birth_col_name} to datetime failed, doesn't match format %Y-%m-%d. Recommend converting to datetime before adding.", ) self.add_static_predictor(id_to_date_of_birth_mapping) @@ -397,7 +401,8 @@ def add_temporal_predictor( source_values_col_name: str = "value", new_col_name: str = None, ): - """Add a column with predictor values to the flattened dataset (e.g. "average value of bloodsample within n days"). + """Add a column with predictor values to the flattened dataset (e.g. + "average value of bloodsample within n days"). Args: predictor_df (DataFrame): A table in wide format. Required columns: patient_id, timestamp, value. @@ -429,7 +434,8 @@ def add_temporal_col_to_flattened_dataset( is_fallback_prop_warning_threshold: float = 0.9, keep_val_timestamp: bool = False, ): - """Add a column to the dataset (either predictor or outcome depending on the value of "direction"). + """Add a column to the dataset (either predictor or outcome depending + on the value of "direction"). Args: values_df (DataFrame): A table in wide format. Required columns: patient_id, timestamp, value. @@ -448,7 +454,7 @@ def add_temporal_col_to_flattened_dataset( if timestamp_col_type not in ["Timestamp"]: raise ValueError( - f"{self.timestamp_col_name} is of type {timestamp_col_type}, not 'Timestamp' from Pandas. Will cause problems. Convert before initialising FlattenedDataset." + f"{self.timestamp_col_name} is of type {timestamp_col_type}, not 'Timestamp' from Pandas. Will cause problems. Convert before initialising FlattenedDataset.", ) df = FlattenedDataset.flatten_temporal_values_to_df( @@ -487,32 +493,42 @@ def flatten_temporal_values_to_df( keep_val_timestamp: bool = False, ) -> DataFrame: - """Create a dataframe with flattened values (either predictor or outcome depending on the value of "direction"). + """Create a dataframe with flattened values (either predictor or + outcome depending on the value of "direction"). Args: - prediction_times_with_uuid_df (DataFrame): Dataframe with id_col and timestamps for each prediction time. - values_df (Union[Callable, DataFrame]): A dataframe or callable resolving to a dataframe containing id_col, timestamp and value cols. + prediction_times_with_uuid_df (DataFrame): Dataframe with id_col and + timestamps for each prediction time. + values_df (Union[Callable, DataFrame]): A dataframe or callable resolving to + a dataframe containing id_col, timestamp and value cols. direction (str): Whether to look "ahead" or "behind" the prediction time. interval_days (float): How far to look in each direction. - resolve_multiple (Union[Callable, str]): How to handle multiple values within interval_days. Takes either + resolve_multiple (Union[Callable, str]): How to handle multiple values + within interval_days. Takes either i) a function that takes a list as an argument and returns a float, or ii) a str mapping to a callable from the resolve_multiple_fn catalogue. - fallback (Union[float, str]): Which value to put if no value within the lookahead. "NaN" for Pandas NA. - id_col_name (str): Name of id_column in prediction_times_with_uuid_df and values_df. - Required because this is a static method. - timestamp_col_name (str): Name of timestamp column in prediction_times_with_uuid_df and values_df. - Required because this is a static method. - pred_time_uuid_col_name (str): Name of uuid column in prediction_times_with_uuid_df. - Required because this is a static method. - new_col_name (Optional[str], optional): Name of new column in returned dataframe. . - source_values_col_name (str, optional): Name of column containing values in values_df. Defaults to "value". - is_fallback_prop_warning_threshold (float, optional): Triggers a ValueError if proportion of - prediction_times that receive fallback is larger than threshold. - Indicates unlikely to be a learnable feature. Defaults to 0.9. - keep_val_timestamp (bool, optional): Whether to keep the timestamp for the temporal value as a separate column. Defaults to False. + fallback (Union[float, str]): Which value to put if no value within the + lookahead. "NaN" for Pandas NA. + id_col_name (str): Name of id_column in prediction_times_with_uuid_df and + values_df. Required because this is a static method. + timestamp_col_name (str): Name of timestamp column in + prediction_times_with_uuid_df and values_df. Required because this is a + static method. + pred_time_uuid_col_name (str): Name of uuid column in + prediction_times_with_uuid_df. Required because this is a static method. + new_col_name (Optional[str], optional): Name of new column in returned + dataframe. + source_values_col_name (str, optional): Name of column containing values in + values_df. Defaults to "value". + is_fallback_prop_warning_threshold (float, optional): Triggers a ValueError + if proportion of prediction_times that receive fallback is larger than + threshold. Indicates unlikely to be a learnable feature. Defaults to + 0.9. + keep_val_timestamp (bool, optional): Whether to keep the timestamp for the + temporal value as a separate column. Defaults to False. Returns: - DataFrame: + DataFrame """ # Resolve values_df if not already a dataframe. @@ -525,7 +541,7 @@ def flatten_temporal_values_to_df( for col_name in [timestamp_col_name, id_col_name]: if col_name not in values_df.columns: raise ValueError( - f"{col_name} does not exist in df_prediction_times, change the df or set another argument" + f"{col_name} does not exist in df_prediction_times, change the df or set another argument", ) # Rename column @@ -584,7 +600,7 @@ def flatten_temporal_values_to_df( > is_fallback_prop_warning_threshold ): msg.warn( - f"""{full_col_str}: Beware, {prop_of_values_that_are_fallback*100}% of rows contain the fallback value, indicating that it is unlikely to be a learnable feature. Consider redefining. You can generate the feature anyway by passing an is_fallback_prop_warning_threshold argument with a higher threshold or None.""" + f"""{full_col_str}: Beware, {prop_of_values_that_are_fallback*100}% of rows contain the fallback value, indicating that it is unlikely to be a learnable feature. Consider redefining. You can generate the feature anyway by passing an is_fallback_prop_warning_threshold argument with a higher threshold or None.""", ) if low_variance_threshold is not None: @@ -593,7 +609,7 @@ def flatten_temporal_values_to_df( ) if variance_as_fraction_of_mean < low_variance_threshold: msg.warn( - f"""{full_col_str}: Beware, variance / mean < low_variance_threshold ({variance_as_fraction_of_mean} < {low_variance_threshold}), indicating high risk of overfitting. Consider redefining. You can generate the feature anyway by passing an low_variance_threshold argument with a lower threshold or None.""" + f"""{full_col_str}: Beware, variance / mean < low_variance_threshold ({variance_as_fraction_of_mean} < {low_variance_threshold}), indicating high risk of overfitting. Consider redefining. You can generate the feature anyway by passing an low_variance_threshold argument with a lower threshold or None.""", ) msg.good(f"Returning flattened dataframe with {full_col_str}") @@ -619,7 +635,8 @@ def add_back_prediction_times_without_value( pred_times_with_uuid: DataFrame, pred_time_uuid_colname: str, ) -> DataFrame: - """Ensure all prediction times are represented in the returned dataframe. + """Ensure all prediction times are represented in the returned + dataframe. Args: df (DataFrame): @@ -642,7 +659,8 @@ def resolve_multiple_values_within_interval_days( timestamp_col_name: str, pred_time_uuid_colname: str, ) -> DataFrame: - """Apply the resolve_multiple function to prediction_times where there are multiple values within the interval_days lookahead. + """Apply the resolve_multiple function to prediction_times where there + are multiple values within the interval_days lookahead. Args: resolve_multiple (Callable): Takes a grouped df and collapses each group to one record (e.g. sum, count etc.). @@ -672,7 +690,8 @@ def drop_records_outside_interval_days( timestamp_pred_colname: str, timestamp_value_colname: str, ) -> DataFrame: - """Keep only rows where timestamp_value is within interval_days in direction of timestamp_pred. + """Keep only rows where timestamp_value is within interval_days in + direction of timestamp_pred. Args: direction (str): Whether to look ahead or behind. @@ -705,13 +724,15 @@ def drop_records_outside_interval_days( else: raise ValueError("direction can only be 'ahead' or 'behind'") - return df[df["is_in_interval"] == True].drop( - ["is_in_interval", "time_from_pred_to_val_in_days"], axis=1 + return df[df["is_in_interval"]].drop( + ["is_in_interval", "time_from_pred_to_val_in_days"], + axis=1, ) def select_and_assert_keys(dictionary: Dict, key_list: List[str]) -> Dict: - """Keep only the keys in the dictionary that are in key_order, and orders them as in the lsit. + """Keep only the keys in the dictionary that are in key_order, and orders + them as in the lsit. Args: dict (Dict): Dictionary to process diff --git a/src/psycopmlutils/writers/sql_writer.py b/src/psycopmlutils/writers/sql_writer.py index 43e31392..17150c55 100644 --- a/src/psycopmlutils/writers/sql_writer.py +++ b/src/psycopmlutils/writers/sql_writer.py @@ -1,12 +1,9 @@ import urllib import urllib.parse -from multiprocessing.sharedctypes import Value import pandas as pd from sqlalchemy import create_engine -from sqlalchemy.pool import NullPool from tqdm import tqdm -from wasabi import msg def chunker(seq, size): @@ -15,7 +12,11 @@ def chunker(seq, size): def insert_with_progress( - df: pd.DataFrame, table_name: str, conn, rows_per_chunk: int, if_exists: str + df: pd.DataFrame, + table_name: str, + conn, + rows_per_chunk: int, + if_exists: str, ): """Chunk dataframe and insert each chunk, showing a progress bar. @@ -62,7 +63,7 @@ def write_df_to_sql( driver = "SQL Server" params = urllib.parse.quote( - f"DRIVER={driver};SERVER={server};DATABASE={database};Trusted_Connection=yes" + f"DRIVER={driver};SERVER={server};DATABASE={database};Trusted_Connection=yes", ) url = f"mssql+pyodbc:///?odbc_connect={params}" diff --git a/tests/test_model_performance/test_model_performance.py b/tests/test_model_performance/test_model_performance.py index 439c849e..b06a2eb4 100644 --- a/tests/test_model_performance/test_model_performance.py +++ b/tests/test_model_performance/test_model_performance.py @@ -2,8 +2,8 @@ import pandas as pd import pytest + from psycopmlutils.model_performance import ModelPerformance -from sklearn.model_selection import PredefinedSplit @pytest.fixture(scope="function") @@ -27,7 +27,7 @@ def multiclass_df(): ], "label": ["ASD", "ASD", "DEPR", "DEPR", "TD", "TD", "SCHZ", "SCHZ"], "model_name": ["test"] * 8, - } + }, ) @@ -40,7 +40,7 @@ def binary_df(): "label": ["TD", "TD", "DEPR", "DEPR"], "optional_grouping1": ["grouping1"] * 4, "optional_grouping2": ["grouping2"] * 4, - } + }, ) @@ -97,7 +97,8 @@ def test_binary_transform_from_dataframe(binary_df, binary_score_mapping): def test_binary_transform_from_dataframe_with_float( - binary_float_df, binary_score_mapping + binary_float_df, + binary_score_mapping, ): res = ModelPerformance.performance_metrics_from_df( diff --git a/tests/test_timeseriesflattener/test_add_values.py b/tests/test_timeseriesflattener/test_add_values.py index da524a71..023c4856 100644 --- a/tests/test_timeseriesflattener/test_add_values.py +++ b/tests/test_timeseriesflattener/test_add_values.py @@ -1,20 +1,17 @@ import numpy as np import pandas as pd import pytest -from psycopmlutils.timeseriesflattener import ( - FlattenedDataset, - create_feature_combinations, -) -from psycopmlutils.timeseriesflattener.resolve_multiple_functions import ( - get_max_in_group, -) - from utils_for_testing import ( assert_flattened_outcome_as_expected, assert_flattened_predictor_as_expected, str_to_df, ) +from psycopmlutils.timeseriesflattener import ( + FlattenedDataset, + create_feature_combinations, +) + # Predictors def test_predictor_after_prediction_time(): @@ -124,7 +121,8 @@ def test_raise_error_if_timestamp_col_not_timestamp_type(): """ df_prediction_times = str_to_df( - prediction_times_df_str, convert_timestamp_to_datetime=True + prediction_times_df_str, + convert_timestamp_to_datetime=True, ) df_event_times = str_to_df(outcome_df_str, convert_timestamp_to_datetime=False) @@ -136,7 +134,10 @@ def test_raise_error_if_timestamp_col_not_timestamp_type(): with pytest.raises(ValueError): dataset.add_temporal_outcome( - df_event_times, lookahead_days=5, resolve_multiple="max", fallback=0 + df_event_times, + lookahead_days=5, + resolve_multiple="max", + fallback=0, ) @@ -201,8 +202,8 @@ def test_static_predictor(): "1994-12-31 00:00:01", "1994-12-31 00:00:01", "1994-12-31 00:00:01", - ] - } + ], + }, ) pd.testing.assert_series_equal( @@ -234,8 +235,8 @@ def test_add_age(): 0.0, 27.0, 27.0, - ] - } + ], + }, ) pd.testing.assert_series_equal( @@ -417,11 +418,12 @@ def test_add_temporal_predictors_then_temporal_outcome(): "resolve_multiple": "min", "fallback": np.nan, }, - ] + ], ) flattened_dataset.add_temporal_predictors_from_list_of_argument_dictionaries( - predictors=PREDICTOR_LIST, predictor_dfs={"predictors": predictors_df} + predictors=PREDICTOR_LIST, + predictor_dfs={"predictors": predictors_df}, ) flattened_dataset.add_temporal_outcome( diff --git a/tests/test_timeseriesflattener/test_create_feature_combinations.py b/tests/test_timeseriesflattener/test_create_feature_combinations.py index e619d3ed..18fc3085 100644 --- a/tests/test_timeseriesflattener/test_create_feature_combinations.py +++ b/tests/test_timeseriesflattener/test_create_feature_combinations.py @@ -13,7 +13,7 @@ def test_skip_all_if_no_need_to_process(): "lookbehind_days": 1, "resolve_multiple": "max", "fallback": 0, - } + }, ] assert create_feature_combinations(input) == input @@ -64,7 +64,7 @@ def test_list_has_dict_with_list_as_val(): "resolve_multiple": "max", "fallback": 0, "source_values_col_name": "val", - } + }, ] assert list_has_dict_with_list_as_value(test_pos_dataset) @@ -75,7 +75,7 @@ def test_list_has_dict_with_list_as_val(): "resolve_multiple": "max", "fallback": 0, "source_values_col_name": "val", - } + }, ] assert not list_has_dict_with_list_as_value(test_neg_dataset) @@ -109,7 +109,7 @@ def test_create_feature_combinations(): "lookbehind_days": [1, 30], "resolve_multiple": "max", "fallback": 0, - } + }, ] expected_output = [ diff --git a/tests/test_timeseriesflattener/test_errors.py b/tests/test_timeseriesflattener/test_errors.py index 62cc3b66..61723dfb 100644 --- a/tests/test_timeseriesflattener/test_errors.py +++ b/tests/test_timeseriesflattener/test_errors.py @@ -1,8 +1,8 @@ import pytest -from psycopmlutils.timeseriesflattener.flattened_dataset import FlattenedDataset - from utils_for_testing import str_to_df +from psycopmlutils.timeseriesflattener.flattened_dataset import FlattenedDataset + def test_col_does_not_exist_in_prediction_times(): prediction_times_str = """dw_ek_borger, @@ -12,7 +12,7 @@ def test_col_does_not_exist_in_prediction_times(): prediction_times_df = str_to_df(prediction_times_str) with pytest.raises(ValueError): - flattened_df = FlattenedDataset( + flattened_df = FlattenedDataset( # noqa prediction_times_df=prediction_times_df, timestamp_col_name="timestamp", id_col_name="dw_ek_borger", diff --git a/tests/test_timeseriesflattener/test_resolve_multiple.py b/tests/test_timeseriesflattener/test_resolve_multiple.py index 5161f1b8..c2530b5c 100644 --- a/tests/test_timeseriesflattener/test_resolve_multiple.py +++ b/tests/test_timeseriesflattener/test_resolve_multiple.py @@ -1,4 +1,9 @@ -from psycopmlutils.timeseriesflattener.resolve_multiple_functions import ( +from utils_for_testing import ( + assert_flattened_outcome_as_expected, + assert_flattened_predictor_as_expected, +) + +from psycopmlutils.timeseriesflattener.resolve_multiple_functions import ( # noqa get_earliest_value_in_group, get_latest_value_in_group, get_max_in_group, @@ -6,11 +11,6 @@ get_min_in_group, ) -from utils_for_testing import ( - assert_flattened_outcome_as_expected, - assert_flattened_predictor_as_expected, -) - def test_resolve_multiple_catalogue(): prediction_times_str = """dw_ek_borger,timestamp, diff --git a/tests/test_timeseriesflattener/test_wrapper.py b/tests/test_timeseriesflattener/test_wrapper.py index cee7a604..3dbce6f6 100644 --- a/tests/test_timeseriesflattener/test_wrapper.py +++ b/tests/test_timeseriesflattener/test_wrapper.py @@ -1,13 +1,12 @@ import pandas as pd import pytest from pandas.testing import assert_frame_equal +from utils_for_testing import str_to_df + from psycopmlutils.timeseriesflattener.create_feature_combinations import ( create_feature_combinations, ) from psycopmlutils.timeseriesflattener.flattened_dataset import FlattenedDataset -from psycopmlutils.utils import data_loaders - -from utils_for_testing import load_event_times, str_to_df def test_generate_two_features_from_dict(): @@ -23,7 +22,7 @@ def test_generate_two_features_from_dict(): """ expected_df_str = """dw_ek_borger,timestamp,event_times_df_within_1_days_max_fallback_0,event_times_df_within_2_days_max_fallback_0,event_times_df_within_3_days_max_fallback_0,event_times_df_within_4_days_max_fallback_0 - 1,2021-12-31 00:00:00,1,2,2,2 + 1,2021-12-31 00:00:00,1,2,2,2 """ prediction_times_df = str_to_df(prediction_times_str) @@ -46,7 +45,7 @@ def test_generate_two_features_from_dict(): "fallback": 0, "source_values_col_name": "val", }, - ] + ], ) flattened_dataset.add_temporal_predictors_from_list_of_argument_dictionaries( @@ -106,7 +105,7 @@ def test_output_independent_of_order_of_input(): "fallback": 0, "source_values_col_name": "val", }, - ] + ], ) predictor_list2 = create_feature_combinations( @@ -118,7 +117,7 @@ def test_output_independent_of_order_of_input(): "fallback": 0, "source_values_col_name": "val", }, - ] + ], ) predictor_str = """dw_ek_borger,timestamp,value, @@ -143,10 +142,10 @@ def test_output_independent_of_order_of_input(): # We don't care about indeces. Sort to match the ordering. assert_frame_equal( flattened_dataset1.df.sort_values(["dw_ek_borger", "timestamp"]).reset_index( - drop=True + drop=True, ), flattened_dataset2.df.sort_values(["dw_ek_borger", "timestamp"]).reset_index( - drop=True + drop=True, ), check_index_type=False, check_like=True, @@ -161,7 +160,7 @@ def test_add_df_from_catalogue(): """ expected_df_str = """dw_ek_borger,timestamp,load_event_times_within_1_days_max_fallback_0,load_event_times_within_2_days_max_fallback_0,load_event_times_within_3_days_max_fallback_0,load_event_times_within_4_days_max_fallback_0 - 1,2021-12-31 00:00:00,1,2,2,2 + 1,2021-12-31 00:00:00,1,2,2,2 """ prediction_times_df = str_to_df(prediction_times_str) @@ -183,7 +182,7 @@ def test_add_df_from_catalogue(): "fallback": 0, "source_values_col_name": "val", }, - ] + ], ) flattened_dataset.add_temporal_predictors_from_list_of_argument_dictionaries( @@ -206,12 +205,7 @@ def test_wrong_formatting(): 1,2021-12-31 00:00:00 """ - expected_df_str = """dw_ek_borger,timestamp,event_times_df_within_1_days_max_fallback_0,event_times_df_within_2_days_max_fallback_0,event_times_df_within_3_days_max_fallback_0,event_times_df_within_4_days_max_fallback_0 - 1,2021-12-31 00:00:00,1,2,2,2 - """ - prediction_times_df = str_to_df(prediction_times_str) - expected_df = str_to_df(expected_df_str) predictor_str = """dw_ek_borger,timestamp,value, 1,2021-12-30 00:00:01, 1 diff --git a/tests/test_timeseriesflattener/utils_for_testing.py b/tests/test_timeseriesflattener/utils_for_testing.py index 9314b6a3..820759e4 100644 --- a/tests/test_timeseriesflattener/utils_for_testing.py +++ b/tests/test_timeseriesflattener/utils_for_testing.py @@ -2,6 +2,7 @@ import pandas as pd from pandas import DataFrame + from psycopmlutils.timeseriesflattener.flattened_dataset import FlattenedDataset from psycopmlutils.utils import data_loaders @@ -19,7 +20,8 @@ def str_to_df(str, convert_timestamp_to_datetime: bool = True) -> DataFrame: def convert_cols_with_matching_colnames_to_datetime( - df: DataFrame, colname_substr: str + df: DataFrame, + colname_substr: str, ) -> DataFrame: """Convert columns that contain colname_substr in their name to datetimes Args: @@ -143,6 +145,7 @@ def assert_flattened_values_as_expected( is_fallback_prop_warning_threshold: float = 0.9, ): """Run tests from string representations of dataframes. + Args: Args: prediction_times_df_str (str): A string-representation of prediction-times df @@ -190,13 +193,13 @@ def assert_flattened_values_as_expected( flattened_values_colname = f"{values_colname}_within_{interval_days}_days_{resolve_multiple}_fallback_{fallback}" expected_flattened_values = pd.DataFrame( - {flattened_values_colname: expected_flattened_values} + {flattened_values_colname: expected_flattened_values}, ) pd.testing.assert_series_equal( left=dataset.df[flattened_values_colname].reset_index(drop=True), right=expected_flattened_values[flattened_values_colname].reset_index( - drop=True + drop=True, ), check_dtype=False, )