From 97efed3db5df13dde52694c2cf21952f7c12233b Mon Sep 17 00:00:00 2001 From: drkostas Date: Mon, 9 May 2022 18:53:16 -0400 Subject: [PATCH] Initial Commit --- .gitignore | 150 ++++++++++++++++++++++++++++++++ LICENSE | 201 +++++++++++++++++++++++++++++++++++++++++++ MANIFEST.in | 5 ++ Makefile | 90 +++++++++++++++++++ README.md | 103 ++++++++++++++++++++++ data/._ | 0 logs/._ | 0 main.py | 59 +++++++++++++ models/._ | 0 plots/._ | 0 requirements.txt | 9 ++ settings.ini | 39 +++++++++ src/__init__.py | 5 ++ src/data_loader.py | 160 ++++++++++++++++++++++++++++++++++ src/plotter.py | 6 ++ src/preprocessing.py | 4 + 16 files changed, 831 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 MANIFEST.in create mode 100644 Makefile create mode 100644 README.md create mode 100644 data/._ create mode 100644 logs/._ create mode 100644 main.py create mode 100644 models/._ create mode 100644 plots/._ create mode 100644 requirements.txt create mode 100644 settings.ini create mode 100644 src/__init__.py create mode 100644 src/data_loader.py create mode 100644 src/plotter.py create mode 100644 src/preprocessing.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5e1c5a0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,150 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# celery beat schedule file +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# IDE settings +.vscode/ +/.idea + +# Tmp files +*tmp.* + +# Tars +*.gz +*.tar +*.bz2 +*.zip +*.7z + +# Custom +*.DS_Store +logs/*.log +/bck/ \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..5c0e7ce --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ +include settings.ini +include LICENSE +include CONTRIBUTING.md +include README.md +recursive-exclude * __pycache__ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..273f46a --- /dev/null +++ b/Makefile @@ -0,0 +1,90 @@ +# Makefile for COSC525-Project4 +.ONESHELL: +SHELL=/bin/bash +PYTHON_VERSION=3.9 + +# You can use either venv (venv) or conda env +# by specifying the correct argument (env=) +ifeq ($(env),venv) + # Use Conda + BASE=venv + BIN=$(BASE)/bin + CREATE_COMMAND="python$(PYTHON_VERSION) -m venv $(BASE)" + DELETE_COMMAND="rm -rf $(BASE)" + ACTIVATE_COMMAND="source venv/bin/activate" + DEACTIVATE_COMMAND="deactivate" +else + # Use Conda + BASE=~/anaconda3/envs/cosc525_finalproject + BIN=$(BASE)/bin + CREATE_COMMAND="conda create --prefix $(BASE) python=$(PYTHON_VERSION) -y" + DELETE_COMMAND="conda env remove -p $(BASE)" + ACTIVATE_COMMAND="conda activate -p $(BASE)" + DEACTIVATE_COMMAND="conda deactivate" +endif + +# To load a env file use env_file= +# e.g. make release env_file=.env +ifneq ($(env_file),) + include $(env_file) +# export +endif + +all: + $(MAKE) help +help: + @echo + @echo "-----------------------------------------------------------------------------------------------------------" + @echo " DISPLAYING HELP " + @echo "-----------------------------------------------------------------------------------------------------------" + @echo "Use make [env=] [env_file=]" + @echo + @echo "make help" + @echo " Display this message" + @echo "make install [env=] [env_file=]" + @echo " Call clean delete_conda_env create_conda_env setup tests" + @echo "make clean [env=] [env_file=]" + @echo " Delete all './build ./dist ./*.pyc ./*.tgz ./*.egg-info' files" + @echo "make create_env [env=] [env_file=]" + @echo " Create a new conda env or virtualenv for the specified python version" + @echo "make delete_env [env=] [env_file=]" + @echo " Delete the current conda env or virtualenv" + @echo "make setup [env=] [env_file=]" + @echo " Call setup.py install" + @echo "-----------------------------------------------------------------------------------------------------------" +install: + $(MAKE) delete_env + $(MAKE) create_env + $(MAKE) clean + $(MAKE) requirements + $(MAKE) setup + @echo -e "\033[0;31m############################################" + @echo + @echo "Installation Successful!" + @echo "To activate the conda environment run:" + @echo ' conda activate cosc525_project4' +setup: + $(BIN)/pip install setuptools + $(BIN)/python setup.py install +setup_dev: + $(BIN)/pip install setuptools + $(BIN)/python setup.py install --dev +m1_requirements: + conda install -c apple tensorflow-deps -y + python -m pip install tensorflow-macos + pip install tensorflow-metal + conda install -c conda-forge jupyter jupyterlab -y + pip install matplotlib numpy tensorboard pandas Pillow sklearn keras-tuner +requirements: + pip install -r requirements.txt + #conda install --file requirements.txt -y +clean: + $(BIN)/python setup.py clean +create_env: + @echo "Creating virtual environment.." + @eval $(CREATE_COMMAND) +delete_env: + @echo "Deleting virtual environment.." + @eval $(DELETE_COMMAND) + +.PHONY: help install clean delete_env create_env setup requirements setup_dev \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..2697424 --- /dev/null +++ b/README.md @@ -0,0 +1,103 @@ +# COSC525: Final Project: Semantic Segmentation with Transformers on 3D Medical Images + +[![GitHub license](https://img.shields.io/badge/license-Apache-blue.svg)]( +https://github.com/drkostas/COSC525-Project1/blob/master/LICENSE) + +## Table of Contents + ++ [About](#about) ++ [Getting Started](#getting_started) + + [Prerequisites](#prerequisites) ++ [Installing the requirements](#installing) ++ [Running the code](#run_locally) + + [Execution Options](#execution_options) + + [main.py](#src_main) ++ [Todo](#todo) ++ [License](#license) + +## About + +Final Project for the Deep Learning course (COSC 525). Involves the development of a semantic +segmentation model with transformers on 3D medical images + +The main code is located in the [main.py](main.py) file. All the other code such is located +in the [src folder](src). + +## Getting Started + +These instructions will get you a copy of the project up and running on your local machine. + +### Prerequisites + +You need to have a machine with Python > 3.6 and any Bash based shell (e.g. zsh) installed. + +```ShellSession +$ python3.9 -V +Python 3.9.1 + +$ echo $SHELL +/usr/bin/zsh +``` + +## Installing the requirements + +All the installation steps are being handled by the [Makefile](Makefile). You can either use conda or +venv by setting the flag `env=`. To load an env file use the +flag `env_file=` + +Before installing everything, make any changes needed in the [settings.ini](settings.ini) file. + +Then, to create a conda environment, install the requirements, setup the library and run the tests +execute the following command: + +```ShellSession +$ make install +``` + +## Running the code + +In order to run the code, you will only need to change the yml file if you need to, and either run its +file directly or invoke its console script. + +### Execution Options + +First, make sure you are in the correct virtual environment: + +```ShellSession +$ conda activate cosc525_finalproject + +$ which python +/home//anaconda3/envs/src/bin/python +``` + +#### main.py + +Now, in order to run the code you can call the [main.py](main.py) +directly. + +```ShellSession +$ python main.py -h +usage: main.py -d DATASET -n NETWORK -c CONFIG_FILE [-l LOG] [-h] + +Project 1 for the Deep Learning class (COSC 525). Involves the development of a FeedForward Neural Network. + +Required Arguments: + -d DATASET, --dataset DATASET + The datasets to train the network on. Options (defined in yml): [and, xor, class_example] + -n NETWORK, --network NETWORK + The network configuration to use. Options (defined in yml): [1x1_net, 2x1_net, 2x2_net] + -c CONFIG_FILE, --config-file CONFIG_FILE + The path to the yaml configuration file. + +Optional Arguments: + -l LOG, --log LOG Name of the output log file + -h, --help Show this help message and exit +``` + +## TODO + +Read the [TODO](TODO.md) to see the current task list. + +## License + +This project is licensed under the Apache License - see the [LICENSE](LICENSE) file for details. diff --git a/data/._ b/data/._ new file mode 100644 index 0000000..e69de29 diff --git a/logs/._ b/logs/._ new file mode 100644 index 0000000..e69de29 diff --git a/main.py b/main.py new file mode 100644 index 0000000..1f17840 --- /dev/null +++ b/main.py @@ -0,0 +1,59 @@ +import traceback +import argparse +import numpy as np +from src import * +from typing import * + +# Color-logger is used to print colored messages to the console +logger = ColorLogger(logger_name='Main', color='yellow') + + +def get_args() -> argparse.Namespace: + """Set-up the argument parser + + Returns: + argparse.Namespace: + """ + parser = argparse.ArgumentParser( + description='Project 1 for the Deep Learning class (COSC 525). ' + 'Involves the development of a FeedForward Neural Network.', + add_help=False) + # Required Args + required_args = parser.add_argument_group('Required Arguments') + config_file_params = { + 'type': argparse.FileType('r'), + 'required': True, + 'help': "The path to the yaml configuration file." + } + required_args.add_argument('-d', '--dataset', required=True, + help="The datasets to train the network on. " + "Options (defined in yml): [and, xor, class_example]") + required_args.add_argument('-n', '--network', required=True, + help="The network configuration to use. " + "Options (defined in yml): [1x1_net, 2x1_net, 2x2_net]") + required_args.add_argument('-c', '--config-file', **config_file_params) + # Optional args + optional_args = parser.add_argument_group('Optional Arguments') + optional_args.add_argument('-l', '--log', required=False, default='out.log', + help="Name of the output log file") + optional_args.add_argument("-h", "--help", action="help", help="Show this help message and exit") + + return parser.parse_args() + + +def main(): + """This is the main function of main.py + + Example: + python main.py --dataset xor --network 2x1_net --config confs/main_conf.yml + """ + + + + +if __name__ == '__main__': + try: + main() + except Exception as e: + logger.error(str(e) + '\n' + str(traceback.format_exc())) + raise e diff --git a/models/._ b/models/._ new file mode 100644 index 0000000..e69de29 diff --git a/plots/._ b/plots/._ new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d5cbd58 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +matplotlib +numpy +tensorflow +tensorboard +pandas +Pillow +sklearn +keras +keras-tuner \ No newline at end of file diff --git a/settings.ini b/settings.ini new file mode 100644 index 0000000..5e011bf --- /dev/null +++ b/settings.ini @@ -0,0 +1,39 @@ +[DEFAULT] +# All sections below are required unless otherwise specified +host = github +lib_name = src +repo_name = COSC525-FinalProject +author = drkostas, Gcantral +description = Semantic Segmentation with Transformers on 3D Medical Images +keywords = python, betales, RNN, Tensorflow, Deep Learning, LSTM +user = Kostas Georgiou, Greg Cantrall +author_email = kgeorgio@vols.utk.edu, gcantral@vols.utk.edu +copyright = apache2 +branch = master +version = 1.0.0 +testing_version = 1.0.0 +min_python = 3.6 +audience = Developers +language = English +# Add licenses and see current list in `setup.py` +license = apache2 +# From 0-6: Planning Pre-Alpha Alpha Beta Production Mature Inactive +status = 3 + +# Optional. Same format as setuptools requirements +requirements = requirements.txt +data_files = +# Optional. Same format as setuptools console_scripts +# console_scripts = +# Optional. Same format as setuptools dependency-links +# dep_links = + +# Values of the form `%(foo)s` are automatically replaced with the value of `foo` +lib_path = %(lib_name)s + +git_url = https://github.com/%(user)s/%(lib_name)s/tree/%(branch)s/ +# For Enterprise Github use: +# repo_name = your-repo +# company_name = your-company +# git_url = https://github.%(company_name)s.com/%(repo_name)s/%(lib_name)s/tree/%(branch)s/ + diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..64ab6eb --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,5 @@ +"""Top-level package for COSC525-Project4.""" + +from .data_loader import * +from .preprocessing import * +from .plotter import * diff --git a/src/data_loader.py b/src/data_loader.py new file mode 100644 index 0000000..e91d0c2 --- /dev/null +++ b/src/data_loader.py @@ -0,0 +1,160 @@ +import os +import numpy as np +import re +from typing import * +import pickle + +model_path = os.path.join(os.path.dirname(__file__), '..', 'models') + + +def create_train_data(file_name: str = 'beatles.txt', window_size: int = 10, stride: int = 5, + debug: bool = False) -> Tuple[np.ndarray, np.ndarray, int]: + """ + Creates training data from a file. + :param file_name: The name of the file to load. + :param window_size: The size of the window to use. + :param stride: The stride to use. + :param debug: Whether to print debug information. + :return: A tuple containing the training data, the labels and the size of the vocab.. + """ + dl = DataLoader(file_name, window_size, stride) + dl.load() + dl.sanitize() + dl.tokenize() + data = dl.create_x_y(debug=debug) + if debug: + x, y = data[2], data[3] + for a, b in zip(x[:10], y[:10]): + print(a, " | ", b) + for a, b in zip(x[-10:], y[-10:]): + print(a, " | ", b) + else: + x, y = data[0], data[1] + x_one_hot, yx_one_hot = dl.one_hot_encode() + if x.shape > y.shape: + x = x[:y.shape[0], :] + x_one_hot = x_one_hot[:y.shape[0], :] + elif x.shape < y.shape: + y = y[:x.shape[0], :] + yx_one_hot = yx_one_hot[:x.shape[0], :] + print("X shape: ", x.shape) + print("Y shape: ", y.shape) + print("x_one_hot shape: ", x_one_hot.shape) + print("y_one_hot shape: ", yx_one_hot.shape) + return x_one_hot, yx_one_hot, dl.vocab_size + + +class DataLoader: + data_path: str = os.path.join(os.path.dirname(__file__), '..', 'data') + data_str: str + data_lst: List[str] + one_hot_dict: Dict[str, List] + extra_characters: List + tokenized_data_lst: List + vocab_size = int + x: np.ndarray + y: np.ndarray + x_onehot: np.ndarray + y_onehot: np.ndarray + + def __init__(self, file_name: str, window_size: int, stride: int): + self.file_path = os.path.join(self.data_path, file_name) + self.window_size = window_size + self.stride = stride + self.is_encoded = False + + def load(self, n_rows: int = -1): + raw_data_np = np.genfromtxt(self.file_path, dtype='str', delimiter='\n', + max_rows=n_rows if n_rows != -1 else None) + self.data_str = ' '.join(raw_data_np.tolist()) + self.data_lst = list(self.data_str) + self.vocab_size = self._create_dict() + return self.data_str + + def _create_dict(self) -> int: + """ + Creates a dictionary of all the characters in the data set. + :return: The vocabulary size. + """ + vocab = set(self.data_lst) + one_hot_vocab = np.zeros((len(vocab), len(vocab))) + for i, letter in enumerate(vocab): + one_hot_vocab[i, i] = 1 + one_hot_vocab = one_hot_vocab.tolist() + self.one_hot_dict = {letter: one_hot for letter, one_hot in zip(vocab, one_hot_vocab)} + save_pickle(self.one_hot_dict, 'one_hot_dict.pkl') + return len(vocab) + + def sanitize(self) -> List[str]: + """ + Sanitizes the data set. + """ + pattern = re.compile(r'[^a-z0-9 ]+') + self.data_str = pattern.sub('', self.data_str.lower()) + self.data_lst = list(pattern.sub('', self.data_str.lower())) + self.vocab_size = self._create_dict() + return self.data_lst + + def tokenize(self) -> Tuple[List, List]: + """ + Tokenizes the data set. + """ + self.tokenized_data_lst = [] + letter_ind = 0 + for letter_ind in range(0, len(self.data_lst) - self.window_size + 1, self.stride): + self.tokenized_data_lst.append(self.data_lst[letter_ind:letter_ind + self.window_size]) + self.extra_characters = self.data_lst[letter_ind + self.window_size:] + return self.tokenized_data_lst, self.extra_characters + + def create_x_y(self, debug: bool = False) -> Union[Tuple[np.ndarray, np.ndarray], + Tuple[np.ndarray, np.ndarray, List, List]]: + """ + Creates the x and y values for the data set. + """ + y = [] + less_windows = 0 if len(self.extra_characters) > 0 else 1 + for i in range(len(self.tokenized_data_lst) - less_windows): + if i + 1 < len(self.tokenized_data_lst): + extra_char = self.data_lst[i * self.stride + self.window_size] + else: + extra_char = self.extra_characters[0] + y.append(self.tokenized_data_lst[i][1:] + [extra_char]) + self.x = np.array(self.tokenized_data_lst) + self.y = np.array(y) + if debug: + x_debug = [''.join(i) for i in self.tokenized_data_lst] + y_debug = [''.join(i) for i in y] + return self.x, self.y, x_debug, y_debug + else: + return self.x, self.y + + def one_hot_encode(self) -> Tuple[np.ndarray, np.ndarray]: + """ + Encodes the data set. + """ + x_one_hot = np.array([list(map(self.one_hot_dict.__getitem__, row)) for row in self.x]) + y_one_hot = np.array([list(map(self.one_hot_dict.__getitem__, row)) for row in self.y]) + return x_one_hot, y_one_hot + + +def save_pickle(data, file_name: str, + protocol=pickle.HIGHEST_PROTOCOL): + """ + Saves a pickle file. + """ + file_path = os.path.join(model_path, f'encodings') + os.makedirs(file_path, exist_ok=True) + file_path = os.path.join(file_path, file_name) + with open(file_path, 'wb') as f: + pickle.dump(data, f, protocol=protocol) + + +def load_pickle(file_name: str): + """ + Loads a pickle file. + """ + file_path = os.path.join(model_path, f'encodings') + file_path = os.path.join(file_path, file_name) + with open(file_path, 'rb') as f: + data = pickle.load(f) + return data diff --git a/src/plotter.py b/src/plotter.py new file mode 100644 index 0000000..ffbf9b7 --- /dev/null +++ b/src/plotter.py @@ -0,0 +1,6 @@ +import matplotlib.pyplot as plt +import io +import numpy as np +import itertools + + diff --git a/src/preprocessing.py b/src/preprocessing.py new file mode 100644 index 0000000..663b915 --- /dev/null +++ b/src/preprocessing.py @@ -0,0 +1,4 @@ +import numpy as np +from sklearn.preprocessing import MinMaxScaler +from sklearn.model_selection import train_test_split +from typing import *