Merge pull request #2 from JeremyBrent/jb/fsa_exp_refactor

FSA Experiment refactor
JeremyBrent · Sep 26, 2024 · 154f906 · 154f906
2 parents ca80311 + 260ca84
commit 154f906
Show file tree

Hide file tree

Showing 5 changed files with 171 additions and 77 deletions.
diff --git a/README.md b/README.md
@@ -8,6 +8,22 @@ cd stock_market_analyzer
 make install
 ```
 
+# The Code Base
+Given the timeframe of the project, I put together a small, end-to-end project. Some of these end 
+to end features include unittests, CICD with Github actions, environment creation with Make and
+requirements.txt, and github branch protection rules found 
+[here](https://github.com/JeremyBrent/stock_market_analyzer/settings/branch_protection_rules/54816872)
+which require 1. PRs and 2. passing Github actions in order to update the main branch. Note, that 
+I didn't require approvers on the branch protection rule due to the fact that there is no one else 
+to review my code .... this would not be the case in a production environment and that would be 
+a rule in said production environment.
+
+With more time, some things I would build upon would be: 
+1. Added a comprehensive logging functionality, this is critical to production-worthy code
+2. Expanding unittest portfolio would need to build out
+3. Further developing the Github actions if we were deploying this model as a service
+
+
 # FSA
 
 ## Data
@@ -55,26 +71,21 @@ from src.experiment import Experiment
 experimenter = Experiment()
 experimenter.fsa_experiment()
 ```
-This will run the models defined in `experimenter.fsa_models_to_test`. 
+This will run the models defined in `experimenter.models`. 
+
+I have implemented functionality that can run FSA experiments using CUDA, 
+MPS (if on Mac Silicon Chip) or parallel compute if on CPU. This allows the software decide 
+the most efficient way to FSA experimentation. Run-time experimentation that I conducted gave 
+general estimates that Parallel compute (on CPU) would complete in about 1 hour, and MPS device 
+would complete in about 30 minutes. 
 
 ### Future Directions
 More FSA models can be experimented on. To include more models in the `Experiment` class, simply 
-add the model to `experimenter.fsa_models_to_test` and any new methods that are needed to run 
+add the model to `experimenter.models` and any new key-value pairs that are needed to run 
 inference with the new model. 
 
 Any new models should be replicated based on existing research found 
 [here](https://dl.acm.org/doi/10.1145/3649451#sec-4-4).
 
 We should also implement a more sophisticated metric for 
-measuring the performance of the FSA models. Currently, we are only using a raw accuracy. 
-
-# The Code Base
-Given the timeframe of the project, I put together a small, end-to-end project. Some of these end 
-to end features include unittests, CICD with Github actions, environment creation with Make and
-requirements.txt, and github branch protection rules found 
-[here](https://github.com/JeremyBrent/stock_market_analyzer/settings/branch_protection_rules/54816872)
-which require 1. PRs and 2. passing Github actions in order to update the main branch.
-
-With more time, some things I would build upon would be, 
-expanding unittest portfolio would need to build out, and further developing the Github actions
-if we were deploying this model as a service
+measuring the performance of the FSA models. Currently, we are only using a raw accuracy.
diff --git a/requirements.txt b/requirements.txt
@@ -3,7 +3,7 @@ nltk
 textblob
 transformers
 torch==2.2.2
-scipy
+scipy==1.13.1
 numpy==1.26.4
-ipython
-kaggle
+ipython==8.18.1
+pandas_ta==0.3.14b
diff --git a/src/consts.py b/src/consts.py
@@ -1,3 +1,7 @@
+import os
+
+PROJECT_ROOT_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))
+
 CPU_COUNT: int = 8
 PARALLEL_CHUNK_SIZE: int = 5
 DATE_FORMAT: str = '%Y-%m-%d %H:%M:%S'
diff --git a/src/experiment.py b/src/experiment.py
@@ -1,132 +1,159 @@
+import os
 import torch
 import nltk
 import collections
 import pandas as pd
 from tqdm import tqdm
 from textblob import TextBlob
-nltk.download('vader_lexicon')
 from multiprocessing import Pool
-from src.consts import CPU_COUNT, PARALLEL_CHUNK_SIZE
 from nltk.sentiment.vader import SentimentIntensityAnalyzer
+from src.consts import CPU_COUNT, PARALLEL_CHUNK_SIZE, PROJECT_ROOT_PATH
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
 
 class Experiment:
 
     def __init__(self):
 
+        self.device: torch.device = self._get_device()
+
         # Storing models here so we do not have to instantiate them every time we maka prediction
         self.models = {
             'finbert': {
+                'method': self._fsa_bert_model,
                 'tokenizer': AutoTokenizer.from_pretrained('ProsusAI/finbert'),
-                'model': AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert')
+                'model': AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert').to(self.device)
             },
-            'textblob': ...,
-            'nltk': SentimentIntensityAnalyzer(),
             'roberta': {
+                'method': self._fsa_bert_model,
                 'tokenizer': AutoTokenizer.from_pretrained(
                     'cardiffnlp/twitter-roberta-base-sentiment-latest'),
                 'model': AutoModelForSequenceClassification.from_pretrained(
-                    'cardiffnlp/twitter-roberta-base-sentiment-latest')
+                    'cardiffnlp/twitter-roberta-base-sentiment-latest').to(self.device)
             },
             'fin_roberta': {
+                'method': self._fsa_bert_model,
                 'tokenizer': AutoTokenizer.from_pretrained(
                     'soleimanian/financial-roberta-large-sentiment'),
                 'model': AutoModelForSequenceClassification.from_pretrained(
-                    'soleimanian/financial-roberta-large-sentiment')
+                    'soleimanian/financial-roberta-large-sentiment').to(self.device)
+            },
+            'textblob': {
+                'method': self._fsa_text_blob
+            },
+            'nltk': {
+                'method': self._fsa_nltk
             }
         }
 
-        # The first index in the below list needs to be a method with 2 input parameters,
-        # text, model
-        self.fsa_models_to_test = \
-            [('finbert', self._fsa_bert_model),
-             ('fin_roberta', self._fsa_bert_model),
-             ('roberta', self._fsa_bert_model),
-             ('nltk', self._fsa_nltk)]
-
         self.fsa_ground_truth_data = self._get_fsa_ground_truth_data()
 
+        # Download vader lexicon if we do not have it
+        try:
+            nltk.data.find('vader_lexicon')
+            print('Finding vader')
+        except LookupError:
+            nltk.download('vader_lexicon')
+            print('Downloading vader')
+
+    @staticmethod
+    def _get_device() -> torch.device:
+        """
+
+        :return:
+        """
+        # If on an Apple Silicon Mac (M1/M2), Pytorch supports MPS Backend (Metal Performance Shaders)
+        if torch.backends.mps.is_available():
+            device = torch.device('mps')
+
+        # If machine has access to trad GPU
+        elif torch.cuda.is_available():
+            device = torch.device('cuda')
+
+        else:
+            device = torch.device('cpu')
+
+        return device
+
     @staticmethod
     def _get_fsa_ground_truth_data() -> list[dict[str, str]]:
         """
 
         :return:
         """
-        df: pd.DataFrame = pd.read_csv('./data/fsa_ground_truth.csv')
+        df: pd.DataFrame = pd.read_csv(os.path.join(PROJECT_ROOT_PATH,
+                                                    "data",
+                                                    "fsa_ground_truth.csv"))
+
         return [{"text": x[0], 'sentiment': x[1]} for x in df.values.tolist()]
 
-    def _fsa_bert_model(self, text: str, model: str) -> str:
+    @staticmethod
+    def _categorize_sentiment_range(value: float) -> str:
+        """
+        Given a sentiment float, return the sentiment.
+
+        Threshold were determined here:
+            https://github.com/cjhutto/vaderSentiment?tab=readme-ov-file#about-the-scoring
+
+        :param value:
+        :return:
+        """
+        if value >= 0.05:
+            return 'positive'
+        elif -.05 < value < .05:
+            return 'neutral'
+        else:
+            return 'negative'
+
+    def _fsa_bert_model(self, text: str, model_name: str) -> str:
         """
 
         :param text:
         :param model:
         :return:
         """
-        tokens = self.models[model]['tokenizer'](text,
-                                                 padding=True,
-                                                 truncation=True,
-                                                 return_tensors='pt')
-        output = self.models[model]['model'](**tokens)
+        tokens = self.models[model_name]['tokenizer'](text,
+                                                      padding=True,
+                                                      truncation=True,
+                                                      return_tensors='pt',
+                                                      max_length=512).to(self.device)
+        output = self.models[model_name]['model'](**tokens)
 
         # self.models[model]['model'].config.id2label.values() ~= ['pos', 'neg', 'neu']
         # torch.nn.functional.softmax(output.logits, dim=-1).tolist()[0] ~= [.083, .034, .883]
         final_output = {label: value
                         for label, value in
-                        zip(self.models[model]['model'].config.id2label.values(),
+                        zip(self.models[model_name]['model'].config.id2label.values(),
                             torch.nn.functional.softmax(output.logits, dim=-1).tolist()[0])
                         }
-        print(final_output)
+
         # Only get the sentiment with the highest score
         return max(final_output, key=final_output.get)
 
-    def _fsa_text_blob(self, text: str):
+    def _fsa_text_blob(self, text: str, model_name: str) -> str:
         """
 
         :param text:
+        :param model_name: Not used in this method, but needed for design pattern
         :return:
         """
-        sentiment = TextBlob(text).sentiment.polarity
+        polarity: float = TextBlob(text).sentiment.polarity
+        sentiment: str = self._categorize_sentiment_range(polarity)
         return sentiment
 
-    def _fsa_nltk(self, text: str, model: str):
+    def _fsa_nltk(self, text: str, model_name: str) -> str:
         """
         This NLTK class uses VADER (Valence Aware Dictionary and sEntiment Reasoner),
         "a lexicon and rule-based sentiment analysis tool that is specifically attuned
         to sentiments expressed in social media" (https://github.com/cjhutto/vaderSentiment)
 
         :param text:
-        :param model:
+        :param model_name: Not used in this method, but needed for design pattern
         :return:
         """
-        results = self.models[model].polarity_scores(text)
-
-        # Threshold were determined here:
-        # https://github.com/cjhutto/vaderSentiment?tab=readme-ov-file#about-the-scoring
-        if results['compound'] >= 0.05:
-            return 'positive'
-        elif -.05 < results['compound'] < .05:
-            return 'neutral'
-        else:
-            return 'negative'
-
-    def fsa_experiment(self) -> dict[str, int]:
-        """
-
-        :return:
-        """
-        result_dict = collections.defaultdict(int)
-
-        with Pool(CPU_COUNT) as p:
-            for data in tqdm(
-                p.imap(self._fsa_experiment_inner,
-                       self.fsa_ground_truth_data[:100],  # TODO: remove limit
-                       chunksize=PARALLEL_CHUNK_SIZE), total=len(self.fsa_ground_truth_data)
-            ):
-                for model, count in data.items():
-                    result_dict[model] += count
-
-        return dict(result_dict)
+        result: dict[str, float] = SentimentIntensityAnalyzer().polarity_scores(text)
+        sentiment: str = self._categorize_sentiment_range(result['compound'])
+        return sentiment
 
     def _fsa_experiment_inner(self, data: dict[str, str]) -> dict:
         """
@@ -136,11 +163,39 @@ def _fsa_experiment_inner(self, data: dict[str, str]) -> dict:
         """
         data_dict = collections.defaultdict(int)
 
-        for model, predict in self.fsa_models_to_test:
-
-            actual = predict(data['text'], model)
+        for model_name, model_info in self.models.items():
+            actual = model_info['method'](text=data['text'], model_name=model_name)
             if actual == data['sentiment']:
-                data_dict[model] += 1
+                data_dict[model_name] += 1
 
         return data_dict
 
+    def fsa_experiment(self) -> dict[str, int]:
+        """
+
+        :return:
+        """
+        results_dict = collections.defaultdict(int)
+
+        # If we only have cpu access, run experiments using parallel processing
+        if self.device.type == 'cpu':
+            with Pool(CPU_COUNT) as p:
+                for results in tqdm(
+                    p.imap(self._fsa_experiment_inner,
+                           self.fsa_ground_truth_data[:100],  # TODO: remove limit
+                           chunksize=PARALLEL_CHUNK_SIZE), total=len(self.fsa_ground_truth_data)
+                ):
+                    for model, count in results.items():
+                        results_dict[model] += count
+
+            return dict(results_dict)
+
+        # If we are on cuda or mps
+        # TODO: remove limit
+        for data in tqdm(self.fsa_ground_truth_data[:100], total=len(self.fsa_ground_truth_data)):
+            results = self._fsa_experiment_inner(data=data)
+            for model, count in results.items():
+                results_dict[model] += count
+
+        return dict(results_dict)
+
diff --git a/tests/test_experiment.py b/tests/test_experiment.py
@@ -0,0 +1,24 @@
+import unittest
+from src.experiment import Experiment
+
+
+class TestUtils(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.experiment = Experiment()
+
+    def test_convert_datetime(self):
+        test_data = {
+            0.0: "neutral",
+            .92348: "positive",
+            -.92348:  "negative",
+            0.043: "neutral",
+            -0.043: "neutral",
+            .051: "positive",
+            -.051: "negative"
+        }
+
+        for input, expected_data in test_data.items():
+            actual_output = self.experiment._categorize_sentiment_range(input)
+            self.assertEqual(actual_output, expected_data)