fix some bugs; change model smoothing to 0 for increase performance, …

…perhaps
Dechrissen · May 7, 2024 · cb01d0b · cb01d0b
1 parent 2ce9642
commit cb01d0b
Show file tree

Hide file tree

Showing 4 changed files with 6 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -79,7 +79,7 @@ command | flag | description
 add | `-a` | adds the current contents of the files in `/data/src/text` (newline-separated) to the database
 total | `-t` | output the total number of Headline Snaps in the database
 random | `-r` | print a random Headline Snap from the database
-convert | `-c` | convert the Headline Snap image files in `/data/src/raw` to text via OCR and output them to `/data/text/ocr_output.txt`
+convert | `-c` | convert the Headline Snap image files in `/data/src/raw` to text via OCR and output them to `/data/text/ocr_output.txt`, then adds all contents of the `/data/text` directory to the database
 export | `-x` | dump all Headline Snaps from the database to a text file at `/data/dump.txt`
 search | `-s` | query the database for Headline Snaps containing a provided search phrase
 delete | `-d` | delete all data from the Headline Snap and token databases

diff --git a/Tokenizer.py b/Tokenizer.py
@@ -25,7 +25,7 @@ def updateTokens(token_db_path):
     # recreate an empty token db
     createTokenDatabase(token_db_path)
 
-    print("Updating token database with counts ...")
+    print("Updating token database with counts ... this may take a while.")
     sleep(1)
 
     # then iterate over every snap in the corpus and tokenize them and add them to the token db

diff --git a/Trigrams.py b/Trigrams.py
@@ -13,9 +13,9 @@ def trainTrigramModel(corpus_path):
     returns
         model : a dictionary containing the trigram model
     '''
-    # TODO: try making this 0?
-    # Smoothing of 0.01 to handle unattested words in test data 
-    model = defaultdict(lambda: defaultdict(lambda: 0.01))
+    # smoothing value to handle unattested words in test data (I think keep this zero for best model quality)
+    smoothing = 0.0
+    model = defaultdict(lambda: defaultdict(lambda: smoothing))
 
     # TODO make this use headline snap data somehow, not iterate through txt files in a directory
 

diff --git a/Visualizer.py b/Visualizer.py
@@ -28,7 +28,7 @@ def generateWordCloud():
     wl = WordNetLemmatizer()
 
     corpus = ''
-    with open("./data/corpus.txt", "r") as f:
+    with open("./data/corpus.txt", "r", encoding='utf-8') as f:
         corpus = f.readlines()
 
     text_data = ''