update prompts

LibraryOfCongress · Oct 20, 2023 · 8d65b78 · 8d65b78
1 parent 8ac86a7
commit 8d65b78
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 4 deletions.
diff --git a/public/cloud/index.html b/public/cloud/index.html
@@ -56,11 +56,12 @@
     </div>
     <div class="intro active">
         <p>The <a href="https://www.loc.gov/collections/mary-church-terrell-papers/about-this-collection/">Mary Church Terrell Papers</a> at the <a href="https://www.loc.gov/">Library of Congress</a> consist of approximately 13,000 documents, comprising 25,323 images, all of which were digitized and <a href="https://www.loc.gov/item/2021387726/">transcribed</a> by <a href="https://crowd.loc.gov/campaigns/mary-church-terrell-advocate-for-african-americans-and-women/"></a>volunteers</a> participating in the Library's <a href="https://crowd.loc.gov/">By The People</a> program.</p>
+        <p>Using natural language processing, this interface shows the most frequently used words in this collection.</p>
         <div class="loading">
             <p><button disabled>Loading transcript data...</button></p>
         </div>
         <div class="loaded">
-            <p><button class="start">Explore this collection</button></p>
+            <p><button class="start">Explore the data</button></p>
         </div>
     </div>
   </main>

diff --git a/public/data/mary-church-terrell/prompts-docs.json b/public/data/mary-church-terrell/prompts-docs.json
diff --git a/public/data/mary-church-terrell/prompts.json b/public/data/mary-church-terrell/prompts.json
diff --git a/scripts/get_prompts.py b/scripts/get_prompts.py
@@ -102,7 +102,7 @@ def normalizeText(text):
     text = re.sub(r"[^a-zA-Z0-9\.!?]+$", "", text) # remove non-alpha and punct from end of string
     return text
 
-def getSentences(nlp, transcript, minWords=3, maxWords=60):
+def getSentences(nlp, transcript, minWords=3, maxWords=36):
     """Retrieve a list of sentences from a text"""
     doc = nlp(transcript)
     types=["imperative"]
@@ -143,8 +143,12 @@ def getSentences(nlp, transcript, minWords=3, maxWords=60):
             if isImperativeValue is True:
                 sentenceType = "imperative"
                 break
+            # False value only applies to first clause
             elif isImperativeValue is False and j == 0:
                 break
+            # only check the first two clauses for imperative
+            elif j >= 1:
+                break
         if types is not False and sentenceType not in types:
             continue