From 8191fe5744863a2d093e9ffd038d08d917235ebf Mon Sep 17 00:00:00 2001 From: Juan Rodriguez Date: Tue, 27 Aug 2024 10:57:16 -0500 Subject: [PATCH 1/2] added dolch word list --- vocab/dolch-220-95.txt | 315 +++++++++++++++++++++++++++++++++++++++++ vocab/notes/note.md | 3 +- 2 files changed, 317 insertions(+), 1 deletion(-) create mode 100644 vocab/dolch-220-95.txt diff --git a/vocab/dolch-220-95.txt b/vocab/dolch-220-95.txt new file mode 100644 index 0000000..efbd459 --- /dev/null +++ b/vocab/dolch-220-95.txt @@ -0,0 +1,315 @@ +and +as +because +but +if +or +about +after +at +by +down +for +from +in +into +of +on +over +to +under +upon +with +he +her +him +his +i +it +its +me +my +myself +our +she +that +their +them +these +they +this +those +us +we +what +which +who +you +your +again +always +around +away +before +far +fast +first +here +how +just +much +never +no +not +now +off +once +only +out +so +soon +then +there +today +together +too +up +very +well +when +where +why +yes +a +all +an +any +best +better +big +black +blue +both +brown +clean +cold +eight +every +five +four +full +funny +good +green +hat +kind +light +little +long +many +new +old +one +own +pretty +red +right +round +seven +six +small +some +nine +ten +the +three +two +warm +white +yellow +am +are +ask +ate +be +been +bring +buy +call +came +can +carry +come +could +cut +did +do +does +done +don't +draw +drink +eat +fall +find +fly +found +gave +get +give +go +goes +going +got +grow +had +has +have +help +hold +hurt +is +jump +keep +know +laugh +let +like +live +look +made +make +may +must +open +pick +play +please +pull +put +rani +read +ride +run +said +saw +say +see +shall +show +sing +sit +sleep +start +stop +take +tell +thank +think +try +use +walk +want +was +wash +went +were +will +wish +work +would +write +apple +baby +back +ball +bear +bed +bell +bird +birthday +boat +box +boy +bread +brother +cake +car +cat +chair +chicken +children +christmas +coat +corn +cow +day +dog +doll +door +duck +egg +eye +farm +farmer +father +feet +fire +fish +floor +flower +game +garden +girl +goodbye +grass +ground +hand +head +hill +home +horse +house +kitty +leg +letter +man +men +milk +money +morning +mother +name +nest +night +paper +party +picture +pig +rabbit +rain +ring +robin +school +seed +sheep +shoe +sister +snow +song +squirrel +stock +street +sun +table +thing +time +top +toy +tree +watch +water +way +wind +window +wood diff --git a/vocab/notes/note.md b/vocab/notes/note.md index 060a3bb..248dbf7 100644 --- a/vocab/notes/note.md +++ b/vocab/notes/note.md @@ -11,4 +11,5 @@ CH_HF: https://github.com/lpmi-13/machine_readable_wordlists/tree/master List originally from Macalister and Webb, "Can L1 children's literature be used in the English language classroom? High frequency words in writing for children" - +Dolch: + 220 words (verbs, conjunctions, prepositions, pronouns) and 95 nouns from Dolch, Edward W. "A basic sight vocabulary." The Elementary School Journal 36.6 (1936): 456-460. From 08a32226ac82598c4ceac1dfc10e5f31c3a01466 Mon Sep 17 00:00:00 2001 From: Juan Rodriguez Date: Tue, 27 Aug 2024 19:52:47 -0500 Subject: [PATCH 2/2] added a few extra characters to regex (colon, semicolon, dash, asterisk) --- make_hf_tokenizer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/make_hf_tokenizer.py b/make_hf_tokenizer.py index 3a691b2..950e1f9 100644 --- a/make_hf_tokenizer.py +++ b/make_hf_tokenizer.py @@ -36,7 +36,7 @@ from tokenizers.pre_tokenizers import PreTokenizer #NOTE: this function from https://github.com/noanabeshima/tiny_tokenizer/tree/main -single_character_toks = re.compile(r'[ ,."\'0-9?!()\n-]') +single_character_toks = re.compile(r'[ ,—\*;:\."\'0-9?!\(\)\n-]') def word_tokenize(text, known_toks=None, add_bos=True): """The function used for pre-tokenization.""" @@ -121,6 +121,8 @@ def make_tokenizer(vocab_files): normalizers.Replace("“", '"'), normalizers.Replace("”", '"'), normalizers.Replace("’", "'"), + normalizers.Replace("‘", "'"), + normalizers.Replace("–", "—"), normalizers.NFKD(), normalizers.Lowercase(), normalizers.StripAccents(),