-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathspec-part2.ctr
58 lines (47 loc) · 1.72 KB
/
spec-part2.ctr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
Broom memoryLimit: 1024 * 1024 * 1024 * 8.
import main: \*.
import
Library/Test: 'describe'
.
#:language XFrozen
describe TokenAnalyser do: {:it:let
it exists do: {
{ TokenAnalyser. } should not raiseError: String.
}.
let[Reflect thisContext] lexer { Lexer }.
let[Reflect thisContext] analyser { NGramHelper }.
let[Reflect thisContext] glblctx { Reflect thisContext }.
let[Reflect thisContext] tokenmap { File new: 'tokenMap', open: 'r' }.
let[Reflect thisContext] limit { 5000 }.
let[Reflect thisContext] ngram-N { 3 }.
var fancy-name is {\:ex
(frozen _ is Number
case: 1 do: {\ 'uni'},
case: 2 do: {\ 'bi'},
case: 3 do: {\ 'tri'},
case: 4 do: {\ 'quadri'},
default: {\ 'insane'}) switch: ex.
}.
it can tokenise and calculate NGram do: {:thisTest
var allTargetTexts is (Generator from: 2 to: limit) fmap: (\:x File new: 'samples/sample$$x', read).
lexer collate: (File new: 'collation', read split: '\n', fmap: {:x ^x split: ' ' max: 1. }).
thisTest tests: '${{fancy-name[ngram-N]}}$gram'.
thisTest progressesUpTo: limit - ngram-N + 1.
var res is analyser withN: ngram-N,
ngramCountOnTexts: allTargetTexts
withLexer: lexer
pre: {:i thisTest progress: i.}
post: {:i thisTest show.}.
var tokenTable is res last.
var ngramCount is res head.
var ngramCountVec is ngramCount extractDimension: ngram-N - 1 atIndex: 0, multiplicativeInverse.
ngramCountVec writeInto: 'countinv'.
var ngramProb is ngramCount multiply: ngramCountVec throughDimension: ngram-N - 1.
ngramCount writeInto: 'count'.
ngramProb writeInto: 'prob'.
tokenTable each: {:k:v
tokenmap write: '$$k : $$v \n'.
}.
tokenmap close.
}.
}.