forked from nfagerlund/fragment-mangler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentence_segmenter.rb
executable file
·49 lines (36 loc) · 1.31 KB
/
sentence_segmenter.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/env ruby
# > sentence_segmenter.rb ./file1.html ./file2.html ./file3.html
# Sentence segmenter -- splits <p> elements into something like
# <div class="real-paragraph">
# <p class="temp-sentence"> ... </p>
# <p class="temp-sentence"> ... </p>
# </div>
# This script expects any number of HTML fragment files as its argument(s). It
# will mangle the content and update it in-place, destroying the previous
# content of the files.
require 'nokogiri'
require 'punkt-segmenter'
@tokenizer = nil
def call_punkt_segmenter(text)
@tokenizer.sentences_from_text(text, :output => :sentences_text)
end
def segment_on_sentences(text)
parsed = Nokogiri::HTML::DocumentFragment.parse(text)
@tokenizer = Punkt::SentenceTokenizer.new(text)
all_paragraphs = parsed.css('p')
all_paragraphs.each do |graf|
sentences = call_punkt_segmenter(graf.inner_html)
new_div = '<div class="real-paragraph"> <p class="temp-sentence">' << sentences.join('</p> <p class="temp-sentence">') << '</p></div>'
graf.replace(new_div)
end
parsed.to_html
end
ARGV.each do |filename|
full_path = File.expand_path(filename)
print "Mangling #{full_path}... "
mangled_html = segment_on_sentences( File.read(full_path, encoding: 'utf-8') )
File.open(full_path, 'w') do |f|
f.write(mangled_html)
end
print " done.\n"
end