This repository has been archived by the owner on Jun 28, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 195
/
Copy pathkata-spell-check.sh
executable file
·336 lines (269 loc) · 8.55 KB
/
kata-spell-check.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
#!/bin/bash
# Copyright (c) 2019 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
# Description: spell-check utility.
[ -n "$DEBUG" ] && set -x
set -o errexit
set -o pipefail
set -o nounset
# Ensure we spell check in English
LANG=C
LC_ALL=C
script_name=${0##*/}
if [ "$(uname -s)" == "Darwin" ]
then
# Hunspell dictionaries are a not easily available
# on this platform it seems.
echo "INFO: $script_name: OSX not supported - exiting"
exit 0
fi
self_dir=$(dirname "$(readlink -f "$0")")
cidir="${self_dir}/../../.ci"
source "${cidir}/lib.sh"
# Directory containing word lists.
#
# Each file in this directory must:
#
# - Have the ".txt" extension.
# - Contain one word per line.
#
# Additionally, the files may contain blank lines and comments
# (lines beginning with '#').
KATA_DICT_FRAGMENT_DIR=${KATA_DICT_FRAGMENT_DIR:-data}
KATA_DICT_NAME="${KATA_DICT_NAME:-kata-dictionary}"
# Name of dictionary file suitable for using with hunspell(1)
# as a personal dictionary.
KATA_DICT_FILE="${KATA_DICT_FILE:-${KATA_DICT_NAME}.dic}"
KATA_RULES_FILE="${KATA_RULES_FILE:-${KATA_DICT_FILE/.dic/.aff}}"
# command to remove code from markdown (inline and blocks)
strip_cmd="${cidir}/kata-doc-to-script.sh"
fragment_dir="${self_dir}/${KATA_DICT_FRAGMENT_DIR}"
# Name of file containing dictionary rules that apply to the
# KATA_DICT_FILE word list.
rules_file_name="rules.aff"
# Command to spell check a file
spell_check_cmd="${KATA_SPELL_CHECK_CMD:-hunspell}"
# Command to convert a markdown file into plain text
md_convert_tool="${KATA_MARKDOWN_CONVERT_TOOL:-pandoc}"
KATA_DICT_DIR="${KATA_DICT_DIR:-${self_dir}}"
dict_file="${KATA_DICT_DIR}/${KATA_DICT_FILE}"
rules_file="${KATA_DICT_DIR}/${KATA_RULES_FILE}"
# Hunspell refers to custom dictionary by their path followed by the name of
# the dictionary (without the file extension).
kata_dict_ref="${KATA_DICT_DIR}/${KATA_DICT_NAME}"
# All project documentation must be written in English,
# with American English taking priority.
#
# We also use a custom dictionary which has to be specified by its
# "directory and name prefix" and which must also be the first specified
# dictionary.
dict_languages="${kata_dict_ref},en_US,en_GB"
make_dictionary()
{
[ -d "$fragment_dir" ] || die "invalid fragment directory"
[ -z "$dict_file" ] && die "missing dictionary output file name"
# Note: the first field is extracted to allow for inline
# comments in each fragment. For example:
#
# word # this text describes why the word is in the dictionary.
#
local dict
dict=$(cat "$fragment_dir"/*.txt |\
grep -v '^\#' |\
grep -v '^$' |\
awk '{print $1}' |\
sort -u || true)
[ -z "$dict" ] && die "generated dictionary is empty"
# Now, add in the number of words as a header (required by Hunspell)
local count
count=$(echo "$dict"| wc -l | awk '{print $1}' || true)
[ -z "$count" ] && die "cannot determine dictionary length"
[ "$count" -eq 0 ] && die "invalid dictionary length"
# Construct the dictionary
(echo "$count"; echo "$dict") > "$dict_file"
cp "${fragment_dir}/${rules_file_name}" "${rules_file}"
}
spell_check_file()
{
local file="$1"
[ -z "$file" ] && die "need file to check"
[ -e "$file" ] || die "file does not exist: '$file'"
[ -e "$dict_file" ] || make_dictionary
info "Spell checking file '$file'"
# Determine the pandoc input format.
local pandoc_input_fmts
local pandoc_input_fmt
local pandoc_input_fmts=$(pandoc --list-input-formats 2>/dev/null || true)
if [ -z "$pandoc_input_fmts" ]
then
# We're using a very old version of pandoc that doesn't
# support listing its available input formats, so
# specify a default.
pandoc_input_fmt="markdown_github"
else
# Pandoc has multiple names for the gfm parser so find one of them
pandoc_input_fmt=$(echo "$pandoc_input_fmts" |\
grep -E "gfm|github" |\
head -1 || true)
fi
[ -z "$pandoc_input_fmt" ] && die "cannot find usable pandoc input format"
local stripped_doc
local pandoc_doc
local utf8_free_doc
local pre_hunspell_doc
local hunspell_results
local final_results
# First strip out all code blocks and convert all
# "quoted apostrophe's" ('\'') back into a single apostrophe.
stripped_doc=$("$strip_cmd" -i "$file" -)
# Next, convert the remainder it into plain text to remove the
# remaining markdown syntax.
#
# Before pandoc gets hold of it:
#
# - Replace pipes with spaces. This
# fixes an issue with old versions of pandoc (Ubuntu 16.04)
# which completely mangle tables into nonsense.
#
# - Remove empty reference links.
#
# For example, this markdown
#
# blah [`qemu-lite`][qemu-lite] blah.
# :
# [qemu-lite]: https://...
#
# Gets converted into
#
# blah [][qemu-lite] blah.
# :
# [qemu-lite]: https://...
#
# And the empty set of square brackets confuses pandoc.
#
# After pandoc has processed the data, remove any remaining
# "inline links" in this format:
#
# [link name](#link-address)
#
# This is strictly only required for old versions of pandoc.
pandoc_doc=$(echo "$stripped_doc" |\
tr '|' ' ' |\
sed 's/\[\]\[[^]]*\]//g' |\
"$md_convert_tool" -f "${pandoc_input_fmt}" -t plain - |\
sed 's/\[[^]]*\]([^\)]*)//g' || true)
# Convert the file into "pure ASCII" by removing all awkward
# Unicode characters that won't spell check.
#
# Necessary since pandoc is "clever" and will convert things like
# GitHub's colon emojis (such as ":smile:") into the actual utf8
# character where possible.
utf8_free_doc=$(echo "$pandoc_doc" | iconv -c -f utf-8 -t ascii)
# Next, perform the following simplifications:
#
# - Remove URLs.
# - Remove email addresses.
# - Replace most punctuation symbols with a space
# (excluding a dash (aka hyphen!)
# - Carefully remove non-hyphen dashes.
# - Remove GitHub @userids.
pre_hunspell_doc=$(echo "$utf8_free_doc" |\
sed 's,https*://[^[:space:]()][^[:space:]()]*,,g' |\
sed -r 's/[a-zA-Z0-9.-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9.-]+//g' |\
tr '[,\[\]()\*\\/\|=]' ' ' |\
sed -e 's/^ *-//g' -e 's/- $//g' -e 's/ -//g' |\
sed 's/@[a-zA-Z0-9][a-zA-Z0-9]*\b//g')
# Call the spell checker
hunspell_results=$(echo "$pre_hunspell_doc" | $spell_check_cmd -d "${dict_languages}")
# Finally, post-process the hunspell output:
#
# - Parse the output to ignore:
# - Hunspell banner.
# - Correctly spelt words (lines starting with '*', '+' or '-').
# - All words containing numbers (like "100MB").
# - All words that appear to be acronymns / Abbreviations
# (atleast two upper-case letters and which may be plural or
# possessive).
# - All words that appear to be numbers.
# - All possessives and the dreaded isolated "'s" which occurs
# for input like this:
#
# `kata-shim`'s
#
# which gets converted by $strip_cmd into simply:
#
# 's
#
# - Sort output.
final_results=$(echo "$hunspell_results" |\
grep -Evi "(ispell|hunspell)" |\
grep -Ev '^(\*|\+|-)' |\
grep -Evi "^(&|#) [^ ]*[0-9][^ ]*" |\
grep -Ev "^. [A-Z][A-Z][A-Z]*(s|'s)*" |\
grep -Ev "^. 's" |\
sort -u || true)
local line
local incorrects
local near_misses
near_misses=$(echo "$final_results" | grep '^&' || true)
incorrects=$(echo "$final_results" | grep '^\#' | awk '{print $2}' || true)
local -i failed=0
[ -n "$near_misses" ] && failed+=1
[ -n "$incorrects" ] && failed+=1
echo "$near_misses" | while read -r line
do
[ "$line" = "" ] && continue
local word
local possibles
word=$(echo "$line" | awk '{print $2}')
possibles=$(echo "$line" | cut -d: -f2- | sed 's/^ *//g')
warn "Word '${word}': did you mean one of the following?: ${possibles}"
done
local incorrect
for incorrect in $incorrects
do
warn "Incorrect word: '$incorrect'"
done
[ "$failed" -gt 0 ] && die "Spell check failed for file: '$file'"
info "Spell check successful for file: '$file'"
}
delete_dictionary()
{
rm -f "${KATA_DICT_FILE}" "${KATA_RULES_FILE}"
}
setup()
{
local cmd
for cmd in "$spell_check_cmd" "$md_convert_tool"
do
command -v "$cmd" &>/dev/null || die "Need $cmd command"
done
}
usage()
{
cat <<-EOF
Usage: ${script_name} <command> [arguments]
Description: Spell-checking utility.
Commands:
check <file> : Spell check the specified file
(implies 'make-dict').
delete-dict : Delete the dictionary.
help : Show this usage.
make-dict : Create the dictionary.
EOF
}
main()
{
setup
[ -z "${1:-}" ] && usage && echo && die "need command"
case "$1" in
check) shift && spell_check_file "$1" ;;
delete-dict) delete_dictionary ;;
help|-h|--help) usage && exit 0 ;;
make-dict) make_dictionary ;;
*) die "invalid command: '$1'" ;;
esac
}
main "$@"