From 1e12afeaabf98549caaabfe58f2fa4391cd17c6c Mon Sep 17 00:00:00 2001 From: c42f Date: Wed, 25 Jan 2023 18:04:48 +1000 Subject: [PATCH] Parser fuzz testing tools and fixes (#185) Some fuzz testing tooling to check that the parser doesn't crash on randomly generated source strings. Fix several problems found with this: * `?` shouldn't be special in `parse_unary`. This was inherited from the a syntax hack used to support the ancient and questionable `@windows?` and other platform test macros in osutils.jl. This is long since gone and we shouldn't continue supporting this. * `<:` may be unary so `<: <: x` should parse as `(<: (<: x))`, even though this is kind of nonsense semantically. * Constructing a SyntaxNode tree shouldn't fail when there's malformed literals but when we've parsed using `ignore_errors=true`. Instead we use ErrorVal() for the leaf values in that tree. * The tokenizer should not crash when overlong UTF-8 character literals are encountered. --- src/parser.jl | 13 ++++---- src/syntax_tree.jl | 31 ++++++++++--------- src/tokenize.jl | 2 +- src/tokenize_utils.jl | 12 ++++---- test/fuzz_test.jl | 71 +++++++++++++++++++++++++++++++++++++++++++ test/parser.jl | 4 +++ test/parser_api.jl | 6 ++++ test/tokenize.jl | 29 ++++++++++++------ 8 files changed, 130 insertions(+), 38 deletions(-) create mode 100644 test/fuzz_test.jl diff --git a/src/parser.jl b/src/parser.jl index d7722b6a..e2b28251 100644 --- a/src/parser.jl +++ b/src/parser.jl @@ -1025,9 +1025,10 @@ function parse_unary_subtype(ps::ParseState) else # <: x ==> (<:-pre x) # <: A where B ==> (<:-pre (where A B)) + # <: <: x ==> (<:-pre (<:-pre x)) mark = position(ps) bump(ps, TRIVIA_FLAG) - parse_where(ps, parse_juxtapose) + parse_unary_subtype(ps) # Flisp parser handled this, but I don't know how it can happen... @check peek_behind(ps).kind != K"tuple" emit(ps, mark, kind(t), PREFIX_OP_FLAG) @@ -1162,9 +1163,7 @@ function parse_unary(ps::ParseState) if ( !is_operator(op_k) || is_word_operator(op_k) || - # TODO(jb): `?` should probably not be listed here - # except for the syntax hack in osutils.jl - (op_k in KSet": ' .' ?") || + (op_k in KSet": ' .'") || (is_syntactic_unary_op(op_k) && !is_dotted(op_t)) || is_syntactic_operator(op_k) ) @@ -1693,8 +1692,10 @@ function parse_call_chain(ps::ParseState, mark, is_macrocall=false) emit(ps, mark, K"curly") end elseif k in KSet" \" \"\"\" ` ``` " && - !preceding_whitespace(t) && - maybe_strmac && peek_behind(ps, macro_name_position).kind == K"Identifier" + !preceding_whitespace(t) && maybe_strmac && + (# Must mirror the logic in lex_quote() for consistency + origk = peek_behind(ps, macro_name_position).orig_kind; + origk == K"Identifier" || is_contextual_keyword(origk) || is_word_operator(origk)) # Custom string and command literals # x"str" ==> (macrocall @x_str (string-r "str")) # x`str` ==> (macrocall @x_cmd (cmdstring-r "str")) diff --git a/src/syntax_tree.jl b/src/syntax_tree.jl index 293403e2..733dd750 100644 --- a/src/syntax_tree.jl +++ b/src/syntax_tree.jl @@ -29,18 +29,19 @@ function SyntaxNode(source::SourceFile, raw::GreenNode{SyntaxHead}, position::In val_str = view(source, val_range) # Here we parse the values eagerly rather than representing them as # strings. Maybe this is good. Maybe not. + # + # Any errors parsing literals are represented as ErrorVal() - this can + # happen when the user sets `ignore_errors=true` during parsing. val = if k == K"Integer" parse_int_literal(val_str) elseif k == K"Float" v, code = parse_float_literal(Float64, source.code, position, position+span(raw)) - @check code == :ok || code == :underflow - v + (code == :ok || code == :underflow) ? v : ErrorVal() elseif k == K"Float32" v, code = parse_float_literal(Float32, source.code, position, position+span(raw)) - @check code == :ok || code == :underflow - v + (code == :ok || code == :underflow) ? v : ErrorVal() elseif k in KSet"BinInt OctInt HexInt" parse_uint_literal(val_str, k) elseif k == K"true" @@ -49,14 +50,15 @@ function SyntaxNode(source::SourceFile, raw::GreenNode{SyntaxHead}, position::In false elseif k == K"Char" io = IOBuffer() - ds = Diagnostic[] had_error = unescape_julia_string(io, source.code, position, - position+span(raw), ds) - @check !had_error && isempty(ds) - seek(io, 0) - c = read(io, Char) - @check eof(io) - c + position+span(raw), Diagnostic[]) + if had_error + ErrorVal() + else + seek(io, 0) + c = read(io, Char) + eof(io) ? c : ErrorVal() + end elseif k == K"Identifier" if has_flags(head(raw), RAW_STRING_FLAG) io = IOBuffer() @@ -70,15 +72,14 @@ function SyntaxNode(source::SourceFile, raw::GreenNode{SyntaxHead}, position::In Symbol(val_str) elseif k in KSet"String CmdString" io = IOBuffer() + had_error = false if has_flags(head(raw), RAW_STRING_FLAG) unescape_raw_string(io, val_str, k == K"CmdString") else - ds = Diagnostic[] had_error = unescape_julia_string(io, source.code, position, - position+span(raw), ds) - @check !had_error && isempty(ds) + position+span(raw), Diagnostic[]) end - String(take!(io)) + had_error ? ErrorVal() : String(take!(io)) elseif is_operator(k) isempty(val_range) ? Symbol(untokenize(k)) : # synthetic invisible tokens diff --git a/src/tokenize.jl b/src/tokenize.jl index 974062af..70163010 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -48,7 +48,7 @@ end @inline ishex(c::Char) = isdigit(c) || ('a' <= c <= 'f') || ('A' <= c <= 'F') @inline isbinary(c::Char) = c == '0' || c == '1' @inline isoctal(c::Char) = '0' ≤ c ≤ '7' -@inline iswhitespace(c::Char) = Base.isspace(c) || c === '\ufeff' +@inline iswhitespace(c::Char) = (Base.isvalid(c) && Base.isspace(c)) || c === '\ufeff' struct StringState triplestr::Bool diff --git a/src/tokenize_utils.jl b/src/tokenize_utils.jl index 6da41683..1767cc52 100644 --- a/src/tokenize_utils.jl +++ b/src/tokenize_utils.jl @@ -4,19 +4,19 @@ const EOF_CHAR = typemax(Char) function is_identifier_char(c::Char) c == EOF_CHAR && return false - Base.ismalformed(c) && return false + Base.isvalid(c) || return false return Base.is_id_char(c) end function is_identifier_start_char(c::Char) c == EOF_CHAR && return false - Base.ismalformed(c) && return false + Base.isvalid(c) || return false return Base.is_id_start_char(c) end # Chars that we will never allow to be part of a valid non-operator identifier function is_never_id_char(ch::Char) - Base.ismalformed(ch) && return true + Base.isvalid(ch) || return true cat = Unicode.category_code(ch) c = UInt32(ch) return ( @@ -50,7 +50,7 @@ readchar(io::IO) = eof(io) ? EOF_CHAR : read(io, Char) # `a .(op) b` or `.(op)a` and where `length(string(op)) == 1` @inline function dotop1(c1::Char) c1 == EOF_CHAR && return false - Base.ismalformed(c1) && return false + Base.isvalid(c1) || return false c = UInt32(c1) c == 0x00000021 || c == 0x000000a6 || @@ -173,7 +173,7 @@ end @inline function isopsuffix(c1::Char) c1 == EOF_CHAR && return false - Base.ismalformed(c1) && return false + Base.isvalid(c1) || return false c = UInt32(c1) if (c < 0xa1 || c > 0x10ffff) return false @@ -252,7 +252,7 @@ end function is_operator_start_char(c::Char) c == EOF_CHAR && return false - Base.ismalformed(c) && return false + Base.isvalid(c) || return false is_operator_start_char(UInt32(c)) end is_operator_start_char(u::UInt32) = u == 0x00000021 || (u == 0x00000024 || (u == 0x00000025 || (u == 0x00000026 || (u == 0x00000027 || (u == 0x0000002a || (u == 0x0000002b || (u == 0x0000002d || (u == 0x0000002e || (u == 0x0000002f || (u == 0x0000003a || (u == 0x0000003c || (u == 0x0000003d || (u == 0x0000003e || (u == 0x0000003f || (u == 0x0000005c || (u == 0x0000005e || (u == 0x00000069 || (u == 0x00000077 || (u == 0x0000007c || (u == 0x0000007e || (u == 0x000000ac || (u == 0x000000b1 || (u == 0x000000d7 || (u == 0x000000f7 || (u == 0x00002026 || (u == 0x0000205d || (u == 0x0000214b || (u == 0x00002190 || (u == 0x00002191 || (u == 0x00002192 || (u == 0x00002193 || (u == 0x00002194 || (u == 0x0000219a || (u == 0x0000219b || (u == 0x000021a0 || (u == 0x000021a3 || (u == 0x000021a6 || (u == 0x000021ae || (u == 0x000021ce || (u == 0x000021cf || (u == 0x000021d2 || (u == 0x000021d4 || (u == 0x000021f4 || (u == 0x000021f5 || (u == 0x000021f6 || (u == 0x000021f7 || (u == 0x000021f8 || (u == 0x000021f9 || (u == 0x000021fa || (u == 0x000021fb || (u == 0x000021fc || (u == 0x000021fd || (u == 0x000021fe || (u == 0x000021ff || (u == 0x00002208 || (u == 0x00002209 || (u == 0x0000220a || (u == 0x0000220b || (u == 0x0000220c || (u == 0x0000220d || (u == 0x00002213 || (u == 0x00002214 || (u == 0x00002217 || (u == 0x00002218 || (u == 0x00002219 || (u == 0x0000221a || (u == 0x0000221b || (u == 0x0000221c || (u == 0x0000221d || (u == 0x00002224 || (u == 0x00002225 || (u == 0x00002226 || (u == 0x00002227 || (u == 0x00002228 || (u == 0x00002229 || (u == 0x0000222a || (u == 0x00002237 || (u == 0x00002238 || (u == 0x0000223a || (u == 0x0000223b || (u == 0x0000223d || (u == 0x0000223e || (u == 0x00002240 || (u == 0x00002241 || (u == 0x00002242 || (u == 0x00002243 || (u == 0x00002244 || (u == 0x00002245 || (u == 0x00002246 || (u == 0x00002247 || (u == 0x00002248 || (u == 0x00002249 || (u == 0x0000224a || (u == 0x0000224b || (u == 0x0000224c || (u == 0x0000224d || (u == 0x0000224e || (u == 0x0000224f || (u == 0x00002250 || (u == 0x00002251 || (u == 0x00002252 || (u == 0x00002253 || (u == 0x00002254 || (u == 0x00002255 || (u == 0x00002256 || (u == 0x00002257 || (u == 0x00002258 || (u == 0x00002259 || (u == 0x0000225a || (u == 0x0000225b || (u == 0x0000225c || (u == 0x0000225d || (u == 0x0000225e || (u == 0x0000225f || (u == 0x00002260 || (u == 0x00002261 || (u == 0x00002262 || (u == 0x00002263 || (u == 0x00002264 || (u == 0x00002265 || (u == 0x00002266 || (u == 0x00002267 || (u == 0x00002268 || (u == 0x00002269 || (u == 0x0000226a || (u == 0x0000226b || (u == 0x0000226c || (u == 0x0000226d || (u == 0x0000226e || (u == 0x0000226f || (u == 0x00002270 || (u == 0x00002271 || (u == 0x00002272 || (u == 0x00002273 || (u == 0x00002274 || (u == 0x00002275 || (u == 0x00002276 || (u == 0x00002277 || (u == 0x00002278 || (u == 0x00002279 || (u == 0x0000227a || (u == 0x0000227b || (u == 0x0000227c || (u == 0x0000227d || (u == 0x0000227e || (u == 0x0000227f || (u == 0x00002280 || (u == 0x00002281 || (u == 0x00002282 || (u == 0x00002283 || (u == 0x00002284 || (u == 0x00002285 || (u == 0x00002286 || (u == 0x00002287 || (u == 0x00002288 || (u == 0x00002289 || (u == 0x0000228a || (u == 0x0000228b || (u == 0x0000228d || (u == 0x0000228e || (u == 0x0000228f || (u == 0x00002290 || (u == 0x00002291 || (u == 0x00002292 || (u == 0x00002293 || (u == 0x00002294 || (u == 0x00002295 || (u == 0x00002296 || (u == 0x00002297 || (u == 0x00002298 || (u == 0x00002299 || (u == 0x0000229a || (u == 0x0000229b || (u == 0x0000229c || (u == 0x0000229e || (u == 0x0000229f || (u == 0x000022a0 || (u == 0x000022a1 || (u == 0x000022a2 || (u == 0x000022a3 || (u == 0x000022a9 || (u == 0x000022ac || (u == 0x000022ae || (u == 0x000022b0 || (u == 0x000022b1 || (u == 0x000022b2 || (u == 0x000022b3 || (u == 0x000022b4 || (u == 0x000022b5 || (u == 0x000022b6 || (u == 0x000022b7 || (u == 0x000022bb || (u == 0x000022bc || (u == 0x000022bd || (u == 0x000022c4 || (u == 0x000022c5 || (u == 0x000022c6 || (u == 0x000022c7 || (u == 0x000022c9 || (u == 0x000022ca || (u == 0x000022cb || (u == 0x000022cc || (u == 0x000022cd || (u == 0x000022ce || (u == 0x000022cf || (u == 0x000022d0 || (u == 0x000022d1 || (u == 0x000022d2 || (u == 0x000022d3 || (u == 0x000022d5 || (u == 0x000022d6 || (u == 0x000022d7 || (u == 0x000022d8 || (u == 0x000022d9 || (u == 0x000022da || (u == 0x000022db || (u == 0x000022dc || (u == 0x000022dd || (u == 0x000022de || (u == 0x000022df || (u == 0x000022e0 || (u == 0x000022e1 || (u == 0x000022e2 || (u == 0x000022e3 || (u == 0x000022e4 || (u == 0x000022e5 || (u == 0x000022e6 || (u == 0x000022e7 || (u == 0x000022e8 || (u == 0x000022e9 || (u == 0x000022ea || (u == 0x000022eb || (u == 0x000022ec || (u == 0x000022ed || (u == 0x000022ee || (u == 0x000022ef || (u == 0x000022f0 || (u == 0x000022f1 || (u == 0x000022f2 || (u == 0x000022f3 || (u == 0x000022f4 || (u == 0x000022f5 || (u == 0x000022f6 || (u == 0x000022f7 || (u == 0x000022f8 || (u == 0x000022f9 || (u == 0x000022fa || (u == 0x000022fb || (u == 0x000022fc || (u == 0x000022fd || (u == 0x000022fe || (u == 0x000022ff || (u == 0x000025b7 || (u == 0x000027c2 || (u == 0x000027c8 || (u == 0x000027c9 || (u == 0x000027d1 || (u == 0x000027d2 || (u == 0x000027d5 || (u == 0x000027d6 || (u == 0x000027d7 || (u == 0x000027f0 || (u == 0x000027f1 || (u == 0x000027f5 || (u == 0x000027f6 || (u == 0x000027f7 || (u == 0x000027f9 || (u == 0x000027fa || (u == 0x000027fb || (u == 0x000027fc || (u == 0x000027fd || (u == 0x000027fe || (u == 0x000027ff || (u == 0x00002900 || (u == 0x00002901 || (u == 0x00002902 || (u == 0x00002903 || (u == 0x00002904 || (u == 0x00002905 || (u == 0x00002906 || (u == 0x00002907 || (u == 0x00002908 || (u == 0x00002909 || (u == 0x0000290a || (u == 0x0000290b || (u == 0x0000290c || (u == 0x0000290d || (u == 0x0000290e || (u == 0x0000290f || (u == 0x00002910 || (u == 0x00002911 || (u == 0x00002912 || (u == 0x00002913 || (u == 0x00002914 || (u == 0x00002915 || (u == 0x00002916 || (u == 0x00002917 || (u == 0x00002918 || (u == 0x0000291d || (u == 0x0000291e || (u == 0x0000291f || (u == 0x00002920 || (u == 0x00002944 || (u == 0x00002945 || (u == 0x00002946 || (u == 0x00002947 || (u == 0x00002948 || (u == 0x00002949 || (u == 0x0000294a || (u == 0x0000294b || (u == 0x0000294c || (u == 0x0000294d || (u == 0x0000294e || (u == 0x0000294f || (u == 0x00002950 || (u == 0x00002951 || (u == 0x00002952 || (u == 0x00002953 || (u == 0x00002954 || (u == 0x00002955 || (u == 0x00002956 || (u == 0x00002957 || (u == 0x00002958 || (u == 0x00002959 || (u == 0x0000295a || (u == 0x0000295b || (u == 0x0000295c || (u == 0x0000295d || (u == 0x0000295e || (u == 0x0000295f || (u == 0x00002960 || (u == 0x00002961 || (u == 0x00002962 || (u == 0x00002963 || (u == 0x00002964 || (u == 0x00002965 || (u == 0x00002966 || (u == 0x00002967 || (u == 0x00002968 || (u == 0x00002969 || (u == 0x0000296a || (u == 0x0000296b || (u == 0x0000296c || (u == 0x0000296d || (u == 0x0000296e || (u == 0x0000296f || (u == 0x00002970 || (u == 0x000029b7 || (u == 0x000029b8 || (u == 0x000029bc || (u == 0x000029be || (u == 0x000029bf || (u == 0x000029c0 || (u == 0x000029c1 || (u == 0x000029e1 || (u == 0x000029e3 || (u == 0x000029e4 || (u == 0x000029e5 || (u == 0x000029f4 || (u == 0x000029f6 || (u == 0x000029f7 || (u == 0x000029fa || (u == 0x000029fb || (u == 0x00002a07 || (u == 0x00002a08 || (u == 0x00002a1d || (u == 0x00002a22 || (u == 0x00002a23 || (u == 0x00002a24 || (u == 0x00002a25 || (u == 0x00002a26 || (u == 0x00002a27 || (u == 0x00002a28 || (u == 0x00002a29 || (u == 0x00002a2a || (u == 0x00002a2b || (u == 0x00002a2c || (u == 0x00002a2d || (u == 0x00002a2e || (u == 0x00002a30 || (u == 0x00002a31 || (u == 0x00002a32 || (u == 0x00002a33 || (u == 0x00002a34 || (u == 0x00002a35 || (u == 0x00002a36 || (u == 0x00002a37 || (u == 0x00002a38 || (u == 0x00002a39 || (u == 0x00002a3a || (u == 0x00002a3b || (u == 0x00002a3c || (u == 0x00002a3d || (u == 0x00002a40 || (u == 0x00002a41 || (u == 0x00002a42 || (u == 0x00002a43 || (u == 0x00002a44 || (u == 0x00002a45 || (u == 0x00002a4a || (u == 0x00002a4b || (u == 0x00002a4c || (u == 0x00002a4d || (u == 0x00002a4e || (u == 0x00002a4f || (u == 0x00002a50 || (u == 0x00002a51 || (u == 0x00002a52 || (u == 0x00002a53 || (u == 0x00002a54 || (u == 0x00002a55 || (u == 0x00002a56 || (u == 0x00002a57 || (u == 0x00002a58 || (u == 0x00002a5a || (u == 0x00002a5b || (u == 0x00002a5c || (u == 0x00002a5d || (u == 0x00002a5e || (u == 0x00002a5f || (u == 0x00002a60 || (u == 0x00002a61 || (u == 0x00002a62 || (u == 0x00002a63 || (u == 0x00002a66 || (u == 0x00002a67 || (u == 0x00002a6a || (u == 0x00002a6b || (u == 0x00002a6c || (u == 0x00002a6d || (u == 0x00002a6e || (u == 0x00002a6f || (u == 0x00002a70 || (u == 0x00002a71 || (u == 0x00002a72 || (u == 0x00002a73 || (u == 0x00002a74 || (u == 0x00002a75 || (u == 0x00002a76 || (u == 0x00002a77 || (u == 0x00002a78 || (u == 0x00002a79 || (u == 0x00002a7a || (u == 0x00002a7b || (u == 0x00002a7c || (u == 0x00002a7d || (u == 0x00002a7e || (u == 0x00002a7f || (u == 0x00002a80 || (u == 0x00002a81 || (u == 0x00002a82 || (u == 0x00002a83 || (u == 0x00002a84 || (u == 0x00002a85 || (u == 0x00002a86 || (u == 0x00002a87 || (u == 0x00002a88 || (u == 0x00002a89 || (u == 0x00002a8a || (u == 0x00002a8b || (u == 0x00002a8c || (u == 0x00002a8d || (u == 0x00002a8e || (u == 0x00002a8f || (u == 0x00002a90 || (u == 0x00002a91 || (u == 0x00002a92 || (u == 0x00002a93 || (u == 0x00002a94 || (u == 0x00002a95 || (u == 0x00002a96 || (u == 0x00002a97 || (u == 0x00002a98 || (u == 0x00002a99 || (u == 0x00002a9a || (u == 0x00002a9b || (u == 0x00002a9c || (u == 0x00002a9d || (u == 0x00002a9e || (u == 0x00002a9f || (u == 0x00002aa0 || (u == 0x00002aa1 || (u == 0x00002aa2 || (u == 0x00002aa3 || (u == 0x00002aa4 || (u == 0x00002aa5 || (u == 0x00002aa6 || (u == 0x00002aa7 || (u == 0x00002aa8 || (u == 0x00002aa9 || (u == 0x00002aaa || (u == 0x00002aab || (u == 0x00002aac || (u == 0x00002aad || (u == 0x00002aae || (u == 0x00002aaf || (u == 0x00002ab0 || (u == 0x00002ab1 || (u == 0x00002ab2 || (u == 0x00002ab3 || (u == 0x00002ab4 || (u == 0x00002ab5 || (u == 0x00002ab6 || (u == 0x00002ab7 || (u == 0x00002ab8 || (u == 0x00002ab9 || (u == 0x00002aba || (u == 0x00002abb || (u == 0x00002abc || (u == 0x00002abd || (u == 0x00002abe || (u == 0x00002abf || (u == 0x00002ac0 || (u == 0x00002ac1 || (u == 0x00002ac2 || (u == 0x00002ac3 || (u == 0x00002ac4 || (u == 0x00002ac5 || (u == 0x00002ac6 || (u == 0x00002ac7 || (u == 0x00002ac8 || (u == 0x00002ac9 || (u == 0x00002aca || (u == 0x00002acb || (u == 0x00002acc || (u == 0x00002acd || (u == 0x00002ace || (u == 0x00002acf || (u == 0x00002ad0 || (u == 0x00002ad1 || (u == 0x00002ad2 || (u == 0x00002ad3 || (u == 0x00002ad4 || (u == 0x00002ad5 || (u == 0x00002ad6 || (u == 0x00002ad7 || (u == 0x00002ad8 || (u == 0x00002ad9 || (u == 0x00002adb || (u == 0x00002af7 || (u == 0x00002af8 || (u == 0x00002af9 || (u == 0x00002afa || (u == 0x00002b30 || (u == 0x00002b31 || (u == 0x00002b32 || (u == 0x00002b33 || (u == 0x00002b34 || (u == 0x00002b35 || (u == 0x00002b36 || (u == 0x00002b37 || (u == 0x00002b38 || (u == 0x00002b39 || (u == 0x00002b3a || (u == 0x00002b3b || (u == 0x00002b3c || (u == 0x00002b3d || (u == 0x00002b3e || (u == 0x00002b3f || (u == 0x00002b40 || (u == 0x00002b41 || (u == 0x00002b42 || (u == 0x00002b43 || (u == 0x00002b44 || (u == 0x00002b47 || (u == 0x00002b48 || (u == 0x00002b49 || (u == 0x00002b4a || (u == 0x00002b4b || (u == 0x00002b4c || (u == 0x0000ffe9 || (u == 0x0000ffea || (u == 0x0000ffeb || u == 0x0000ffec))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))) diff --git a/test/fuzz_test.jl b/test/fuzz_test.jl new file mode 100644 index 00000000..49dc11a2 --- /dev/null +++ b/test/fuzz_test.jl @@ -0,0 +1,71 @@ +using JuliaSyntax + +# Parser fuzz testing tools. + +function parser_exception(str) + try + JuliaSyntax.parseall(JuliaSyntax.SyntaxNode, str, ignore_errors=true) + false + catch + true + end +end + +""" +Reduce test case via combination of bisection and random deletion. + +This is suited to randomly generated strings. It might work with more code-like +strings too? +""" +function rand_reduce(str) + while true + if length(str) <= 1 + return str + end + m1 = thisind(str, length(str)÷2) + m2 = nextind(str, m1) + if parser_exception(str[1:m1]) + str = str[1:m1] + elseif parser_exception(str[m2:end]) + str = str[m2:end] + else + chunklen = 10 + reduced = false + if length(str) > chunklen + for i = 1:100 + m = thisind(str, rand(1:length(str)-chunklen)) + s = str[1:m]*str[prevind(str, m+chunklen):end] + if parser_exception(s) + str = s + reduced = true + break + end + end + end + if !reduced + return str + end + end + end +end + +# The parser should never throw an exception. To test whether this is true, +# try passing randomly generated bad input data into it. +function fuzz_test(gen_bad_input, N) + for i=1:N + str = gen_bad_input() + try + JuliaSyntax.parseall(JuliaSyntax.SyntaxNode, str, ignore_errors=true); + catch + @error "Parser threw exception" exception=current_exceptions() + return str + end + end + return nothing +end + +function fuzz_binary(N) + fuzz_test(N) do + String(rand(UInt8, 1_000_000)) + end +end diff --git a/test/parser.jl b/test/parser.jl index dbf0ae0f..a49e85c7 100644 --- a/test/parser.jl +++ b/test/parser.jl @@ -238,6 +238,7 @@ tests = [ "/x" => "(call-pre (error /) x)" "+₁ x" => "(call-pre (error +₁) x)" ".<: x" => "(dotcall-pre (error .<:) x)" + "?\"str\"" => """(call-pre (error ?) (string "str"))""" ], JuliaSyntax.parse_factor => [ "x^y" => "(call-i x ^ y)" @@ -257,6 +258,7 @@ tests = [ "<:{T}(x::T)" => "(call (curly <: T) (::-i x T))" "<:(x::T)" => "(<:-pre (::-i x T))" "<: x" => "(<:-pre x)" + "<: <: x" => "(<:-pre (<:-pre x))" "<: A where B" => "(<:-pre (where A B))" # Really for parse_where "x where \n {T}" => "(where x T)" @@ -388,6 +390,8 @@ tests = [ "x`str`" => """(macrocall @x_cmd (cmdstring-r "str"))""" "x\"\"" => """(macrocall @x_str (string-r ""))""" "x``" => """(macrocall @x_cmd (cmdstring-r ""))""" + "in\"str\"" => """(macrocall @in_str (string-r "str"))""" + "outer\"str\"" => """(macrocall @outer_str (string-r "str"))""" # Triple quoted procesing for custom strings "r\"\"\"\nx\"\"\"" => raw"""(macrocall @r_str (string-sr "x"))""" "r\"\"\"\n x\n y\"\"\"" => raw"""(macrocall @r_str (string-sr "x\n" "y"))""" diff --git a/test/parser_api.jl b/test/parser_api.jl index b6c7752c..e6f8d26c 100644 --- a/test/parser_api.jl +++ b/test/parser_api.jl @@ -90,5 +90,11 @@ @test_throws JuliaSyntax.ParseError parseshow("[a; b, c]") @test_throws JuliaSyntax.ParseError parseshow("[a; b, c]", ignore_warnings=true) @test parseshow("[a; b, c]", ignore_errors=true) == "(vcat a b (error-t) c)" + # errors in literals + @test parseshow("\"\\z\"", ignore_errors=true) == "(string (ErrorInvalidEscapeSequence))" + @test parseshow("'\\z'", ignore_errors=true) == "(char (ErrorInvalidEscapeSequence))" + @test parseshow("'abc'", ignore_errors=true) == "(char (ErrorOverLongCharacter))" + @test parseshow("1e1000", ignore_errors=true) == "(ErrorNumericOverflow)" + @test parseshow("1f1000", ignore_errors=true) == "(ErrorNumericOverflow)" end end diff --git a/test/tokenize.jl b/test/tokenize.jl index 4ea295c6..528ef7c1 100644 --- a/test/tokenize.jl +++ b/test/tokenize.jl @@ -921,6 +921,25 @@ end @test (t = last(collect(tokenize("+*"))); (t.startbyte, t.endbyte)) == (2,1) end +@testset "invalid UTF-8 characters" begin + bad_chars = [ + first("\xe2") # malformed + first("\xc0\x9b") # overlong + first("\xf0\x83\x99\xae") # overlong + ] + + @testset "bad char $(repr(c))" for c in bad_chars + @test Tokenize.is_identifier_char(c) == false + @test Tokenize.is_identifier_start_char(c) == false + @test Tokenize.is_never_id_char(c) == true + @test Tokenize.dotop1(c) == false + @test Tokenize.isopsuffix(c) == false + @test Tokenize.is_operator_start_char(c) == false + @test Tokenize.iswhitespace(c) == false + @test Tokenize.ishex(c) == false + end +end + @testset "dotop miscellanea" begin @test strtok("a .-> b") == ["a", " ", ".-", ">", " ", "b", ""] @test strtok(".>: b") == [".>:", " ", "b", ""] @@ -933,14 +952,4 @@ end @test strtok("a .&&₁ b") == ["a", " ", ".&&", "₁", " ", "b", ""] end -@testset "malformed strings" begin - malformed = first("\xe2") - @test Tokenize.is_identifier_char(malformed) == false - @test Tokenize.is_identifier_start_char(malformed) == false - @test Tokenize.is_never_id_char(malformed) == true - @test Tokenize.dotop1(malformed) == false - @test Tokenize.isopsuffix(malformed) == false - @test Tokenize.is_operator_start_char(malformed) == false -end - end