Fix Tokenizer::int64() parsing of "0" when guessing base (#1842)

Known bug victims in current code were tcp_outgoing_mark, mark_client_packet, clientside_mark, and mark_client_connection directives as well as client_connection_mark and (deprecated) clientside_mark ACLs if they were configured to match a zero mark using "0" or "0/..." syntax: ERROR: configuration failure: NfMarkConfig: invalid value '0/10'... exception location: NfMarkConfig.cc(23) getNfmark Probably broken since 2014 commit 01f2137.
squid-cache · Jul 7, 2024 · 8cc5e88 · 8cc5e88
1 parent 314e430
commit 8cc5e88
Show file tree

Hide file tree

Showing 2 changed files with 309 additions and 1 deletion.
diff --git a/src/parser/Tokenizer.cc b/src/parser/Tokenizer.cc
@@ -264,7 +264,6 @@ Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf:
     if (base == 0) {
         if ( *s == '0') {
             base = 8;
-            ++s;
         } else {
             base = 10;
         }

diff --git a/src/tests/testTokenizer.cc b/src/tests/testTokenizer.cc
@@ -248,6 +248,285 @@ TestTokenizer::testTokenizerInt64()
         CPPUNIT_ASSERT(t.buf().isEmpty());
     }
 
+    // When interpreting octal numbers, standard strtol() and Tokenizer::int64()
+    // treat leading zero as a part of sequence of digits rather than a
+    // character used _exclusively_ as base indicator. Thus, it is not possible
+    // to create an invalid octal number with an explicit octal base -- the
+    // first invalid character after the base will be successfully ignored. This
+    // treatment also makes it difficult to define "shortest valid octal input".
+    // Here, we are just enumerating interesting "short input" octal cases in
+    // four dimensions:
+    // 1. int64(base) argument: forced or auto-detected;
+    // 2. base character ("0") in input: absent or present;
+    // 3. post-base digits in input: absent, valid, or invalid;
+    // 4. input length limits via int64(length) argument: unlimited or limited.
+
+    // forced base; input: no base, no post-base digits, unlimited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf(""));
+        CPPUNIT_ASSERT(!t.int64(rv, 8));
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
+    // forced base; input: no base, no post-base digits, limited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("7"));
+        CPPUNIT_ASSERT(!t.int64(rv, 8, false, 0));
+        CPPUNIT_ASSERT_EQUAL(SBuf("7"), t.buf());
+    }
+
+    // forced base; input: no base, one valid post-base digit, unlimited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("4"));
+        const int64_t benchmark = 04;
+        CPPUNIT_ASSERT(t.int64(rv, 8));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
+    // forced base; input: no base, one valid post-base digit, limited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("46"));
+        const int64_t benchmark = 04;
+        CPPUNIT_ASSERT(t.int64(rv, 8, false, 1));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("6"), t.buf());
+    }
+
+    // forced base; input: no base, one invalid post-base digit, unlimited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("8"));
+        CPPUNIT_ASSERT(!t.int64(rv, 8));
+        CPPUNIT_ASSERT_EQUAL(SBuf("8"), t.buf());
+    }
+
+    // forced base; input: no base, one invalid post-base digit, limited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("80"));
+        CPPUNIT_ASSERT(!t.int64(rv, 8, false, 1));
+        CPPUNIT_ASSERT_EQUAL(SBuf("80"), t.buf());
+    }
+
+    // repeat the above six octal cases, but now with base character in input
+
+    // forced base; input: base, no post-base digits, unlimited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0"));
+        const int64_t benchmark = 0;
+        CPPUNIT_ASSERT(t.int64(rv, 8));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
+    // forced base; input: base, no post-base digits, limited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("07"));
+        const int64_t benchmark = 0;
+        CPPUNIT_ASSERT(t.int64(rv, 8, false, 1));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("7"), t.buf());
+    }
+
+    // forced base; input: base, one valid post-base digit, unlimited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("04"));
+        const int64_t benchmark = 04;
+        CPPUNIT_ASSERT(t.int64(rv, 8));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
+    // forced base; input: base, one valid post-base digit, limited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("046"));
+        const int64_t benchmark = 04;
+        CPPUNIT_ASSERT(t.int64(rv, 8, false, 2));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("6"), t.buf());
+    }
+
+    // forced base; input: base, one invalid post-base digit, unlimited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("08"));
+        const int64_t benchmark = 00;
+        CPPUNIT_ASSERT(t.int64(rv, 8));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("8"), t.buf());
+    }
+
+    // forced base; input: base, one invalid post-base digit, limited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("08"));
+        const int64_t benchmark = 00;
+        CPPUNIT_ASSERT(t.int64(rv, 8, false, 2));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("8"), t.buf());
+    }
+
+    // And now repeat six "with base character in input" octal cases but with
+    // auto-detected base. When octal cases below say "auto-detected base", they
+    // describe int64() base=0 parameter value. Current int64() implementation
+    // does auto-detect base as octal in all of these cases, but that might
+    // change, and some of these cases (e.g., "0") can also be viewed as a
+    // non-octal input case as well. These cases do not attempt to test base
+    // detection. They focus on other potential problems.
+
+    // auto-detected base; input: base, no post-base digits, unlimited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0"));
+        const int64_t benchmark = 00;
+        CPPUNIT_ASSERT(t.int64(rv, 0));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
+    // auto-detected base; input: base, no post-base digits, limited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("07"));
+        const int64_t benchmark = 0;
+        CPPUNIT_ASSERT(t.int64(rv, 0, false, 1));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("7"), t.buf());
+    }
+
+    // auto-detected base; input: base, one valid post-base digit, unlimited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("04"));
+        const int64_t benchmark = 04;
+        CPPUNIT_ASSERT(t.int64(rv, 0));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
+    // auto-detected base; input: base, one valid post-base digit, limited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("046"));
+        const int64_t benchmark = 04;
+        CPPUNIT_ASSERT(t.int64(rv, 0, false, 2));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("6"), t.buf());
+    }
+
+    // auto-detected base; input: base, one invalid post-base digit, unlimited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("08"));
+        const int64_t benchmark = 00;
+        CPPUNIT_ASSERT(t.int64(rv, 0));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("8"), t.buf());
+    }
+
+    // auto-detected base; input: base, one invalid post-base digit, limited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("08"));
+        const int64_t benchmark = 00;
+        CPPUNIT_ASSERT(t.int64(rv, 0, false, 2));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("8"), t.buf());
+    }
+
+    // this ends four-dimensional enumeration of octal cases described earlier
+
+    // check octal base auto-detection
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0128"));
+        const int64_t benchmark = 012;
+        CPPUNIT_ASSERT(t.int64(rv, 0));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("8"), t.buf());
+    }
+
+    // check that octal base auto-detection is not confused by repeated zeros
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("00000000071"));
+        const int64_t benchmark = 00000000071;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
+    // check that forced octal base is not confused by hex prefix
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0x5"));
+        const int64_t benchmark = 0;
+        CPPUNIT_ASSERT(t.int64(rv, 8));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("x5"), t.buf());
+    }
+
+    // autodetect decimal base in shortest valid input
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("1"));
+        const int64_t benchmark = 1;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT(t.buf().isEmpty());
+    }
+
+    // autodetect hex base in shortest valid input
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0X1"));
+        const int64_t benchmark = 0X1;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT(t.buf().isEmpty());
+    }
+
+    // invalid (when autodetecting base) input matching hex base
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0x"));
+        CPPUNIT_ASSERT(!t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(SBuf("0x"), t.buf());
+    }
+
+    // invalid (when forcing hex base) input matching hex base
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0x"));
+        CPPUNIT_ASSERT(!t.int64(rv, 16));
+        CPPUNIT_ASSERT_EQUAL(SBuf("0x"), t.buf());
+    }
+
+    // invalid (when autodetecting base and limiting) input matching hex base
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0x2"));
+        CPPUNIT_ASSERT(!t.int64(rv, 0, true, 2));
+        CPPUNIT_ASSERT_EQUAL(SBuf("0x2"), t.buf());
+    }
+
+    // invalid (when forcing hex base and limiting) input matching hex base
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0x3"));
+        CPPUNIT_ASSERT(!t.int64(rv, 16, false, 2));
+        CPPUNIT_ASSERT_EQUAL(SBuf("0x3"), t.buf());
+    }
+
     // API mismatch: don't eat leading space
     {
         int64_t rv;
@@ -264,6 +543,36 @@ TestTokenizer::testTokenizerInt64()
         CPPUNIT_ASSERT_EQUAL(SBuf("  1234"), t.buf());
     }
 
+    // zero corner case: repeated zeros
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("00"));
+        const int64_t benchmark = 00;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
+    // zero corner case: "positive" zero
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("+0"));
+        const int64_t benchmark = +0;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
+    // zero corner case: "negative" zero
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("-0"));
+        const int64_t benchmark = -0;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
     // trailing spaces
     {
         int64_t rv;