fix: rare decoding error on UTF-8 documents

On rare occasions, decoding UTF-8 documents caused a fatal error RSC-016 (`Invalid byte 2 of 4-byte UTF-8 sequence.`). This was likely due to a bug in the Xerces XML parser decoding component, see https://issues.apache.org/jira/browse/XERCESJ-1668 As a workaround, we now read documents using the Java built-in UTF-8 decoder instead of Xerces's own decoder, by creating the SAX parsers' InputSource from an InputStreamReader instead of the raw InputStream. Fixes #1548
w3c · Dec 23, 2024 · aebc651 · aebc651
1 parent 4ad738c
commit aebc651
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 7 deletions.
diff --git a/src/main/java/com/adobe/epubcheck/xml/XMLEncodingSniffer.java b/src/main/java/com/adobe/epubcheck/xml/XMLEncodingSniffer.java
@@ -108,8 +108,28 @@ public static String sniffEncoding(InputStream in)
     return encoding.toUpperCase(Locale.ROOT);
   }
 
+  /**
+   * Checks if the parameter input stream has a UTF-8 byte order mark.
+   *
+   * @param in
+   *        an input stream
+   * @return <code>true</code> if and only if the input stream starts with a
+   *           UTF-8 BOM
+   * @throws IOException
+   */
+  public static boolean hasUTF8BOM(InputStream in)
+    throws IOException
+  {
+    byte[] buffer = new byte[3];
+    in.mark(buffer.length);
+    int len = in.read(buffer);
+    in.reset();
+    return (len == 3 && matchesMagic(UTF8_MAGIC, buffer));
+  }
+
   private XMLEncodingSniffer()
   {
     // Not instanciable.
   }
+
 }
diff --git a/src/main/java/com/adobe/epubcheck/xml/XMLParser.java b/src/main/java/com/adobe/epubcheck/xml/XMLParser.java
@@ -25,6 +25,8 @@
 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
 
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;
@@ -136,8 +138,8 @@ public void process()
         return;
       }
 
-      // Check encoding
-      // If the result is null, the XML parser will must parse it as UTF-8
+      // Create the InputSource based on the encoding
+      final InputSource source;
       String encoding = XMLEncodingSniffer.sniffEncoding(buffered);
       if (encoding != null && !encoding.equals("UTF-8"))
       {
@@ -158,13 +160,27 @@ public void process()
         {
           report.message(MessageId.RSC_028, EPUBLocation.of(context), encoding);
         }
+
+        // We do not set the source encoding name, but instead let the SAXParser
+        // apply its own encoding-sniffing logic, as it can report useful errors
+        // (for instance a mismatch between a BOM and the XML declaration)
+        source = new InputSource(buffered);
+
+      }
+      else
+      {
+        // Decode the UTF-8 stream with java.io instead of letting Xerces
+        // do it, to work around Xerces issue #1668
+        // (see https://issues.apache.org/jira/browse/XERCESJ-1668),
+        // skipping any UTF-8 BOM first (disallowed by that constructor)
+        if (XMLEncodingSniffer.hasUTF8BOM(buffered))
+        {
+          buffered.skip(3);
+        }
+        source = new InputSource(new InputStreamReader(buffered, StandardCharsets.UTF_8));
       }
 
-      // Build the input source
-      // We do not set the source encoding name, but instead let the SAXParser
-      // apply its own encoding-sniffing logic, as it can report useful errors
-      // (for instance a mismatch between a BOM and the XML declaration)
-      InputSource source = new InputSource(buffered);
+      // Set the source's system ID
       source.setSystemId(url.toString());
 
       // Set the error handler