Try if a custom text rule results in better performance

hukkin · Dec 12, 2024 · dc4f7b5 · dc4f7b5
1 parent 0314b4f
commit dc4f7b5
Show file tree

Hide file tree

Showing 2 changed files with 95 additions and 5 deletions.
diff --git a/src/mdformat_gfm/_mdit_gfm_autolink_plugin.py b/src/mdformat_gfm/_mdit_gfm_autolink_plugin.py
@@ -3,15 +3,18 @@
 from markdown_it import MarkdownIt
 from markdown_it.rules_inline import StateInline
 
+from mdformat_gfm._text_inline_rule import text_rule
+
 
 def gfm_autolink_plugin(md: MarkdownIt) -> None:
     """Markdown-it plugin to parse GFM autolinks."""
     md.inline.ruler.before("linkify", "gfm_autolink", gfm_autolink)
-    # "text" inline rule will skip "www." prefixed links, so needs to be
-    # disabled. This is probably disastrous for performance. An alternative, I think,
-    # would be to override the "text" inline rule with one that stops at a "."
-    # prefixed by "www".
-    md.inline.ruler.disable("text")
+
+    # The default "text" inline rule will skip starting characters of GFM
+    # autolinks. It can be disabled, but that is disastrous for performance.
+    # Instead, we replace it with a custom "text" inline rule that yields at
+    # locations that can potentially be the beginning of a GFM autolink.
+    md.inline.ruler.at("text", text_rule)
 
 
 # A string that matches this must still be invalidated if it ends with "_" or "-"

diff --git a/src/mdformat_gfm/_text_inline_rule.py b/src/mdformat_gfm/_text_inline_rule.py
@@ -0,0 +1,87 @@
+"""A replacement for the "text" inline rule in markdown-it.
+
+The default "text" rule will skip until the next character in
+`_TerminatorChars` is found. This extends the set of termination points
+to those that can potentially be the beginning of a GFM autolink. The
+GFM autolink plugin also works with "text" inline rule disabled, but
+this should (at least partially) bring back the performance boost that
+"text" inline rule provides.
+"""
+
+import re
+
+from markdown_it.rules_inline import StateInline
+
+GFM_WHITESPACE = frozenset(" \t\n\v\f\r")
+BEFORE_VALID_AUTOLINK_CHARS = GFM_WHITESPACE | {"*", "_", "~", "("}
+
+# The default set of terminator characters
+_TerminatorChars = {
+    "\n",
+    "!",
+    "#",
+    "$",
+    "%",
+    "&",
+    "*",
+    "+",
+    "-",
+    ":",
+    "<",
+    "=",
+    ">",
+    "@",
+    "[",
+    "\\",
+    "]",
+    "^",
+    "_",
+    "`",
+    "{",
+    "}",
+    "~",
+}
+
+_default_terminator = "[" + re.escape("".join(_TerminatorChars)) + "]"
+_gfm_autolink_terminator = (
+    r"(?:" r"www\." "|" "http" "|" "mailto:" "|" "xmpp:" "|" r"[a-zA-Z0-9._+-]+@" r")"
+)
+_before_autolink_terminator = (
+    "[" + re.escape("".join(BEFORE_VALID_AUTOLINK_CHARS)) + "]"
+)
+
+_RE_TERMINATOR_FIRST_CHAR = re.compile(
+    _default_terminator + "|" + _gfm_autolink_terminator
+)
+_RE_TERMINATOR_NON_FIRST_CHAR = re.compile(
+    r"(?s:.)"  # match any character (also newline)
+    + _default_terminator
+    + "|"
+    + _before_autolink_terminator
+    + _gfm_autolink_terminator
+)
+
+
+def text_rule(state: StateInline, silent: bool) -> bool:
+    pos = state.pos
+
+    if not pos:
+        if _RE_TERMINATOR_FIRST_CHAR.match(state.src):
+            return False
+        pos += 1
+
+    terminator_match = _RE_TERMINATOR_NON_FIRST_CHAR.search(state.src, pos - 1)
+    if terminator_match:
+        pos = terminator_match.start() + 1
+    else:
+        pos = state.posMax
+
+    if pos == state.pos:
+        return False
+
+    if not silent:
+        state.pending += state.src[state.pos : pos]
+
+    state.pos = pos
+
+    return True