Skip to content

Commit

Permalink
Try if a custom text rule results in better performance
Browse files Browse the repository at this point in the history
  • Loading branch information
hukkin committed Dec 12, 2024
1 parent 0314b4f commit dc4f7b5
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 5 deletions.
13 changes: 8 additions & 5 deletions src/mdformat_gfm/_mdit_gfm_autolink_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,18 @@
from markdown_it import MarkdownIt
from markdown_it.rules_inline import StateInline

from mdformat_gfm._text_inline_rule import text_rule


def gfm_autolink_plugin(md: MarkdownIt) -> None:
"""Markdown-it plugin to parse GFM autolinks."""
md.inline.ruler.before("linkify", "gfm_autolink", gfm_autolink)
# "text" inline rule will skip "www." prefixed links, so needs to be
# disabled. This is probably disastrous for performance. An alternative, I think,
# would be to override the "text" inline rule with one that stops at a "."
# prefixed by "www".
md.inline.ruler.disable("text")

# The default "text" inline rule will skip starting characters of GFM
# autolinks. It can be disabled, but that is disastrous for performance.
# Instead, we replace it with a custom "text" inline rule that yields at
# locations that can potentially be the beginning of a GFM autolink.
md.inline.ruler.at("text", text_rule)


# A string that matches this must still be invalidated if it ends with "_" or "-"
Expand Down
87 changes: 87 additions & 0 deletions src/mdformat_gfm/_text_inline_rule.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""A replacement for the "text" inline rule in markdown-it.
The default "text" rule will skip until the next character in
`_TerminatorChars` is found. This extends the set of termination points
to those that can potentially be the beginning of a GFM autolink. The
GFM autolink plugin also works with "text" inline rule disabled, but
this should (at least partially) bring back the performance boost that
"text" inline rule provides.
"""

import re

from markdown_it.rules_inline import StateInline

GFM_WHITESPACE = frozenset(" \t\n\v\f\r")
BEFORE_VALID_AUTOLINK_CHARS = GFM_WHITESPACE | {"*", "_", "~", "("}

# The default set of terminator characters
_TerminatorChars = {
"\n",
"!",
"#",
"$",
"%",
"&",
"*",
"+",
"-",
":",
"<",
"=",
">",
"@",
"[",
"\\",
"]",
"^",
"_",
"`",
"{",
"}",
"~",
}

_default_terminator = "[" + re.escape("".join(_TerminatorChars)) + "]"
_gfm_autolink_terminator = (
r"(?:" r"www\." "|" "http" "|" "mailto:" "|" "xmpp:" "|" r"[a-zA-Z0-9._+-]+@" r")"
)
_before_autolink_terminator = (
"[" + re.escape("".join(BEFORE_VALID_AUTOLINK_CHARS)) + "]"
)

_RE_TERMINATOR_FIRST_CHAR = re.compile(
_default_terminator + "|" + _gfm_autolink_terminator
)
_RE_TERMINATOR_NON_FIRST_CHAR = re.compile(
r"(?s:.)" # match any character (also newline)
+ _default_terminator
+ "|"
+ _before_autolink_terminator
+ _gfm_autolink_terminator
)


def text_rule(state: StateInline, silent: bool) -> bool:
pos = state.pos

if not pos:
if _RE_TERMINATOR_FIRST_CHAR.match(state.src):
return False
pos += 1

terminator_match = _RE_TERMINATOR_NON_FIRST_CHAR.search(state.src, pos - 1)
if terminator_match:
pos = terminator_match.start() + 1
else:
pos = state.posMax

if pos == state.pos:
return False

if not silent:
state.pending += state.src[state.pos : pos]

state.pos = pos

return True

0 comments on commit dc4f7b5

Please sign in to comment.