diff --git a/src/mdformat_gfm/_mdit_gfm_autolink_plugin.py b/src/mdformat_gfm/_mdit_gfm_autolink_plugin.py index 491b39e..3c9e7ba 100644 --- a/src/mdformat_gfm/_mdit_gfm_autolink_plugin.py +++ b/src/mdformat_gfm/_mdit_gfm_autolink_plugin.py @@ -3,15 +3,18 @@ from markdown_it import MarkdownIt from markdown_it.rules_inline import StateInline +from mdformat_gfm._text_inline_rule import text_rule + def gfm_autolink_plugin(md: MarkdownIt) -> None: """Markdown-it plugin to parse GFM autolinks.""" md.inline.ruler.before("linkify", "gfm_autolink", gfm_autolink) - # "text" inline rule will skip "www." prefixed links, so needs to be - # disabled. This is probably disastrous for performance. An alternative, I think, - # would be to override the "text" inline rule with one that stops at a "." - # prefixed by "www". - md.inline.ruler.disable("text") + + # The default "text" inline rule will skip starting characters of GFM + # autolinks. It can be disabled, but that is disastrous for performance. + # Instead, we replace it with a custom "text" inline rule that yields at + # locations that can potentially be the beginning of a GFM autolink. + md.inline.ruler.at("text", text_rule) # A string that matches this must still be invalidated if it ends with "_" or "-" diff --git a/src/mdformat_gfm/_text_inline_rule.py b/src/mdformat_gfm/_text_inline_rule.py new file mode 100644 index 0000000..51eb979 --- /dev/null +++ b/src/mdformat_gfm/_text_inline_rule.py @@ -0,0 +1,87 @@ +"""A replacement for the "text" inline rule in markdown-it. + +The default "text" rule will skip until the next character in +`_TerminatorChars` is found. This extends the set of termination points +to those that can potentially be the beginning of a GFM autolink. The +GFM autolink plugin also works with "text" inline rule disabled, but +this should (at least partially) bring back the performance boost that +"text" inline rule provides. +""" + +import re + +from markdown_it.rules_inline import StateInline + +GFM_WHITESPACE = frozenset(" \t\n\v\f\r") +BEFORE_VALID_AUTOLINK_CHARS = GFM_WHITESPACE | {"*", "_", "~", "("} + +# The default set of terminator characters +_TerminatorChars = { + "\n", + "!", + "#", + "$", + "%", + "&", + "*", + "+", + "-", + ":", + "<", + "=", + ">", + "@", + "[", + "\\", + "]", + "^", + "_", + "`", + "{", + "}", + "~", +} + +_default_terminator = "[" + re.escape("".join(_TerminatorChars)) + "]" +_gfm_autolink_terminator = ( + r"(?:" r"www\." "|" "http" "|" "mailto:" "|" "xmpp:" "|" r"[a-zA-Z0-9._+-]+@" r")" +) +_before_autolink_terminator = ( + "[" + re.escape("".join(BEFORE_VALID_AUTOLINK_CHARS)) + "]" +) + +_RE_TERMINATOR_FIRST_CHAR = re.compile( + _default_terminator + "|" + _gfm_autolink_terminator +) +_RE_TERMINATOR_NON_FIRST_CHAR = re.compile( + r"(?s:.)" # match any character (also newline) + + _default_terminator + + "|" + + _before_autolink_terminator + + _gfm_autolink_terminator +) + + +def text_rule(state: StateInline, silent: bool) -> bool: + pos = state.pos + + if not pos: + if _RE_TERMINATOR_FIRST_CHAR.match(state.src): + return False + pos += 1 + + terminator_match = _RE_TERMINATOR_NON_FIRST_CHAR.search(state.src, pos - 1) + if terminator_match: + pos = terminator_match.start() + 1 + else: + pos = state.posMax + + if pos == state.pos: + return False + + if not silent: + state.pending += state.src[state.pos : pos] + + state.pos = pos + + return True