Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix corner case regex colorization #678

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 78 additions & 19 deletions pydoctor/epydoc/markup/_pyval_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@
from pydoctor.epydoc.markup.restructuredtext import ParsedRstDocstring
from pydoctor.epydoc.docutils import set_node_attributes, wbr, obj_reference, new_document
from pydoctor.astutils import node2dottedname, bind_args
from pydoctor.node2stan import gettext


def decode_with_backslashreplace(s: bytes) -> str:
r"""
Expand All @@ -70,13 +72,14 @@ def decode_with_backslashreplace(s: bytes) -> str:
.encode('ascii', 'backslashreplace')
.decode('ascii'))

@attr.s(auto_attribs=True)
@attr.s(auto_attribs=True, frozen=True)
class _MarkedColorizerState:
length: int
charpos: int
lineno: int
linebreakok: bool

@attr.s(auto_attribs=True)
class _ColorizerState:
"""
An object uesd to keep track of the current state of the pyval
Expand All @@ -86,12 +89,15 @@ class _ColorizerState:
their object on a single line (setting linebreakok=False); and
then fall back on a multi-line output if that fails.
"""
def __init__(self) -> None:
self.result: List[nodes.Node] = []
self.charpos = 0
self.lineno = 1
self.linebreakok = True
self.warnings: List[str] = []
result: List[nodes.Node] = attr.ib(factory=list, init=False)
charpos:int = attr.ib(default=0, init=False)
lineno:int = attr.ib(default=1, init=False)
linebreakok:bool = attr.ib(default=False, init=False)
warnings: List[str] = attr.ib(factory=list)

# state linked to regex colorization
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These state trackers should be moved into the state object.

_regex_begins: Optional[_MarkedColorizerState] = attr.ib(default=None, init=False)
_regex_pattern: Optional[Union[str, bytes]] = attr.ib(default=None, init=False)

def mark(self) -> _MarkedColorizerState:
return _MarkedColorizerState(
Expand Down Expand Up @@ -727,7 +733,9 @@ def _colorize_ast_re(self, node:ast.Call, state: _ColorizerState) -> None:
# Make sure not to swallow control flow errors.
# Colorize the ast.Call as any other node if the pattern parsing fails.
state.restore(mark)
state.warnings.append(f"Cannot colorize regular expression, error: {str(e)}")
# We do not log the error since we know our colorizer is not perfect, no spamming for issues
# that the developpers can't fix.

self._colorize_ast_call_generic(node, state)
return

Expand Down Expand Up @@ -792,16 +800,40 @@ def _colorize_re_pattern(self, pat: AnyStr, state: _ColorizerState, prefix: AnyS
self._output(prefix, None, state)
self._output(quote, self.QUOTE_TAG, state)

if flags != sre_constants.SRE_FLAG_UNICODE:
# If developers included flags in the regex string, display them.
# By default, do not display the '(?u)'
self._colorize_re_flags(flags, state)

# Colorize it!
self._colorize_re_tree(tree.data, state, True, groups)
# init the regex specifics state
state._regex_begins = marked = state.mark()
state._regex_pattern = pat

# Close quote.
self._output(quote, self.QUOTE_TAG, state)
try:

if flags != sre_constants.SRE_FLAG_UNICODE:
# If developers included flags in the regex string, display them.
# By default, do not display the '(?u)'
# the usage of flags might cause regex to not round-trip, but that's not really bad.
self._colorize_re_flags(flags, state)

# Colorize it!
self._colorize_re_tree(tree.data, state, True, groups)

# Our regex understanding is not up to date with python's, we use python 3.6 engine.
# This causes some regular expression to be falsely interpreted, so we check if the
# colorized regex round trips, and if it doesn't, then use the default string colorization.
colorized_regex_text: Union[str, bytes] = ''.join(gettext(state.result[marked.length:]))
if isinstance(pat, bytes):
try:
assert isinstance(colorized_regex_text, str)
colorized_regex_text = bytes(colorized_regex_text, encoding='utf-8')
except Exception:
raise ValueError("cannot encode regular expression as utf-8")
if colorized_regex_text != pat:
raise ValueError("regex doesn't round-trips")

# Close quote.
self._output(quote, self.QUOTE_TAG, state)

finally:
# Close regex state
state._regex_pattern = state._regex_begins = None

def _colorize_re_flags(self, flags: int, state: _ColorizerState) -> None:
if flags:
Expand All @@ -823,6 +855,7 @@ def _colorize_re_tree(self, tree: Sequence[Tuple[sre_constants._NamedIntConstant
if op == sre_constants.LITERAL: #type:ignore[attr-defined]
c = chr(cast(int, args))
# Add any appropriate escaping.
escaping = True
if c in '.^$\\*+?{}[]|()\'':
c = '\\' + c
elif c == '\t':
Expand All @@ -840,6 +873,29 @@ def _colorize_re_tree(self, tree: Sequence[Tuple[sre_constants._NamedIntConstant
c = rb'\u%04x' % ord(c) # type:ignore[assignment]
elif (ord(c)<32 or ord(c)>=127) and ord(c) <= 65535:
c = rb'\x%02x' % ord(c) # type:ignore[assignment]
else:
escaping = False

# Maybe the developper added backslash for a caracter that doesn't need escaping
# so we check the original regex string and output a backslash if that's
# what the developper did, to keep the round-trip possible.
# This rely on the fact that we don't break regex patterns in several segments,
# so there is no '↵' in the regex pattern.
if not escaping:
if state._regex_pattern is None or state._regex_begins is None:
raise RuntimeError(f'inconsistent colorizer state: {state!r}')
# this adds a lot of computing only used for roud-tripping issues...
# but it does not seem to really impact performance
try:
outputed_re = ''.join(gettext(state.result[state._regex_begins.length:]))
current_caracter = state._regex_pattern[len(outputed_re)]
next_caracter = state._regex_pattern[len(outputed_re)+1]
except IndexError:
pass
else:
if current_caracter == '\\' and next_caracter == c:
self._output('\\', self.RE_CHAR_TAG, state)

self._output(c, self.RE_CHAR_TAG, state)

elif op == sre_constants.ANY: #type:ignore[attr-defined]
Expand Down Expand Up @@ -991,15 +1047,18 @@ def _output(self, s: AnyStr, css_class: Optional[str],
state.lineno += 1
state.charpos = 0

# segments_len = len(segments)
# remaining_segments = segments_len - 1 - i
segment_len = len(segment)

# If the segment fits on the current line, then just call
# markup to tag it, and store the result.
# Don't break links into separate segments, neither quotes.
# Don't break links into separate segments, neither quotes, neither regex patterns.
if (self.linelen is None or
state.charpos + segment_len <= self.linelen
or link is True
or css_class in ('variable-quote',)):
or css_class in ('variable-quote',)
or state._regex_pattern is not None):

state.charpos += segment_len

Expand Down
20 changes: 3 additions & 17 deletions pydoctor/epydoc/sre_parse36.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,15 +504,7 @@ def _parse_sub(source, state, verbose, nested):
continue # check next one
break

# check if the branch can be replaced by a character set
for item in items:
if len(item) != 1 or item[0][0] is not LITERAL:
break
else:
# we can store this as a character set instead of a
# branch (the compiler may optimize this even more)
subpatternappend((IN, [item[0] for item in items]))
return subpattern
# pydoctor: remove all optimizations for round-tripping issues
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to edit sre_parse at all ?
I’m definitely not sure of what I’m doing when editing this file…

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might indeed need this to remove some optimizations, also we might want to add support for latest python regex features. So there are good reasons to have our own fork of sre_parse.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any regex guru in twisted contributors that could give me some feedback on how to remove all opmizations of our sre_parse fork and ensure we don’t break the logic ? @adiroiban

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@adiroiban @glyph can you help me on that ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tristanlatr what do you need here exactly? I'm not super familiar with this code, and I am not sure what the relevant optimizations are. What would give you confidence the logic isn't broken, beyond the existing test suite?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok so basically there are several issues here. We vendor the sre_parse module from the python 3.6 standard library. First, why the 3.6 version? It is because the non capturing groups are optimized from 3.7 onwards such that the SubPattern instances cannot be converted back to the same string. Then, I suspect there are other optimizations that’ are preventing some regex to roundtrip, the changes up there is one of them. Lastly, some regex feature have been added recently to python 3.11 I believe. So we need to add support for those in our sre_parse module to provide complete colorizing for regexes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What would give you confidence the logic isn't broken, beyond the existing test suite?

Its not broken from a regex matching perspective, but it is from a regex colorizing perspective (since the roundtrip is impossible for some cases)
And no, we just have our test suite to check that.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its not broken from a regex matching perspective, but it is from a regex colorizing perspective (since the roundtrip is impossible for some cases)

gotcha, this makes a bit more sense. My plate is honestly kind of full right now, so I don't know how much time I can dedicate to this. Maybe ping the mailing list to see if anyone is interested? Sustainable open source infrastructure is hard :(.


subpattern.append((BRANCH, (None, items)))
return subpattern
Expand Down Expand Up @@ -619,14 +611,8 @@ def _parse(source, state, verbose, nested, first=False):
code1 = code1[1][0]
setappend(code1)

# XXX: <fl> should move set optimization to compiler!
if _len(set)==1 and set[0][0] is LITERAL:
subpatternappend(set[0]) # optimization
elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
subpatternappend((NOT_LITERAL, set[1][1])) # optimization
else:
# XXX: <fl> should add charmap optimization here
subpatternappend((IN, set))
# pydoctor: remove all optimizations for round-tripping issues
subpatternappend((IN, set))

elif this in REPEAT_CHARS:
# repeat previous item
Expand Down
115 changes: 75 additions & 40 deletions pydoctor/test/epydoc/test_pyval_repr.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import ast
import sys
from textwrap import dedent
from typing import Any, Union
from typing import Any, Union, Optional
import xml.sax

import pytest
Expand Down Expand Up @@ -1216,46 +1216,57 @@ def test_ast_regex() -> None:
re.X
)\n"""

def color_re(s: Union[bytes, str],
check_roundtrip:bool=True) -> str:
def color_re(s: Union[bytes, str], *,
expect_failure:Optional[bool]=None) -> str:

colorizer = PyvalColorizer(linelen=55, maxlines=5)
val = colorizer.colorize(extract_expr(ast.parse(f"re.compile({repr(s)})")))

if check_roundtrip:
raw_text = ''.join(gettext(val.to_node()))
re_begin = 13
raw_string = True

if raw_text[11] != 'r':
# the regex has failed to be colorized since we can't find the r prefix
# meaning the string has been rendered as plaintext instead.
raw_string = False
re_begin -= 1
if isinstance(s, bytes):
re_begin += 1
re_end = -2

round_trip: Union[bytes, str] = raw_text[re_begin:re_end]
if isinstance(s, bytes):
assert isinstance(round_trip, str)
round_trip = bytes(round_trip, encoding='utf-8')
expected = s
if not raw_string:
assert isinstance(expected, str)
# we only test invalid regexes with strings currently
raw_text = ''.join(gettext(val.to_node()))
re_begin = 13
raw_string = True

if raw_text[11] != 'r':
# the regex has failed to be colorized since we can't find the r prefix
# meaning the string has been rendered as plaintext instead.
raw_string = False
re_begin -= 1

if isinstance(s, bytes):
re_begin += 1
re_end = -2

round_trip: Union[bytes, str] = raw_text[re_begin:re_end]
if isinstance(s, bytes):
assert isinstance(round_trip, str)
round_trip = bytes(round_trip, encoding='utf-8')

expected = s
if not raw_string:
if isinstance(expected, bytes):
expected = expected.replace(b'\\', b'\\\\')
else:
expected = expected.replace('\\', '\\\\')
try:
assert round_trip == expected, "%s != %s" % (repr(round_trip), repr(s))
except AssertionError as e:
if not raw_string and expect_failure is False:
raise AssertionError(f'regex colorization failed for {s!r} (and did not round-trip!), warnings: {val.warnings!r}') from e
elif raw_string and expect_failure is True:
raise AssertionError(f'regex did not did not round-trip and was expected to failed for {s!r} (but it succeeded)') from e
raise

if not raw_string and expect_failure is False:
raise AssertionError(f'regex colorization failed for {s!r}, warnings: {val.warnings!r}')
elif raw_string and expect_failure is True:
raise AssertionError(f'regex colorization was expected to failed for {s!r} (but it succeeded)')

return flatten(val.to_stan(NotFoundLinker()))[17:-8]


def test_re_literals() -> None:
# Literal characters
assert color_re(r'abc \t\r\n\f\v \xff \uffff', False) == r"""r<span class="rst-variable-quote">'</span>abc \t\r\n\f\v \xff \uffff<span class="rst-variable-quote">'</span>"""
assert color_re(r'abc \t\r\n\f\v \xff \uffff') == r"""r<span class="rst-variable-quote">'</span>abc \t\r\n\f\v \xff \uffff<span class="rst-variable-quote">'</span>"""

assert color_re(r'\.\^\$\\\*\+\?\{\}\[\]\|\(\)\'') == r"""r<span class="rst-variable-quote">'</span>\.\^\$\\\*\+\?\{\}\[\]\|\(\)\'<span class="rst-variable-quote">'</span>"""

Expand All @@ -1264,7 +1275,7 @@ def test_re_literals() -> None:

def test_re_branching() -> None:
# Branching
assert color_re(r"foo|bar") == """r<span class="rst-variable-quote">'</span>foo<span class="rst-re-op">|</span>bar<span class="rst-variable-quote">'</span>"""
assert color_re(r"foo|bar", expect_failure=False) == """r<span class="rst-variable-quote">'</span>foo<span class="rst-re-op">|</span>bar<span class="rst-variable-quote">'</span>"""

def test_re_char_classes() -> None:
# Character classes
Expand Down Expand Up @@ -1340,7 +1351,7 @@ def test_re_lookahead_behinds() -> None:
assert color_re(r"foo(?!bar)") == ("""r<span class="rst-variable-quote">'</span>foo<span class="rst-re-group">(?!</span>"""
"""bar<span class="rst-re-group">)</span><span class="rst-variable-quote">'</span>""")

assert color_re(r"(?<=bar)foo") == ("""r<span class="rst-variable-quote">'</span><span class="rst-re-group">(?&lt;=</span>"""
assert color_re(r"(?<=bar)foo", expect_failure=False) == ("""r<span class="rst-variable-quote">'</span><span class="rst-re-group">(?&lt;=</span>"""
"""bar<span class="rst-re-group">)</span>foo<span class="rst-variable-quote">'</span>""")

assert color_re(r"(?<!bar)foo") == ("""r<span class="rst-variable-quote">'</span><span class="rst-re-group">(?&lt;!</span>"""
Expand All @@ -1349,15 +1360,15 @@ def test_re_lookahead_behinds() -> None:

def test_re_flags() -> None:
# Flags
assert color_re(r"(?imu)^Food") == """r<span class="rst-variable-quote">'</span><span class="rst-re-flags">(?imu)</span>^Food<span class="rst-variable-quote">'</span>"""
assert color_re(r"(?imu)^Food", expect_failure=False) == """r<span class="rst-variable-quote">'</span><span class="rst-re-flags">(?imu)</span>^Food<span class="rst-variable-quote">'</span>"""

assert color_re(b"(?Limsx)^Food") == """rb<span class="rst-variable-quote">'</span><span class="rst-re-flags">(?Limsx)</span>^Food<span class="rst-variable-quote">'</span>"""

assert color_re(b"(?Limstx)^Food") == """rb<span class="rst-variable-quote">'</span><span class="rst-re-flags">(?Limstx)</span>^Food<span class="rst-variable-quote">'</span>"""

assert color_re(r"(?imstux)^Food") == """r<span class="rst-variable-quote">'</span><span class="rst-re-flags">(?imstux)</span>^Food<span class="rst-variable-quote">'</span>"""

assert color_re(r"(?x)This is verbose", False) == """r<span class="rst-variable-quote">'</span><span class="rst-re-flags">(?ux)</span>Thisisverbose<span class="rst-variable-quote">'</span>"""
# assert color_re(r"(?x)This is verbose") == """r<span class="rst-variable-quote">'</span><span class="rst-re-flags">(?ux)</span>Thisisverbose<span class="rst-variable-quote">'</span>"""

def test_unsupported_regex_features() -> None:
"""
Expand All @@ -1369,7 +1380,6 @@ def test_unsupported_regex_features() -> None:
regexes = ['e*+e',
'(e?){2,4}+a',
r"^(\w){1,2}+$",
# "^x{}+$", this one fails to round-trip :/
r'a++',
r'(?:ab)++',
r'(?:ab){1,3}+',
Expand All @@ -1380,17 +1390,41 @@ def test_unsupported_regex_features() -> None:
for r in regexes:
color_re(r)

def test_regex_corner_case_roudtrips() -> None:
color_re("^x{}+$", expect_failure=True)
color_re(r"(?x)This is verbose")
color_re(r'abc \t\r\n\f\v \xff \uffff')

color_re(b"^x{}+$")
color_re(rb"(?x)This is verbose")
color_re(rb'abc \t\r\n\f\v \xff \uffff')

# found in epytext.py
color_re(r'{|}', expect_failure=True)

# found in _configparser.py
color_re(r'(^\"(?:\\.|[^\"\\])*\"$)', expect_failure=False)

# found in node2stan.py
color_re(r'^(.*?)\s*<(?:URI:|URL:)?([^<>]+)>$', expect_failure=True)

# found in options.py
color_re(r'(^https?:\/\/sourceforge\.net\/)', expect_failure=False)
color_re(r'(.*)?', expect_failure=False)

def test_re_not_literal() -> None:

assert color_re(r"[^0-9]") == """r<span class="rst-variable-quote">'</span><span class="rst-re-group">[</span><span class="rst-re-op">^</span>0<span class="rst-re-op">-</span>9<span class="rst-re-group">]</span><span class="rst-variable-quote">'</span>"""

def test_re_named_groups() -> None:
# This regex triggers some weird behaviour: it adds the &crarr; element at the end where it should not be...
# The regex is 42 caracters long, so more than 40, maybe that's why?
# assert color_re(r'^<(?P<descr>.*) at (?P<addr>0x[0-9a-f]+)>$') == """"""

assert color_re(r'^<(?P<descr>.*)>$') == """r<span class="rst-variable-quote">'</span>^&lt;<span class="rst-re-group">(?P&lt;</span><span class="rst-re-ref">descr</span><span class="rst-re-group">&gt;</span>.<span class="rst-re-op">*</span><span class="rst-re-group">)</span>&gt;$<span class="rst-variable-quote">'</span>"""

@pytest.mark.xfail
def test_re_named_groups_weird() -> None:
# This regex triggers some weird behaviour: it adds the &crarr; element at the end where it should not be...
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test should be be fixed

# The regex is 42 caracters long, re.compile(r' is 13 caracter long, the color_re function uses linelen=55.
assert '&crarr;' not in color_re(r'^<(?P<descr>.*) at (?P<addr>0x[0-9a-f]+)>$')

def test_re_multiline() -> None:

assert color(extract_expr(ast.parse(dedent(r'''re.compile(r"""\d + # the integral part
Expand Down Expand Up @@ -1505,9 +1539,10 @@ def test_crash_surrogates_not_allowed() -> None:

def test_surrogates_cars_in_re() -> None:
"""
Regex string are escaped their own way. See https://github.com/twisted/pydoctor/pull/493
Original string is used when the regex doesn't round-trips.
See https://github.com/twisted/pydoctor/pull/493 and https://github.com/twisted/pydoctor/pull/678 for later modification of the test.
"""
assert color2(extract_expr(ast.parse("re.compile('surrogates:\\udc80\\udcff')"))) == "re.compile(r'surrogates:\\udc80\\udcff')"
assert color2(extract_expr(ast.parse("re.compile('surrogates:\\udc80\\udcff')"))) == "re.compile('surrogates:\\udc80\\udcff')"

def test_repr_text() -> None:
"""Test a few representations, with a plain text version.
Expand Down