twisted · tristanlatr · Jan 23, 2023 · Jan 23, 2023 · Jan 23, 2023 · Jan 23, 2023
diff --git a/pydoctor/epydoc/markup/_pyval_repr.py b/pydoctor/epydoc/markup/_pyval_repr.py
@@ -52,6 +52,8 @@
 from pydoctor.epydoc.markup.restructuredtext import ParsedRstDocstring
 from pydoctor.epydoc.docutils import set_node_attributes, wbr, obj_reference, new_document
 from pydoctor.astutils import node2dottedname, bind_args
+from pydoctor.node2stan import gettext
+
 
 def decode_with_backslashreplace(s: bytes) -> str:
     r"""
@@ -70,13 +72,14 @@ def decode_with_backslashreplace(s: bytes) -> str:
             .encode('ascii', 'backslashreplace')
             .decode('ascii'))
 
-@attr.s(auto_attribs=True)
+@attr.s(auto_attribs=True, frozen=True)
 class _MarkedColorizerState:
     length: int
     charpos: int
     lineno: int
     linebreakok: bool
 
+@attr.s(auto_attribs=True)
 class _ColorizerState:
     """
     An object uesd to keep track of the current state of the pyval
@@ -86,12 +89,15 @@ class _ColorizerState:
     their object on a single line (setting linebreakok=False); and
     then fall back on a multi-line output if that fails.  
     """
-    def __init__(self) -> None:
-        self.result: List[nodes.Node] = []
-        self.charpos = 0
-        self.lineno = 1
-        self.linebreakok = True
-        self.warnings: List[str] = []
+    result: List[nodes.Node] = attr.ib(factory=list, init=False)
+    charpos:int = attr.ib(default=0, init=False)
+    lineno:int = attr.ib(default=1, init=False)
+    linebreakok:bool = attr.ib(default=False, init=False)
+    warnings: List[str] = attr.ib(factory=list)
+
+    # state linked to regex colorization
+    _regex_begins: Optional[_MarkedColorizerState] = attr.ib(default=None, init=False)
+    _regex_pattern: Optional[Union[str, bytes]] = attr.ib(default=None, init=False)
 
     def mark(self) -> _MarkedColorizerState:
         return _MarkedColorizerState(
@@ -727,7 +733,9 @@ def _colorize_ast_re(self, node:ast.Call, state: _ColorizerState) -> None:
             # Make sure not to swallow control flow errors.
             # Colorize the ast.Call as any other node if the pattern parsing fails.
             state.restore(mark)
-            state.warnings.append(f"Cannot colorize regular expression, error: {str(e)}")
+            # We do not log the error since we know our colorizer is not perfect, no spamming for issues
+            # that the developpers can't fix.
+
             self._colorize_ast_call_generic(node, state)
             return
 
@@ -792,16 +800,40 @@ def _colorize_re_pattern(self, pat: AnyStr, state: _ColorizerState, prefix: AnyS
         self._output(prefix, None, state)
         self._output(quote, self.QUOTE_TAG, state)
 
-        if flags != sre_constants.SRE_FLAG_UNICODE:
-            # If developers included flags in the regex string, display them.
-            # By default, do not display the '(?u)'
-            self._colorize_re_flags(flags, state)
-
-        # Colorize it!
-        self._colorize_re_tree(tree.data, state, True, groups)
+        # init the regex specifics state
+        state._regex_begins = marked = state.mark()
+        state._regex_pattern = pat
 
-        # Close quote.
-        self._output(quote, self.QUOTE_TAG, state)
+        try:
+
+            if flags != sre_constants.SRE_FLAG_UNICODE:
+                # If developers included flags in the regex string, display them.
+                # By default, do not display the '(?u)'
+                # the usage of flags might cause regex to not round-trip, but that's not really bad.
+                self._colorize_re_flags(flags, state)
+
+            # Colorize it!
+            self._colorize_re_tree(tree.data, state, True, groups)
+
+            # Our regex understanding is not up to date with python's, we use python 3.6 engine.
+            # This causes some regular expression to be falsely interpreted, so we check if the 
+            # colorized regex round trips, and if it doesn't, then use the default string colorization.
+            colorized_regex_text: Union[str, bytes] = ''.join(gettext(state.result[marked.length:]))
+            if isinstance(pat, bytes):
+                try:
+                    assert isinstance(colorized_regex_text, str)
+                    colorized_regex_text = bytes(colorized_regex_text, encoding='utf-8')
+                except Exception:
+                    raise ValueError("cannot encode regular expression as utf-8")
+            if colorized_regex_text != pat:
+                raise ValueError("regex doesn't round-trips")
+
+            # Close quote.
+            self._output(quote, self.QUOTE_TAG, state)
+
+        finally:
+            # Close regex state
+            state._regex_pattern = state._regex_begins = None
 
     def _colorize_re_flags(self, flags: int, state: _ColorizerState) -> None:
         if flags:
@@ -823,6 +855,7 @@ def _colorize_re_tree(self, tree: Sequence[Tuple[sre_constants._NamedIntConstant
             if op == sre_constants.LITERAL: #type:ignore[attr-defined]
                 c = chr(cast(int, args))
                 # Add any appropriate escaping.
+                escaping = True
                 if c in '.^$\\*+?{}[]|()\'': 
                     c = '\\' + c
                 elif c == '\t': 
@@ -840,6 +873,29 @@ def _colorize_re_tree(self, tree: Sequence[Tuple[sre_constants._NamedIntConstant
                    c = rb'\u%04x' % ord(c) # type:ignore[assignment]
                 elif (ord(c)<32 or ord(c)>=127) and ord(c) <= 65535: 
                     c = rb'\x%02x' % ord(c) # type:ignore[assignment]
+                else:
+                    escaping = False
+
+                # Maybe the developper added backslash for a caracter that doesn't need escaping
+                # so we check the original regex string and output a backslash if that's 
+                # what the developper did, to keep the round-trip possible.
+                # This rely on the fact that we don't break regex patterns in several segments, 
+                # so there is no '↵' in the regex pattern.
+                if not escaping:
+                    if state._regex_pattern is None or state._regex_begins is None:
+                        raise RuntimeError(f'inconsistent colorizer state: {state!r}')
+                    # this adds a lot of computing only used for roud-tripping issues...
+                    # but it does not seem to really impact performance
+                    try:
+                        outputed_re = ''.join(gettext(state.result[state._regex_begins.length:]))
+                        current_caracter = state._regex_pattern[len(outputed_re)]
+                        next_caracter = state._regex_pattern[len(outputed_re)+1]
+                    except IndexError:
+                        pass
+                    else:
+                        if current_caracter == '\\' and next_caracter == c:
+                            self._output('\\', self.RE_CHAR_TAG, state)
+
                 self._output(c, self.RE_CHAR_TAG, state)
 
             elif op == sre_constants.ANY: #type:ignore[attr-defined]
@@ -991,15 +1047,18 @@ def _output(self, s: AnyStr, css_class: Optional[str],
                 state.lineno += 1
                 state.charpos = 0
 
+            # segments_len = len(segments)
+            # remaining_segments = segments_len - 1 - i
             segment_len = len(segment) 
 
             # If the segment fits on the current line, then just call
             # markup to tag it, and store the result.
-            # Don't break links into separate segments, neither quotes.
+            # Don't break links into separate segments, neither quotes, neither regex patterns.
             if (self.linelen is None or 
                 state.charpos + segment_len <= self.linelen 
                 or link is True 
-                or css_class in ('variable-quote',)):
+                or css_class in ('variable-quote',)
+                or state._regex_pattern is not None):
 
                 state.charpos += segment_len
 

diff --git a/pydoctor/epydoc/sre_parse36.py b/pydoctor/epydoc/sre_parse36.py
@@ -504,15 +504,7 @@ def _parse_sub(source, state, verbose, nested):
             continue # check next one
         break
 
-    # check if the branch can be replaced by a character set
-    for item in items:
-        if len(item) != 1 or item[0][0] is not LITERAL:
-            break
-    else:
-        # we can store this as a character set instead of a
-        # branch (the compiler may optimize this even more)
-        subpatternappend((IN, [item[0] for item in items]))
-        return subpattern
+    # pydoctor: remove all optimizations for round-tripping issues
 
     subpattern.append((BRANCH, (None, items)))
     return subpattern
@@ -619,14 +611,8 @@ def _parse(source, state, verbose, nested, first=False):
                         code1 = code1[1][0]
                     setappend(code1)
 
-            # XXX: <fl> should move set optimization to compiler!
-            if _len(set)==1 and set[0][0] is LITERAL:
-                subpatternappend(set[0]) # optimization
-            elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
-                subpatternappend((NOT_LITERAL, set[1][1])) # optimization
-            else:
-                # XXX: <fl> should add charmap optimization here
-                subpatternappend((IN, set))
+            # pydoctor: remove all optimizations for round-tripping issues
+            subpatternappend((IN, set))
 
         elif this in REPEAT_CHARS:
             # repeat previous item

diff --git a/pydoctor/test/epydoc/test_pyval_repr.py b/pydoctor/test/epydoc/test_pyval_repr.py
@@ -1,7 +1,7 @@
 import ast
 import sys
 from textwrap import dedent
-from typing import Any, Union
+from typing import Any, Union, Optional
 import xml.sax
 
 import pytest
@@ -1216,46 +1216,57 @@ def test_ast_regex() -> None:
         re.X
     )\n"""
 
-def color_re(s: Union[bytes, str], 
-             check_roundtrip:bool=True) -> str:
+def color_re(s: Union[bytes, str], *,
+             expect_failure:Optional[bool]=None) -> str:
 
     colorizer = PyvalColorizer(linelen=55, maxlines=5)
     val = colorizer.colorize(extract_expr(ast.parse(f"re.compile({repr(s)})")))
 
-    if check_roundtrip:
-        raw_text = ''.join(gettext(val.to_node()))
-        re_begin = 13
-        raw_string = True
-
-        if raw_text[11] != 'r':
-            # the regex has failed to be colorized since we can't find the r prefix
-            # meaning the string has been rendered as plaintext instead.
-            raw_string = False
-            re_begin -= 1
-        
-        if isinstance(s, bytes):
-            re_begin += 1
-        re_end = -2
-
-        round_trip: Union[bytes, str] = raw_text[re_begin:re_end]
-        if isinstance(s, bytes):
-            assert isinstance(round_trip, str)
-            round_trip = bytes(round_trip, encoding='utf-8')
-        
-        expected = s
-        if not raw_string:
-            assert isinstance(expected, str) 
-            # we only test invalid regexes with strings currently
+    raw_text = ''.join(gettext(val.to_node()))
+    re_begin = 13
+    raw_string = True
+
+    if raw_text[11] != 'r':
+        # the regex has failed to be colorized since we can't find the r prefix
+        # meaning the string has been rendered as plaintext instead.
+        raw_string = False
+        re_begin -= 1
+
+    if isinstance(s, bytes):
+        re_begin += 1
+    re_end = -2
+
+    round_trip: Union[bytes, str] = raw_text[re_begin:re_end]
+    if isinstance(s, bytes):
+        assert isinstance(round_trip, str)
+        round_trip = bytes(round_trip, encoding='utf-8')
+
+    expected = s
+    if not raw_string:
+        if isinstance(expected, bytes):
+            expected = expected.replace(b'\\', b'\\\\')
+        else:
             expected = expected.replace('\\', '\\\\')
-        
+    try:
         assert round_trip == expected, "%s != %s" % (repr(round_trip), repr(s))
+    except AssertionError as e:
+        if not raw_string and expect_failure is False:
+            raise AssertionError(f'regex colorization failed for {s!r} (and did not round-trip!), warnings: {val.warnings!r}') from e
+        elif raw_string and expect_failure is True:
+            raise AssertionError(f'regex did not did not round-trip and was expected to failed for {s!r} (but it succeeded)') from e
+        raise
+
+    if not raw_string and expect_failure is False:
+        raise AssertionError(f'regex colorization failed for {s!r}, warnings: {val.warnings!r}')
+    elif raw_string and expect_failure is True:
+        raise AssertionError(f'regex colorization was expected to failed for {s!r} (but it succeeded)')
 
     return flatten(val.to_stan(NotFoundLinker()))[17:-8]
 
 
 def test_re_literals() -> None:
     # Literal characters
-    assert color_re(r'abc \t\r\n\f\v \xff \uffff', False) == r"""r<span class="rst-variable-quote">'</span>abc \t\r\n\f\v \xff \uffff<span class="rst-variable-quote">'</span>"""
+    assert color_re(r'abc \t\r\n\f\v \xff \uffff') == r"""r<span class="rst-variable-quote">'</span>abc \t\r\n\f\v \xff \uffff<span class="rst-variable-quote">'</span>"""
 
     assert color_re(r'\.\^\$\\\*\+\?\{\}\[\]\|\(\)\'') == r"""r<span class="rst-variable-quote">'</span>\.\^\$\\\*\+\?\{\}\[\]\|\(\)\'<span class="rst-variable-quote">'</span>"""
 
@@ -1264,7 +1275,7 @@ def test_re_literals() -> None:
 
 def test_re_branching() -> None:
     # Branching
-    assert color_re(r"foo|bar") == """r<span class="rst-variable-quote">'</span>foo<span class="rst-re-op">|</span>bar<span class="rst-variable-quote">'</span>"""
+    assert color_re(r"foo|bar", expect_failure=False) == """r<span class="rst-variable-quote">'</span>foo<span class="rst-re-op">|</span>bar<span class="rst-variable-quote">'</span>"""
 
 def test_re_char_classes() -> None:
     # Character classes
@@ -1340,7 +1351,7 @@ def test_re_lookahead_behinds() -> None:
     assert color_re(r"foo(?!bar)") == ("""r<span class="rst-variable-quote">'</span>foo<span class="rst-re-group">(?!</span>"""
                                        """bar<span class="rst-re-group">)</span><span class="rst-variable-quote">'</span>""")
 
-    assert color_re(r"(?<=bar)foo") == ("""r<span class="rst-variable-quote">'</span><span class="rst-re-group">(?&lt;=</span>"""
+    assert color_re(r"(?<=bar)foo", expect_failure=False) == ("""r<span class="rst-variable-quote">'</span><span class="rst-re-group">(?&lt;=</span>"""
                                         """bar<span class="rst-re-group">)</span>foo<span class="rst-variable-quote">'</span>""")
 
     assert color_re(r"(?<!bar)foo") == ("""r<span class="rst-variable-quote">'</span><span class="rst-re-group">(?&lt;!</span>"""
@@ -1349,15 +1360,15 @@ def test_re_lookahead_behinds() -> None:
 
 def test_re_flags() -> None:
     # Flags
-    assert color_re(r"(?imu)^Food") == """r<span class="rst-variable-quote">'</span><span class="rst-re-flags">(?imu)</span>^Food<span class="rst-variable-quote">'</span>"""
+    assert color_re(r"(?imu)^Food", expect_failure=False) == """r<span class="rst-variable-quote">'</span><span class="rst-re-flags">(?imu)</span>^Food<span class="rst-variable-quote">'</span>"""
 
     assert color_re(b"(?Limsx)^Food") == """rb<span class="rst-variable-quote">'</span><span class="rst-re-flags">(?Limsx)</span>^Food<span class="rst-variable-quote">'</span>"""
 
     assert color_re(b"(?Limstx)^Food") == """rb<span class="rst-variable-quote">'</span><span class="rst-re-flags">(?Limstx)</span>^Food<span class="rst-variable-quote">'</span>"""
 
     assert color_re(r"(?imstux)^Food") == """r<span class="rst-variable-quote">'</span><span class="rst-re-flags">(?imstux)</span>^Food<span class="rst-variable-quote">'</span>"""
 
-    assert color_re(r"(?x)This   is   verbose", False) == """r<span class="rst-variable-quote">'</span><span class="rst-re-flags">(?ux)</span>Thisisverbose<span class="rst-variable-quote">'</span>"""
+    # assert color_re(r"(?x)This   is   verbose") == """r<span class="rst-variable-quote">'</span><span class="rst-re-flags">(?ux)</span>Thisisverbose<span class="rst-variable-quote">'</span>"""
 
 def test_unsupported_regex_features() -> None:
     """
@@ -1369,7 +1380,6 @@ def test_unsupported_regex_features() -> None:
     regexes = ['e*+e',
         '(e?){2,4}+a',
         r"^(\w){1,2}+$",
-        # "^x{}+$", this one fails to round-trip :/
         r'a++',
         r'(?:ab)++',
         r'(?:ab){1,3}+',
@@ -1380,17 +1390,41 @@ def test_unsupported_regex_features() -> None:
     for r in regexes:
         color_re(r)
 
+def test_regex_corner_case_roudtrips() -> None:
+    color_re("^x{}+$", expect_failure=True)
+    color_re(r"(?x)This   is   verbose")
+    color_re(r'abc \t\r\n\f\v \xff \uffff')
+
+    color_re(b"^x{}+$")
+    color_re(rb"(?x)This   is   verbose")
+    color_re(rb'abc \t\r\n\f\v \xff \uffff')
+
+    # found in epytext.py
+    color_re(r'{|}', expect_failure=True)
+
+    # found in _configparser.py
+    color_re(r'(^\"(?:\\.|[^\"\\])*\"$)', expect_failure=False)
+
+    # found in node2stan.py
+    color_re(r'^(.*?)\s*<(?:URI:|URL:)?([^<>]+)>$', expect_failure=True)
+
+    # found in options.py
+    color_re(r'(^https?:\/\/sourceforge\.net\/)',  expect_failure=False)
+    color_re(r'(.*)?', expect_failure=False)
+
 def test_re_not_literal() -> None:
 
     assert color_re(r"[^0-9]") == """r<span class="rst-variable-quote">'</span><span class="rst-re-group">[</span><span class="rst-re-op">^</span>0<span class="rst-re-op">-</span>9<span class="rst-re-group">]</span><span class="rst-variable-quote">'</span>"""
 
 def test_re_named_groups() -> None:
-    # This regex triggers some weird behaviour: it adds the &crarr; element at the end where it should not be...
-    # The regex is 42 caracters long, so more than 40, maybe that's why?
-    # assert color_re(r'^<(?P<descr>.*) at (?P<addr>0x[0-9a-f]+)>$') == """"""
-
     assert color_re(r'^<(?P<descr>.*)>$') == """r<span class="rst-variable-quote">'</span>^&lt;<span class="rst-re-group">(?P&lt;</span><span class="rst-re-ref">descr</span><span class="rst-re-group">&gt;</span>.<span class="rst-re-op">*</span><span class="rst-re-group">)</span>&gt;$<span class="rst-variable-quote">'</span>"""
 
+@pytest.mark.xfail
+def test_re_named_groups_weird() -> None:
+    # This regex triggers some weird behaviour: it adds the &crarr; element at the end where it should not be...
+    # The regex is 42 caracters long, re.compile(r' is 13 caracter long, the color_re function uses linelen=55.
+    assert '&crarr;' not in color_re(r'^<(?P<descr>.*) at (?P<addr>0x[0-9a-f]+)>$')
+
 def test_re_multiline() -> None:
 
     assert color(extract_expr(ast.parse(dedent(r'''re.compile(r"""\d +  # the integral part
@@ -1505,9 +1539,10 @@ def test_crash_surrogates_not_allowed() -> None:
 
 def test_surrogates_cars_in_re() -> None:
     """
-    Regex string are escaped their own way. See https://github.com/twisted/pydoctor/pull/493
+    Original string is used when the regex doesn't round-trips.
+    See https://github.com/twisted/pydoctor/pull/493 and https://github.com/twisted/pydoctor/pull/678 for later modification of the test.
     """
-    assert color2(extract_expr(ast.parse("re.compile('surrogates:\\udc80\\udcff')"))) == "re.compile(r'surrogates:\\udc80\\udcff')"
+    assert color2(extract_expr(ast.parse("re.compile('surrogates:\\udc80\\udcff')"))) == "re.compile('surrogates:\\udc80\\udcff')"
 
 def test_repr_text() -> None:
     """Test a few representations, with a plain text version.