Sub-line page sequence calculation

I noticed that the caret test file with many annotations on the same line was returning them in a really wonky order. This changes the pageseq calculation to use finer-grain (character-level) distances to disambiguate conflicting positions.
0xabu · Dec 30, 2024 · 03190b1 · 03190b1
1 parent c7e3612
commit 03190b1
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 22 deletions.
diff --git a/pdfannots/__init__.py b/pdfannots/__init__.py
@@ -22,7 +22,7 @@
 import pdfminer.settings
 import pdfminer.utils
 
-from .types import Page, Outline, AnnotationType, Annotation, Document, RGB
+from .types import Page, ObjectWithPos, Outline, AnnotationType, Annotation, Document, RGB
 from .utils import cleanup_text, decode_datetime
 
 pdfminer.settings.STRICT = False
@@ -217,13 +217,30 @@ def receive_layout(self, ltpage: LTPage) -> None:
 
         self.page = None
 
-    def update_pageseq(self, component: LTComponent) -> None:
-        """Assign sequence numbers for objects on the page based on the nearest line of text."""
+    def update_pageseq(self, component: LTComponent) -> bool:
+        """Assign sequence numbers for objects on the page based on the nearest line of text.
+        Returns True if we need to recurse on smaller sub-components (e.g. characters)."""
         assert self.page is not None
         self.compseq += 1
 
+        hits = 0
         for x in itertools.chain(self.page.annots, self.page.outlines):
-            x.update_pageseq(component, self.compseq)
+            if x.update_pageseq(component, self.compseq):
+                hits += 1
+
+        # If we have assigned the same sequence number to multiple objects, and there exist smaller
+        # sub-components (e.g. characters within a line), we'll recurse on those assigning sequence
+        # numbers to sub-components to disambiguate the hits, but first we must forget about the
+        # current sequence number.
+        # NB: This could be done more efficiently -- we really only need to disambiguate conflicts
+        # that still exist after processing *all* the line-level components on the same page, but
+        # that would require multiple rendering passes.
+        if hits > 1 and isinstance(component, LTContainer) and len(component) > 1:
+            for x in itertools.chain(self.page.annots, self.page.outlines):
+                x.discard_pageseq(self.compseq)
+            return True
+
+        return False
 
     def test_boxes(self, item: LTComponent) -> None:
         """Update the set of annotations whose boxes intersect with the area of the given item."""
@@ -288,21 +305,22 @@ def capture_char(self, text: str) -> None:
                     # Subscribe this annotation for post-context.
                     self.context_subscribers.append((self.charseq, a))
 
-    def render(self, item: LTItem) -> None:
+    def render(self, item: LTItem, pageseq_nested: bool = False) -> None:
         """
         Helper for receive_layout, called recursively for every item on a page, in layout order.
 
         Ref: https://pdfminersix.readthedocs.io/en/latest/topic/converting_pdf_to_text.html
         """
         # Assign sequence numbers to items on the page based on their proximity to lines of text or
         # to figures (which may contain bare LTChar elements).
-        if isinstance(item, (LTTextLine, LTFigure)):
-            self.update_pageseq(item)
+        if isinstance(item, (LTTextLine, LTFigure)) or (
+                pageseq_nested and isinstance(item, LTComponent)):
+            pageseq_nested = self.update_pageseq(item)
 
         # If it's a container, recurse on nested items.
         if isinstance(item, LTContainer):
             for child in item:
-                self.render(child)
+                self.render(child, pageseq_nested)
 
             # After the children of a text box, capture the end of the final
             # line (logic derived from pdfminer.converter.TextConverter).

diff --git a/pdfannots/types.py b/pdfannots/types.py
@@ -59,6 +59,10 @@ def get_height(self) -> float:
         """Return the height of the box."""
         return self.y1 - self.y0
 
+    def get_area(self) -> float:
+        """Return the area of the box."""
+        return self.get_height() * self.get_width()
+
     def get_overlap(self, other: Box) -> float:
         """Compute the overlapping area (if any) with the provided box."""
         x_overlap = max(0, min(other.x1, self.x1) - max(other.x0, self.x0))
@@ -218,18 +222,27 @@ def item_hit(self, item: LTComponent) -> bool:
                 and self.y >= item.y0
                 and self.y <= item.y1)
 
-    def update_pageseq(self, component: LTComponent, pageseq: int) -> None:
-        """If close-enough to the given component, adopt its sequence number."""
+    def update_pageseq(self, component: LTComponent, pageseq: int) -> bool:
+        """If close-enough to the given component, adopt its sequence number and return True."""
         assert pageseq > 0
         if self.item_hit(component):
             # This pos is inside the component area
             self._pageseq = pageseq
             self._pageseq_distance = 0
+            return True
         else:
             d = Box.from_item(component).square_of_distance_to_closest_point((self.x, self.y))
             if self._pageseq == 0 or self._pageseq_distance > d:
                 self._pageseq = pageseq
                 self._pageseq_distance = d
+                return True
+            return False
+
+    def discard_pageseq(self, pageseq: int) -> None:
+        """If we have been assigned the specified pageseq, forget about it."""
+        if self._pageseq == pageseq:
+            self._pageseq = 0
+            self._pageseq_distance = 0.0
 
 
 @functools.total_ordering
@@ -246,10 +259,14 @@ def __lt__(self, other: object) -> bool:
             return self.pos < other.pos
         return NotImplemented
 
-    def update_pageseq(self, component: LTComponent, pageseq: int) -> None:
+    def update_pageseq(self, component: LTComponent, pageseq: int) -> bool:
         """Delegates to Pos.update_pageseq"""
+        return False if self.pos is None else self.pos.update_pageseq(component, pageseq)
+
+    def discard_pageseq(self, pageseq: int) -> None:
+        """Delegates to Pos.discard_pageseq"""
         if self.pos is not None:
-            self.pos.update_pageseq(component, pageseq)
+            self.pos.discard_pageseq(pageseq)
 
 
 class AnnotationType(enum.Enum):

diff --git a/tests.py b/tests.py
@@ -173,8 +173,8 @@ def test(self) -> None:
             (AnnotationType.Highlight, 'short highlight', 'not working'),
             (AnnotationType.Text, None, None),
             (AnnotationType.Highlight, None, 'Some more text'),
-            (AnnotationType.Text, 's', None),
-            (AnnotationType.Text, 'dual\n\npara note', None)]
+            (AnnotationType.Text, 'dual\n\npara note', None),
+            (AnnotationType.Text, 's', None)]
         self.assertEqual(len(self.annots), len(EXPECTED))
         for a, expected in zip(self.annots, EXPECTED):
             self.assertEqual((a.subtype, a.contents, a.gettext()), expected)
@@ -270,14 +270,14 @@ class CaretAnnotations(ExtractionTestBase):
 
     def test(self) -> None:
         self.assertEqual(len(self.annots), 5)
-        self.assertEqual(self.annots[1].subtype, AnnotationType.StrikeOut)
-        self.assertEqual(self.annots[1].gettext(), 'Adobe Acrobat Reader')
-        self.assertEqual(self.annots[4].subtype, AnnotationType.Caret)
-        self.assertEqual(self.annots[4].contents, 'Google Chrome')
-        self.assertEqual(self.annots[1].in_reply_to, self.annots[4])
-        self.assertEqual(self.annots[4].replies, [self.annots[1]])
-        self.assertEqual(self.annots[1].replies, [])
-        self.assertEqual(self.annots[4].in_reply_to, None)
+        self.assertEqual(self.annots[0].subtype, AnnotationType.StrikeOut)
+        self.assertEqual(self.annots[0].gettext(), 'Adobe Acrobat Reader')
+        self.assertEqual(self.annots[3].subtype, AnnotationType.Caret)
+        self.assertEqual(self.annots[3].contents, 'Google Chrome')
+        self.assertEqual(self.annots[0].in_reply_to, self.annots[3])
+        self.assertEqual(self.annots[3].replies, [self.annots[0]])
+        self.assertEqual(self.annots[0].replies, [])
+        self.assertEqual(self.annots[3].in_reply_to, None)
 
 
 class PrinterTestBase(unittest.TestCase):