From 03190b168b6a208c8d350dce09b3aa5fcd4199c6 Mon Sep 17 00:00:00 2001 From: Andrew Baumann Date: Mon, 30 Dec 2024 10:30:30 +0100 Subject: [PATCH] Sub-line page sequence calculation I noticed that the caret test file with many annotations on the same line was returning them in a really wonky order. This changes the pageseq calculation to use finer-grain (character-level) distances to disambiguate conflicting positions. --- pdfannots/__init__.py | 34 ++++++++++++++++++++++++++-------- pdfannots/types.py | 25 +++++++++++++++++++++---- tests.py | 20 ++++++++++---------- 3 files changed, 57 insertions(+), 22 deletions(-) diff --git a/pdfannots/__init__.py b/pdfannots/__init__.py index 575b1b6..6a6d9c7 100644 --- a/pdfannots/__init__.py +++ b/pdfannots/__init__.py @@ -22,7 +22,7 @@ import pdfminer.settings import pdfminer.utils -from .types import Page, Outline, AnnotationType, Annotation, Document, RGB +from .types import Page, ObjectWithPos, Outline, AnnotationType, Annotation, Document, RGB from .utils import cleanup_text, decode_datetime pdfminer.settings.STRICT = False @@ -217,13 +217,30 @@ def receive_layout(self, ltpage: LTPage) -> None: self.page = None - def update_pageseq(self, component: LTComponent) -> None: - """Assign sequence numbers for objects on the page based on the nearest line of text.""" + def update_pageseq(self, component: LTComponent) -> bool: + """Assign sequence numbers for objects on the page based on the nearest line of text. + Returns True if we need to recurse on smaller sub-components (e.g. characters).""" assert self.page is not None self.compseq += 1 + hits = 0 for x in itertools.chain(self.page.annots, self.page.outlines): - x.update_pageseq(component, self.compseq) + if x.update_pageseq(component, self.compseq): + hits += 1 + + # If we have assigned the same sequence number to multiple objects, and there exist smaller + # sub-components (e.g. characters within a line), we'll recurse on those assigning sequence + # numbers to sub-components to disambiguate the hits, but first we must forget about the + # current sequence number. + # NB: This could be done more efficiently -- we really only need to disambiguate conflicts + # that still exist after processing *all* the line-level components on the same page, but + # that would require multiple rendering passes. + if hits > 1 and isinstance(component, LTContainer) and len(component) > 1: + for x in itertools.chain(self.page.annots, self.page.outlines): + x.discard_pageseq(self.compseq) + return True + + return False def test_boxes(self, item: LTComponent) -> None: """Update the set of annotations whose boxes intersect with the area of the given item.""" @@ -288,7 +305,7 @@ def capture_char(self, text: str) -> None: # Subscribe this annotation for post-context. self.context_subscribers.append((self.charseq, a)) - def render(self, item: LTItem) -> None: + def render(self, item: LTItem, pageseq_nested: bool = False) -> None: """ Helper for receive_layout, called recursively for every item on a page, in layout order. @@ -296,13 +313,14 @@ def render(self, item: LTItem) -> None: """ # Assign sequence numbers to items on the page based on their proximity to lines of text or # to figures (which may contain bare LTChar elements). - if isinstance(item, (LTTextLine, LTFigure)): - self.update_pageseq(item) + if isinstance(item, (LTTextLine, LTFigure)) or ( + pageseq_nested and isinstance(item, LTComponent)): + pageseq_nested = self.update_pageseq(item) # If it's a container, recurse on nested items. if isinstance(item, LTContainer): for child in item: - self.render(child) + self.render(child, pageseq_nested) # After the children of a text box, capture the end of the final # line (logic derived from pdfminer.converter.TextConverter). diff --git a/pdfannots/types.py b/pdfannots/types.py index 1bbccd8..25d4d9b 100644 --- a/pdfannots/types.py +++ b/pdfannots/types.py @@ -59,6 +59,10 @@ def get_height(self) -> float: """Return the height of the box.""" return self.y1 - self.y0 + def get_area(self) -> float: + """Return the area of the box.""" + return self.get_height() * self.get_width() + def get_overlap(self, other: Box) -> float: """Compute the overlapping area (if any) with the provided box.""" x_overlap = max(0, min(other.x1, self.x1) - max(other.x0, self.x0)) @@ -218,18 +222,27 @@ def item_hit(self, item: LTComponent) -> bool: and self.y >= item.y0 and self.y <= item.y1) - def update_pageseq(self, component: LTComponent, pageseq: int) -> None: - """If close-enough to the given component, adopt its sequence number.""" + def update_pageseq(self, component: LTComponent, pageseq: int) -> bool: + """If close-enough to the given component, adopt its sequence number and return True.""" assert pageseq > 0 if self.item_hit(component): # This pos is inside the component area self._pageseq = pageseq self._pageseq_distance = 0 + return True else: d = Box.from_item(component).square_of_distance_to_closest_point((self.x, self.y)) if self._pageseq == 0 or self._pageseq_distance > d: self._pageseq = pageseq self._pageseq_distance = d + return True + return False + + def discard_pageseq(self, pageseq: int) -> None: + """If we have been assigned the specified pageseq, forget about it.""" + if self._pageseq == pageseq: + self._pageseq = 0 + self._pageseq_distance = 0.0 @functools.total_ordering @@ -246,10 +259,14 @@ def __lt__(self, other: object) -> bool: return self.pos < other.pos return NotImplemented - def update_pageseq(self, component: LTComponent, pageseq: int) -> None: + def update_pageseq(self, component: LTComponent, pageseq: int) -> bool: """Delegates to Pos.update_pageseq""" + return False if self.pos is None else self.pos.update_pageseq(component, pageseq) + + def discard_pageseq(self, pageseq: int) -> None: + """Delegates to Pos.discard_pageseq""" if self.pos is not None: - self.pos.update_pageseq(component, pageseq) + self.pos.discard_pageseq(pageseq) class AnnotationType(enum.Enum): diff --git a/tests.py b/tests.py index 2ac981f..affb0b9 100755 --- a/tests.py +++ b/tests.py @@ -173,8 +173,8 @@ def test(self) -> None: (AnnotationType.Highlight, 'short highlight', 'not working'), (AnnotationType.Text, None, None), (AnnotationType.Highlight, None, 'Some more text'), - (AnnotationType.Text, 's', None), - (AnnotationType.Text, 'dual\n\npara note', None)] + (AnnotationType.Text, 'dual\n\npara note', None), + (AnnotationType.Text, 's', None)] self.assertEqual(len(self.annots), len(EXPECTED)) for a, expected in zip(self.annots, EXPECTED): self.assertEqual((a.subtype, a.contents, a.gettext()), expected) @@ -270,14 +270,14 @@ class CaretAnnotations(ExtractionTestBase): def test(self) -> None: self.assertEqual(len(self.annots), 5) - self.assertEqual(self.annots[1].subtype, AnnotationType.StrikeOut) - self.assertEqual(self.annots[1].gettext(), 'Adobe Acrobat Reader') - self.assertEqual(self.annots[4].subtype, AnnotationType.Caret) - self.assertEqual(self.annots[4].contents, 'Google Chrome') - self.assertEqual(self.annots[1].in_reply_to, self.annots[4]) - self.assertEqual(self.annots[4].replies, [self.annots[1]]) - self.assertEqual(self.annots[1].replies, []) - self.assertEqual(self.annots[4].in_reply_to, None) + self.assertEqual(self.annots[0].subtype, AnnotationType.StrikeOut) + self.assertEqual(self.annots[0].gettext(), 'Adobe Acrobat Reader') + self.assertEqual(self.annots[3].subtype, AnnotationType.Caret) + self.assertEqual(self.annots[3].contents, 'Google Chrome') + self.assertEqual(self.annots[0].in_reply_to, self.annots[3]) + self.assertEqual(self.annots[3].replies, [self.annots[0]]) + self.assertEqual(self.annots[0].replies, []) + self.assertEqual(self.annots[3].in_reply_to, None) class PrinterTestBase(unittest.TestCase):