Skip to content

Commit

Permalink
Sub-line page sequence calculation
Browse files Browse the repository at this point in the history
I noticed that the caret test file with many annotations on the same line was
returning them in a really wonky order. This changes the pageseq calculation
to use finer-grain (character-level) distances to disambiguate conflicting
positions.
  • Loading branch information
0xabu committed Dec 30, 2024
1 parent c7e3612 commit 03190b1
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 22 deletions.
34 changes: 26 additions & 8 deletions pdfannots/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import pdfminer.settings
import pdfminer.utils

from .types import Page, Outline, AnnotationType, Annotation, Document, RGB
from .types import Page, ObjectWithPos, Outline, AnnotationType, Annotation, Document, RGB
from .utils import cleanup_text, decode_datetime

pdfminer.settings.STRICT = False
Expand Down Expand Up @@ -217,13 +217,30 @@ def receive_layout(self, ltpage: LTPage) -> None:

self.page = None

def update_pageseq(self, component: LTComponent) -> None:
"""Assign sequence numbers for objects on the page based on the nearest line of text."""
def update_pageseq(self, component: LTComponent) -> bool:
"""Assign sequence numbers for objects on the page based on the nearest line of text.
Returns True if we need to recurse on smaller sub-components (e.g. characters)."""
assert self.page is not None
self.compseq += 1

hits = 0
for x in itertools.chain(self.page.annots, self.page.outlines):
x.update_pageseq(component, self.compseq)
if x.update_pageseq(component, self.compseq):
hits += 1

# If we have assigned the same sequence number to multiple objects, and there exist smaller
# sub-components (e.g. characters within a line), we'll recurse on those assigning sequence
# numbers to sub-components to disambiguate the hits, but first we must forget about the
# current sequence number.
# NB: This could be done more efficiently -- we really only need to disambiguate conflicts
# that still exist after processing *all* the line-level components on the same page, but
# that would require multiple rendering passes.
if hits > 1 and isinstance(component, LTContainer) and len(component) > 1:
for x in itertools.chain(self.page.annots, self.page.outlines):
x.discard_pageseq(self.compseq)
return True

return False

def test_boxes(self, item: LTComponent) -> None:
"""Update the set of annotations whose boxes intersect with the area of the given item."""
Expand Down Expand Up @@ -288,21 +305,22 @@ def capture_char(self, text: str) -> None:
# Subscribe this annotation for post-context.
self.context_subscribers.append((self.charseq, a))

def render(self, item: LTItem) -> None:
def render(self, item: LTItem, pageseq_nested: bool = False) -> None:
"""
Helper for receive_layout, called recursively for every item on a page, in layout order.
Ref: https://pdfminersix.readthedocs.io/en/latest/topic/converting_pdf_to_text.html
"""
# Assign sequence numbers to items on the page based on their proximity to lines of text or
# to figures (which may contain bare LTChar elements).
if isinstance(item, (LTTextLine, LTFigure)):
self.update_pageseq(item)
if isinstance(item, (LTTextLine, LTFigure)) or (
pageseq_nested and isinstance(item, LTComponent)):
pageseq_nested = self.update_pageseq(item)

# If it's a container, recurse on nested items.
if isinstance(item, LTContainer):
for child in item:
self.render(child)
self.render(child, pageseq_nested)

# After the children of a text box, capture the end of the final
# line (logic derived from pdfminer.converter.TextConverter).
Expand Down
25 changes: 21 additions & 4 deletions pdfannots/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ def get_height(self) -> float:
"""Return the height of the box."""
return self.y1 - self.y0

def get_area(self) -> float:
"""Return the area of the box."""
return self.get_height() * self.get_width()

def get_overlap(self, other: Box) -> float:
"""Compute the overlapping area (if any) with the provided box."""
x_overlap = max(0, min(other.x1, self.x1) - max(other.x0, self.x0))
Expand Down Expand Up @@ -218,18 +222,27 @@ def item_hit(self, item: LTComponent) -> bool:
and self.y >= item.y0
and self.y <= item.y1)

def update_pageseq(self, component: LTComponent, pageseq: int) -> None:
"""If close-enough to the given component, adopt its sequence number."""
def update_pageseq(self, component: LTComponent, pageseq: int) -> bool:
"""If close-enough to the given component, adopt its sequence number and return True."""
assert pageseq > 0
if self.item_hit(component):
# This pos is inside the component area
self._pageseq = pageseq
self._pageseq_distance = 0
return True
else:
d = Box.from_item(component).square_of_distance_to_closest_point((self.x, self.y))
if self._pageseq == 0 or self._pageseq_distance > d:
self._pageseq = pageseq
self._pageseq_distance = d
return True
return False

def discard_pageseq(self, pageseq: int) -> None:
"""If we have been assigned the specified pageseq, forget about it."""
if self._pageseq == pageseq:
self._pageseq = 0
self._pageseq_distance = 0.0


@functools.total_ordering
Expand All @@ -246,10 +259,14 @@ def __lt__(self, other: object) -> bool:
return self.pos < other.pos
return NotImplemented

def update_pageseq(self, component: LTComponent, pageseq: int) -> None:
def update_pageseq(self, component: LTComponent, pageseq: int) -> bool:
"""Delegates to Pos.update_pageseq"""
return False if self.pos is None else self.pos.update_pageseq(component, pageseq)

def discard_pageseq(self, pageseq: int) -> None:
"""Delegates to Pos.discard_pageseq"""
if self.pos is not None:
self.pos.update_pageseq(component, pageseq)
self.pos.discard_pageseq(pageseq)


class AnnotationType(enum.Enum):
Expand Down
20 changes: 10 additions & 10 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,8 @@ def test(self) -> None:
(AnnotationType.Highlight, 'short highlight', 'not working'),
(AnnotationType.Text, None, None),
(AnnotationType.Highlight, None, 'Some more text'),
(AnnotationType.Text, 's', None),
(AnnotationType.Text, 'dual\n\npara note', None)]
(AnnotationType.Text, 'dual\n\npara note', None),
(AnnotationType.Text, 's', None)]
self.assertEqual(len(self.annots), len(EXPECTED))
for a, expected in zip(self.annots, EXPECTED):
self.assertEqual((a.subtype, a.contents, a.gettext()), expected)
Expand Down Expand Up @@ -270,14 +270,14 @@ class CaretAnnotations(ExtractionTestBase):

def test(self) -> None:
self.assertEqual(len(self.annots), 5)
self.assertEqual(self.annots[1].subtype, AnnotationType.StrikeOut)
self.assertEqual(self.annots[1].gettext(), 'Adobe Acrobat Reader')
self.assertEqual(self.annots[4].subtype, AnnotationType.Caret)
self.assertEqual(self.annots[4].contents, 'Google Chrome')
self.assertEqual(self.annots[1].in_reply_to, self.annots[4])
self.assertEqual(self.annots[4].replies, [self.annots[1]])
self.assertEqual(self.annots[1].replies, [])
self.assertEqual(self.annots[4].in_reply_to, None)
self.assertEqual(self.annots[0].subtype, AnnotationType.StrikeOut)
self.assertEqual(self.annots[0].gettext(), 'Adobe Acrobat Reader')
self.assertEqual(self.annots[3].subtype, AnnotationType.Caret)
self.assertEqual(self.annots[3].contents, 'Google Chrome')
self.assertEqual(self.annots[0].in_reply_to, self.annots[3])
self.assertEqual(self.annots[3].replies, [self.annots[0]])
self.assertEqual(self.annots[0].replies, [])
self.assertEqual(self.annots[3].in_reply_to, None)


class PrinterTestBase(unittest.TestCase):
Expand Down

0 comments on commit 03190b1

Please sign in to comment.