From c7e36123d4311828a2358166b770d07c30c83792 Mon Sep 17 00:00:00 2001 From: Andrew Baumann Date: Sun, 29 Dec 2024 21:45:40 +0100 Subject: [PATCH] Caret annotations: initial support (#102) * extract Caret annotations in PDF * handle IRT (in reply to) property, and expose as inter-Annotation lins * capture the optional NM name property, and export it in JSON (this is really unrelated) * when rendering the specific case of a Caret annotation with a single StrikeOut annotation as a "reply" (which is how Acrobat seems to render replace+insert edits), render this as a "suggested replacement" Based on the work of Suyash Mahar in https://github.com/0xabu/pdfannots/pull/96 --- pdfannots/__init__.py | 22 ++++++++--- pdfannots/printer/json.py | 5 ++- pdfannots/printer/markdown.py | 34 ++++++++++++----- pdfannots/types.py | 70 ++++++++++++++++++++++++---------- tests.py | 15 ++++++++ tests/caret.pdf | Bin 0 -> 10200 bytes 6 files changed, 108 insertions(+), 38 deletions(-) create mode 100644 tests/caret.pdf diff --git a/pdfannots/__init__.py b/pdfannots/__init__.py index c693a9b..575b1b6 100644 --- a/pdfannots/__init__.py +++ b/pdfannots/__init__.py @@ -49,8 +49,7 @@ def _mkannotation( """ Given a PDF annotation, capture relevant fields and construct an Annotation object. - Refer to Section 8.4 of the PDF spec: - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf + Refer to Section 8.4 of the PDF reference (version 1.7). """ subtype = pa.get('Subtype') @@ -85,13 +84,17 @@ def _mkannotation( rect = pdftypes.resolve1(pa.get('Rect')) # QuadPoints are defined only for "markup" annotations (Highlight, Underline, StrikeOut, - # Squiggly), where they specify the quadrilaterals (boxes) covered by the annotation. + # Squiggly, Caret), where they specify the quadrilaterals (boxes) covered by the annotation. quadpoints = pdftypes.resolve1(pa.get('QuadPoints')) author = pdftypes.resolve1(pa.get('T')) if author is not None: author = pdfminer.utils.decode_text(author) + name = pdftypes.resolve1(pa.get('NM')) + if name is not None: + name = pdfminer.utils.decode_text(name) + created = None dobj = pa.get('CreationDate') # some pdf apps set modification date, but not creation date @@ -103,8 +106,9 @@ def _mkannotation( createds = pdfminer.utils.decode_text(createds) created = decode_datetime(createds) - return Annotation(page, annot_type, quadpoints, rect, - contents, author=author, created=created, color=rgb) + return Annotation(page, annot_type, quadpoints=quadpoints, rect=rect, name=name, + contents=contents, author=author, created=created, color=rgb, + in_reply_to_ref=pa.get('IRT')) def _get_outlines(doc: PDFDocument) -> typ.Iterator[Outline]: @@ -383,6 +387,10 @@ def emit_progress(msg: str) -> None: o.resolve(page) page.outlines.append(o) + # Dict from object ID (in the ObjRef) to Annotation object + # This is used while post-processing to resolve inter-annotation references + annots_by_objid: typ.Dict[int, Annotation] = {} + # Construct Annotation objects, and append them to the page. for pa in pdftypes.resolve1(pdfpage.annots) if pdfpage.annots else []: if isinstance(pa, pdftypes.PDFObjRef): @@ -391,6 +399,8 @@ def emit_progress(msg: str) -> None: annot = _mkannotation(annot_dict, page) if annot is not None: page.annots.append(annot) + assert pa.objid not in annots_by_objid + annots_by_objid[pa.objid] = annot else: logger.warning("Unknown annotation: %s", pa) @@ -410,7 +420,7 @@ def emit_progress(msg: str) -> None: # Give the annotations a chance to update their internals for a in page.annots: - a.postprocess() + a.postprocess(annots_by_objid) emit_progress("\n") diff --git a/pdfannots/printer/json.py b/pdfannots/printer/json.py index 7506ce6..e3306b0 100644 --- a/pdfannots/printer/json.py +++ b/pdfannots/printer/json.py @@ -14,6 +14,7 @@ def annot_to_dict( assert annot.pos result = { + "name": annot.name, "type": annot.subtype.name, "page": annot.pos.page.pageno + 1, "page_label": annot.pos.page.label, @@ -23,7 +24,9 @@ def annot_to_dict( "contents": annot.contents, "author": annot.author, "created": annot.created.strftime('%Y-%m-%dT%H:%M:%S') if annot.created else None, - "color": ('#' + annot.color.ashex()) if annot.color else None + "color": ('#' + annot.color.ashex()) if annot.color else None, + "in_reply_to": (annot.in_reply_to.name if annot.in_reply_to and annot.in_reply_to.name + else None), } # Remove keys with None values in nested dictionary and return diff --git a/pdfannots/printer/markdown.py b/pdfannots/printer/markdown.py index 27c8889..312d0fc 100644 --- a/pdfannots/printer/markdown.py +++ b/pdfannots/printer/markdown.py @@ -216,11 +216,17 @@ def format_annot( document: Document, extra: typ.Optional[str] = None ) -> str: + # Limited support for Caret annotations with a single "reply" of type StrikeOut + contents = annot.contents + if (annot.subtype == AnnotationType.Caret and annot.replies + and annot.replies[0].subtype == AnnotationType.StrikeOut): + annot = annot.replies[0] + if annot.contents: + logger.warning("Ignored StrikeOut comment: %s", annot.contents) # capture item text and contents (i.e. the comment), and split the latter into paragraphs text = annot.gettext(self.remove_hyphens) or '' - comment = ([l for l in annot.contents.splitlines() if l] - if annot.contents else []) + comment = [l for l in contents.splitlines() if l] if contents else [] if annot.has_context(): assert annot.subtype == AnnotationType.StrikeOut @@ -270,13 +276,13 @@ def emit_body( self, document: Document ) -> typ.Iterator[str]: - for a in document.iter_annots(): + for a in document.iter_annots(include_replies=False): yield self.format_annot(a, document, a.subtype.name) class GroupedMarkdownPrinter(MarkdownPrinter): - ANNOT_NITS = frozenset({ - AnnotationType.Squiggly, AnnotationType.StrikeOut, AnnotationType.Underline}) + ANNOT_NITS = frozenset({AnnotationType.Caret, AnnotationType.Squiggly, + AnnotationType.StrikeOut, AnnotationType.Underline}) ALL_SECTIONS = ["highlights", "comments", "nits"] def __init__( @@ -316,12 +322,12 @@ def fmt_header(name: str, level: int = 2) -> str: return prefix + header + " " + name + "\n" # Partition annotations into nits, comments, and highlights. - nits = [] - comments = [] - highlights = [] # When grouping by color, this holds only the undefined annotations + nits: typ.List[Annotation] = [] + comments: typ.List[Annotation] = [] + highlights: typ.List[Annotation] = [] # When grouping by color holds only undefined annots highlights_by_color: typ.DefaultDict[RGB, typ.List[Annotation]] = defaultdict(list) - for a in document.iter_annots(): + for a in document.iter_annots(include_replies=False): if a.subtype in self.ANNOT_NITS: nits.append(a) elif a.contents: @@ -355,5 +361,13 @@ def fmt_header(name: str, level: int = 2) -> str: if nits and secname == 'nits': yield fmt_header("Nits") for a in nits: - extra = "suggested deletion" if a.subtype == AnnotationType.StrikeOut else None + extra = None + if a.subtype == AnnotationType.Caret: + if a.replies and a.replies[0].subtype == AnnotationType.StrikeOut: + extra = "suggested replacement" + else: + extra = "suggested insertion" + elif a.subtype == AnnotationType.StrikeOut: + extra = "suggested deletion" + yield self.format_annot(a, document, extra) diff --git a/pdfannots/types.py b/pdfannots/types.py index 0a67671..1bbccd8 100644 --- a/pdfannots/types.py +++ b/pdfannots/types.py @@ -33,6 +33,9 @@ def __init__(self, x0: float, y0: float, x1: float, y1: float): self.y0 = y0 self.y1 = y1 + def __repr__(self) -> str: + return '' % (self.x0, self.y0, self.x1, self.y1) + @staticmethod def from_item(item: LTComponent) -> Box: """Construct a Box from the bounding box of a given PDF component.""" @@ -261,6 +264,8 @@ class AnnotationType(enum.Enum): StrikeOut = enum.auto() Underline = enum.auto() + Caret = enum.auto() + # A single rectangle, that is abused by some Apple tools to render custom # highlights. We do not attempt to capture the affected text. Square = enum.auto() @@ -274,35 +279,43 @@ class Annotation(ObjectWithPos): A PDF annotation, and its extracted text. Attributes: - subtype PDF annotation type - contents Contents of the annotation in the PDF (e.g. comment/description) - text Text in the order captured (use gettext() for a cleaner form) author Author of the annotation - created Timestamp the annotation was created color RGB color of the annotation + contents Contents of the annotation in the PDF (e.g. comment/description) + created Timestamp the annotation was created + in_reply_to Reference to another annotation on the page that this is "in reply to" last_charseq Sequence number of the most recent character in text + name If present, uniquely identifies this annotation among others on the page + replies Annotations replying to this one (reverse of in_reply_to) + subtype PDF annotation type + text Text in the order captured (use gettext() for a cleaner form) - Attributes updated only for StrikeOut annotations: + Attributes updated for StrikeOut and Caret annotations: pre_context Text captured just prior to the beginning of 'text' post_context Text captured just after the end of 'text' """ - contents: typ.Optional[str] boxes: typ.List[Box] - text: typ.List[str] + contents: typ.Optional[str] + in_reply_to: typ.Optional[Annotation] pre_context: typ.Optional[str] post_context: typ.Optional[str] + replies: typ.List[Annotation] + text: typ.List[str] def __init__( self, page: Page, subtype: AnnotationType, - quadpoints: typ.Optional[typ.Sequence[float]] = None, - rect: typ.Optional[BoxCoords] = None, - contents: typ.Optional[str] = None, + *, author: typ.Optional[str] = None, created: typ.Optional[datetime.datetime] = None, - color: typ.Optional[RGB] = None): + color: typ.Optional[RGB] = None, + contents: typ.Optional[str] = None, + in_reply_to_ref: typ.Optional[PDFObjRef] = None, + name: typ.Optional[str] = None, + quadpoints: typ.Optional[typ.Sequence[float]] = None, + rect: typ.Optional[BoxCoords] = None): # Construct boxes from quadpoints boxes = [] @@ -324,16 +337,22 @@ def __init__( super().__init__(pos) # Initialise the attributes - self.subtype = subtype - self.contents = contents if contents else None self.author = author - self.created = created - self.text = [] - self.color = color - self.pre_context = None - self.post_context = None self.boxes = boxes + self.color = color + self.contents = contents if contents else None + self.created = created + self.name = name self.last_charseq = 0 + self.post_context = None + self.pre_context = None + self.replies = [] + self.subtype = subtype + self.text = [] + + # The in_reply_to reference will be resolved in postprocess() + self._in_reply_to_ref = in_reply_to_ref + self.in_reply_to = None def __repr__(self) -> str: return ('' % @@ -394,8 +413,15 @@ def get_context(self, remove_hyphens: bool = False) -> typ.Tuple[str, str]: return (merge_lines(self.pre_context or '', remove_hyphens, strip_space=False), merge_lines(self.post_context or '', remove_hyphens, strip_space=False)) - def postprocess(self) -> None: + def postprocess(self, annots_by_objid: typ.Dict[int, Annotation]) -> None: """Update internal state once all text and context has been captured.""" + # Resole the in_reply_to object reference to its annotation + if self._in_reply_to_ref is not None: + assert self.in_reply_to is None # This should be called once only + self.in_reply_to = annots_by_objid.get(self._in_reply_to_ref.objid) + if self.in_reply_to is not None: + self.in_reply_to.replies.append(self) + # The Skim PDF reader (https://skim-app.sourceforge.io/) creates annotations whose # default initial contents are a copy of the selected text. Unless the user goes to # the trouble of editing each annotation, this goes badly for us because we have @@ -466,10 +492,12 @@ class Document: def __init__(self) -> None: self.pages = [] - def iter_annots(self) -> typ.Iterator[Annotation]: + def iter_annots(self, *, include_replies: bool = True) -> typ.Iterator[Annotation]: """Iterate over all the annotations in the document.""" for p in self.pages: - yield from p.annots + for a in p.annots: + if include_replies or not a.in_reply_to: + yield a def nearest_outline( self, diff --git a/tests.py b/tests.py index ec88103..2ac981f 100755 --- a/tests.py +++ b/tests.py @@ -265,6 +265,21 @@ def test(self) -> None: self.assertEqual(self.annots[0].gettext(), None) +class CaretAnnotations(ExtractionTestBase): + filename = 'caret.pdf' + + def test(self) -> None: + self.assertEqual(len(self.annots), 5) + self.assertEqual(self.annots[1].subtype, AnnotationType.StrikeOut) + self.assertEqual(self.annots[1].gettext(), 'Adobe Acrobat Reader') + self.assertEqual(self.annots[4].subtype, AnnotationType.Caret) + self.assertEqual(self.annots[4].contents, 'Google Chrome') + self.assertEqual(self.annots[1].in_reply_to, self.annots[4]) + self.assertEqual(self.annots[4].replies, [self.annots[1]]) + self.assertEqual(self.annots[1].replies, []) + self.assertEqual(self.annots[4].in_reply_to, None) + + class PrinterTestBase(unittest.TestCase): filename = 'hotos17.pdf' diff --git a/tests/caret.pdf b/tests/caret.pdf new file mode 100644 index 0000000000000000000000000000000000000000..89c57b72d144ec4892f277af8e170348e6456001 GIT binary patch literal 10200 zcmeHNc|26>|F>nCOhieytBz5!&2nZR*-a7#*&@Vjn9QP?vA3Z@bwg#Tl+r?}i>}12 zkV2HLRX0URi>0({)9;L3saN0E>;Cck<9bEsJm>jr@8^9!&vQP{v%r~Zg9K4T_=1NW zjo0BY5CkxQfUyd0VuE(y@Y!@BCyLDiK(qq@Vz3wt+6e%0=*0j5iy@&Mp>{WbLZ%SW zwtzQ)Cy@Y(2=@g__XCkgE$5Qrg{2A{h^&;WQM+AT7KjrMe52cVaGg8&u)Ap}HX zA)C&F!-DQJ7%WwPe};dZg4#BuMeM` z<9xNEsj;HI;*r`vf6?~}?X54IPE#(>(~*tq^H8i8Yu&-5W~nZ-F_VN5xlTEFk}w95 z9HyHkQz)RVmgGVWHg*Y*nCT-m?PkOgAxL{pgp}-0Q^ucckGWZ z+L<26ra_ECyNlS40zP}NK?G0+1qKJo8dAv>1dfq(P}_(Pe7-;|0-CG_rW12G*1`9J(Cxkio{Kn`2*MUOfRy_3#D0n?Q&_C`aQ0wOlrHIyNi5ME+p znEFnPRcxksj5W52)WpK+8!Qy%M8^-vJZ>*tVt~0##22q^yBriAM$etUfE}+O>}RF8*=imbVT3%Deu%3v$lMC`%=jsN9>3GUG@+hz}BjB2&m@93B|E z0Rm?G^?+_Q6%`v;$EkSDM?G-<{>GH|JE^jQV5+2jq$9y}{Ag zWBoq*`p4dq+)reH4`%pF*<76KFd1?Z@{MQK2n+Be6o!JM03Zn>7LEd;^&Ne%q#r{w zNT-Q_ERGN_ag}f!B9IFP`eV7&6$qenMQpUSCB!})1^U4c(Py+Tnn@ph1QE>zoFHdK zL(@XJVonG*@*6ob=w~8L(Vjz;gcQ)0+c*to6_6x(^#=R?0)-*3fj|)qa$@kmQ7u#f zY%*jey8l;Xmk0%+A&`fzXg499FA9-l3}z%0%J3UY`JWJ<@S2W`e<*hH{}lV6%nu@! zXc`|W2xTWMh5}_LJPL!y0U!m3!eMZO*$Iy!{oM|XodzvTB-(_qe>FRC zuN1;7Fp6>&`}Nn!$UVJ4KX`cc9=wy02H)u!AJ`zTK=l?Qk zEw0-6ir@*V5;W47-ip5|I6Xin=b1F?T}lod7QDGEKZbLTuI_2?lI_4+^l$fhMIG# zoL$Vu*y;}E8X~#Ek2|oaM{E+GwMd-M$hUl1YFmdle10HG6t>Pocdz{6gLir5NsU)3 zV~S^=Tr+1NowD~}uKTBHyX&t%Jn^hOscpvJ$cINaTqjN8O}(XY4oUU4^>d(~ zz5xI16IXg`uco7|ErOi7`Ixm{hoT$ZfkAsTL&Y%I$3r8q`cB+mIyM==mZisMA2I-% z!#A!8PiRc3NI%o@wydvHGs!3QOThY)aOS6{W;werE|0$G)e0{!Gfvpe0{3HaK;fz-Yml$8#dY=AYjB zd!~kJRd}hK@7eDW7QM#AO!J=k?x>hQuU5LnNA%o`%hvNPSFz2EtM5}W^gd8p8@yHE zz@8evqU0`3ug)s=a8tcfwr5O{S!0oR0eE@f(VP>ViwfnF16#JSid=_oQ=#1Iw1kb#RNbg|YR^jub$#vqpK93K z*PU^P-!*>LocyNj#k|J4H%c+txlU12pLi_MG&eEXsA6K2x5%M`T%9!a%++XK&Y_yp z$XnAN9Sz)P)N{>WHSPvEjvl4VeEc{lMD*c$K(nPwDz0Pu2FeDfy!_=v%Ci{FQ**oK zw@ZSN9fh=%WLW6I@p8=r4E)Y@<_=jg?_ z`sW(YbM^9$)SB0?|KN?QYb@Y*zs;hDZTqt73-erTKFR*8YTJIANnp`C?SsdXY3v;K zKf>*XGrMvZlSJI^x+#_5E;|q<_3GbkE`c(+CMy#vX<1aOwnQP{!q_D;8dNGa^uL8E0>fy z-dkMy)qB>CwsxztGy0D!M0!MB$EuNv=O?Q)WyELpJ+@Pw|BsJSuTieiyUar!bqZRC z&v$=4u~qy2>SQv*<-T2GDHwauylP}jk`C2R8^IW}5uez31Bl{9@?-&V`1d7IleG#SDc z9FDN-a<+N6KK%5V4|i~Ksa#^2O@V3kCso#?RMj&|!PCfa%9AG{wjCfCy*~BZNuczN z74l2>$pv4ZEC~mN7F=haZMo|kOrz<4)Lb=x_IYkK^MRj;cRKR&X;Z;*>})U6O97$B zx_G_V(qL!X%FK^m^+JxW(Z;~3aiqtugdF+=PqaROjy^5+Ff6~Y6|9jjhF+!Rmc)|#$!!3*Lc8m z8(Lk1|7@7S+m)*>Sg++uuIsYD2vGZW^X|%CZhINYf4*u%E_mx1=L1=3zR5{hM}#6e zIeFQ`d_w+tqP@RGanak;YfF&nM=R2&MeI?AWkstc{1Ns+eOJB5XT@@t)RR75gijR0 z>9_UKJGm>X`=6Z9oDq7f_KBDKS#4E#_T&3&22MU-_BsB}yOyRv<<{Whre4_q$9t9K zDz)OIVwlmPj5FK5&Wn{grawKZ1{-I9`_Sfe#pKIzFTE%0xf%N?W~`Z( zS>BtjJv9wTJXdq>U8MKfublpEGmiK*>(At^^1FFvp8m~`x8JWi5_TzU;ffmCm9V*O zjehT&Uhz~fellx!;9qV$v^K%q?qgVAjelo!0rn%Upm@)FnX{XEk#OA^8p;UO`MUg$^&AaRkEH zG-;bCPQKuH5u-|dF|ft-p9KE-)jji<9N5slv4V6-o8|(`Rzm0}ln3(S5$;wgnaxoq zuX{x3wEKY;?W9=6yyG(UZAONF{F9-)rIm4%Hf7zyv{{EX1#Bb2nL25=Rpc5EHMQ?8 zSfiv=SANQ;Ma_QdI)YP)K~|n{8tTdoLhvE2Ri|2yPQ4Lnd9YV?v27_~z1P4aPov+{ zb{99VlI^xL%J2(}lw)@K9Ejev?o)P4?<2)>D~i$l8_!6g9dhA$hRzFC+?iIt?)F=C z`Ml0Q7p|Z4=fi=ugvzU$GU=(8ucBIYZp=})K&UT6l-{X0gNU=479G^J(mlhz-(Oj0 zQI*HSZX$f)=HgcKp*L>~^C zVRgqaht^i`^mxB?Qp>XKm9EDha)5}(H3)M$m9#f<|MK6v6pk8hvus(uCU@Y9S;pVG zPUo}xM3J(oIXW>`FQS~Q^KRTs8`uywJr#j!K3Zm(WxQ2CHrU5W&|W&9$|TIZ9p1O! z=WqqezUm(Ax1=Qo8dnHzwJWw9DCAtu^85Rg#jQQs8B#Ym%1Gxdztf?anyZowe&gpX zt$tjT*40msyk0cj_&0r;wm zYcIU_ubK9Kfk|!d%AyC_+XEwJDyUp_QItPnt7Vydx$U!oMxLSG&hXD2@@}s>W@ff{ zzMNS?>N0%8JxRAa(HgSjaprBp%NSat=)7-iC_I0AukS{u&rNP|In{Dz%ABXpottxC zQZX$vSROCWOife>$c%3ha!)xQv28lSC@kCDbG_r{6YiFxLXDUgYp;pSwd7N|{Gju5 z<6bD4s&_PM@Q^bb8k~)urxnz1mG(YXybpPon6JjoIV5-;%d_tn>ifZbU_K4)!tAi@ zJBb0L>pi-(+1?SmgwZK47MEq!z7M=W4Xw4owW#-W3RPm~%9&nmx_IfFZC`2V{^h&Z z)GS(YM@rAgPBYNR&MVW!f3EDSw%1sBwG6qVvmMmgR3?=f@EeYkP}5{jStwj6B<(!= zmw$EE4uUeCRTgxBq6n}WbBYu_Bl*(M+^Ln!UGnRfvat)qD z7%ho;(j103JW{$GOG75+5g~MDFk1{T*nu3rDdNT1as!3*I*g-{@Z$K{KRBTNx=sItyBhOQ;eXaq3WMI3C3u#`Li zJZa8=wLr**N?#z7iNSzSl@diH5m=Z2k`aK#U~y;+5eS!%^Wllt37W2BGQ=8iPe+u}G)~QWVJ-(<6|4k0&c_NC=n1q)U)MLq<%{V~@tbko0Fl+Xm3C$KY6ujLguWxUyGIY#l+F%Q0KR=Cc7S zVJMHmrAL~eC0IkPW11yh2U{m98Fb@&ixNM?95I*ujRuKYORjig>zEf3${2I${6JGg z1d_!LphMLW5Q!lpu@pB9#h8FMCXh_f-}D)SWK3EhbQ1`;W;718{~!_sh=Jt-Aq#qaFyo|$ z`+Oq~Z8UHKB8QD*@kk62i6yv!WQlPQx+NK7hLw!#`K^xOU5;PUGQRIPJrJXep+Xod zl*tyFNw!fKbTL5Xh{PN&mn{UKngs!XhzKm_K-+TRA`=M(#{|YH8sxY!b$EQ1Qq4j` zIV@u~gUAkmw$PwKCF78I7M_A+u*gg#gG|PdF?1H0MIa1fMWcFlmzV($EeKexz!Frwko{ntXHS zlpDsmps+c-0bOsn40GdsgMRQU)2@wrg6ASVgAt^jk|ZGp^wdW!qogpoEdix zD5-jU$)fi{=a&9= zVX6<>)|IJA)6W;*wahZAN{Ncyr<@->%dGnOrdZwQdk?tb^I(OS*zU~UeSWR$*0XYM zzyjCl`FJ62;PCG*y=*jye_fYu|H6(`RVBBY(c?v)hFn^x_0m)iuOHlHtQLZM)8`)( z+@(|gS7G{Lt(ob8br=l`eZ$PXvQ&QjthcL(d-I`_LRHhuk&~b??$pQsi!)>29TS7- z@ZI4saCqz2N+Zu~R@96+aR%e_4M}_&Cv!$Tp VFe!0Uv?jr2+(9r9B#s^h`#+Zs-q`>E literal 0 HcmV?d00001