Reading PDF Annotations
PDF 1.7 defines 25 different annotation types:
Text
Link
FreeText
Line, Square, Circle, Polygon, PolyLine, Highlight, Underline, Squiggly, StrikeOut
Stamp, Caret, Ink
Popup
FileAttachment
Sound, Movie
Widget, Screen
PrinterMark
TrapNet
Watermark
3D
In general, annotations can be read like this:
from PyPDF2 import PdfReader
reader = PdfReader("commented.pdf")
for page in reader.pages:
if "/Annots" in page:
for annot in page["/Annots"]:
obj = annot.get_object()
annotation = {"subtype": obj["/Subtype"], "location": obj["/Rect"]}
print(annotation)
Reading the most common ones is described here.
Text
from PyPDF2 import PdfReader
reader = PdfReader("example.pdf")
for page in reader.pages:
if "/Annots" in page:
for annot in page["/Annots"]:
subtype = annot.get_object()["/Subtype"]
if subtype == "/Text":
print(annot.get_object()["/Contents"])
Highlights
from PyPDF2 import PdfReader
reader = PdfReader("commented.pdf")
for page in reader.pages:
if "/Annots" in page:
for annot in page["/Annots"]:
subtype = annot.get_object()["/Subtype"]
if subtype == "/Highlight":
coords = annot.get_object()["/QuadPoints"]
x1, y1, x2, y2, x3, y3, x4, y4 = coords
Attachments
from PyPDF2 import PdfReader
reader = PdfReader("example.pdf")
attachments = {}
for page in reader.pages:
if "/Annots" in page:
for annotation in page["/Annots"]:
subtype = annot.get_object()["/Subtype"]
if subtype == "/FileAttachment":
fileobj = annotobj["/FS"]
attachments[fileobj["/F"]] = fileobj["/EF"]["/F"].get_data()