Reading PDF Annotations
PDF 1.7 defines 25 different annotation types:
Text
Link
FreeText
Line, Square, Circle, Polygon, PolyLine, Highlight, Underline, Squiggly, StrikeOut
Stamp, Caret, Ink
Popup
FileAttachment
Sound, Movie
Widget, Screen
PrinterMark
TrapNet
Watermark
3D
Reading the most common ones is described here.
Text
from PyPDF2 import PdfReader
reader = PdfReader("example.pdf")
for page in reader.pages:
if "/Annots" in page:
for annot in page["/Annots"]:
subtype = annot.get_object()["/Subtype"]
if subtype == "/Text":
print(annot.get_object()["/Contents"])
Highlights
from PyPDF2 import PdfReader
reader = PdfReader("commented.pdf")
for page in reader.pages:
if "/Annots" in page:
for annot in page["/Annots"]:
subtype = annot.get_object()["/Subtype"]
if subtype == "/Highlight":
coords = annot.get_object()["/QuadPoints"]
x1, y1, x2, y2, x3, y3, x4, y4 = coords
Attachments
from PyPDF2 import PdfReader
reader = PdfReader("example.pdf")
attachments = {}
for page in reader.pages:
if "/Annots" in page:
for annotation in page["/Annots"]:
subtype = annot.get_object()["/Subtype"]
if subtype == "/FileAttachment":
fileobj = annotobj["/FS"]
attachments[fileobj["/F"]] = fileobj["/EF"]["/F"].get_data()