Source code for PyPDF2.xmp

import datetime
import decimal
import re
from typing import (
    Any,
    Callable,
    Dict,
    Iterator,
    List,
    Optional,
    TypeVar,
    Union,
)
from xml.dom.minidom import Document
from xml.dom.minidom import Element as XmlElement
from xml.dom.minidom import parseString

from ._utils import StreamType, deprecate_with_replacement
from .generic import ContentStream, PdfObject

RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"

# What is the PDFX namespace, you might ask?  I might ask that too.  It's
# a completely undocumented namespace used to place "custom metadata"
# properties, which are arbitrary metadata properties with no semantic or
# documented meaning.  Elements in the namespace are key/value-style storage,
# where the element name is the key and the content is the value.  The keys
# are transformed into valid XML identifiers by substituting an invalid
# identifier character with \u2182 followed by the unicode hex ID of the
# original character.  A key like "my car" is therefore "my\u21820020car".
#
# \u2182, in case you're wondering, is the unicode character
# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for
# escaping characters.
#
# Intentional users of the pdfx namespace should be shot on sight.  A
# custom data schema and sensical XML elements could be used instead, as is
# suggested by Adobe's own documentation on XMP (under "Extensibility of
# Schemas").
#
# Information presented here on the /pdfx/ schema is a result of limited
# reverse engineering, and does not constitute a full specification.
PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"

iso8601 = re.compile(
    """
        (?P<year>[0-9]{4})
        (-
            (?P<month>[0-9]{2})
            (-
                (?P<day>[0-9]+)
                (T
                    (?P<hour>[0-9]{2}):
                    (?P<minute>[0-9]{2})
                    (:(?P<second>[0-9]{2}(.[0-9]+)?))?
                    (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
                )?
            )?
        )?
        """,
    re.VERBOSE,
)


K = TypeVar("K")


def _identity(value: K) -> K:
    return value


def _converter_date(value: str) -> datetime.datetime:
    matches = iso8601.match(value)
    if matches is None:
        raise ValueError(f"Invalid date format: {value}")
    year = int(matches.group("year"))
    month = int(matches.group("month") or "1")
    day = int(matches.group("day") or "1")
    hour = int(matches.group("hour") or "0")
    minute = int(matches.group("minute") or "0")
    second = decimal.Decimal(matches.group("second") or "0")
    seconds_dec = second.to_integral(decimal.ROUND_FLOOR)
    milliseconds_dec = (second - seconds_dec) * 1000000

    seconds = int(seconds_dec)
    milliseconds = int(milliseconds_dec)

    tzd = matches.group("tzd") or "Z"
    dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
    if tzd != "Z":
        tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":"))
        tzd_hours *= -1
        if tzd_hours < 0:
            tzd_minutes *= -1
        dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
    return dt


def _getter_bag(
    namespace: str, name: str
) -> Callable[["XmpInformation"], Optional[List[str]]]:
    def get(self: "XmpInformation") -> Optional[List[str]]:
        cached = self.cache.get(namespace, {}).get(name)
        if cached:
            return cached
        retval = []
        for element in self.get_element("", namespace, name):
            bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag")
            if len(bags):
                for bag in bags:
                    for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
                        value = self._get_text(item)
                        retval.append(value)
        ns_cache = self.cache.setdefault(namespace, {})
        ns_cache[name] = retval
        return retval

    return get


def _getter_seq(
    namespace: str, name: str, converter: Callable[[Any], Any] = _identity
) -> Callable[["XmpInformation"], Optional[List[Any]]]:
    def get(self: "XmpInformation") -> Optional[List[Any]]:
        cached = self.cache.get(namespace, {}).get(name)
        if cached:
            return cached
        retval = []
        for element in self.get_element("", namespace, name):
            seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq")
            if len(seqs):
                for seq in seqs:
                    for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
                        value = self._get_text(item)
                        value = converter(value)
                        retval.append(value)
            else:
                value = converter(self._get_text(element))
                retval.append(value)
        ns_cache = self.cache.setdefault(namespace, {})
        ns_cache[name] = retval
        return retval

    return get


def _getter_langalt(
    namespace: str, name: str
) -> Callable[["XmpInformation"], Optional[Dict[Any, Any]]]:
    def get(self: "XmpInformation") -> Optional[Dict[Any, Any]]:
        cached = self.cache.get(namespace, {}).get(name)
        if cached:
            return cached
        retval = {}
        for element in self.get_element("", namespace, name):
            alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
            if len(alts):
                for alt in alts:
                    for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
                        value = self._get_text(item)
                        retval[item.getAttribute("xml:lang")] = value
            else:
                retval["x-default"] = self._get_text(element)
        ns_cache = self.cache.setdefault(namespace, {})
        ns_cache[name] = retval
        return retval

    return get


def _getter_single(
    namespace: str, name: str, converter: Callable[[str], Any] = _identity
) -> Callable[["XmpInformation"], Optional[Any]]:
    def get(self: "XmpInformation") -> Optional[Any]:
        cached = self.cache.get(namespace, {}).get(name)
        if cached:
            return cached
        value = None
        for element in self.get_element("", namespace, name):
            if element.nodeType == element.ATTRIBUTE_NODE:
                value = element.nodeValue
            else:
                value = self._get_text(element)
            break
        if value is not None:
            value = converter(value)
        ns_cache = self.cache.setdefault(namespace, {})
        ns_cache[name] = value
        return value

    return get


[docs]class XmpInformation(PdfObject): """ An object that represents Adobe XMP metadata. Usually accessed by :py:attr:`xmp_metadata()<PyPDF2.PdfReader.xmp_metadata>` """ def __init__(self, stream: ContentStream) -> None: self.stream = stream doc_root: Document = parseString(self.stream.get_data()) self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS( RDF_NAMESPACE, "RDF" )[0] self.cache: Dict[Any, Any] = {} @property def rdfRoot(self) -> XmlElement: # pragma: no cover deprecate_with_replacement("rdfRoot", "rdf_root", "4.0.0") return self.rdf_root
[docs] def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] ) -> None: self.stream.write_to_stream(stream, encryption_key)
[docs] def writeToStream( self, stream: StreamType, encryption_key: Union[None, str, bytes] ) -> None: # pragma: no cover """ .. deprecated:: 1.28.0 Use :meth:`write_to_stream` instead. """ deprecate_with_replacement("writeToStream", "write_to_stream") self.write_to_stream(stream, encryption_key)
[docs] def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]: for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: attr = desc.getAttributeNodeNS(namespace, name) if attr is not None: yield attr yield from desc.getElementsByTagNameNS(namespace, name)
[docs] def getElement( self, aboutUri: str, namespace: str, name: str ) -> Iterator[Any]: # pragma: no cover """ .. deprecated:: 1.28.0 Use :meth:`get_element` instead. """ deprecate_with_replacement("getElement", "get_element") return self.get_element(aboutUri, namespace, name)
[docs] def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]: for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: for i in range(desc.attributes.length): attr = desc.attributes.item(i) if attr.namespaceURI == namespace: yield attr for child in desc.childNodes: if child.namespaceURI == namespace: yield child
[docs] def getNodesInNamespace( self, aboutUri: str, namespace: str ) -> Iterator[Any]: # pragma: no cover """ .. deprecated:: 1.28.0 Use :meth:`get_nodes_in_namespace` instead. """ deprecate_with_replacement("getNodesInNamespace", "get_nodes_in_namespace") return self.get_nodes_in_namespace(aboutUri, namespace)
def _get_text(self, element: XmlElement) -> str: text = "" for child in element.childNodes: if child.nodeType == child.TEXT_NODE: text += child.data return text dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor")) """ Contributors to the resource (other than the authors). An unsorted array of names. """ dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage")) """ Text describing the extent or scope of the resource. """ dc_creator = property(_getter_seq(DC_NAMESPACE, "creator")) """ A sorted array of names of the authors of the resource, listed in order of precedence. """ dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date)) """ A sorted array of dates (datetime.datetime instances) of significance to the resource. The dates and times are in UTC. """ dc_description = property(_getter_langalt(DC_NAMESPACE, "description")) """ A language-keyed dictionary of textual descriptions of the content of the resource. """ dc_format = property(_getter_single(DC_NAMESPACE, "format")) """ The mime-type of the resource. """ dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier")) """ Unique identifier of the resource. """ dc_language = property(_getter_bag(DC_NAMESPACE, "language")) """ An unordered array specifying the languages used in the resource. """ dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher")) """ An unordered array of publisher names. """ dc_relation = property(_getter_bag(DC_NAMESPACE, "relation")) """ An unordered array of text descriptions of relationships to other documents. """ dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights")) """ A language-keyed dictionary of textual descriptions of the rights the user has to this resource. """ dc_source = property(_getter_single(DC_NAMESPACE, "source")) """ Unique identifier of the work from which this resource was derived. """ dc_subject = property(_getter_bag(DC_NAMESPACE, "subject")) """ An unordered array of descriptive phrases or keywrods that specify the topic of the content of the resource. """ dc_title = property(_getter_langalt(DC_NAMESPACE, "title")) """ A language-keyed dictionary of the title of the resource. """ dc_type = property(_getter_bag(DC_NAMESPACE, "type")) """ An unordered array of textual descriptions of the document type. """ pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords")) """ An unformatted text string representing document keywords. """ pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion")) """ The PDF file version, for example 1.0, 1.3. """ pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer")) """ The name of the tool that created the PDF document. """ xmp_create_date = property( _getter_single(XMP_NAMESPACE, "CreateDate", _converter_date) ) """ The date and time the resource was originally created. The date and time are returned as a UTC datetime.datetime object. """ @property def xmp_createDate(self) -> datetime.datetime: # pragma: no cover deprecate_with_replacement("xmp_createDate", "xmp_create_date", "4.0.0") return self.xmp_create_date @xmp_createDate.setter def xmp_createDate(self, value: datetime.datetime) -> None: # pragma: no cover deprecate_with_replacement("xmp_createDate", "xmp_create_date", "4.0.0") self.xmp_create_date = value xmp_modify_date = property( _getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date) ) """ The date and time the resource was last modified. The date and time are returned as a UTC datetime.datetime object. """ @property def xmp_modifyDate(self) -> datetime.datetime: # pragma: no cover deprecate_with_replacement("xmp_modifyDate", "xmp_modify_date", "4.0.0") return self.xmp_modify_date @xmp_modifyDate.setter def xmp_modifyDate(self, value: datetime.datetime) -> None: # pragma: no cover deprecate_with_replacement("xmp_modifyDate", "xmp_modify_date", "4.0.0") self.xmp_modify_date = value xmp_metadata_date = property( _getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date) ) """ The date and time that any metadata for this resource was last changed. The date and time are returned as a UTC datetime.datetime object. """ @property def xmp_metadataDate(self) -> datetime.datetime: # pragma: no cover deprecate_with_replacement("xmp_metadataDate", "xmp_metadata_date", "4.0.0") return self.xmp_metadata_date @xmp_metadataDate.setter def xmp_metadataDate(self, value: datetime.datetime) -> None: # pragma: no cover deprecate_with_replacement("xmp_metadataDate", "xmp_metadata_date", "4.0.0") self.xmp_metadata_date = value xmp_creator_tool = property(_getter_single(XMP_NAMESPACE, "CreatorTool")) """ The name of the first known tool used to create the resource. """ @property def xmp_creatorTool(self) -> str: # pragma: no cover deprecate_with_replacement("xmp_creatorTool", "xmp_creator_tool") return self.xmp_creator_tool @xmp_creatorTool.setter def xmp_creatorTool(self, value: str) -> None: # pragma: no cover deprecate_with_replacement("xmp_creatorTool", "xmp_creator_tool") self.xmp_creator_tool = value xmpmm_document_id = property(_getter_single(XMPMM_NAMESPACE, "DocumentID")) """ The common identifier for all versions and renditions of this resource. """ @property def xmpmm_documentId(self) -> str: # pragma: no cover deprecate_with_replacement("xmpmm_documentId", "xmpmm_document_id") return self.xmpmm_document_id @xmpmm_documentId.setter def xmpmm_documentId(self, value: str) -> None: # pragma: no cover deprecate_with_replacement("xmpmm_documentId", "xmpmm_document_id") self.xmpmm_document_id = value xmpmm_instance_id = property(_getter_single(XMPMM_NAMESPACE, "InstanceID")) """ An identifier for a specific incarnation of a document, updated each time a file is saved. """ @property def xmpmm_instanceId(self) -> str: # pragma: no cover deprecate_with_replacement("xmpmm_instanceId", "xmpmm_instance_id") return self.xmpmm_instance_id @xmpmm_instanceId.setter def xmpmm_instanceId(self, value: str) -> None: # pragma: no cover deprecate_with_replacement("xmpmm_instanceId", "xmpmm_instance_id") self.xmpmm_instance_id = value @property def custom_properties(self) -> Dict[Any, Any]: """ Retrieves custom metadata properties defined in the undocumented pdfx metadata schema. :return: a dictionary of key/value items for custom metadata properties. :rtype: dict """ if not hasattr(self, "_custom_properties"): self._custom_properties = {} for node in self.get_nodes_in_namespace("", PDFX_NAMESPACE): key = node.localName while True: # see documentation about PDFX_NAMESPACE earlier in file idx = key.find("\u2182") if idx == -1: break key = ( key[:idx] + chr(int(key[idx + 1 : idx + 5], base=16)) + key[idx + 5 :] ) if node.nodeType == node.ATTRIBUTE_NODE: value = node.nodeValue else: value = self._get_text(node) self._custom_properties[key] = value return self._custom_properties