Source code for PyPDF2._merger

# Copyright (c) 2006, Mathieu Fenniak
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

import warnings
from io import BytesIO, FileIO, IOBase
from pathlib import Path
from types import TracebackType
from typing import (
    Any,
    Dict,
    Iterable,
    List,
    Optional,
    Tuple,
    Type,
    Union,
    cast,
)

from ._encryption import Encryption
from ._page import PageObject
from ._reader import PdfReader
from ._utils import (
    StrByteType,
    deprecation_bookmark,
    deprecation_with_replacement,
    str_,
)
from ._writer import PdfWriter
from .constants import GoToActionArguments
from .constants import PagesAttributes as PA
from .constants import TypArguments, TypFitArguments
from .generic import (
    PAGE_FIT,
    ArrayObject,
    Destination,
    DictionaryObject,
    Fit,
    FloatObject,
    IndirectObject,
    NameObject,
    NullObject,
    NumberObject,
    OutlineItem,
    TextStringObject,
    TreeObject,
)
from .pagerange import PageRange, PageRangeSpec
from .types import FitType, LayoutType, OutlineType, PagemodeType, ZoomArgType

ERR_CLOSED_WRITER = "close() was called and thus the writer cannot be used anymore"


class _MergedPage:
    """Collect necessary information on each page that is being merged."""

    def __init__(self, pagedata: PageObject, src: PdfReader, id: int) -> None:
        self.src = src
        self.pagedata = pagedata
        self.out_pagedata = None
        self.id = id


[docs]class PdfMerger: """ Initialize a ``PdfMerger`` object. ``PdfMerger`` merges multiple PDFs into a single PDF. It can concatenate, slice, insert, or any combination of the above. See the functions :meth:`merge()<merge>` (or :meth:`append()<append>`) and :meth:`write()<write>` for usage information. :param bool strict: Determines whether user should be warned of all problems and also causes some correctable problems to be fatal. Defaults to ``False``. :param fileobj: Output file. Can be a filename or any kind of file-like object. """ @deprecation_bookmark(bookmarks="outline") def __init__( self, strict: bool = False, fileobj: Union[Path, StrByteType] = "" ) -> None: self.inputs: List[Tuple[Any, PdfReader]] = [] self.pages: List[Any] = [] self.output: Optional[PdfWriter] = PdfWriter() self.outline: OutlineType = [] self.named_dests: List[Any] = [] self.id_count = 0 self.fileobj = fileobj self.strict = strict def __enter__(self) -> "PdfMerger": # There is nothing to do. return self def __exit__( self, exc_type: Optional[Type[BaseException]], exc: Optional[BaseException], traceback: Optional[TracebackType], ) -> None: """Write to the fileobj and close the merger.""" if self.fileobj: self.write(self.fileobj) self.close()
[docs] @deprecation_bookmark(bookmark="outline_item", import_bookmarks="import_outline") def merge( self, page_number: Optional[int] = None, fileobj: Union[Path, StrByteType, PdfReader] = None, outline_item: Optional[str] = None, pages: Optional[PageRangeSpec] = None, import_outline: bool = True, position: Optional[int] = None, # deprecated ) -> None: """ Merge the pages from the given file into the output file at the specified page number. :param int page_number: The *page number* to insert this file. File will be inserted after the given number. :param fileobj: A File Object or an object that supports the standard read and seek methods similar to a File Object. Could also be a string representing a path to a PDF file. :param str outline_item: Optionally, you may specify an outline item (previously referred to as a 'bookmark') to be applied at the beginning of the included file by supplying the text of the outline item. :param pages: can be a :class:`PageRange<PyPDF2.pagerange.PageRange>` or a ``(start, stop[, step])`` tuple to merge only the specified range of pages from the source document into the output document. Can also be a list of pages to merge. :param bool import_outline: You may prevent the source document's outline (collection of outline items, previously referred to as 'bookmarks') from being imported by specifying this as ``False``. """ if position is not None: # deprecated if page_number is None: page_number = position old_term = "position" new_term = "page_number" warnings.warn( ( f"{old_term} is deprecated as an argument and will be " f"removed in PyPDF2=4.0.0. Use {new_term} instead" ), DeprecationWarning, ) else: raise ValueError( "The argument position of merge is deprecated. Use page_number only." ) if page_number is None: # deprecated # The paremter is only marked as Optional as long as # position is not fully deprecated raise ValueError("page_number may not be None") if fileobj is None: # deprecated # The argument is only Optional due to the deprecated position # argument raise ValueError("fileobj may not be None") stream, encryption_obj = self._create_stream(fileobj) # Create a new PdfReader instance using the stream # (either file or BytesIO or StringIO) created above reader = PdfReader(stream, strict=self.strict) # type: ignore[arg-type] self.inputs.append((stream, reader)) if encryption_obj is not None: reader._encryption = encryption_obj # Find the range of pages to merge. if pages is None: pages = (0, len(reader.pages)) elif isinstance(pages, PageRange): pages = pages.indices(len(reader.pages)) elif isinstance(pages, list): pass elif not isinstance(pages, tuple): raise TypeError('"pages" must be a tuple of (start, stop[, step])') srcpages = [] outline = [] if import_outline: outline = reader.outline outline = self._trim_outline(reader, outline, pages) if outline_item: outline_item_typ = OutlineItem( TextStringObject(outline_item), NumberObject(self.id_count), Fit.fit(), ) self.outline += [outline_item_typ, outline] # type: ignore else: self.outline += outline dests = reader.named_destinations trimmed_dests = self._trim_dests(reader, dests, pages) self.named_dests += trimmed_dests # Gather all the pages that are going to be merged for i in range(*pages): page = reader.pages[i] id = self.id_count self.id_count += 1 mp = _MergedPage(page, reader, id) srcpages.append(mp) self._associate_dests_to_pages(srcpages) self._associate_outline_items_to_pages(srcpages) # Slice to insert the pages at the specified page_number self.pages[page_number:page_number] = srcpages
def _create_stream( self, fileobj: Union[Path, StrByteType, PdfReader] ) -> Tuple[IOBase, Optional[Encryption]]: # If the fileobj parameter is a string, assume it is a path # and create a file object at that location. If it is a file, # copy the file's contents into a BytesIO stream object; if # it is a PdfReader, copy that reader's stream into a # BytesIO stream. # If fileobj is none of the above types, it is not modified encryption_obj = None stream: IOBase if isinstance(fileobj, (str, Path)): stream = FileIO(fileobj, "rb") elif isinstance(fileobj, PdfReader): if fileobj._encryption: encryption_obj = fileobj._encryption orig_tell = fileobj.stream.tell() fileobj.stream.seek(0) stream = BytesIO(fileobj.stream.read()) # reset the stream to its original location fileobj.stream.seek(orig_tell) elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"): fileobj.seek(0) filecontent = fileobj.read() stream = BytesIO(filecontent) else: raise NotImplementedError( "PdfMerger.merge requires an object that PdfReader can parse. " "Typically, that is a Path or a string representing a Path, " "a file object, or an object implementing .seek and .read. " "Passing a PdfReader directly works as well." ) return stream, encryption_obj
[docs] @deprecation_bookmark(bookmark="outline_item", import_bookmarks="import_outline") def append( self, fileobj: Union[StrByteType, PdfReader, Path], outline_item: Optional[str] = None, pages: Union[ None, PageRange, Tuple[int, int], Tuple[int, int, int], List[int] ] = None, import_outline: bool = True, ) -> None: """ Identical to the :meth:`merge()<merge>` method, but assumes you want to concatenate all pages onto the end of the file instead of specifying a position. :param fileobj: A File Object or an object that supports the standard read and seek methods similar to a File Object. Could also be a string representing a path to a PDF file. :param str outline_item: Optionally, you may specify an outline item (previously referred to as a 'bookmark') to be applied at the beginning of the included file by supplying the text of the outline item. :param pages: can be a :class:`PageRange<PyPDF2.pagerange.PageRange>` or a ``(start, stop[, step])`` tuple to merge only the specified range of pages from the source document into the output document. Can also be a list of pages to append. :param bool import_outline: You may prevent the source document's outline (collection of outline items, previously referred to as 'bookmarks') from being imported by specifying this as ``False``. """ self.merge(len(self.pages), fileobj, outline_item, pages, import_outline)
[docs] def write(self, fileobj: Union[Path, StrByteType]) -> None: """ Write all data that has been merged to the given output file. :param fileobj: Output file. Can be a filename or any kind of file-like object. """ if self.output is None: raise RuntimeError(ERR_CLOSED_WRITER) # Add pages to the PdfWriter # The commented out line below was replaced with the two lines below it # to allow PdfMerger to work with PyPdf 1.13 for page in self.pages: self.output.add_page(page.pagedata) pages_obj = cast(Dict[str, Any], self.output._pages.get_object()) page.out_pagedata = self.output.get_reference( pages_obj[PA.KIDS][-1].get_object() ) # idnum = self.output._objects.index(self.output._pages.get_object()[PA.KIDS][-1].get_object()) + 1 # page.out_pagedata = IndirectObject(idnum, 0, self.output) # Once all pages are added, create outline items to point at those pages self._write_dests() self._write_outline() # Write the output to the file my_file, ret_fileobj = self.output.write(fileobj) if my_file: ret_fileobj.close()
[docs] def close(self) -> None: """Shut all file descriptors (input and output) and clear all memory usage.""" self.pages = [] for fo, _reader in self.inputs: fo.close() self.inputs = [] self.output = None
[docs] def add_metadata(self, infos: Dict[str, Any]) -> None: """ Add custom metadata to the output. :param dict infos: a Python dictionary where each key is a field and each value is your new metadata. Example: ``{u'/Title': u'My title'}`` """ if self.output is None: raise RuntimeError(ERR_CLOSED_WRITER) self.output.add_metadata(infos)
[docs] def addMetadata(self, infos: Dict[str, Any]) -> None: # pragma: no cover """ .. deprecated:: 1.28.0 Use :meth:`add_metadata` instead. """ deprecation_with_replacement("addMetadata", "add_metadata") self.add_metadata(infos)
[docs] def setPageLayout(self, layout: LayoutType) -> None: # pragma: no cover """ .. deprecated:: 1.28.0 Use :meth:`set_page_layout` instead. """ deprecation_with_replacement("setPageLayout", "set_page_layout") self.set_page_layout(layout)
[docs] def set_page_layout(self, layout: LayoutType) -> None: """ Set the page layout. :param str layout: The page layout to be used .. list-table:: Valid ``layout`` arguments :widths: 50 200 * - /NoLayout - Layout explicitly not specified * - /SinglePage - Show one page at a time * - /OneColumn - Show one column at a time * - /TwoColumnLeft - Show pages in two columns, odd-numbered pages on the left * - /TwoColumnRight - Show pages in two columns, odd-numbered pages on the right * - /TwoPageLeft - Show two pages at a time, odd-numbered pages on the left * - /TwoPageRight - Show two pages at a time, odd-numbered pages on the right """ if self.output is None: raise RuntimeError(ERR_CLOSED_WRITER) self.output._set_page_layout(layout)
[docs] def setPageMode(self, mode: PagemodeType) -> None: # pragma: no cover """ .. deprecated:: 1.28.0 Use :meth:`set_page_mode` instead. """ deprecation_with_replacement("setPageMode", "set_page_mode", "3.0.0") self.set_page_mode(mode)
[docs] def set_page_mode(self, mode: PagemodeType) -> None: """ Set the page mode. :param str mode: The page mode to use. .. list-table:: Valid ``mode`` arguments :widths: 50 200 * - /UseNone - Do not show outline or thumbnails panels * - /UseOutlines - Show outline (aka bookmarks) panel * - /UseThumbs - Show page thumbnails panel * - /FullScreen - Fullscreen view * - /UseOC - Show Optional Content Group (OCG) panel * - /UseAttachments - Show attachments panel """ if self.output is None: raise RuntimeError(ERR_CLOSED_WRITER) self.output.set_page_mode(mode)
def _trim_dests( self, pdf: PdfReader, dests: Dict[str, Dict[str, Any]], pages: Union[Tuple[int, int], Tuple[int, int, int], List[int]], ) -> List[Dict[str, Any]]: """Remove named destinations that are not a part of the specified page set.""" new_dests = [] lst = pages if isinstance(pages, list) else list(range(*pages)) for key, obj in dests.items(): for j in lst: if pdf.pages[j].get_object() == obj["/Page"].get_object(): obj[NameObject("/Page")] = obj["/Page"].get_object() assert str_(key) == str_(obj["/Title"]) new_dests.append(obj) break return new_dests def _trim_outline( self, pdf: PdfReader, outline: OutlineType, pages: Union[Tuple[int, int], Tuple[int, int, int], List[int]], ) -> OutlineType: """Remove outline item entries that are not a part of the specified page set.""" new_outline = [] prev_header_added = True lst = pages if isinstance(pages, list) else list(range(*pages)) for i, outline_item in enumerate(outline): if isinstance(outline_item, list): sub = self._trim_outline(pdf, outline_item, lst) # type: ignore if sub: if not prev_header_added: new_outline.append(outline[i - 1]) new_outline.append(sub) # type: ignore else: prev_header_added = False for j in lst: if outline_item["/Page"] is None: continue if pdf.pages[j].get_object() == outline_item["/Page"].get_object(): outline_item[NameObject("/Page")] = outline_item[ "/Page" ].get_object() new_outline.append(outline_item) prev_header_added = True break return new_outline def _write_dests(self) -> None: if self.output is None: raise RuntimeError(ERR_CLOSED_WRITER) for named_dest in self.named_dests: pageno = None if "/Page" in named_dest: for pageno, page in enumerate(self.pages): # noqa: B007 if page.id == named_dest["/Page"]: named_dest[NameObject("/Page")] = page.out_pagedata break if pageno is not None: self.output.add_named_destination_object(named_dest) @deprecation_bookmark(bookmarks="outline") def _write_outline( self, outline: Optional[Iterable[OutlineItem]] = None, parent: Optional[TreeObject] = None, ) -> None: if self.output is None: raise RuntimeError(ERR_CLOSED_WRITER) if outline is None: outline = self.outline # type: ignore assert outline is not None, "hint for mypy" # TODO: is that true? last_added = None for outline_item in outline: if isinstance(outline_item, list): self._write_outline(outline_item, last_added) continue page_no = None if "/Page" in outline_item: for page_no, page in enumerate(self.pages): # noqa: B007 if page.id == outline_item["/Page"]: self._write_outline_item_on_page(outline_item, page) break if page_no is not None: del outline_item["/Page"], outline_item["/Type"] last_added = self.output.add_outline_item_dict(outline_item, parent) @deprecation_bookmark(bookmark="outline_item") def _write_outline_item_on_page( self, outline_item: Union[OutlineItem, Destination], page: _MergedPage ) -> None: oi_type = cast(str, outline_item["/Type"]) args = [NumberObject(page.id), NameObject(oi_type)] fit2arg_keys: Dict[str, Tuple[str, ...]] = { TypFitArguments.FIT_H: (TypArguments.TOP,), TypFitArguments.FIT_BH: (TypArguments.TOP,), TypFitArguments.FIT_V: (TypArguments.LEFT,), TypFitArguments.FIT_BV: (TypArguments.LEFT,), TypFitArguments.XYZ: (TypArguments.LEFT, TypArguments.TOP, "/Zoom"), TypFitArguments.FIT_R: ( TypArguments.LEFT, TypArguments.BOTTOM, TypArguments.RIGHT, TypArguments.TOP, ), } for arg_key in fit2arg_keys.get(oi_type, tuple()): if arg_key in outline_item and not isinstance( outline_item[arg_key], NullObject ): args.append(FloatObject(outline_item[arg_key])) else: args.append(FloatObject(0)) del outline_item[arg_key] outline_item[NameObject("/A")] = DictionaryObject( { NameObject(GoToActionArguments.S): NameObject("/GoTo"), NameObject(GoToActionArguments.D): ArrayObject(args), } ) def _associate_dests_to_pages(self, pages: List[_MergedPage]) -> None: for named_dest in self.named_dests: pageno = None np = named_dest["/Page"] if isinstance(np, NumberObject): continue for page in pages: if np.get_object() == page.pagedata.get_object(): pageno = page.id if pageno is None: raise ValueError( f"Unresolved named destination '{named_dest['/Title']}'" ) named_dest[NameObject("/Page")] = NumberObject(pageno) @deprecation_bookmark(bookmarks="outline") def _associate_outline_items_to_pages( self, pages: List[_MergedPage], outline: Optional[Iterable[OutlineItem]] = None ) -> None: if outline is None: outline = self.outline # type: ignore # TODO: self.bookmarks can be None! assert outline is not None, "hint for mypy" for outline_item in outline: if isinstance(outline_item, list): self._associate_outline_items_to_pages(pages, outline_item) continue pageno = None outline_item_page = outline_item["/Page"] if isinstance(outline_item_page, NumberObject): continue for p in pages: if outline_item_page.get_object() == p.pagedata.get_object(): pageno = p.id if pageno is not None: outline_item[NameObject("/Page")] = NumberObject(pageno)
[docs] @deprecation_bookmark(bookmark="outline_item") def find_outline_item( self, outline_item: Dict[str, Any], root: Optional[OutlineType] = None, ) -> Optional[List[int]]: if root is None: root = self.outline for i, oi_enum in enumerate(root): if isinstance(oi_enum, list): # oi_enum is still an inner node # (OutlineType, if recursive types were supported by mypy) res = self.find_outline_item(outline_item, oi_enum) # type: ignore if res: return [i] + res elif ( oi_enum == outline_item or cast(Dict[Any, Any], oi_enum["/Title"]) == outline_item ): # we found a leaf node return [i] return None
[docs] @deprecation_bookmark(bookmark="outline_item") def find_bookmark( self, outline_item: Dict[str, Any], root: Optional[OutlineType] = None, ) -> Optional[List[int]]: # pragma: no cover """ .. deprecated:: 2.9.0 Use :meth:`find_outline_item` instead. """ return self.find_outline_item(outline_item, root)
[docs] def add_outline_item( self, title: str, page_number: Optional[int] = None, parent: Union[None, TreeObject, IndirectObject] = None, color: Optional[Tuple[float, float, float]] = None, bold: bool = False, italic: bool = False, fit: Fit = PAGE_FIT, pagenum: Optional[int] = None, # deprecated ) -> IndirectObject: """ Add an outline item (commonly referred to as a "Bookmark") to this PDF file. :param str title: Title to use for this outline item. :param int page_number: Page number this outline item will point to. :param parent: A reference to a parent outline item to create nested outline items. :param tuple color: Color of the outline item's font as a red, green, blue tuple from 0.0 to 1.0 :param bool bold: Outline item font is bold :param bool italic: Outline item font is italic :param Fit fit: The fit of the destination page. """ if page_number is not None and pagenum is not None: raise ValueError( "The argument pagenum of add_outline_item is deprecated. Use page_number only." ) if pagenum is not None: old_term = "pagenum" new_term = "page_number" warnings.warn( ( f"{old_term} is deprecated as an argument and will be " f"removed in PyPDF2==4.0.0. Use {new_term} instead" ), DeprecationWarning, ) page_number = pagenum if page_number is None: raise ValueError("page_number may not be None") writer = self.output if writer is None: raise RuntimeError(ERR_CLOSED_WRITER) return writer.add_outline_item( title, page_number, parent, None, color, bold, italic, fit, )
[docs] def addBookmark( self, title: str, pagenum: int, # deprecated, but the whole method is deprecated parent: Union[None, TreeObject, IndirectObject] = None, color: Optional[Tuple[float, float, float]] = None, bold: bool = False, italic: bool = False, fit: FitType = "/Fit", *args: ZoomArgType, ) -> IndirectObject: # pragma: no cover """ .. deprecated:: 1.28.0 Use :meth:`add_outline_item` instead. """ deprecation_with_replacement("addBookmark", "add_outline_item", "3.0.0") return self.add_outline_item( title, pagenum, parent, color, bold, italic, Fit(fit_type=fit, fit_args=args), )
[docs] def add_bookmark( self, title: str, pagenum: int, # deprecated, but the whole method is deprecated already parent: Union[None, TreeObject, IndirectObject] = None, color: Optional[Tuple[float, float, float]] = None, bold: bool = False, italic: bool = False, fit: FitType = "/Fit", *args: ZoomArgType, ) -> IndirectObject: # pragma: no cover """ .. deprecated:: 2.9.0 Use :meth:`add_outline_item` instead. """ deprecation_with_replacement("addBookmark", "add_outline_item", "3.0.0") return self.add_outline_item( title, pagenum, parent, color, bold, italic, Fit(fit_type=fit, fit_args=args), )
[docs] def addNamedDestination(self, title: str, pagenum: int) -> None: # pragma: no cover """ .. deprecated:: 1.28.0 Use :meth:`add_named_destination` instead. """ deprecation_with_replacement( "addNamedDestination", "add_named_destination", "3.0.0" ) return self.add_named_destination(title, pagenum)
[docs] def add_named_destination( self, title: str, page_number: Optional[int] = None, pagenum: Optional[int] = None, ) -> None: """ Add a destination to the output. :param str title: Title to use :param int page_number: Page number this destination points at. """ if page_number is not None and pagenum is not None: raise ValueError( "The argument pagenum of add_named_destination is deprecated. Use page_number only." ) if pagenum is not None: old_term = "pagenum" new_term = "page_number" warnings.warn( ( f"{old_term} is deprecated as an argument and will be " f"removed in PyPDF2==4.0.0. Use {new_term} instead" ), DeprecationWarning, ) page_number = pagenum if page_number is None: raise ValueError("page_number may not be None") dest = Destination( TextStringObject(title), NumberObject(page_number), Fit.fit_horizontally(top=826), ) self.named_dests.append(dest)
class PdfFileMerger(PdfMerger): # pragma: no cover def __init__(self, *args: Any, **kwargs: Any) -> None: deprecation_with_replacement("PdfFileMerger", "PdfMerger", "3.0.0") if "strict" not in kwargs and len(args) < 1: kwargs["strict"] = True # maintain the default super().__init__(*args, **kwargs)