|
|
# vim: sw=4:expandtab:foldmethod=marker # # Copyright (c) 2006, Mathieu Fenniak # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE.
from .generic import * from .utils import isString, str_ from .pdf import PdfFileReader, PdfFileWriter from .pagerange import PageRange from sys import version_info if version_info < ( 3, 0 ): from cStringIO import StringIO StreamIO = StringIO else: from io import BytesIO from io import FileIO as file StreamIO = BytesIO
class _MergedPage(object): """
_MergedPage is used internally by PdfFileMerger to collect necessary information on each page that is being merged. """
def __init__(self, pagedata, src, id): self.src = src self.pagedata = pagedata self.out_pagedata = None self.id = id
class PdfFileMerger(object): """
Initializes a PdfFileMerger object. PdfFileMerger merges multiple PDFs into a single PDF. It can concatenate, slice, insert, or any combination of the above.
See the functions :meth:`merge()<merge>` (or :meth:`append()<append>`) and :meth:`write()<write>` for usage information.
:param bool strict: Determines whether user should be warned of all problems and also causes some correctable problems to be fatal. Defaults to ``True``. """
def __init__(self, strict=True): self.inputs = [] self.pages = [] self.output = PdfFileWriter() self.bookmarks = [] self.named_dests = [] self.id_count = 0 self.strict = strict
def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=True): """
Merges the pages from the given file into the output file at the specified page number.
:param int position: The *page number* to insert this file. File will be inserted after the given number.
:param fileobj: A File Object or an object that supports the standard read and seek methods similar to a File Object. Could also be a string representing a path to a PDF file.
:param str bookmark: Optionally, you may specify a bookmark to be applied at the beginning of the included file by supplying the text of the bookmark.
:param pages: can be a :ref:`Page Range <page-range>` or a ``(start, stop[, step])`` tuple to merge only the specified range of pages from the source document into the output document.
:param bool import_bookmarks: You may prevent the source document's bookmarks from being imported by specifying this as ``False``. """
# This parameter is passed to self.inputs.append and means # that the stream used was created in this method. my_file = False
# If the fileobj parameter is a string, assume it is a path # and create a file object at that location. If it is a file, # copy the file's contents into a BytesIO (or StreamIO) stream object; if # it is a PdfFileReader, copy that reader's stream into a # BytesIO (or StreamIO) stream. # If fileobj is none of the above types, it is not modified decryption_key = None if isString(fileobj): fileobj = file(fileobj, 'rb') my_file = True elif isinstance(fileobj, file): fileobj.seek(0) filecontent = fileobj.read() fileobj = StreamIO(filecontent) my_file = True elif isinstance(fileobj, PdfFileReader): orig_tell = fileobj.stream.tell() fileobj.stream.seek(0) filecontent = StreamIO(fileobj.stream.read()) fileobj.stream.seek(orig_tell) # reset the stream to its original location fileobj = filecontent if hasattr(fileobj, '_decryption_key'): decryption_key = fileobj._decryption_key my_file = True
# Create a new PdfFileReader instance using the stream # (either file or BytesIO or StringIO) created above pdfr = PdfFileReader(fileobj, strict=self.strict) if decryption_key is not None: pdfr._decryption_key = decryption_key
# Find the range of pages to merge. if pages == None: pages = (0, pdfr.getNumPages()) elif isinstance(pages, PageRange): pages = pages.indices(pdfr.getNumPages()) elif not isinstance(pages, tuple): raise TypeError('"pages" must be a tuple of (start, stop[, step])')
srcpages = [] if bookmark: bookmark = Bookmark(TextStringObject(bookmark), NumberObject(self.id_count), NameObject('/Fit'))
outline = [] if import_bookmarks: outline = pdfr.getOutlines() outline = self._trim_outline(pdfr, outline, pages)
if bookmark: self.bookmarks += [bookmark, outline] else: self.bookmarks += outline
dests = pdfr.namedDestinations dests = self._trim_dests(pdfr, dests, pages) self.named_dests += dests
# Gather all the pages that are going to be merged for i in range(*pages): pg = pdfr.getPage(i)
id = self.id_count self.id_count += 1
mp = _MergedPage(pg, pdfr, id)
srcpages.append(mp)
self._associate_dests_to_pages(srcpages) self._associate_bookmarks_to_pages(srcpages)
# Slice to insert the pages at the specified position self.pages[position:position] = srcpages
# Keep track of our input files so we can close them later self.inputs.append((fileobj, pdfr, my_file))
def append(self, fileobj, bookmark=None, pages=None, import_bookmarks=True): """
Identical to the :meth:`merge()<merge>` method, but assumes you want to concatenate all pages onto the end of the file instead of specifying a position.
:param fileobj: A File Object or an object that supports the standard read and seek methods similar to a File Object. Could also be a string representing a path to a PDF file.
:param str bookmark: Optionally, you may specify a bookmark to be applied at the beginning of the included file by supplying the text of the bookmark.
:param pages: can be a :ref:`Page Range <page-range>` or a ``(start, stop[, step])`` tuple to merge only the specified range of pages from the source document into the output document.
:param bool import_bookmarks: You may prevent the source document's bookmarks from being imported by specifying this as ``False``. """
self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks)
def write(self, fileobj): """
Writes all data that has been merged to the given output file.
:param fileobj: Output file. Can be a filename or any kind of file-like object. """
my_file = False if isString(fileobj): fileobj = file(fileobj, 'wb') my_file = True
# Add pages to the PdfFileWriter # The commented out line below was replaced with the two lines below it to allow PdfFileMerger to work with PyPdf 1.13 for page in self.pages: self.output.addPage(page.pagedata) page.out_pagedata = self.output.getReference(self.output._pages.getObject()["/Kids"][-1].getObject()) #idnum = self.output._objects.index(self.output._pages.getObject()["/Kids"][-1].getObject()) + 1 #page.out_pagedata = IndirectObject(idnum, 0, self.output)
# Once all pages are added, create bookmarks to point at those pages self._write_dests() self._write_bookmarks()
# Write the output to the file self.output.write(fileobj)
if my_file: fileobj.close()
def close(self): """
Shuts all file descriptors (input and output) and clears all memory usage. """
self.pages = [] for fo, pdfr, mine in self.inputs: if mine: fo.close()
self.inputs = [] self.output = None
def addMetadata(self, infos): """
Add custom metadata to the output.
:param dict infos: a Python dictionary where each key is a field and each value is your new metadata. Example: ``{u'/Title': u'My title'}`` """
self.output.addMetadata(infos)
def setPageLayout(self, layout): """
Set the page layout
:param str layout: The page layout to be used
Valid layouts are: /NoLayout Layout explicitly not specified /SinglePage Show one page at a time /OneColumn Show one column at a time /TwoColumnLeft Show pages in two columns, odd-numbered pages on the left /TwoColumnRight Show pages in two columns, odd-numbered pages on the right /TwoPageLeft Show two pages at a time, odd-numbered pages on the left /TwoPageRight Show two pages at a time, odd-numbered pages on the right """
self.output.setPageLayout(layout)
def setPageMode(self, mode): """
Set the page mode.
:param str mode: The page mode to use.
Valid modes are: /UseNone Do not show outlines or thumbnails panels /UseOutlines Show outlines (aka bookmarks) panel /UseThumbs Show page thumbnails panel /FullScreen Fullscreen view /UseOC Show Optional Content Group (OCG) panel /UseAttachments Show attachments panel """
self.output.setPageMode(mode)
def _trim_dests(self, pdf, dests, pages): """
Removes any named destinations that are not a part of the specified page set. """
new_dests = [] prev_header_added = True for k, o in list(dests.items()): for j in range(*pages): if pdf.getPage(j).getObject() == o['/Page'].getObject(): o[NameObject('/Page')] = o['/Page'].getObject() assert str_(k) == str_(o['/Title']) new_dests.append(o) break return new_dests
def _trim_outline(self, pdf, outline, pages): """
Removes any outline/bookmark entries that are not a part of the specified page set. """
new_outline = [] prev_header_added = True for i, o in enumerate(outline): if isinstance(o, list): sub = self._trim_outline(pdf, o, pages) if sub: if not prev_header_added: new_outline.append(outline[i-1]) new_outline.append(sub) else: prev_header_added = False for j in range(*pages): if pdf.getPage(j).getObject() == o['/Page'].getObject(): o[NameObject('/Page')] = o['/Page'].getObject() new_outline.append(o) prev_header_added = True break return new_outline
def _write_dests(self): dests = self.named_dests
for v in dests: pageno = None pdf = None if '/Page' in v: for i, p in enumerate(self.pages): if p.id == v['/Page']: v[NameObject('/Page')] = p.out_pagedata pageno = i pdf = p.src break if pageno != None: self.output.addNamedDestinationObject(v)
def _write_bookmarks(self, bookmarks=None, parent=None):
if bookmarks == None: bookmarks = self.bookmarks
last_added = None for b in bookmarks: if isinstance(b, list): self._write_bookmarks(b, last_added) continue
pageno = None pdf = None if '/Page' in b: for i, p in enumerate(self.pages): if p.id == b['/Page']: #b[NameObject('/Page')] = p.out_pagedata args = [NumberObject(p.id), NameObject(b['/Type'])] #nothing more to add #if b['/Type'] == '/Fit' or b['/Type'] == '/FitB' if b['/Type'] == '/FitH' or b['/Type'] == '/FitBH': if '/Top' in b and not isinstance(b['/Top'], NullObject): args.append(FloatObject(b['/Top'])) else: args.append(FloatObject(0)) del b['/Top'] elif b['/Type'] == '/FitV' or b['/Type'] == '/FitBV': if '/Left' in b and not isinstance(b['/Left'], NullObject): args.append(FloatObject(b['/Left'])) else: args.append(FloatObject(0)) del b['/Left'] elif b['/Type'] == '/XYZ': if '/Left' in b and not isinstance(b['/Left'], NullObject): args.append(FloatObject(b['/Left'])) else: args.append(FloatObject(0)) if '/Top' in b and not isinstance(b['/Top'], NullObject): args.append(FloatObject(b['/Top'])) else: args.append(FloatObject(0)) if '/Zoom' in b and not isinstance(b['/Zoom'], NullObject): args.append(FloatObject(b['/Zoom'])) else: args.append(FloatObject(0)) del b['/Top'], b['/Zoom'], b['/Left'] elif b['/Type'] == '/FitR': if '/Left' in b and not isinstance(b['/Left'], NullObject): args.append(FloatObject(b['/Left'])) else: args.append(FloatObject(0)) if '/Bottom' in b and not isinstance(b['/Bottom'], NullObject): args.append(FloatObject(b['/Bottom'])) else: args.append(FloatObject(0)) if '/Right' in b and not isinstance(b['/Right'], NullObject): args.append(FloatObject(b['/Right'])) else: args.append(FloatObject(0)) if '/Top' in b and not isinstance(b['/Top'], NullObject): args.append(FloatObject(b['/Top'])) else: args.append(FloatObject(0)) del b['/Left'], b['/Right'], b['/Bottom'], b['/Top']
b[NameObject('/A')] = DictionaryObject({NameObject('/S'): NameObject('/GoTo'), NameObject('/D'): ArrayObject(args)})
pageno = i pdf = p.src break if pageno != None: del b['/Page'], b['/Type'] last_added = self.output.addBookmarkDict(b, parent)
def _associate_dests_to_pages(self, pages): for nd in self.named_dests: pageno = None np = nd['/Page']
if isinstance(np, NumberObject): continue
for p in pages: if np.getObject() == p.pagedata.getObject(): pageno = p.id
if pageno != None: nd[NameObject('/Page')] = NumberObject(pageno) else: raise ValueError("Unresolved named destination '%s'" % (nd['/Title'],))
def _associate_bookmarks_to_pages(self, pages, bookmarks=None): if bookmarks == None: bookmarks = self.bookmarks
for b in bookmarks: if isinstance(b, list): self._associate_bookmarks_to_pages(pages, b) continue
pageno = None bp = b['/Page']
if isinstance(bp, NumberObject): continue
for p in pages: if bp.getObject() == p.pagedata.getObject(): pageno = p.id
if pageno != None: b[NameObject('/Page')] = NumberObject(pageno) else: raise ValueError("Unresolved bookmark '%s'" % (b['/Title'],))
def findBookmark(self, bookmark, root=None): if root == None: root = self.bookmarks
for i, b in enumerate(root): if isinstance(b, list): res = self.findBookmark(bookmark, b) if res: return [i] + res elif b == bookmark or b['/Title'] == bookmark: return [i]
return None
def addBookmark(self, title, pagenum, parent=None): """
Add a bookmark to this PDF file.
:param str title: Title to use for this bookmark. :param int pagenum: Page number this bookmark will point to. :param parent: A reference to a parent bookmark to create nested bookmarks. """
if parent == None: iloc = [len(self.bookmarks)-1] elif isinstance(parent, list): iloc = parent else: iloc = self.findBookmark(parent)
dest = Bookmark(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826))
if parent == None: self.bookmarks.append(dest) else: bmparent = self.bookmarks for i in iloc[:-1]: bmparent = bmparent[i] npos = iloc[-1]+1 if npos < len(bmparent) and isinstance(bmparent[npos], list): bmparent[npos].append(dest) else: bmparent.insert(npos, [dest]) return dest
def addNamedDestination(self, title, pagenum): """
Add a destination to the output.
:param str title: Title to use :param int pagenum: Page number this destination points at. """
dest = Destination(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826)) self.named_dests.append(dest)
class OutlinesObject(list): def __init__(self, pdf, tree, parent=None): list.__init__(self) self.tree = tree self.pdf = pdf self.parent = parent
def remove(self, index): obj = self[index] del self[index] self.tree.removeChild(obj)
def add(self, title, pagenum): pageRef = self.pdf.getObject(self.pdf._pages)['/Kids'][pagenum] action = DictionaryObject() action.update({ NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), NameObject('/S') : NameObject('/GoTo') }) actionRef = self.pdf._addObject(action) bookmark = TreeObject()
bookmark.update({ NameObject('/A'): actionRef, NameObject('/Title'): createStringObject(title), })
self.pdf._addObject(bookmark)
self.tree.addChild(bookmark)
def removeAll(self): for child in [x for x in self.tree.children()]: self.tree.removeChild(child) self.pop()
|