I updated the PDF Booklet project and removed Python 2 dependencies so that it will run under Ubuntu 22.04.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

553 lines
21 KiB

2 years ago
  1. # vim: sw=4:expandtab:foldmethod=marker
  2. #
  3. # Copyright (c) 2006, Mathieu Fenniak
  4. # All rights reserved.
  5. #
  6. # Redistribution and use in source and binary forms, with or without
  7. # modification, are permitted provided that the following conditions are
  8. # met:
  9. #
  10. # * Redistributions of source code must retain the above copyright notice,
  11. # this list of conditions and the following disclaimer.
  12. # * Redistributions in binary form must reproduce the above copyright notice,
  13. # this list of conditions and the following disclaimer in the documentation
  14. # and/or other materials provided with the distribution.
  15. # * The name of the author may not be used to endorse or promote products
  16. # derived from this software without specific prior written permission.
  17. #
  18. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  19. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21. # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  22. # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  23. # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  24. # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  25. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  26. # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  27. # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  28. # POSSIBILITY OF SUCH DAMAGE.
  29. from .generic import *
  30. from .utils import isString, str_
  31. from .pdf import PdfFileReader, PdfFileWriter
  32. from .pagerange import PageRange
  33. from sys import version_info
  34. if version_info < ( 3, 0 ):
  35. from cStringIO import StringIO
  36. StreamIO = StringIO
  37. else:
  38. from io import BytesIO
  39. from io import FileIO as file
  40. StreamIO = BytesIO
  41. class _MergedPage(object):
  42. """
  43. _MergedPage is used internally by PdfFileMerger to collect necessary
  44. information on each page that is being merged.
  45. """
  46. def __init__(self, pagedata, src, id):
  47. self.src = src
  48. self.pagedata = pagedata
  49. self.out_pagedata = None
  50. self.id = id
  51. class PdfFileMerger(object):
  52. """
  53. Initializes a PdfFileMerger object. PdfFileMerger merges multiple PDFs
  54. into a single PDF. It can concatenate, slice, insert, or any combination
  55. of the above.
  56. See the functions :meth:`merge()<merge>` (or :meth:`append()<append>`)
  57. and :meth:`write()<write>` for usage information.
  58. :param bool strict: Determines whether user should be warned of all
  59. problems and also causes some correctable problems to be fatal.
  60. Defaults to ``True``.
  61. """
  62. def __init__(self, strict=True):
  63. self.inputs = []
  64. self.pages = []
  65. self.output = PdfFileWriter()
  66. self.bookmarks = []
  67. self.named_dests = []
  68. self.id_count = 0
  69. self.strict = strict
  70. def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=True):
  71. """
  72. Merges the pages from the given file into the output file at the
  73. specified page number.
  74. :param int position: The *page number* to insert this file. File will
  75. be inserted after the given number.
  76. :param fileobj: A File Object or an object that supports the standard read
  77. and seek methods similar to a File Object. Could also be a
  78. string representing a path to a PDF file.
  79. :param str bookmark: Optionally, you may specify a bookmark to be applied at
  80. the beginning of the included file by supplying the text of the bookmark.
  81. :param pages: can be a :ref:`Page Range <page-range>` or a ``(start, stop[, step])`` tuple
  82. to merge only the specified range of pages from the source
  83. document into the output document.
  84. :param bool import_bookmarks: You may prevent the source document's bookmarks
  85. from being imported by specifying this as ``False``.
  86. """
  87. # This parameter is passed to self.inputs.append and means
  88. # that the stream used was created in this method.
  89. my_file = False
  90. # If the fileobj parameter is a string, assume it is a path
  91. # and create a file object at that location. If it is a file,
  92. # copy the file's contents into a BytesIO (or StreamIO) stream object; if
  93. # it is a PdfFileReader, copy that reader's stream into a
  94. # BytesIO (or StreamIO) stream.
  95. # If fileobj is none of the above types, it is not modified
  96. decryption_key = None
  97. if isString(fileobj):
  98. fileobj = file(fileobj, 'rb')
  99. my_file = True
  100. elif isinstance(fileobj, file):
  101. fileobj.seek(0)
  102. filecontent = fileobj.read()
  103. fileobj = StreamIO(filecontent)
  104. my_file = True
  105. elif isinstance(fileobj, PdfFileReader):
  106. orig_tell = fileobj.stream.tell()
  107. fileobj.stream.seek(0)
  108. filecontent = StreamIO(fileobj.stream.read())
  109. fileobj.stream.seek(orig_tell) # reset the stream to its original location
  110. fileobj = filecontent
  111. if hasattr(fileobj, '_decryption_key'):
  112. decryption_key = fileobj._decryption_key
  113. my_file = True
  114. # Create a new PdfFileReader instance using the stream
  115. # (either file or BytesIO or StringIO) created above
  116. pdfr = PdfFileReader(fileobj, strict=self.strict)
  117. if decryption_key is not None:
  118. pdfr._decryption_key = decryption_key
  119. # Find the range of pages to merge.
  120. if pages == None:
  121. pages = (0, pdfr.getNumPages())
  122. elif isinstance(pages, PageRange):
  123. pages = pages.indices(pdfr.getNumPages())
  124. elif not isinstance(pages, tuple):
  125. raise TypeError('"pages" must be a tuple of (start, stop[, step])')
  126. srcpages = []
  127. if bookmark:
  128. bookmark = Bookmark(TextStringObject(bookmark), NumberObject(self.id_count), NameObject('/Fit'))
  129. outline = []
  130. if import_bookmarks:
  131. outline = pdfr.getOutlines()
  132. outline = self._trim_outline(pdfr, outline, pages)
  133. if bookmark:
  134. self.bookmarks += [bookmark, outline]
  135. else:
  136. self.bookmarks += outline
  137. dests = pdfr.namedDestinations
  138. dests = self._trim_dests(pdfr, dests, pages)
  139. self.named_dests += dests
  140. # Gather all the pages that are going to be merged
  141. for i in range(*pages):
  142. pg = pdfr.getPage(i)
  143. id = self.id_count
  144. self.id_count += 1
  145. mp = _MergedPage(pg, pdfr, id)
  146. srcpages.append(mp)
  147. self._associate_dests_to_pages(srcpages)
  148. self._associate_bookmarks_to_pages(srcpages)
  149. # Slice to insert the pages at the specified position
  150. self.pages[position:position] = srcpages
  151. # Keep track of our input files so we can close them later
  152. self.inputs.append((fileobj, pdfr, my_file))
  153. def append(self, fileobj, bookmark=None, pages=None, import_bookmarks=True):
  154. """
  155. Identical to the :meth:`merge()<merge>` method, but assumes you want to concatenate
  156. all pages onto the end of the file instead of specifying a position.
  157. :param fileobj: A File Object or an object that supports the standard read
  158. and seek methods similar to a File Object. Could also be a
  159. string representing a path to a PDF file.
  160. :param str bookmark: Optionally, you may specify a bookmark to be applied at
  161. the beginning of the included file by supplying the text of the bookmark.
  162. :param pages: can be a :ref:`Page Range <page-range>` or a ``(start, stop[, step])`` tuple
  163. to merge only the specified range of pages from the source
  164. document into the output document.
  165. :param bool import_bookmarks: You may prevent the source document's bookmarks
  166. from being imported by specifying this as ``False``.
  167. """
  168. self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks)
  169. def write(self, fileobj):
  170. """
  171. Writes all data that has been merged to the given output file.
  172. :param fileobj: Output file. Can be a filename or any kind of
  173. file-like object.
  174. """
  175. my_file = False
  176. if isString(fileobj):
  177. fileobj = file(fileobj, 'wb')
  178. my_file = True
  179. # Add pages to the PdfFileWriter
  180. # The commented out line below was replaced with the two lines below it to allow PdfFileMerger to work with PyPdf 1.13
  181. for page in self.pages:
  182. self.output.addPage(page.pagedata)
  183. page.out_pagedata = self.output.getReference(self.output._pages.getObject()["/Kids"][-1].getObject())
  184. #idnum = self.output._objects.index(self.output._pages.getObject()["/Kids"][-1].getObject()) + 1
  185. #page.out_pagedata = IndirectObject(idnum, 0, self.output)
  186. # Once all pages are added, create bookmarks to point at those pages
  187. self._write_dests()
  188. self._write_bookmarks()
  189. # Write the output to the file
  190. self.output.write(fileobj)
  191. if my_file:
  192. fileobj.close()
  193. def close(self):
  194. """
  195. Shuts all file descriptors (input and output) and clears all memory
  196. usage.
  197. """
  198. self.pages = []
  199. for fo, pdfr, mine in self.inputs:
  200. if mine:
  201. fo.close()
  202. self.inputs = []
  203. self.output = None
  204. def addMetadata(self, infos):
  205. """
  206. Add custom metadata to the output.
  207. :param dict infos: a Python dictionary where each key is a field
  208. and each value is your new metadata.
  209. Example: ``{u'/Title': u'My title'}``
  210. """
  211. self.output.addMetadata(infos)
  212. def setPageLayout(self, layout):
  213. """
  214. Set the page layout
  215. :param str layout: The page layout to be used
  216. Valid layouts are:
  217. /NoLayout Layout explicitly not specified
  218. /SinglePage Show one page at a time
  219. /OneColumn Show one column at a time
  220. /TwoColumnLeft Show pages in two columns, odd-numbered pages on the left
  221. /TwoColumnRight Show pages in two columns, odd-numbered pages on the right
  222. /TwoPageLeft Show two pages at a time, odd-numbered pages on the left
  223. /TwoPageRight Show two pages at a time, odd-numbered pages on the right
  224. """
  225. self.output.setPageLayout(layout)
  226. def setPageMode(self, mode):
  227. """
  228. Set the page mode.
  229. :param str mode: The page mode to use.
  230. Valid modes are:
  231. /UseNone Do not show outlines or thumbnails panels
  232. /UseOutlines Show outlines (aka bookmarks) panel
  233. /UseThumbs Show page thumbnails panel
  234. /FullScreen Fullscreen view
  235. /UseOC Show Optional Content Group (OCG) panel
  236. /UseAttachments Show attachments panel
  237. """
  238. self.output.setPageMode(mode)
  239. def _trim_dests(self, pdf, dests, pages):
  240. """
  241. Removes any named destinations that are not a part of the specified
  242. page set.
  243. """
  244. new_dests = []
  245. prev_header_added = True
  246. for k, o in list(dests.items()):
  247. for j in range(*pages):
  248. if pdf.getPage(j).getObject() == o['/Page'].getObject():
  249. o[NameObject('/Page')] = o['/Page'].getObject()
  250. assert str_(k) == str_(o['/Title'])
  251. new_dests.append(o)
  252. break
  253. return new_dests
  254. def _trim_outline(self, pdf, outline, pages):
  255. """
  256. Removes any outline/bookmark entries that are not a part of the
  257. specified page set.
  258. """
  259. new_outline = []
  260. prev_header_added = True
  261. for i, o in enumerate(outline):
  262. if isinstance(o, list):
  263. sub = self._trim_outline(pdf, o, pages)
  264. if sub:
  265. if not prev_header_added:
  266. new_outline.append(outline[i-1])
  267. new_outline.append(sub)
  268. else:
  269. prev_header_added = False
  270. for j in range(*pages):
  271. if pdf.getPage(j).getObject() == o['/Page'].getObject():
  272. o[NameObject('/Page')] = o['/Page'].getObject()
  273. new_outline.append(o)
  274. prev_header_added = True
  275. break
  276. return new_outline
  277. def _write_dests(self):
  278. dests = self.named_dests
  279. for v in dests:
  280. pageno = None
  281. pdf = None
  282. if '/Page' in v:
  283. for i, p in enumerate(self.pages):
  284. if p.id == v['/Page']:
  285. v[NameObject('/Page')] = p.out_pagedata
  286. pageno = i
  287. pdf = p.src
  288. break
  289. if pageno != None:
  290. self.output.addNamedDestinationObject(v)
  291. def _write_bookmarks(self, bookmarks=None, parent=None):
  292. if bookmarks == None:
  293. bookmarks = self.bookmarks
  294. last_added = None
  295. for b in bookmarks:
  296. if isinstance(b, list):
  297. self._write_bookmarks(b, last_added)
  298. continue
  299. pageno = None
  300. pdf = None
  301. if '/Page' in b:
  302. for i, p in enumerate(self.pages):
  303. if p.id == b['/Page']:
  304. #b[NameObject('/Page')] = p.out_pagedata
  305. args = [NumberObject(p.id), NameObject(b['/Type'])]
  306. #nothing more to add
  307. #if b['/Type'] == '/Fit' or b['/Type'] == '/FitB'
  308. if b['/Type'] == '/FitH' or b['/Type'] == '/FitBH':
  309. if '/Top' in b and not isinstance(b['/Top'], NullObject):
  310. args.append(FloatObject(b['/Top']))
  311. else:
  312. args.append(FloatObject(0))
  313. del b['/Top']
  314. elif b['/Type'] == '/FitV' or b['/Type'] == '/FitBV':
  315. if '/Left' in b and not isinstance(b['/Left'], NullObject):
  316. args.append(FloatObject(b['/Left']))
  317. else:
  318. args.append(FloatObject(0))
  319. del b['/Left']
  320. elif b['/Type'] == '/XYZ':
  321. if '/Left' in b and not isinstance(b['/Left'], NullObject):
  322. args.append(FloatObject(b['/Left']))
  323. else:
  324. args.append(FloatObject(0))
  325. if '/Top' in b and not isinstance(b['/Top'], NullObject):
  326. args.append(FloatObject(b['/Top']))
  327. else:
  328. args.append(FloatObject(0))
  329. if '/Zoom' in b and not isinstance(b['/Zoom'], NullObject):
  330. args.append(FloatObject(b['/Zoom']))
  331. else:
  332. args.append(FloatObject(0))
  333. del b['/Top'], b['/Zoom'], b['/Left']
  334. elif b['/Type'] == '/FitR':
  335. if '/Left' in b and not isinstance(b['/Left'], NullObject):
  336. args.append(FloatObject(b['/Left']))
  337. else:
  338. args.append(FloatObject(0))
  339. if '/Bottom' in b and not isinstance(b['/Bottom'], NullObject):
  340. args.append(FloatObject(b['/Bottom']))
  341. else:
  342. args.append(FloatObject(0))
  343. if '/Right' in b and not isinstance(b['/Right'], NullObject):
  344. args.append(FloatObject(b['/Right']))
  345. else:
  346. args.append(FloatObject(0))
  347. if '/Top' in b and not isinstance(b['/Top'], NullObject):
  348. args.append(FloatObject(b['/Top']))
  349. else:
  350. args.append(FloatObject(0))
  351. del b['/Left'], b['/Right'], b['/Bottom'], b['/Top']
  352. b[NameObject('/A')] = DictionaryObject({NameObject('/S'): NameObject('/GoTo'), NameObject('/D'): ArrayObject(args)})
  353. pageno = i
  354. pdf = p.src
  355. break
  356. if pageno != None:
  357. del b['/Page'], b['/Type']
  358. last_added = self.output.addBookmarkDict(b, parent)
  359. def _associate_dests_to_pages(self, pages):
  360. for nd in self.named_dests:
  361. pageno = None
  362. np = nd['/Page']
  363. if isinstance(np, NumberObject):
  364. continue
  365. for p in pages:
  366. if np.getObject() == p.pagedata.getObject():
  367. pageno = p.id
  368. if pageno != None:
  369. nd[NameObject('/Page')] = NumberObject(pageno)
  370. else:
  371. raise ValueError("Unresolved named destination '%s'" % (nd['/Title'],))
  372. def _associate_bookmarks_to_pages(self, pages, bookmarks=None):
  373. if bookmarks == None:
  374. bookmarks = self.bookmarks
  375. for b in bookmarks:
  376. if isinstance(b, list):
  377. self._associate_bookmarks_to_pages(pages, b)
  378. continue
  379. pageno = None
  380. bp = b['/Page']
  381. if isinstance(bp, NumberObject):
  382. continue
  383. for p in pages:
  384. if bp.getObject() == p.pagedata.getObject():
  385. pageno = p.id
  386. if pageno != None:
  387. b[NameObject('/Page')] = NumberObject(pageno)
  388. else:
  389. raise ValueError("Unresolved bookmark '%s'" % (b['/Title'],))
  390. def findBookmark(self, bookmark, root=None):
  391. if root == None:
  392. root = self.bookmarks
  393. for i, b in enumerate(root):
  394. if isinstance(b, list):
  395. res = self.findBookmark(bookmark, b)
  396. if res:
  397. return [i] + res
  398. elif b == bookmark or b['/Title'] == bookmark:
  399. return [i]
  400. return None
  401. def addBookmark(self, title, pagenum, parent=None):
  402. """
  403. Add a bookmark to this PDF file.
  404. :param str title: Title to use for this bookmark.
  405. :param int pagenum: Page number this bookmark will point to.
  406. :param parent: A reference to a parent bookmark to create nested
  407. bookmarks.
  408. """
  409. if parent == None:
  410. iloc = [len(self.bookmarks)-1]
  411. elif isinstance(parent, list):
  412. iloc = parent
  413. else:
  414. iloc = self.findBookmark(parent)
  415. dest = Bookmark(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826))
  416. if parent == None:
  417. self.bookmarks.append(dest)
  418. else:
  419. bmparent = self.bookmarks
  420. for i in iloc[:-1]:
  421. bmparent = bmparent[i]
  422. npos = iloc[-1]+1
  423. if npos < len(bmparent) and isinstance(bmparent[npos], list):
  424. bmparent[npos].append(dest)
  425. else:
  426. bmparent.insert(npos, [dest])
  427. return dest
  428. def addNamedDestination(self, title, pagenum):
  429. """
  430. Add a destination to the output.
  431. :param str title: Title to use
  432. :param int pagenum: Page number this destination points at.
  433. """
  434. dest = Destination(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826))
  435. self.named_dests.append(dest)
  436. class OutlinesObject(list):
  437. def __init__(self, pdf, tree, parent=None):
  438. list.__init__(self)
  439. self.tree = tree
  440. self.pdf = pdf
  441. self.parent = parent
  442. def remove(self, index):
  443. obj = self[index]
  444. del self[index]
  445. self.tree.removeChild(obj)
  446. def add(self, title, pagenum):
  447. pageRef = self.pdf.getObject(self.pdf._pages)['/Kids'][pagenum]
  448. action = DictionaryObject()
  449. action.update({
  450. NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]),
  451. NameObject('/S') : NameObject('/GoTo')
  452. })
  453. actionRef = self.pdf._addObject(action)
  454. bookmark = TreeObject()
  455. bookmark.update({
  456. NameObject('/A'): actionRef,
  457. NameObject('/Title'): createStringObject(title),
  458. })
  459. self.pdf._addObject(bookmark)
  460. self.tree.addChild(bookmark)
  461. def removeAll(self):
  462. for child in [x for x in self.tree.children()]:
  463. self.tree.removeChild(child)
  464. self.pop()