I updated the PDF Booklet project and removed Python 2 dependencies so that it will run under Ubuntu 22.04.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1222 lines
44 KiB

2 years ago
  1. # vim: sw=4:expandtab:foldmethod=marker
  2. #
  3. # Copyright (c) 2006, Mathieu Fenniak
  4. # All rights reserved.
  5. #
  6. # Redistribution and use in source and binary forms, with or without
  7. # modification, are permitted provided that the following conditions are
  8. # met:
  9. #
  10. # * Redistributions of source code must retain the above copyright notice,
  11. # this list of conditions and the following disclaimer.
  12. # * Redistributions in binary form must reproduce the above copyright notice,
  13. # this list of conditions and the following disclaimer in the documentation
  14. # and/or other materials provided with the distribution.
  15. # * The name of the author may not be used to endorse or promote products
  16. # derived from this software without specific prior written permission.
  17. #
  18. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  19. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21. # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  22. # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  23. # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  24. # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  25. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  26. # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  27. # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  28. # POSSIBILITY OF SUCH DAMAGE.
  29. """
  30. Implementation of generic PDF objects (dictionary, number, string, and so on)
  31. """
  32. __author__ = "Mathieu Fenniak"
  33. __author_email__ = "biziqe@mathieu.fenniak.net"
  34. import re
  35. from .utils import readNonWhitespace, RC4_encrypt, skipOverComment
  36. from .utils import b_, u_, chr_, ord_
  37. from .utils import PdfStreamError
  38. import warnings
  39. from . import filters
  40. from . import utils
  41. import decimal
  42. import codecs
  43. import sys
  44. #import debugging
  45. ObjectPrefix = b_('/<[tf(n%')
  46. NumberSigns = b_('+-')
  47. IndirectPattern = re.compile(b_(r"(\d+)\s+(\d+)\s+R[^a-zA-Z]"))
  48. def readObject(stream, pdf):
  49. tok = stream.read(1)
  50. stream.seek(-1, 1) # reset to start
  51. idx = ObjectPrefix.find(tok)
  52. if idx == 0:
  53. # name object
  54. return NameObject.readFromStream(stream, pdf)
  55. elif idx == 1:
  56. # hexadecimal string OR dictionary
  57. peek = stream.read(2)
  58. stream.seek(-2, 1) # reset to start
  59. if peek == b_('<<'):
  60. return DictionaryObject.readFromStream(stream, pdf)
  61. else:
  62. return readHexStringFromStream(stream)
  63. elif idx == 2:
  64. # array object
  65. return ArrayObject.readFromStream(stream, pdf)
  66. elif idx == 3 or idx == 4:
  67. # boolean object
  68. return BooleanObject.readFromStream(stream)
  69. elif idx == 5:
  70. # string object
  71. return readStringFromStream(stream)
  72. elif idx == 6:
  73. # null object
  74. return NullObject.readFromStream(stream)
  75. elif idx == 7:
  76. # comment
  77. while tok not in (b_('\r'), b_('\n')):
  78. tok = stream.read(1)
  79. tok = readNonWhitespace(stream)
  80. stream.seek(-1, 1)
  81. return readObject(stream, pdf)
  82. else:
  83. # number object OR indirect reference
  84. if tok in NumberSigns:
  85. # number
  86. return NumberObject.readFromStream(stream)
  87. peek = stream.read(20)
  88. stream.seek(-len(peek), 1) # reset to start
  89. if IndirectPattern.match(peek) != None:
  90. return IndirectObject.readFromStream(stream, pdf)
  91. else:
  92. return NumberObject.readFromStream(stream)
  93. class PdfObject(object):
  94. def getObject(self):
  95. """Resolves indirect references."""
  96. return self
  97. class NullObject(PdfObject):
  98. def writeToStream(self, stream, encryption_key):
  99. stream.write(b_("null"))
  100. def readFromStream(stream):
  101. nulltxt = stream.read(4)
  102. if nulltxt != b_("null"):
  103. raise utils.PdfReadError("Could not read Null object")
  104. return NullObject()
  105. readFromStream = staticmethod(readFromStream)
  106. class BooleanObject(PdfObject):
  107. def __init__(self, value):
  108. self.value = value
  109. def writeToStream(self, stream, encryption_key):
  110. if self.value:
  111. stream.write(b_("true"))
  112. else:
  113. stream.write(b_("false"))
  114. def readFromStream(stream):
  115. word = stream.read(4)
  116. if word == b_("true"):
  117. return BooleanObject(True)
  118. elif word == b_("fals"):
  119. stream.read(1)
  120. return BooleanObject(False)
  121. else:
  122. raise utils.PdfReadError('Could not read Boolean object')
  123. readFromStream = staticmethod(readFromStream)
  124. class ArrayObject(list, PdfObject):
  125. def writeToStream(self, stream, encryption_key):
  126. stream.write(b_("["))
  127. for data in self:
  128. stream.write(b_(" "))
  129. data.writeToStream(stream, encryption_key)
  130. stream.write(b_(" ]"))
  131. def readFromStream(stream, pdf):
  132. arr = ArrayObject()
  133. tmp = stream.read(1)
  134. if tmp != b_("["):
  135. raise utils.PdfReadError("Could not read array")
  136. while True:
  137. # skip leading whitespace
  138. tok = stream.read(1)
  139. while tok.isspace():
  140. tok = stream.read(1)
  141. stream.seek(-1, 1)
  142. # check for array ending
  143. peekahead = stream.read(1)
  144. if peekahead == b_("]"):
  145. break
  146. stream.seek(-1, 1)
  147. # read and append obj
  148. arr.append(readObject(stream, pdf))
  149. return arr
  150. readFromStream = staticmethod(readFromStream)
  151. class IndirectObject(PdfObject):
  152. def __init__(self, idnum, generation, pdf):
  153. self.idnum = idnum
  154. self.generation = generation
  155. self.pdf = pdf
  156. def getObject(self):
  157. return self.pdf.getObject(self).getObject()
  158. def __repr__(self):
  159. return "IndirectObject(%r, %r)" % (self.idnum, self.generation)
  160. def __eq__(self, other):
  161. return (
  162. other != None and
  163. isinstance(other, IndirectObject) and
  164. self.idnum == other.idnum and
  165. self.generation == other.generation and
  166. self.pdf is other.pdf
  167. )
  168. def __ne__(self, other):
  169. return not self.__eq__(other)
  170. def writeToStream(self, stream, encryption_key):
  171. stream.write(b_("%s %s R" % (self.idnum, self.generation)))
  172. def readFromStream(stream, pdf):
  173. idnum = b_("")
  174. while True:
  175. tok = stream.read(1)
  176. if not tok:
  177. # stream has truncated prematurely
  178. raise PdfStreamError("Stream has ended unexpectedly")
  179. if tok.isspace():
  180. break
  181. idnum += tok
  182. generation = b_("")
  183. while True:
  184. tok = stream.read(1)
  185. if not tok:
  186. # stream has truncated prematurely
  187. raise PdfStreamError("Stream has ended unexpectedly")
  188. if tok.isspace():
  189. if not generation:
  190. continue
  191. break
  192. generation += tok
  193. r = readNonWhitespace(stream)
  194. if r != b_("R"):
  195. raise utils.PdfReadError("Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell()))
  196. return IndirectObject(int(idnum), int(generation), pdf)
  197. readFromStream = staticmethod(readFromStream)
  198. class FloatObject(decimal.Decimal, PdfObject):
  199. def __new__(cls, value="0", context=None):
  200. try:
  201. return decimal.Decimal.__new__(cls, utils.str_(value), context)
  202. except:
  203. return decimal.Decimal.__new__(cls, str(value))
  204. def __repr__(self):
  205. if self == self.to_integral():
  206. return str(self.quantize(decimal.Decimal(1)))
  207. else:
  208. # XXX: this adds useless extraneous zeros.
  209. return "%.5f" % self
  210. def as_numeric(self):
  211. return float(b_(repr(self)))
  212. def writeToStream(self, stream, encryption_key):
  213. stream.write(b_(repr(self)))
  214. class NumberObject(int, PdfObject):
  215. NumberPattern = re.compile(b_('[^+-.0-9]'))
  216. ByteDot = b_(".")
  217. def __new__(cls, value):
  218. val = int(value)
  219. try:
  220. return int.__new__(cls, val)
  221. except OverflowError:
  222. return int.__new__(cls, 0)
  223. def as_numeric(self):
  224. return int(b_(repr(self)))
  225. def writeToStream(self, stream, encryption_key):
  226. stream.write(b_(repr(self)))
  227. def readFromStream(stream):
  228. num = utils.readUntilRegex(stream, NumberObject.NumberPattern)
  229. if num.find(NumberObject.ByteDot) != -1:
  230. return FloatObject(num)
  231. else:
  232. return NumberObject(num)
  233. readFromStream = staticmethod(readFromStream)
  234. ##
  235. # Given a string (either a "str" or "unicode"), create a ByteStringObject or a
  236. # TextStringObject to represent the string.
  237. def createStringObject(string):
  238. if isinstance(string, utils.string_type):
  239. return TextStringObject(string)
  240. elif isinstance(string, utils.bytes_type):
  241. try:
  242. if string.startswith(codecs.BOM_UTF16_BE):
  243. retval = TextStringObject(string.decode("utf-16"))
  244. retval.autodetect_utf16 = True
  245. return retval
  246. else:
  247. # This is probably a big performance hit here, but we need to
  248. # convert string objects into the text/unicode-aware version if
  249. # possible... and the only way to check if that's possible is
  250. # to try. Some strings are strings, some are just byte arrays.
  251. retval = TextStringObject(decode_pdfdocencoding(string))
  252. retval.autodetect_pdfdocencoding = True
  253. return retval
  254. except UnicodeDecodeError:
  255. return ByteStringObject(string)
  256. else:
  257. raise TypeError("createStringObject should have str or unicode arg")
  258. def readHexStringFromStream(stream):
  259. stream.read(1)
  260. txt = ""
  261. x = b_("")
  262. while True:
  263. tok = readNonWhitespace(stream)
  264. if not tok:
  265. # stream has truncated prematurely
  266. raise PdfStreamError("Stream has ended unexpectedly")
  267. if tok == b_(">"):
  268. break
  269. x += tok
  270. if len(x) == 2:
  271. txt += chr(int(x, base=16))
  272. x = b_("")
  273. if len(x) == 1:
  274. x += b_("0")
  275. if len(x) == 2:
  276. txt += chr(int(x, base=16))
  277. return createStringObject(b_(txt))
  278. def readStringFromStream(stream):
  279. tok = stream.read(1)
  280. parens = 1
  281. txt = b_("")
  282. while True:
  283. tok = stream.read(1)
  284. if not tok:
  285. # stream has truncated prematurely
  286. raise PdfStreamError("Stream has ended unexpectedly")
  287. if tok == b_("("):
  288. parens += 1
  289. elif tok == b_(")"):
  290. parens -= 1
  291. if parens == 0:
  292. break
  293. elif tok == b_("\\"):
  294. tok = stream.read(1)
  295. if tok == b_("n"):
  296. tok = b_("\n")
  297. elif tok == b_("r"):
  298. tok = b_("\r")
  299. elif tok == b_("t"):
  300. tok = b_("\t")
  301. elif tok == b_("b"):
  302. tok = b_("\b")
  303. elif tok == b_("f"):
  304. tok = b_("\f")
  305. elif tok == b_("c"):
  306. tok = b_("\c")
  307. elif tok == b_("("):
  308. tok = b_("(")
  309. elif tok == b_(")"):
  310. tok = b_(")")
  311. elif tok == b_("/"):
  312. tok = b_("/")
  313. elif tok == b_("\\"):
  314. tok = b_("\\")
  315. elif tok in (b_(" "), b_("/"), b_("%"), b_("<"), b_(">"), b_("["),
  316. b_("]"), b_("#"), b_("_"), b_("&"), b_('$')):
  317. # odd/unnessecary escape sequences we have encountered
  318. tok = b_(tok)
  319. elif tok.isdigit():
  320. # "The number ddd may consist of one, two, or three
  321. # octal digits; high-order overflow shall be ignored.
  322. # Three octal digits shall be used, with leading zeros
  323. # as needed, if the next character of the string is also
  324. # a digit." (PDF reference 7.3.4.2, p 16)
  325. for i in range(2):
  326. ntok = stream.read(1)
  327. if ntok.isdigit():
  328. tok += ntok
  329. else:
  330. break
  331. tok = b_(chr(int(tok, base=8)))
  332. elif tok in b_("\n\r"):
  333. # This case is hit when a backslash followed by a line
  334. # break occurs. If it's a multi-char EOL, consume the
  335. # second character:
  336. tok = stream.read(1)
  337. if not tok in b_("\n\r"):
  338. stream.seek(-1, 1)
  339. # Then don't add anything to the actual string, since this
  340. # line break was escaped:
  341. tok = b_('')
  342. else:
  343. raise utils.PdfReadError(r"Unexpected escaped string: %s" % tok)
  344. txt += tok
  345. return createStringObject(txt)
  346. ##
  347. # Represents a string object where the text encoding could not be determined.
  348. # This occurs quite often, as the PDF spec doesn't provide an alternate way to
  349. # represent strings -- for example, the encryption data stored in files (like
  350. # /O) is clearly not text, but is still stored in a "String" object.
  351. class ByteStringObject(utils.bytes_type, PdfObject):
  352. ##
  353. # For compatibility with TextStringObject.original_bytes. This method
  354. # returns self.
  355. original_bytes = property(lambda self: self)
  356. def writeToStream(self, stream, encryption_key):
  357. bytearr = self
  358. if encryption_key:
  359. bytearr = RC4_encrypt(encryption_key, bytearr)
  360. stream.write(b_("<"))
  361. stream.write(utils.hexencode(bytearr))
  362. stream.write(b_(">"))
  363. ##
  364. # Represents a string object that has been decoded into a real unicode string.
  365. # If read from a PDF document, this string appeared to match the
  366. # PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to
  367. # occur.
  368. class TextStringObject(utils.string_type, PdfObject):
  369. autodetect_pdfdocencoding = False
  370. autodetect_utf16 = False
  371. ##
  372. # It is occasionally possible that a text string object gets created where
  373. # a byte string object was expected due to the autodetection mechanism --
  374. # if that occurs, this "original_bytes" property can be used to
  375. # back-calculate what the original encoded bytes were.
  376. original_bytes = property(lambda self: self.get_original_bytes())
  377. def get_original_bytes(self):
  378. # We're a text string object, but the library is trying to get our raw
  379. # bytes. This can happen if we auto-detected this string as text, but
  380. # we were wrong. It's pretty common. Return the original bytes that
  381. # would have been used to create this object, based upon the autodetect
  382. # method.
  383. if self.autodetect_utf16:
  384. return codecs.BOM_UTF16_BE + self.encode("utf-16be")
  385. elif self.autodetect_pdfdocencoding:
  386. return encode_pdfdocencoding(self)
  387. else:
  388. raise Exception("no information about original bytes")
  389. def writeToStream(self, stream, encryption_key):
  390. # Try to write the string out as a PDFDocEncoding encoded string. It's
  391. # nicer to look at in the PDF file. Sadly, we take a performance hit
  392. # here for trying...
  393. try:
  394. bytearr = encode_pdfdocencoding(self)
  395. except UnicodeEncodeError:
  396. bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
  397. if encryption_key:
  398. bytearr = RC4_encrypt(encryption_key, bytearr)
  399. obj = ByteStringObject(bytearr)
  400. obj.writeToStream(stream, None)
  401. else:
  402. stream.write(b_("("))
  403. for c in bytearr:
  404. if not chr_(c).isalnum() and c != b_(' '):
  405. stream.write(b_("\\%03o" % ord_(c)))
  406. else:
  407. stream.write(b_(chr_(c)))
  408. stream.write(b_(")"))
  409. class NameObject(str, PdfObject):
  410. delimiterPattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]"))
  411. surfix = b_("/")
  412. def writeToStream(self, stream, encryption_key):
  413. stream.write(b_(self))
  414. def readFromStream(stream, pdf):
  415. debug = False
  416. if debug: print((stream.tell()))
  417. name = stream.read(1)
  418. if name != NameObject.surfix:
  419. raise utils.PdfReadError("name read error")
  420. name += utils.readUntilRegex(stream, NameObject.delimiterPattern,
  421. ignore_eof=True)
  422. if debug: print(name)
  423. try:
  424. return NameObject(name.decode('utf-8'))
  425. except (UnicodeEncodeError, UnicodeDecodeError) as e:
  426. # Name objects should represent irregular characters
  427. # with a '#' followed by the symbol's hex number
  428. if not pdf.strict:
  429. warnings.warn("Illegal character in Name Object", utils.PdfReadWarning)
  430. return NameObject(name)
  431. else:
  432. raise utils.PdfReadError("Illegal character in Name Object")
  433. readFromStream = staticmethod(readFromStream)
  434. class DictionaryObject(dict, PdfObject):
  435. def raw_get(self, key):
  436. return dict.__getitem__(self, key)
  437. def __setitem__(self, key, value):
  438. if not isinstance(key, PdfObject):
  439. raise ValueError("key must be PdfObject")
  440. if not isinstance(value, PdfObject):
  441. raise ValueError("value must be PdfObject")
  442. return dict.__setitem__(self, key, value)
  443. def setdefault(self, key, value=None):
  444. if not isinstance(key, PdfObject):
  445. raise ValueError("key must be PdfObject")
  446. if not isinstance(value, PdfObject):
  447. raise ValueError("value must be PdfObject")
  448. return dict.setdefault(self, key, value)
  449. def __getitem__(self, key):
  450. return dict.__getitem__(self, key).getObject()
  451. ##
  452. # Retrieves XMP (Extensible Metadata Platform) data relevant to the
  453. # this object, if available.
  454. # <p>
  455. # Stability: Added in v1.12, will exist for all future v1.x releases.
  456. # @return Returns a {@link #xmp.XmpInformation XmlInformation} instance
  457. # that can be used to access XMP metadata from the document. Can also
  458. # return None if no metadata was found on the document root.
  459. def getXmpMetadata(self):
  460. metadata = self.get("/Metadata", None)
  461. if metadata == None:
  462. return None
  463. metadata = metadata.getObject()
  464. from . import xmp
  465. if not isinstance(metadata, xmp.XmpInformation):
  466. metadata = xmp.XmpInformation(metadata)
  467. self[NameObject("/Metadata")] = metadata
  468. return metadata
  469. ##
  470. # Read-only property that accesses the {@link
  471. # #DictionaryObject.getXmpData getXmpData} function.
  472. # <p>
  473. # Stability: Added in v1.12, will exist for all future v1.x releases.
  474. xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None)
  475. def writeToStream(self, stream, encryption_key):
  476. stream.write(b_("<<\n"))
  477. for key, value in list(self.items()):
  478. key.writeToStream(stream, encryption_key)
  479. stream.write(b_(" "))
  480. value.writeToStream(stream, encryption_key)
  481. stream.write(b_("\n"))
  482. stream.write(b_(">>"))
  483. def readFromStream(stream, pdf):
  484. debug = False
  485. tmp = stream.read(2)
  486. if tmp != b_("<<"):
  487. raise utils.PdfReadError("Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell()))
  488. data = {}
  489. while True:
  490. tok = readNonWhitespace(stream)
  491. if tok == b_('\x00'):
  492. continue
  493. elif tok == b_('%'):
  494. stream.seek(-1, 1)
  495. skipOverComment(stream)
  496. continue
  497. if not tok:
  498. # stream has truncated prematurely
  499. raise PdfStreamError("Stream has ended unexpectedly")
  500. if debug: print(("Tok:", tok))
  501. if tok == b_(">"):
  502. stream.read(1)
  503. break
  504. stream.seek(-1, 1)
  505. key = readObject(stream, pdf)
  506. tok = readNonWhitespace(stream)
  507. stream.seek(-1, 1)
  508. value = readObject(stream, pdf)
  509. if not data.get(key):
  510. data[key] = value
  511. elif pdf.strict:
  512. # multiple definitions of key not permitted
  513. raise utils.PdfReadError("Multiple definitions in dictionary at byte %s for key %s" \
  514. % (utils.hexStr(stream.tell()), key))
  515. else:
  516. warnings.warn("Multiple definitions in dictionary at byte %s for key %s" \
  517. % (utils.hexStr(stream.tell()), key), utils.PdfReadWarning)
  518. pos = stream.tell()
  519. s = readNonWhitespace(stream)
  520. if s == b_('s') and stream.read(5) == b_('tream'):
  521. eol = stream.read(1)
  522. # odd PDF file output has spaces after 'stream' keyword but before EOL.
  523. # patch provided by Danial Sandler
  524. while eol == b_(' '):
  525. eol = stream.read(1)
  526. assert eol in (b_("\n"), b_("\r"))
  527. if eol == b_("\r"):
  528. # read \n after
  529. if stream.read(1) != b_('\n'):
  530. stream.seek(-1, 1)
  531. # this is a stream object, not a dictionary
  532. assert "/Length" in data
  533. length = data["/Length"]
  534. if debug: print(data)
  535. if isinstance(length, IndirectObject):
  536. t = stream.tell()
  537. length = pdf.getObject(length)
  538. stream.seek(t, 0)
  539. data["__streamdata__"] = stream.read(length)
  540. if debug: print("here")
  541. #if debug: print(binascii.hexlify(data["__streamdata__"]))
  542. e = readNonWhitespace(stream)
  543. ndstream = stream.read(8)
  544. if (e + ndstream) != b_("endstream"):
  545. # (sigh) - the odd PDF file has a length that is too long, so
  546. # we need to read backwards to find the "endstream" ending.
  547. # ReportLab (unknown version) generates files with this bug,
  548. # and Python users into PDF files tend to be our audience.
  549. # we need to do this to correct the streamdata and chop off
  550. # an extra character.
  551. pos = stream.tell()
  552. stream.seek(-10, 1)
  553. end = stream.read(9)
  554. if end == b_("endstream"):
  555. # we found it by looking back one character further.
  556. data["__streamdata__"] = data["__streamdata__"][:-1]
  557. else:
  558. if debug: print(("E", e, ndstream, debugging.toHex(end)))
  559. stream.seek(pos, 0)
  560. raise utils.PdfReadError("Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell()))
  561. else:
  562. stream.seek(pos, 0)
  563. if "__streamdata__" in data:
  564. return StreamObject.initializeFromDictionary(data)
  565. else:
  566. retval = DictionaryObject()
  567. retval.update(data)
  568. return retval
  569. readFromStream = staticmethod(readFromStream)
  570. class TreeObject(DictionaryObject):
  571. def __init__(self):
  572. DictionaryObject.__init__(self)
  573. def hasChildren(self):
  574. return '/First' in self
  575. def __iter__(self):
  576. return self.children()
  577. def children(self):
  578. if not self.hasChildren():
  579. raise StopIteration
  580. child = self['/First']
  581. while True:
  582. yield child
  583. if child == self['/Last']:
  584. raise StopIteration
  585. child = child['/Next']
  586. def addChild(self, child, pdf):
  587. childObj = child.getObject()
  588. child = pdf.getReference(childObj)
  589. assert isinstance(child, IndirectObject)
  590. if '/First' not in self:
  591. self[NameObject('/First')] = child
  592. self[NameObject('/Count')] = NumberObject(0)
  593. prev = None
  594. else:
  595. prev = self['/Last']
  596. self[NameObject('/Last')] = child
  597. self[NameObject('/Count')] = NumberObject(self[NameObject('/Count')] + 1)
  598. if prev:
  599. prevRef = pdf.getReference(prev)
  600. assert isinstance(prevRef, IndirectObject)
  601. childObj[NameObject('/Prev')] = prevRef
  602. prev[NameObject('/Next')] = child
  603. parentRef = pdf.getReference(self)
  604. assert isinstance(parentRef, IndirectObject)
  605. childObj[NameObject('/Parent')] = parentRef
  606. def removeChild(self, child):
  607. childObj = child.getObject()
  608. if NameObject('/Parent') not in childObj:
  609. raise ValueError("Removed child does not appear to be a tree item")
  610. elif childObj[NameObject('/Parent')] != self:
  611. raise ValueError("Removed child is not a member of this tree")
  612. found = False
  613. prevRef = None
  614. prev = None
  615. curRef = self[NameObject('/First')]
  616. cur = curRef.getObject()
  617. lastRef = self[NameObject('/Last')]
  618. last = lastRef.getObject()
  619. while cur != None:
  620. if cur == childObj:
  621. if prev == None:
  622. if NameObject('/Next') in cur:
  623. # Removing first tree node
  624. nextRef = cur[NameObject('/Next')]
  625. next = nextRef.getObject()
  626. del next[NameObject('/Prev')]
  627. self[NameObject('/First')] = nextRef
  628. self[NameObject('/Count')] = self[NameObject('/Count')] - 1
  629. else:
  630. # Removing only tree node
  631. assert self[NameObject('/Count')] == 1
  632. del self[NameObject('/Count')]
  633. del self[NameObject('/First')]
  634. if NameObject('/Last') in self:
  635. del self[NameObject('/Last')]
  636. else:
  637. if NameObject('/Next') in cur:
  638. # Removing middle tree node
  639. nextRef = cur[NameObject('/Next')]
  640. next = nextRef.getObject()
  641. next[NameObject('/Prev')] = prevRef
  642. prev[NameObject('/Next')] = nextRef
  643. self[NameObject('/Count')] = self[NameObject('/Count')] - 1
  644. else:
  645. # Removing last tree node
  646. assert cur == last
  647. del prev[NameObject('/Next')]
  648. self[NameObject('/Last')] = prevRef
  649. self[NameObject('/Count')] = self[NameObject('/Count')] - 1
  650. found = True
  651. break
  652. prevRef = curRef
  653. prev = cur
  654. if NameObject('/Next') in cur:
  655. curRef = cur[NameObject('/Next')]
  656. cur = curRef.getObject()
  657. else:
  658. curRef = None
  659. cur = None
  660. if not found:
  661. raise ValueError("Removal couldn't find item in tree")
  662. del childObj[NameObject('/Parent')]
  663. if NameObject('/Next') in childObj:
  664. del childObj[NameObject('/Next')]
  665. if NameObject('/Prev') in childObj:
  666. del childObj[NameObject('/Prev')]
  667. def emptyTree(self):
  668. for child in self:
  669. childObj = child.getObject()
  670. del childObj[NameObject('/Parent')]
  671. if NameObject('/Next') in childObj:
  672. del childObj[NameObject('/Next')]
  673. if NameObject('/Prev') in childObj:
  674. del childObj[NameObject('/Prev')]
  675. if NameObject('/Count') in self:
  676. del self[NameObject('/Count')]
  677. if NameObject('/First') in self:
  678. del self[NameObject('/First')]
  679. if NameObject('/Last') in self:
  680. del self[NameObject('/Last')]
  681. class StreamObject(DictionaryObject):
  682. def __init__(self):
  683. self._data = None
  684. self.decodedSelf = None
  685. def writeToStream(self, stream, encryption_key):
  686. self[NameObject("/Length")] = NumberObject(len(self._data))
  687. DictionaryObject.writeToStream(self, stream, encryption_key)
  688. del self["/Length"]
  689. stream.write(b_("\nstream\n"))
  690. data = self._data
  691. if encryption_key:
  692. data = RC4_encrypt(encryption_key, data)
  693. stream.write(data)
  694. stream.write(b_("\nendstream"))
  695. def initializeFromDictionary(data):
  696. if "/Filter" in data:
  697. retval = EncodedStreamObject()
  698. else:
  699. retval = DecodedStreamObject()
  700. retval._data = data["__streamdata__"]
  701. del data["__streamdata__"]
  702. del data["/Length"]
  703. retval.update(data)
  704. return retval
  705. initializeFromDictionary = staticmethod(initializeFromDictionary)
  706. def flateEncode(self):
  707. if "/Filter" in self:
  708. f = self["/Filter"]
  709. if isinstance(f, ArrayObject):
  710. f.insert(0, NameObject("/FlateDecode"))
  711. else:
  712. newf = ArrayObject()
  713. newf.append(NameObject("/FlateDecode"))
  714. newf.append(f)
  715. f = newf
  716. else:
  717. f = NameObject("/FlateDecode")
  718. retval = EncodedStreamObject()
  719. retval[NameObject("/Filter")] = f
  720. retval._data = filters.FlateDecode.encode(self._data)
  721. return retval
  722. class DecodedStreamObject(StreamObject):
  723. def getData(self):
  724. return self._data
  725. def setData(self, data):
  726. self._data = data
  727. class EncodedStreamObject(StreamObject):
  728. def __init__(self):
  729. self.decodedSelf = None
  730. def getData(self):
  731. if self.decodedSelf:
  732. # cached version of decoded object
  733. return self.decodedSelf.getData()
  734. else:
  735. # create decoded object
  736. decoded = DecodedStreamObject()
  737. decoded._data = filters.decodeStreamData(self)
  738. for key, value in list(self.items()):
  739. if not key in ("/Length", "/Filter", "/DecodeParms"):
  740. decoded[key] = value
  741. self.decodedSelf = decoded
  742. return decoded._data
  743. def setData(self, data):
  744. raise utils.PdfReadError("Creating EncodedStreamObject is not currently supported")
  745. class RectangleObject(ArrayObject):
  746. """
  747. This class is used to represent *page boxes* in PyPDF2. These boxes include:
  748. * :attr:`artBox <PyPDF2.pdf.PageObject.artBox>`
  749. * :attr:`bleedBox <PyPDF2.pdf.PageObject.bleedBox>`
  750. * :attr:`cropBox <PyPDF2.pdf.PageObject.cropBox>`
  751. * :attr:`mediaBox <PyPDF2.pdf.PageObject.mediaBox>`
  752. * :attr:`trimBox <PyPDF2.pdf.PageObject.trimBox>`
  753. """
  754. def __init__(self, arr):
  755. # must have four points
  756. assert len(arr) == 4
  757. # automatically convert arr[x] into NumberObject(arr[x]) if necessary
  758. ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr])
  759. def ensureIsNumber(self, value):
  760. if not isinstance(value, (NumberObject, FloatObject)):
  761. value = FloatObject(value)
  762. return value
  763. def __repr__(self):
  764. return "RectangleObject(%s)" % repr(list(self))
  765. def getLowerLeft_x(self):
  766. return self[0]
  767. def getLowerLeft_y(self):
  768. return self[1]
  769. def getUpperRight_x(self):
  770. return self[2]
  771. def getUpperRight_y(self):
  772. return self[3]
  773. def getUpperLeft_x(self):
  774. return self.getLowerLeft_x()
  775. def getUpperLeft_y(self):
  776. return self.getUpperRight_y()
  777. def getLowerRight_x(self):
  778. return self.getUpperRight_x()
  779. def getLowerRight_y(self):
  780. return self.getLowerLeft_y()
  781. def getLowerLeft(self):
  782. return self.getLowerLeft_x(), self.getLowerLeft_y()
  783. def getLowerRight(self):
  784. return self.getLowerRight_x(), self.getLowerRight_y()
  785. def getUpperLeft(self):
  786. return self.getUpperLeft_x(), self.getUpperLeft_y()
  787. def getUpperRight(self):
  788. return self.getUpperRight_x(), self.getUpperRight_y()
  789. def setLowerLeft(self, value):
  790. self[0], self[1] = [self.ensureIsNumber(x) for x in value]
  791. def setLowerRight(self, value):
  792. self[2], self[1] = [self.ensureIsNumber(x) for x in value]
  793. def setUpperLeft(self, value):
  794. self[0], self[3] = [self.ensureIsNumber(x) for x in value]
  795. def setUpperRight(self, value):
  796. self[2], self[3] = [self.ensureIsNumber(x) for x in value]
  797. def getWidth(self):
  798. return self.getUpperRight_x() - self.getLowerLeft_x()
  799. def getHeight(self):
  800. return self.getUpperRight_y() - self.getLowerLeft_y()
  801. lowerLeft = property(getLowerLeft, setLowerLeft, None, None)
  802. """
  803. Property to read and modify the lower left coordinate of this box
  804. in (x,y) form.
  805. """
  806. lowerRight = property(getLowerRight, setLowerRight, None, None)
  807. """
  808. Property to read and modify the lower right coordinate of this box
  809. in (x,y) form.
  810. """
  811. upperLeft = property(getUpperLeft, setUpperLeft, None, None)
  812. """
  813. Property to read and modify the upper left coordinate of this box
  814. in (x,y) form.
  815. """
  816. upperRight = property(getUpperRight, setUpperRight, None, None)
  817. """
  818. Property to read and modify the upper right coordinate of this box
  819. in (x,y) form.
  820. """
  821. class Field(TreeObject):
  822. """
  823. A class representing a field dictionary. This class is accessed through
  824. :meth:`getFields()<PyPDF2.PdfFileReader.getFields>`
  825. """
  826. def __init__(self, data):
  827. DictionaryObject.__init__(self)
  828. attributes = ("/FT", "/Parent", "/Kids", "/T", "/TU", "/TM", "/Ff",
  829. "/V", "/DV", "/AA")
  830. for attr in attributes:
  831. try:
  832. self[NameObject(attr)] = data[attr]
  833. except KeyError:
  834. pass
  835. fieldType = property(lambda self: self.get("/FT"))
  836. """
  837. Read-only property accessing the type of this field.
  838. """
  839. parent = property(lambda self: self.get("/Parent"))
  840. """
  841. Read-only property accessing the parent of this field.
  842. """
  843. kids = property(lambda self: self.get("/Kids"))
  844. """
  845. Read-only property accessing the kids of this field.
  846. """
  847. name = property(lambda self: self.get("/T"))
  848. """
  849. Read-only property accessing the name of this field.
  850. """
  851. altName = property(lambda self: self.get("/TU"))
  852. """
  853. Read-only property accessing the alternate name of this field.
  854. """
  855. mappingName = property(lambda self: self.get("/TM"))
  856. """
  857. Read-only property accessing the mapping name of this field. This
  858. name is used by PyPDF2 as a key in the dictionary returned by
  859. :meth:`getFields()<PyPDF2.PdfFileReader.getFields>`
  860. """
  861. flags = property(lambda self: self.get("/Ff"))
  862. """
  863. Read-only property accessing the field flags, specifying various
  864. characteristics of the field (see Table 8.70 of the PDF 1.7 reference).
  865. """
  866. value = property(lambda self: self.get("/V"))
  867. """
  868. Read-only property accessing the value of this field. Format
  869. varies based on field type.
  870. """
  871. defaultValue = property(lambda self: self.get("/DV"))
  872. """
  873. Read-only property accessing the default value of this field.
  874. """
  875. additionalActions = property(lambda self: self.get("/AA"))
  876. """
  877. Read-only property accessing the additional actions dictionary.
  878. This dictionary defines the field's behavior in response to trigger events.
  879. See Section 8.5.2 of the PDF 1.7 reference.
  880. """
  881. class Destination(TreeObject):
  882. """
  883. A class representing a destination within a PDF file.
  884. See section 8.2.1 of the PDF 1.6 reference.
  885. :param str title: Title of this destination.
  886. :param int page: Page number of this destination.
  887. :param str typ: How the destination is displayed.
  888. :param args: Additional arguments may be necessary depending on the type.
  889. :raises PdfReadError: If destination type is invalid.
  890. Valid ``typ`` arguments (see PDF spec for details):
  891. /Fit No additional arguments
  892. /XYZ [left] [top] [zoomFactor]
  893. /FitH [top]
  894. /FitV [left]
  895. /FitR [left] [bottom] [right] [top]
  896. /FitB No additional arguments
  897. /FitBH [top]
  898. /FitBV [left]
  899. """
  900. def __init__(self, title, page, typ, *args):
  901. DictionaryObject.__init__(self)
  902. self[NameObject("/Title")] = title
  903. self[NameObject("/Page")] = page
  904. self[NameObject("/Type")] = typ
  905. # from table 8.2 of the PDF 1.7 reference.
  906. if typ == "/XYZ":
  907. (self[NameObject("/Left")], self[NameObject("/Top")],
  908. self[NameObject("/Zoom")]) = args
  909. elif typ == "/FitR":
  910. (self[NameObject("/Left")], self[NameObject("/Bottom")],
  911. self[NameObject("/Right")], self[NameObject("/Top")]) = args
  912. elif typ in ["/FitH", "/FitBH"]:
  913. self[NameObject("/Top")], = args
  914. elif typ in ["/FitV", "/FitBV"]:
  915. self[NameObject("/Left")], = args
  916. elif typ in ["/Fit", "/FitB"]:
  917. pass
  918. else:
  919. raise utils.PdfReadError("Unknown Destination Type: %r" % typ)
  920. def getDestArray(self):
  921. return ArrayObject([self.raw_get('/Page'), self['/Type']] + [self[x] for x in ['/Left', '/Bottom', '/Right', '/Top', '/Zoom'] if x in self])
  922. def writeToStream(self, stream, encryption_key):
  923. stream.write(b_("<<\n"))
  924. key = NameObject('/D')
  925. key.writeToStream(stream, encryption_key)
  926. stream.write(b_(" "))
  927. value = self.getDestArray()
  928. value.writeToStream(stream, encryption_key)
  929. key = NameObject("/S")
  930. key.writeToStream(stream, encryption_key)
  931. stream.write(b_(" "))
  932. value = NameObject("/GoTo")
  933. value.writeToStream(stream, encryption_key)
  934. stream.write(b_("\n"))
  935. stream.write(b_(">>"))
  936. title = property(lambda self: self.get("/Title"))
  937. """
  938. Read-only property accessing the destination title.
  939. :rtype: str
  940. """
  941. page = property(lambda self: self.get("/Page"))
  942. """
  943. Read-only property accessing the destination page number.
  944. :rtype: int
  945. """
  946. typ = property(lambda self: self.get("/Type"))
  947. """
  948. Read-only property accessing the destination type.
  949. :rtype: str
  950. """
  951. zoom = property(lambda self: self.get("/Zoom", None))
  952. """
  953. Read-only property accessing the zoom factor.
  954. :rtype: int, or ``None`` if not available.
  955. """
  956. left = property(lambda self: self.get("/Left", None))
  957. """
  958. Read-only property accessing the left horizontal coordinate.
  959. :rtype: int, or ``None`` if not available.
  960. """
  961. right = property(lambda self: self.get("/Right", None))
  962. """
  963. Read-only property accessing the right horizontal coordinate.
  964. :rtype: int, or ``None`` if not available.
  965. """
  966. top = property(lambda self: self.get("/Top", None))
  967. """
  968. Read-only property accessing the top vertical coordinate.
  969. :rtype: int, or ``None`` if not available.
  970. """
  971. bottom = property(lambda self: self.get("/Bottom", None))
  972. """
  973. Read-only property accessing the bottom vertical coordinate.
  974. :rtype: int, or ``None`` if not available.
  975. """
  976. class Bookmark(Destination):
  977. def writeToStream(self, stream, encryption_key):
  978. stream.write(b_("<<\n"))
  979. for key in [NameObject(x) for x in ['/Title', '/Parent', '/First', '/Last', '/Next', '/Prev'] if x in self]:
  980. key.writeToStream(stream, encryption_key)
  981. stream.write(b_(" "))
  982. value = self.raw_get(key)
  983. value.writeToStream(stream, encryption_key)
  984. stream.write(b_("\n"))
  985. key = NameObject('/Dest')
  986. key.writeToStream(stream, encryption_key)
  987. stream.write(b_(" "))
  988. value = self.getDestArray()
  989. value.writeToStream(stream, encryption_key)
  990. stream.write(b_("\n"))
  991. stream.write(b_(">>"))
  992. def encode_pdfdocencoding(unicode_string):
  993. retval = b_('')
  994. for c in unicode_string:
  995. try:
  996. retval += b_(chr(_pdfDocEncoding_rev[c]))
  997. except KeyError:
  998. raise UnicodeEncodeError("pdfdocencoding", c, -1, -1,
  999. "does not exist in translation table")
  1000. return retval
  1001. def decode_pdfdocencoding(byte_array):
  1002. retval = u_('')
  1003. for b in byte_array:
  1004. c = _pdfDocEncoding[ord_(b)]
  1005. if c == u_('\u0000'):
  1006. raise UnicodeDecodeError("pdfdocencoding", utils.barray(b), -1, -1,
  1007. "does not exist in translation table")
  1008. retval += c
  1009. return retval
  1010. _pdfDocEncoding = (
  1011. u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
  1012. u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
  1013. u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
  1014. u_('\u02d8'), u_('\u02c7'), u_('\u02c6'), u_('\u02d9'), u_('\u02dd'), u_('\u02db'), u_('\u02da'), u_('\u02dc'),
  1015. u_('\u0020'), u_('\u0021'), u_('\u0022'), u_('\u0023'), u_('\u0024'), u_('\u0025'), u_('\u0026'), u_('\u0027'),
  1016. u_('\u0028'), u_('\u0029'), u_('\u002a'), u_('\u002b'), u_('\u002c'), u_('\u002d'), u_('\u002e'), u_('\u002f'),
  1017. u_('\u0030'), u_('\u0031'), u_('\u0032'), u_('\u0033'), u_('\u0034'), u_('\u0035'), u_('\u0036'), u_('\u0037'),
  1018. u_('\u0038'), u_('\u0039'), u_('\u003a'), u_('\u003b'), u_('\u003c'), u_('\u003d'), u_('\u003e'), u_('\u003f'),
  1019. u_('\u0040'), u_('\u0041'), u_('\u0042'), u_('\u0043'), u_('\u0044'), u_('\u0045'), u_('\u0046'), u_('\u0047'),
  1020. u_('\u0048'), u_('\u0049'), u_('\u004a'), u_('\u004b'), u_('\u004c'), u_('\u004d'), u_('\u004e'), u_('\u004f'),
  1021. u_('\u0050'), u_('\u0051'), u_('\u0052'), u_('\u0053'), u_('\u0054'), u_('\u0055'), u_('\u0056'), u_('\u0057'),
  1022. u_('\u0058'), u_('\u0059'), u_('\u005a'), u_('\u005b'), u_('\u005c'), u_('\u005d'), u_('\u005e'), u_('\u005f'),
  1023. u_('\u0060'), u_('\u0061'), u_('\u0062'), u_('\u0063'), u_('\u0064'), u_('\u0065'), u_('\u0066'), u_('\u0067'),
  1024. u_('\u0068'), u_('\u0069'), u_('\u006a'), u_('\u006b'), u_('\u006c'), u_('\u006d'), u_('\u006e'), u_('\u006f'),
  1025. u_('\u0070'), u_('\u0071'), u_('\u0072'), u_('\u0073'), u_('\u0074'), u_('\u0075'), u_('\u0076'), u_('\u0077'),
  1026. u_('\u0078'), u_('\u0079'), u_('\u007a'), u_('\u007b'), u_('\u007c'), u_('\u007d'), u_('\u007e'), u_('\u0000'),
  1027. u_('\u2022'), u_('\u2020'), u_('\u2021'), u_('\u2026'), u_('\u2014'), u_('\u2013'), u_('\u0192'), u_('\u2044'),
  1028. u_('\u2039'), u_('\u203a'), u_('\u2212'), u_('\u2030'), u_('\u201e'), u_('\u201c'), u_('\u201d'), u_('\u2018'),
  1029. u_('\u2019'), u_('\u201a'), u_('\u2122'), u_('\ufb01'), u_('\ufb02'), u_('\u0141'), u_('\u0152'), u_('\u0160'),
  1030. u_('\u0178'), u_('\u017d'), u_('\u0131'), u_('\u0142'), u_('\u0153'), u_('\u0161'), u_('\u017e'), u_('\u0000'),
  1031. u_('\u20ac'), u_('\u00a1'), u_('\u00a2'), u_('\u00a3'), u_('\u00a4'), u_('\u00a5'), u_('\u00a6'), u_('\u00a7'),
  1032. u_('\u00a8'), u_('\u00a9'), u_('\u00aa'), u_('\u00ab'), u_('\u00ac'), u_('\u0000'), u_('\u00ae'), u_('\u00af'),
  1033. u_('\u00b0'), u_('\u00b1'), u_('\u00b2'), u_('\u00b3'), u_('\u00b4'), u_('\u00b5'), u_('\u00b6'), u_('\u00b7'),
  1034. u_('\u00b8'), u_('\u00b9'), u_('\u00ba'), u_('\u00bb'), u_('\u00bc'), u_('\u00bd'), u_('\u00be'), u_('\u00bf'),
  1035. u_('\u00c0'), u_('\u00c1'), u_('\u00c2'), u_('\u00c3'), u_('\u00c4'), u_('\u00c5'), u_('\u00c6'), u_('\u00c7'),
  1036. u_('\u00c8'), u_('\u00c9'), u_('\u00ca'), u_('\u00cb'), u_('\u00cc'), u_('\u00cd'), u_('\u00ce'), u_('\u00cf'),
  1037. u_('\u00d0'), u_('\u00d1'), u_('\u00d2'), u_('\u00d3'), u_('\u00d4'), u_('\u00d5'), u_('\u00d6'), u_('\u00d7'),
  1038. u_('\u00d8'), u_('\u00d9'), u_('\u00da'), u_('\u00db'), u_('\u00dc'), u_('\u00dd'), u_('\u00de'), u_('\u00df'),
  1039. u_('\u00e0'), u_('\u00e1'), u_('\u00e2'), u_('\u00e3'), u_('\u00e4'), u_('\u00e5'), u_('\u00e6'), u_('\u00e7'),
  1040. u_('\u00e8'), u_('\u00e9'), u_('\u00ea'), u_('\u00eb'), u_('\u00ec'), u_('\u00ed'), u_('\u00ee'), u_('\u00ef'),
  1041. u_('\u00f0'), u_('\u00f1'), u_('\u00f2'), u_('\u00f3'), u_('\u00f4'), u_('\u00f5'), u_('\u00f6'), u_('\u00f7'),
  1042. u_('\u00f8'), u_('\u00f9'), u_('\u00fa'), u_('\u00fb'), u_('\u00fc'), u_('\u00fd'), u_('\u00fe'), u_('\u00ff')
  1043. )
  1044. assert len(_pdfDocEncoding) == 256
  1045. _pdfDocEncoding_rev = {}
  1046. for i in range(256):
  1047. char = _pdfDocEncoding[i]
  1048. if char == u_("\u0000"):
  1049. continue
  1050. assert char not in _pdfDocEncoding_rev
  1051. _pdfDocEncoding_rev[char] = i