I updated the PDF Booklet project and removed Python 2 dependencies so that it will run under Ubuntu 22.04.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

358 lines
13 KiB

2 years ago
  1. import re
  2. import datetime
  3. import decimal
  4. from .generic import PdfObject
  5. from xml.dom import getDOMImplementation
  6. from xml.dom.minidom import parseString
  7. from .utils import u_
  8. RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  9. DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
  10. XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
  11. PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
  12. XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
  13. # What is the PDFX namespace, you might ask? I might ask that too. It's
  14. # a completely undocumented namespace used to place "custom metadata"
  15. # properties, which are arbitrary metadata properties with no semantic or
  16. # documented meaning. Elements in the namespace are key/value-style storage,
  17. # where the element name is the key and the content is the value. The keys
  18. # are transformed into valid XML identifiers by substituting an invalid
  19. # identifier character with \u2182 followed by the unicode hex ID of the
  20. # original character. A key like "my car" is therefore "my\u21820020car".
  21. #
  22. # \u2182, in case you're wondering, is the unicode character
  23. # \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for
  24. # escaping characters.
  25. #
  26. # Intentional users of the pdfx namespace should be shot on sight. A
  27. # custom data schema and sensical XML elements could be used instead, as is
  28. # suggested by Adobe's own documentation on XMP (under "Extensibility of
  29. # Schemas").
  30. #
  31. # Information presented here on the /pdfx/ schema is a result of limited
  32. # reverse engineering, and does not constitute a full specification.
  33. PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
  34. iso8601 = re.compile("""
  35. (?P<year>[0-9]{4})
  36. (-
  37. (?P<month>[0-9]{2})
  38. (-
  39. (?P<day>[0-9]+)
  40. (T
  41. (?P<hour>[0-9]{2}):
  42. (?P<minute>[0-9]{2})
  43. (:(?P<second>[0-9]{2}(.[0-9]+)?))?
  44. (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
  45. )?
  46. )?
  47. )?
  48. """, re.VERBOSE)
  49. class XmpInformation(PdfObject):
  50. """
  51. An object that represents Adobe XMP metadata.
  52. Usually accessed by :meth:`getXmpMetadata()<PyPDF2.PdfFileReader.getXmpMetadata>`
  53. """
  54. def __init__(self, stream):
  55. self.stream = stream
  56. docRoot = parseString(self.stream.getData())
  57. self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0]
  58. self.cache = {}
  59. def writeToStream(self, stream, encryption_key):
  60. self.stream.writeToStream(stream, encryption_key)
  61. def getElement(self, aboutUri, namespace, name):
  62. for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
  63. if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
  64. attr = desc.getAttributeNodeNS(namespace, name)
  65. if attr != None:
  66. yield attr
  67. for element in desc.getElementsByTagNameNS(namespace, name):
  68. yield element
  69. def getNodesInNamespace(self, aboutUri, namespace):
  70. for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
  71. if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
  72. for i in range(desc.attributes.length):
  73. attr = desc.attributes.item(i)
  74. if attr.namespaceURI == namespace:
  75. yield attr
  76. for child in desc.childNodes:
  77. if child.namespaceURI == namespace:
  78. yield child
  79. def _getText(self, element):
  80. text = ""
  81. for child in element.childNodes:
  82. if child.nodeType == child.TEXT_NODE:
  83. text += child.data
  84. return text
  85. def _converter_string(value):
  86. return value
  87. def _converter_date(value):
  88. m = iso8601.match(value)
  89. year = int(m.group("year"))
  90. month = int(m.group("month") or "1")
  91. day = int(m.group("day") or "1")
  92. hour = int(m.group("hour") or "0")
  93. minute = int(m.group("minute") or "0")
  94. second = decimal.Decimal(m.group("second") or "0")
  95. seconds = second.to_integral(decimal.ROUND_FLOOR)
  96. milliseconds = (second - seconds) * 1000000
  97. tzd = m.group("tzd") or "Z"
  98. dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
  99. if tzd != "Z":
  100. tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")]
  101. tzd_hours *= -1
  102. if tzd_hours < 0:
  103. tzd_minutes *= -1
  104. dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
  105. return dt
  106. _test_converter_date = staticmethod(_converter_date)
  107. def _getter_bag(namespace, name, converter):
  108. def get(self):
  109. cached = self.cache.get(namespace, {}).get(name)
  110. if cached:
  111. return cached
  112. retval = []
  113. for element in self.getElement("", namespace, name):
  114. bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag")
  115. if len(bags):
  116. for bag in bags:
  117. for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
  118. value = self._getText(item)
  119. value = converter(value)
  120. retval.append(value)
  121. ns_cache = self.cache.setdefault(namespace, {})
  122. ns_cache[name] = retval
  123. return retval
  124. return get
  125. def _getter_seq(namespace, name, converter):
  126. def get(self):
  127. cached = self.cache.get(namespace, {}).get(name)
  128. if cached:
  129. return cached
  130. retval = []
  131. for element in self.getElement("", namespace, name):
  132. seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq")
  133. if len(seqs):
  134. for seq in seqs:
  135. for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
  136. value = self._getText(item)
  137. value = converter(value)
  138. retval.append(value)
  139. else:
  140. value = converter(self._getText(element))
  141. retval.append(value)
  142. ns_cache = self.cache.setdefault(namespace, {})
  143. ns_cache[name] = retval
  144. return retval
  145. return get
  146. def _getter_langalt(namespace, name, converter):
  147. def get(self):
  148. cached = self.cache.get(namespace, {}).get(name)
  149. if cached:
  150. return cached
  151. retval = {}
  152. for element in self.getElement("", namespace, name):
  153. alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
  154. if len(alts):
  155. for alt in alts:
  156. for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
  157. value = self._getText(item)
  158. value = converter(value)
  159. retval[item.getAttribute("xml:lang")] = value
  160. else:
  161. retval["x-default"] = converter(self._getText(element))
  162. ns_cache = self.cache.setdefault(namespace, {})
  163. ns_cache[name] = retval
  164. return retval
  165. return get
  166. def _getter_single(namespace, name, converter):
  167. def get(self):
  168. cached = self.cache.get(namespace, {}).get(name)
  169. if cached:
  170. return cached
  171. value = None
  172. for element in self.getElement("", namespace, name):
  173. if element.nodeType == element.ATTRIBUTE_NODE:
  174. value = element.nodeValue
  175. else:
  176. value = self._getText(element)
  177. break
  178. if value != None:
  179. value = converter(value)
  180. ns_cache = self.cache.setdefault(namespace, {})
  181. ns_cache[name] = value
  182. return value
  183. return get
  184. dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string))
  185. """
  186. Contributors to the resource (other than the authors). An unsorted
  187. array of names.
  188. """
  189. dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string))
  190. """
  191. Text describing the extent or scope of the resource.
  192. """
  193. dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string))
  194. """
  195. A sorted array of names of the authors of the resource, listed in order
  196. of precedence.
  197. """
  198. dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date))
  199. """
  200. A sorted array of dates (datetime.datetime instances) of signifigance to
  201. the resource. The dates and times are in UTC.
  202. """
  203. dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string))
  204. """
  205. A language-keyed dictionary of textual descriptions of the content of the
  206. resource.
  207. """
  208. dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string))
  209. """
  210. The mime-type of the resource.
  211. """
  212. dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string))
  213. """
  214. Unique identifier of the resource.
  215. """
  216. dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string))
  217. """
  218. An unordered array specifying the languages used in the resource.
  219. """
  220. dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string))
  221. """
  222. An unordered array of publisher names.
  223. """
  224. dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string))
  225. """
  226. An unordered array of text descriptions of relationships to other
  227. documents.
  228. """
  229. dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string))
  230. """
  231. A language-keyed dictionary of textual descriptions of the rights the
  232. user has to this resource.
  233. """
  234. dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string))
  235. """
  236. Unique identifier of the work from which this resource was derived.
  237. """
  238. dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string))
  239. """
  240. An unordered array of descriptive phrases or keywrods that specify the
  241. topic of the content of the resource.
  242. """
  243. dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string))
  244. """
  245. A language-keyed dictionary of the title of the resource.
  246. """
  247. dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string))
  248. """
  249. An unordered array of textual descriptions of the document type.
  250. """
  251. pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string))
  252. """
  253. An unformatted text string representing document keywords.
  254. """
  255. pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string))
  256. """
  257. The PDF file version, for example 1.0, 1.3.
  258. """
  259. pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string))
  260. """
  261. The name of the tool that created the PDF document.
  262. """
  263. xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date))
  264. """
  265. The date and time the resource was originally created. The date and
  266. time are returned as a UTC datetime.datetime object.
  267. """
  268. xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date))
  269. """
  270. The date and time the resource was last modified. The date and time
  271. are returned as a UTC datetime.datetime object.
  272. """
  273. xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date))
  274. """
  275. The date and time that any metadata for this resource was last
  276. changed. The date and time are returned as a UTC datetime.datetime
  277. object.
  278. """
  279. xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string))
  280. """
  281. The name of the first known tool used to create the resource.
  282. """
  283. xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string))
  284. """
  285. The common identifier for all versions and renditions of this resource.
  286. """
  287. xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string))
  288. """
  289. An identifier for a specific incarnation of a document, updated each
  290. time a file is saved.
  291. """
  292. def custom_properties(self):
  293. if not hasattr(self, "_custom_properties"):
  294. self._custom_properties = {}
  295. for node in self.getNodesInNamespace("", PDFX_NAMESPACE):
  296. key = node.localName
  297. while True:
  298. # see documentation about PDFX_NAMESPACE earlier in file
  299. idx = key.find(u_("\u2182"))
  300. if idx == -1:
  301. break
  302. key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:]
  303. if node.nodeType == node.ATTRIBUTE_NODE:
  304. value = node.nodeValue
  305. else:
  306. value = self._getText(node)
  307. self._custom_properties[key] = value
  308. return self._custom_properties
  309. custom_properties = property(custom_properties)
  310. """
  311. Retrieves custom metadata properties defined in the undocumented pdfx
  312. metadata schema.
  313. :return: a dictionary of key/value items for custom metadata properties.
  314. :rtype: dict
  315. """