Spaces:
Runtime error
Runtime error
# Copyright (c) 2006, Mathieu Fenniak | |
# Copyright (c) 2007, Ashish Kulkarni <[email protected]> | |
# | |
# All rights reserved. | |
# | |
# Redistribution and use in source and binary forms, with or without | |
# modification, are permitted provided that the following conditions are | |
# met: | |
# | |
# * Redistributions of source code must retain the above copyright notice, | |
# this list of conditions and the following disclaimer. | |
# * Redistributions in binary form must reproduce the above copyright notice, | |
# this list of conditions and the following disclaimer in the documentation | |
# and/or other materials provided with the distribution. | |
# * The name of the author may not be used to endorse or promote products | |
# derived from this software without specific prior written permission. | |
# | |
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
# POSSIBILITY OF SUCH DAMAGE. | |
import math | |
import uuid | |
import warnings | |
from decimal import Decimal | |
from typing import ( | |
Any, | |
Callable, | |
Dict, | |
Iterable, | |
Iterator, | |
List, | |
Optional, | |
Set, | |
Tuple, | |
Union, | |
cast, | |
) | |
from ._cmap import build_char_map, unknown_char_map | |
from ._protocols import PdfReaderProtocol | |
from ._utils import ( | |
CompressedTransformationMatrix, | |
File, | |
TransformationMatrixType, | |
deprecation_no_replacement, | |
deprecation_with_replacement, | |
logger_warning, | |
matrix_multiply, | |
) | |
from .constants import AnnotationDictionaryAttributes as ADA | |
from .constants import ImageAttributes as IA | |
from .constants import PageAttributes as PG | |
from .constants import Ressources as RES | |
from .errors import PageSizeNotDefinedError | |
from .filters import _xobj_to_image | |
from .generic import ( | |
ArrayObject, | |
ContentStream, | |
DictionaryObject, | |
EncodedStreamObject, | |
FloatObject, | |
IndirectObject, | |
NameObject, | |
NullObject, | |
NumberObject, | |
RectangleObject, | |
encode_pdfdocencoding, | |
) | |
CUSTOM_RTL_MIN: int = -1 | |
CUSTOM_RTL_MAX: int = -1 | |
CUSTOM_RTL_SPECIAL_CHARS: List[int] = [] | |
def set_custom_rtl( | |
_min: Union[str, int, None] = None, | |
_max: Union[str, int, None] = None, | |
specials: Union[str, List[int], None] = None, | |
) -> Tuple[int, int, List[int]]: | |
""" | |
Change the Right-To-Left and special characters custom parameters. | |
Args: | |
_min: The new minimum value for the range of custom characters that | |
will be written right to left. | |
If set to `None`, the value will not be changed. | |
If set to an integer or string, it will be converted to its ASCII code. | |
The default value is -1, which sets no additional range to be converted. | |
_max: The new maximum value for the range of custom characters that will be written right to left. | |
If set to `None`, the value will not be changed. | |
If set to an integer or string, it will be converted to its ASCII code. | |
The default value is -1, which sets no additional range to be converted. | |
specials: The new list of special characters to be inserted in the current insertion order. | |
If set to `None`, the current value will not be changed. | |
If set to a string, it will be converted to a list of ASCII codes. | |
The default value is an empty list. | |
Returns: | |
A tuple containing the new values for `CUSTOM_RTL_MIN`, `CUSTOM_RTL_MAX`, and `CUSTOM_RTL_SPECIAL_CHARS`. | |
""" | |
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS | |
if isinstance(_min, int): | |
CUSTOM_RTL_MIN = _min | |
elif isinstance(_min, str): | |
CUSTOM_RTL_MIN = ord(_min) | |
if isinstance(_max, int): | |
CUSTOM_RTL_MAX = _max | |
elif isinstance(_max, str): | |
CUSTOM_RTL_MAX = ord(_max) | |
if isinstance(specials, str): | |
CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials] | |
elif isinstance(specials, list): | |
CUSTOM_RTL_SPECIAL_CHARS = specials | |
return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS | |
def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: | |
retval: Union[None, RectangleObject, IndirectObject] = self.get(name) | |
if isinstance(retval, RectangleObject): | |
return retval | |
if retval is None: | |
for d in defaults: | |
retval = self.get(d) | |
if retval is not None: | |
break | |
if isinstance(retval, IndirectObject): | |
retval = self.pdf.get_object(retval) | |
retval = RectangleObject(retval) # type: ignore | |
_set_rectangle(self, name, retval) | |
return retval | |
def getRectangle( | |
self: Any, name: str, defaults: Iterable[str] | |
) -> RectangleObject: # pragma: no cover | |
deprecation_no_replacement("getRectangle", "3.0.0") | |
return _get_rectangle(self, name, defaults) | |
def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: | |
name = NameObject(name) | |
self[name] = value | |
def setRectangle( | |
self: Any, name: str, value: Union[RectangleObject, float] | |
) -> None: # pragma: no cover | |
deprecation_no_replacement("setRectangle", "3.0.0") | |
_set_rectangle(self, name, value) | |
def _delete_rectangle(self: Any, name: str) -> None: | |
del self[name] | |
def deleteRectangle(self: Any, name: str) -> None: # pragma: no cover | |
deprecation_no_replacement("deleteRectangle", "3.0.0") | |
del self[name] | |
def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property: | |
return property( | |
lambda self: _get_rectangle(self, name, fallback), | |
lambda self, value: _set_rectangle(self, name, value), | |
lambda self: _delete_rectangle(self, name), | |
) | |
def createRectangleAccessor( | |
name: str, fallback: Iterable[str] | |
) -> property: # pragma: no cover | |
deprecation_no_replacement("createRectangleAccessor", "3.0.0") | |
return _create_rectangle_accessor(name, fallback) | |
class Transformation: | |
""" | |
Represent a 2D transformation. | |
The transformation between two coordinate systems is represented by a 3-by-3 | |
transformation matrix matrix with the following form:: | |
a b 0 | |
c d 0 | |
e f 1 | |
Because a transformation matrix has only six elements that can be changed, | |
it is usually specified in PDF as the six-element array [ a b c d e f ]. | |
Coordinate transformations are expressed as matrix multiplications:: | |
a b 0 | |
[ x′ y′ 1 ] = [ x y 1 ] × c d 0 | |
e f 1 | |
Example | |
------- | |
>>> from PyPDF2 import Transformation | |
>>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20) | |
>>> page.add_transformation(op) | |
""" | |
# 9.5.4 Coordinate Systems for 3D | |
# 4.2.2 Common Transformations | |
def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)): | |
self.ctm = ctm | |
def matrix(self) -> TransformationMatrixType: | |
""" | |
Return the transformation matrix as a tuple of tuples in the form: | |
((a, b, 0), (c, d, 0), (e, f, 1)) | |
""" | |
return ( | |
(self.ctm[0], self.ctm[1], 0), | |
(self.ctm[2], self.ctm[3], 0), | |
(self.ctm[4], self.ctm[5], 1), | |
) | |
def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix: | |
""" | |
Compresses the transformation matrix into a tuple of (a, b, c, d, e, f). | |
Args: | |
matrix: The transformation matrix as a tuple of tuples. | |
Returns: | |
A tuple representing the transformation matrix as (a, b, c, d, e, f) | |
""" | |
return ( | |
matrix[0][0], | |
matrix[0][1], | |
matrix[1][0], | |
matrix[1][1], | |
matrix[2][0], | |
matrix[2][1], | |
) | |
def translate(self, tx: float = 0, ty: float = 0) -> "Transformation": | |
""" | |
Translate the contents of a page. | |
Args: | |
tx: The translation along the x-axis. | |
ty: The translation along the y-axis. | |
Returns: | |
A new `Transformation` instance | |
""" | |
m = self.ctm | |
return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty)) | |
def scale( | |
self, sx: Optional[float] = None, sy: Optional[float] = None | |
) -> "Transformation": | |
""" | |
Scale the contents of a page towards the origin of the coordinate system. | |
Typically, that is the lower-left corner of the page. That can be | |
changed by translating the contents / the page boxes. | |
Args: | |
sx: The scale factor along the x-axis. | |
sy: The scale factor along the y-axis. | |
Returns: | |
A new Transformation instance with the scaled matrix. | |
""" | |
if sx is None and sy is None: | |
raise ValueError("Either sx or sy must be specified") | |
if sx is None: | |
sx = sy | |
if sy is None: | |
sy = sx | |
assert sx is not None | |
assert sy is not None | |
op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1)) | |
ctm = Transformation.compress(matrix_multiply(self.matrix, op)) | |
return Transformation(ctm) | |
def rotate(self, rotation: float) -> "Transformation": | |
""" | |
Rotate the contents of a page. | |
Args: | |
rotation: The angle of rotation in degrees. | |
Returns: | |
A new `Transformation` instance with the rotated matrix. | |
""" | |
rotation = math.radians(rotation) | |
op: TransformationMatrixType = ( | |
(math.cos(rotation), math.sin(rotation), 0), | |
(-math.sin(rotation), math.cos(rotation), 0), | |
(0, 0, 1), | |
) | |
ctm = Transformation.compress(matrix_multiply(self.matrix, op)) | |
return Transformation(ctm) | |
def __repr__(self) -> str: | |
return f"Transformation(ctm={self.ctm})" | |
def apply_on( | |
self, pt: Union[Tuple[Decimal, Decimal], Tuple[float, float], List[float]] | |
) -> Union[Tuple[float, float], List[float]]: | |
""" | |
Apply the transformation matrix on the given point. | |
Args: | |
pt: A tuple or list representing the point in the form (x, y) | |
Returns: | |
A tuple or list representing the transformed point in the form (x', y') | |
""" | |
pt1 = ( | |
float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4], | |
float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5], | |
) | |
return list(pt1) if isinstance(pt, list) else pt1 | |
class PageObject(DictionaryObject): | |
""" | |
PageObject represents a single page within a PDF file. | |
Typically this object will be created by accessing the | |
:meth:`get_page()<PyPDF2.PdfReader.get_page>` method of the | |
:class:`PdfReader<PyPDF2.PdfReader>` class, but it is | |
also possible to create an empty page with the | |
:meth:`create_blank_page()<PyPDF2._page.PageObject.create_blank_page>` static method. | |
Args: | |
pdf: PDF file the page belongs to. | |
indirect_reference: Stores the original indirect reference to | |
this object in its source PDF | |
""" | |
original_page: "PageObject" # very local use in writer when appending | |
def __init__( | |
self, | |
pdf: Optional[PdfReaderProtocol] = None, | |
indirect_reference: Optional[IndirectObject] = None, | |
indirect_ref: Optional[IndirectObject] = None, # deprecated | |
) -> None: | |
DictionaryObject.__init__(self) | |
self.pdf: Optional[PdfReaderProtocol] = pdf | |
if indirect_ref is not None: # deprecated | |
warnings.warn( | |
( | |
"indirect_ref is deprecated and will be removed in " | |
"PyPDF2 4.0.0. Use indirect_reference instead of indirect_ref." | |
), | |
DeprecationWarning, | |
) | |
if indirect_reference is not None: | |
raise ValueError("Use indirect_reference instead of indirect_ref.") | |
indirect_reference = indirect_ref | |
self.indirect_reference = indirect_reference | |
def indirect_ref(self) -> Optional[IndirectObject]: # deprecated | |
warnings.warn( | |
( | |
"indirect_ref is deprecated and will be removed in PyPDF2 4.0.0" | |
"Use indirect_reference instead of indirect_ref." | |
), | |
DeprecationWarning, | |
) | |
return self.indirect_reference | |
def indirect_ref(self, value: Optional[IndirectObject]) -> None: # deprecated | |
self.indirect_reference = value | |
def hash_value_data(self) -> bytes: | |
data = super().hash_value_data() | |
data += b"%d" % id(self) | |
return data | |
def user_unit(self) -> float: | |
""" | |
A read-only positive number giving the size of user space units. | |
It is in multiples of 1/72 inch. Hence a value of 1 means a user space | |
unit is 1/72 inch, and a value of 3 means that a user space unit is | |
3/72 inch. | |
""" | |
return self.get(PG.USER_UNIT, 1) | |
def create_blank_page( | |
pdf: Optional[Any] = None, # PdfReader | |
width: Union[float, Decimal, None] = None, | |
height: Union[float, Decimal, None] = None, | |
) -> "PageObject": | |
""" | |
Return a new blank page. | |
If ``width`` or ``height`` is ``None``, try to get the page size | |
from the last page of *pdf*. | |
Args: | |
pdf: PDF file the page belongs to | |
width: The width of the new page expressed in default user | |
space units. | |
height: The height of the new page expressed in default user | |
space units. | |
Returns: | |
The new blank page | |
Raises: | |
PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains | |
no page | |
""" | |
page = PageObject(pdf) | |
# Creates a new page (cf PDF Reference 7.7.3.3) | |
page.__setitem__(NameObject(PG.TYPE), NameObject("/Page")) | |
page.__setitem__(NameObject(PG.PARENT), NullObject()) | |
page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject()) | |
if width is None or height is None: | |
if pdf is not None and len(pdf.pages) > 0: | |
lastpage = pdf.pages[len(pdf.pages) - 1] | |
width = lastpage.mediabox.width | |
height = lastpage.mediabox.height | |
else: | |
raise PageSizeNotDefinedError | |
page.__setitem__( | |
NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore | |
) | |
return page | |
def createBlankPage( | |
pdf: Optional[Any] = None, # PdfReader | |
width: Union[float, Decimal, None] = None, | |
height: Union[float, Decimal, None] = None, | |
) -> "PageObject": # pragma: no cover | |
""" | |
.. deprecated:: 1.28.0 | |
Use :meth:`create_blank_page` instead. | |
""" | |
deprecation_with_replacement("createBlankPage", "create_blank_page", "3.0.0") | |
return PageObject.create_blank_page(pdf, width, height) | |
def images(self) -> List[File]: | |
""" | |
Get a list of all images of the page. | |
This requires pillow. You can install it via 'pip install PyPDF2[image]'. | |
For the moment, this does NOT include inline images. They will be added | |
in future. | |
""" | |
images_extracted: List[File] = [] | |
if RES.XOBJECT not in self[PG.RESOURCES]: # type: ignore | |
return images_extracted | |
x_object = self[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore | |
for obj in x_object: | |
if x_object[obj][IA.SUBTYPE] == "/Image": | |
extension, byte_stream = _xobj_to_image(x_object[obj]) | |
if extension is not None: | |
filename = f"{obj[1:]}{extension}" | |
images_extracted.append(File(name=filename, data=byte_stream)) | |
return images_extracted | |
def rotation(self) -> int: | |
""" | |
The VISUAL rotation of the page. | |
This number has to be a multiple of 90 degrees: 0,90,180,270 | |
This property does not affect "/Contents" | |
""" | |
return int(self.get(PG.ROTATE, 0)) | |
def rotation(self, r: Union[int, float]) -> None: | |
self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360) | |
def transfer_rotation_to_content(self) -> None: | |
""" | |
Apply the rotation of the page to the content and the media/crop/... boxes. | |
It's recommended to apply this function before page merging. | |
""" | |
r = -self.rotation # rotation to apply is in the otherway | |
self.rotation = 0 | |
mb = RectangleObject(self.mediabox) | |
trsf = ( | |
Transformation() | |
.translate( | |
-float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2) | |
) | |
.rotate(r) | |
) | |
pt1 = trsf.apply_on(mb.lower_left) | |
pt2 = trsf.apply_on(mb.upper_right) | |
trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1])) | |
self.add_transformation(trsf, False) | |
for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]: | |
if b in self: | |
rr = RectangleObject(self[b]) # type: ignore | |
pt1 = trsf.apply_on(rr.lower_left) | |
pt2 = trsf.apply_on(rr.upper_right) | |
self[NameObject(b)] = RectangleObject( | |
( | |
min(pt1[0], pt2[0]), | |
min(pt1[1], pt2[1]), | |
max(pt1[0], pt2[0]), | |
max(pt1[1], pt2[1]), | |
) | |
) | |
def rotate(self, angle: int) -> "PageObject": | |
""" | |
Rotate a page clockwise by increments of 90 degrees. | |
Args: | |
angle: Angle to rotate the page. Must be an increment of 90 deg. | |
""" | |
if angle % 90 != 0: | |
raise ValueError("Rotation angle must be a multiple of 90") | |
rotate_obj = self.get(PG.ROTATE, 0) | |
current_angle = ( | |
rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object() | |
) | |
self[NameObject(PG.ROTATE)] = NumberObject(current_angle + angle) | |
return self | |
def rotate_clockwise(self, angle: int) -> "PageObject": # pragma: no cover | |
deprecation_with_replacement("rotate_clockwise", "rotate", "3.0.0") | |
return self.rotate(angle) | |
def rotateClockwise(self, angle: int) -> "PageObject": # pragma: no cover | |
""" | |
.. deprecated:: 1.28.0 | |
Use :meth:`rotate_clockwise` instead. | |
""" | |
deprecation_with_replacement("rotateClockwise", "rotate", "3.0.0") | |
return self.rotate(angle) | |
def rotateCounterClockwise(self, angle: int) -> "PageObject": # pragma: no cover | |
""" | |
.. deprecated:: 1.28.0 | |
Use :meth:`rotate_clockwise` with a negative argument instead. | |
""" | |
deprecation_with_replacement("rotateCounterClockwise", "rotate", "3.0.0") | |
return self.rotate(-angle) | |
def _merge_resources( | |
res1: DictionaryObject, res2: DictionaryObject, resource: Any | |
) -> Tuple[Dict[str, Any], Dict[str, Any]]: | |
new_res = DictionaryObject() | |
new_res.update(res1.get(resource, DictionaryObject()).get_object()) | |
page2res = cast( | |
DictionaryObject, res2.get(resource, DictionaryObject()).get_object() | |
) | |
rename_res = {} | |
for key in list(page2res.keys()): | |
if key in new_res and new_res.raw_get(key) != page2res.raw_get(key): | |
newname = NameObject(key + str(uuid.uuid4())) | |
rename_res[key] = newname | |
new_res[newname] = page2res[key] | |
elif key not in new_res: | |
new_res[key] = page2res.raw_get(key) | |
return new_res, rename_res | |
def _content_stream_rename( | |
stream: ContentStream, rename: Dict[Any, Any], pdf: Any # PdfReader | |
) -> ContentStream: | |
if not rename: | |
return stream | |
stream = ContentStream(stream, pdf) | |
for operands, _operator in stream.operations: | |
if isinstance(operands, list): | |
for i in range(len(operands)): | |
op = operands[i] | |
if isinstance(op, NameObject): | |
operands[i] = rename.get(op, op) | |
elif isinstance(operands, dict): | |
for i in operands: | |
op = operands[i] | |
if isinstance(op, NameObject): | |
operands[i] = rename.get(op, op) | |
else: | |
raise KeyError(f"type of operands is {type(operands)}") | |
return stream | |
def _push_pop_gs(contents: Any, pdf: Any) -> ContentStream: # PdfReader | |
# adds a graphics state "push" and "pop" to the beginning and end | |
# of a content stream. This isolates it from changes such as | |
# transformation matricies. | |
stream = ContentStream(contents, pdf) | |
stream.operations.insert(0, ([], "q")) | |
stream.operations.append(([], "Q")) | |
return stream | |
def _add_transformation_matrix( | |
contents: Any, pdf: Any, ctm: CompressedTransformationMatrix | |
) -> ContentStream: # PdfReader | |
# adds transformation matrix at the beginning of the given | |
# contents stream. | |
a, b, c, d, e, f = ctm | |
contents = ContentStream(contents, pdf) | |
contents.operations.insert( | |
0, | |
[ | |
[ | |
FloatObject(a), | |
FloatObject(b), | |
FloatObject(c), | |
FloatObject(d), | |
FloatObject(e), | |
FloatObject(f), | |
], | |
" cm", | |
], | |
) | |
return contents | |
def get_contents(self) -> Optional[ContentStream]: | |
""" | |
Access the page contents. | |
:return: the ``/Contents`` object, or ``None`` if it doesn't exist. | |
``/Contents`` is optional, as described in PDF Reference 7.7.3.3 | |
""" | |
if PG.CONTENTS in self: | |
return self[PG.CONTENTS].get_object() # type: ignore | |
else: | |
return None | |
def getContents(self) -> Optional[ContentStream]: # pragma: no cover | |
""" | |
.. deprecated:: 1.28.0 | |
Use :meth:`get_contents` instead. | |
""" | |
deprecation_with_replacement("getContents", "get_contents", "3.0.0") | |
return self.get_contents() | |
def merge_page(self, page2: "PageObject", expand: bool = False) -> None: | |
""" | |
Merge the content streams of two pages into one. | |
Resource references | |
(i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc | |
of this page are not altered. The parameter page's content stream will | |
be added to the end of this page's content stream, meaning that it will | |
be drawn after, or "on top" of this page. | |
Args: | |
page2: The page to be merged into this one. Should be | |
an instance of :class:`PageObject<PageObject>`. | |
expand: If true, the current page dimensions will be | |
expanded to accommodate the dimensions of the page to be merged. | |
""" | |
self._merge_page(page2, expand=expand) | |
def mergePage(self, page2: "PageObject") -> None: # pragma: no cover | |
""" | |
.. deprecated:: 1.28.0 | |
Use :meth:`merge_page` instead. | |
""" | |
deprecation_with_replacement("mergePage", "merge_page", "3.0.0") | |
return self.merge_page(page2) | |
def _merge_page( | |
self, | |
page2: "PageObject", | |
page2transformation: Optional[Callable[[Any], ContentStream]] = None, | |
ctm: Optional[CompressedTransformationMatrix] = None, | |
expand: bool = False, | |
) -> None: | |
# First we work on merging the resource dictionaries. This allows us | |
# to find out what symbols in the content streams we might need to | |
# rename. | |
new_resources = DictionaryObject() | |
rename = {} | |
try: | |
original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) | |
except KeyError: | |
original_resources = DictionaryObject() | |
try: | |
page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) | |
except KeyError: | |
page2resources = DictionaryObject() | |
new_annots = ArrayObject() | |
for page in (self, page2): | |
if PG.ANNOTS in page: | |
annots = page[PG.ANNOTS] | |
if isinstance(annots, ArrayObject): | |
for ref in annots: | |
new_annots.append(ref) | |
for res in ( | |
RES.EXT_G_STATE, | |
RES.FONT, | |
RES.XOBJECT, | |
RES.COLOR_SPACE, | |
RES.PATTERN, | |
RES.SHADING, | |
RES.PROPERTIES, | |
): | |
new, newrename = PageObject._merge_resources( | |
original_resources, page2resources, res | |
) | |
if new: | |
new_resources[NameObject(res)] = new | |
rename.update(newrename) | |
# Combine /ProcSet sets. | |
new_resources[NameObject(RES.PROC_SET)] = ArrayObject( | |
frozenset( | |
original_resources.get(RES.PROC_SET, ArrayObject()).get_object() | |
).union( | |
frozenset(page2resources.get(RES.PROC_SET, ArrayObject()).get_object()) | |
) | |
) | |
new_content_array = ArrayObject() | |
original_content = self.get_contents() | |
if original_content is not None: | |
new_content_array.append( | |
PageObject._push_pop_gs(original_content, self.pdf) | |
) | |
page2content = page2.get_contents() | |
if page2content is not None: | |
page2content = ContentStream(page2content, self.pdf) | |
rect = page2.trimbox | |
page2content.operations.insert( | |
0, | |
( | |
map( | |
FloatObject, | |
[ | |
rect.left, | |
rect.bottom, | |
rect.width, | |
rect.height, | |
], | |
), | |
"re", | |
), | |
) | |
page2content.operations.insert(1, ([], "W")) | |
page2content.operations.insert(2, ([], "n")) | |
if page2transformation is not None: | |
page2content = page2transformation(page2content) | |
page2content = PageObject._content_stream_rename( | |
page2content, rename, self.pdf | |
) | |
page2content = PageObject._push_pop_gs(page2content, self.pdf) | |
new_content_array.append(page2content) | |
# if expanding the page to fit a new page, calculate the new media box size | |
if expand: | |
self._expand_mediabox(page2, ctm) | |
self[NameObject(PG.CONTENTS)] = ContentStream(new_content_array, self.pdf) | |
self[NameObject(PG.RESOURCES)] = new_resources | |
self[NameObject(PG.ANNOTS)] = new_annots | |
def _expand_mediabox( | |
self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] | |
) -> None: | |
corners1 = ( | |
self.mediabox.left.as_numeric(), | |
self.mediabox.bottom.as_numeric(), | |
self.mediabox.right.as_numeric(), | |
self.mediabox.top.as_numeric(), | |
) | |
corners2 = ( | |
page2.mediabox.left.as_numeric(), | |
page2.mediabox.bottom.as_numeric(), | |
page2.mediabox.left.as_numeric(), | |
page2.mediabox.top.as_numeric(), | |
page2.mediabox.right.as_numeric(), | |
page2.mediabox.top.as_numeric(), | |
page2.mediabox.right.as_numeric(), | |
page2.mediabox.bottom.as_numeric(), | |
) | |
if ctm is not None: | |
ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] | |
new_x = tuple( | |
ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] | |
for i in range(0, 8, 2) | |
) | |
new_y = tuple( | |
ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] | |
for i in range(0, 8, 2) | |
) | |
else: | |
new_x = corners2[0:8:2] | |
new_y = corners2[1:8:2] | |
lowerleft = (min(new_x), min(new_y)) | |
upperright = (max(new_x), max(new_y)) | |
lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) | |
upperright = ( | |
max(corners1[2], upperright[0]), | |
max(corners1[3], upperright[1]), | |
) | |
self.mediabox.lower_left = lowerleft | |
self.mediabox.upper_right = upperright | |
def mergeTransformedPage( | |
self, | |
page2: "PageObject", | |
ctm: Union[CompressedTransformationMatrix, Transformation], | |
expand: bool = False, | |
) -> None: # pragma: no cover | |
""" | |
mergeTransformedPage is similar to merge_page, but a transformation | |
matrix is applied to the merged stream. | |
:param PageObject page2: The page to be merged into this one. Should be | |
an instance of :class:`PageObject<PageObject>`. | |
:param tuple ctm: a 6-element tuple containing the operands of the | |
transformation matrix | |
:param bool expand: Whether the page should be expanded to fit the dimensions | |
of the page to be merged. | |
.. deprecated:: 1.28.0 | |
Use :meth:`add_transformation` and :meth:`merge_page` instead. | |
""" | |
deprecation_with_replacement( | |
"page.mergeTransformedPage(page2, ctm)", | |
"page2.add_transformation(ctm); page.merge_page(page2)", | |
"3.0.0", | |
) | |
if isinstance(ctm, Transformation): | |
ctm = ctm.ctm | |
ctm = cast(CompressedTransformationMatrix, ctm) | |
self._merge_page( | |
page2, | |
lambda page2Content: PageObject._add_transformation_matrix( | |
page2Content, page2.pdf, ctm # type: ignore[arg-type] | |
), | |
ctm, | |
expand, | |
) | |
def mergeScaledPage( | |
self, page2: "PageObject", scale: float, expand: bool = False | |
) -> None: # pragma: no cover | |
""" | |
mergeScaledPage is similar to merge_page, but the stream to be merged | |
is scaled by applying a transformation matrix. | |
:param PageObject page2: The page to be merged into this one. Should be | |
an instance of :class:`PageObject<PageObject>`. | |
:param float scale: The scaling factor | |
:param bool expand: Whether the page should be expanded to fit the | |
dimensions of the page to be merged. | |
.. deprecated:: 1.28.0 | |
Use :meth:`add_transformation` and :meth:`merge_page` instead. | |
""" | |
deprecation_with_replacement( | |
"page.mergeScaledPage(page2, scale, expand)", | |
"page2.add_transformation(Transformation().scale(scale)); page.merge_page(page2, expand)", | |
"3.0.0", | |
) | |
op = Transformation().scale(scale, scale) | |
self.mergeTransformedPage(page2, op, expand) | |
def mergeRotatedPage( | |
self, page2: "PageObject", rotation: float, expand: bool = False | |
) -> None: # pragma: no cover | |
""" | |
mergeRotatedPage is similar to merge_page, but the stream to be merged | |
is rotated by applying a transformation matrix. | |
:param PageObject page2: the page to be merged into this one. Should be | |
an instance of :class:`PageObject<PageObject>`. | |
:param float rotation: The angle of the rotation, in degrees | |
:param bool expand: Whether the page should be expanded to fit the | |
dimensions of the page to be merged. | |
.. deprecated:: 1.28.0 | |
Use :meth:`add_transformation` and :meth:`merge_page` instead. | |
""" | |
deprecation_with_replacement( | |
"page.mergeRotatedPage(page2, rotation, expand)", | |
"page2.add_transformation(Transformation().rotate(rotation)); page.merge_page(page2, expand)", | |
"3.0.0", | |
) | |
op = Transformation().rotate(rotation) | |
self.mergeTransformedPage(page2, op, expand) | |
def mergeTranslatedPage( | |
self, page2: "PageObject", tx: float, ty: float, expand: bool = False | |
) -> None: # pragma: no cover | |
""" | |
mergeTranslatedPage is similar to merge_page, but the stream to be | |
merged is translated by applying a transformation matrix. | |
:param PageObject page2: the page to be merged into this one. Should be | |
an instance of :class:`PageObject<PageObject>`. | |
:param float tx: The translation on X axis | |
:param float ty: The translation on Y axis | |
:param bool expand: Whether the page should be expanded to fit the | |
dimensions of the page to be merged. | |
.. deprecated:: 1.28.0 | |
Use :meth:`add_transformation` and :meth:`merge_page` instead. | |
""" | |
deprecation_with_replacement( | |
"page.mergeTranslatedPage(page2, tx, ty, expand)", | |
"page2.add_transformation(Transformation().translate(tx, ty)); page.merge_page(page2, expand)", | |
"3.0.0", | |
) | |
op = Transformation().translate(tx, ty) | |
self.mergeTransformedPage(page2, op, expand) | |
def mergeRotatedTranslatedPage( | |
self, | |
page2: "PageObject", | |
rotation: float, | |
tx: float, | |
ty: float, | |
expand: bool = False, | |
) -> None: # pragma: no cover | |
""" | |
mergeRotatedTranslatedPage is similar to merge_page, but the stream to | |
be merged is rotated and translated by applying a transformation matrix. | |
:param PageObject page2: the page to be merged into this one. Should be | |
an instance of :class:`PageObject<PageObject>`. | |
:param float tx: The translation on X axis | |
:param float ty: The translation on Y axis | |
:param float rotation: The angle of the rotation, in degrees | |
:param bool expand: Whether the page should be expanded to fit the | |
dimensions of the page to be merged. | |
.. deprecated:: 1.28.0 | |
Use :meth:`add_transformation` and :meth:`merge_page` instead. | |
""" | |
deprecation_with_replacement( | |
"page.mergeRotatedTranslatedPage(page2, rotation, tx, ty, expand)", | |
"page2.add_transformation(Transformation().rotate(rotation).translate(tx, ty)); page.merge_page(page2, expand)", | |
"3.0.0", | |
) | |
op = Transformation().translate(-tx, -ty).rotate(rotation).translate(tx, ty) | |
return self.mergeTransformedPage(page2, op, expand) | |
def mergeRotatedScaledPage( | |
self, page2: "PageObject", rotation: float, scale: float, expand: bool = False | |
) -> None: # pragma: no cover | |
""" | |
mergeRotatedScaledPage is similar to merge_page, but the stream to be | |
merged is rotated and scaled by applying a transformation matrix. | |
:param PageObject page2: the page to be merged into this one. Should be | |
an instance of :class:`PageObject<PageObject>`. | |
:param float rotation: The angle of the rotation, in degrees | |
:param float scale: The scaling factor | |
:param bool expand: Whether the page should be expanded to fit the | |
dimensions of the page to be merged. | |
.. deprecated:: 1.28.0 | |
Use :meth:`add_transformation` and :meth:`merge_page` instead. | |
""" | |
deprecation_with_replacement( | |
"page.mergeRotatedScaledPage(page2, rotation, scale, expand)", | |
"page2.add_transformation(Transformation().rotate(rotation).scale(scale)); page.merge_page(page2, expand)", | |
"3.0.0", | |
) | |
op = Transformation().rotate(rotation).scale(scale, scale) | |
self.mergeTransformedPage(page2, op, expand) | |
def mergeScaledTranslatedPage( | |
self, | |
page2: "PageObject", | |
scale: float, | |
tx: float, | |
ty: float, | |
expand: bool = False, | |
) -> None: # pragma: no cover | |
""" | |
mergeScaledTranslatedPage is similar to merge_page, but the stream to be | |
merged is translated and scaled by applying a transformation matrix. | |
:param PageObject page2: the page to be merged into this one. Should be | |
an instance of :class:`PageObject<PageObject>`. | |
:param float scale: The scaling factor | |
:param float tx: The translation on X axis | |
:param float ty: The translation on Y axis | |
:param bool expand: Whether the page should be expanded to fit the | |
dimensions of the page to be merged. | |
.. deprecated:: 1.28.0 | |
Use :meth:`add_transformation` and :meth:`merge_page` instead. | |
""" | |
deprecation_with_replacement( | |
"page.mergeScaledTranslatedPage(page2, scale, tx, ty, expand)", | |
"page2.add_transformation(Transformation().scale(scale).translate(tx, ty)); page.merge_page(page2, expand)", | |
"3.0.0", | |
) | |
op = Transformation().scale(scale, scale).translate(tx, ty) | |
return self.mergeTransformedPage(page2, op, expand) | |
def mergeRotatedScaledTranslatedPage( | |
self, | |
page2: "PageObject", | |
rotation: float, | |
scale: float, | |
tx: float, | |
ty: float, | |
expand: bool = False, | |
) -> None: # pragma: no cover | |
""" | |
mergeRotatedScaledTranslatedPage is similar to merge_page, but the | |
stream to be merged is translated, rotated and scaled by applying a | |
transformation matrix. | |
:param PageObject page2: the page to be merged into this one. Should be | |
an instance of :class:`PageObject<PageObject>`. | |
:param float tx: The translation on X axis | |
:param float ty: The translation on Y axis | |
:param float rotation: The angle of the rotation, in degrees | |
:param float scale: The scaling factor | |
:param bool expand: Whether the page should be expanded to fit the | |
dimensions of the page to be merged. | |
.. deprecated:: 1.28.0 | |
Use :meth:`add_transformation` and :meth:`merge_page` instead. | |
""" | |
deprecation_with_replacement( | |
"page.mergeRotatedScaledTranslatedPage(page2, rotation, tx, ty, expand)", | |
"page2.add_transformation(Transformation().rotate(rotation).scale(scale)); page.merge_page(page2, expand)", | |
"3.0.0", | |
) | |
op = Transformation().rotate(rotation).scale(scale, scale).translate(tx, ty) | |
self.mergeTransformedPage(page2, op, expand) | |
def add_transformation( | |
self, | |
ctm: Union[Transformation, CompressedTransformationMatrix], | |
expand: bool = False, | |
) -> None: | |
""" | |
Apply a transformation matrix to the page. | |
Args: | |
ctm: A 6-element tuple containing the operands of the | |
transformation matrix. Alternatively, a | |
:py:class:`Transformation<PyPDF2.Transformation>` | |
object can be passed. | |
See :doc:`/user/cropping-and-transforming`. | |
""" | |
if isinstance(ctm, Transformation): | |
ctm = ctm.ctm | |
content = self.get_contents() | |
if content is not None: | |
content = PageObject._add_transformation_matrix(content, self.pdf, ctm) | |
content = PageObject._push_pop_gs(content, self.pdf) | |
self[NameObject(PG.CONTENTS)] = content | |
# if expanding the page to fit a new page, calculate the new media box size | |
if expand: | |
corners = [ | |
self.mediabox.left.as_numeric(), | |
self.mediabox.bottom.as_numeric(), | |
self.mediabox.left.as_numeric(), | |
self.mediabox.top.as_numeric(), | |
self.mediabox.right.as_numeric(), | |
self.mediabox.top.as_numeric(), | |
self.mediabox.right.as_numeric(), | |
self.mediabox.bottom.as_numeric(), | |
] | |
ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] | |
new_x = [ | |
ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4] | |
for i in range(0, 8, 2) | |
] | |
new_y = [ | |
ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5] | |
for i in range(0, 8, 2) | |
] | |
lowerleft = (min(new_x), min(new_y)) | |
upperright = (max(new_x), max(new_y)) | |
lowerleft = (min(corners[0], lowerleft[0]), min(corners[1], lowerleft[1])) | |
upperright = ( | |
max(corners[2], upperright[0]), | |
max(corners[3], upperright[1]), | |
) | |
self.mediabox.lower_left = lowerleft | |
self.mediabox.upper_right = upperright | |
def addTransformation( | |
self, ctm: CompressedTransformationMatrix | |
) -> None: # pragma: no cover | |
""" | |
.. deprecated:: 1.28.0 | |
Use :meth:`add_transformation` instead. | |
""" | |
deprecation_with_replacement("addTransformation", "add_transformation", "3.0.0") | |
self.add_transformation(ctm) | |
def scale(self, sx: float, sy: float) -> None: | |
""" | |
Scale a page by the given factors by applying a transformation | |
matrix to its content and updating the page size. | |
This updates the mediabox, the cropbox, and the contents | |
of the page. | |
Args: | |
sx: The scaling factor on horizontal axis. | |
sy: The scaling factor on vertical axis. | |
""" | |
self.add_transformation((sx, 0, 0, sy, 0, 0)) | |
self.cropbox = self.cropbox.scale(sx, sy) | |
self.artbox = self.artbox.scale(sx, sy) | |
self.bleedbox = self.bleedbox.scale(sx, sy) | |
self.trimbox = self.trimbox.scale(sx, sy) | |
self.mediabox = self.mediabox.scale(sx, sy) | |
if PG.ANNOTS in self: | |
annotations = self[PG.ANNOTS] | |
if isinstance(annotations, ArrayObject): | |
for annotation in annotations: | |
annotation_obj = annotation.get_object() | |
if ADA.Rect in annotation_obj: | |
rectangle = annotation_obj[ADA.Rect] | |
if isinstance(rectangle, ArrayObject): | |
rectangle[0] = FloatObject(float(rectangle[0]) * sx) | |
rectangle[1] = FloatObject(float(rectangle[1]) * sy) | |
rectangle[2] = FloatObject(float(rectangle[2]) * sx) | |
rectangle[3] = FloatObject(float(rectangle[3]) * sy) | |
if PG.VP in self: | |
viewport = self[PG.VP] | |
if isinstance(viewport, ArrayObject): | |
bbox = viewport[0]["/BBox"] | |
else: | |
bbox = viewport["/BBox"] # type: ignore | |
scaled_bbox = RectangleObject( | |
( | |
float(bbox[0]) * sx, | |
float(bbox[1]) * sy, | |
float(bbox[2]) * sx, | |
float(bbox[3]) * sy, | |
) | |
) | |
if isinstance(viewport, ArrayObject): | |
self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore | |
NameObject("/BBox") | |
] = scaled_bbox | |
else: | |
self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore | |
def scale_by(self, factor: float) -> None: | |
""" | |
Scale a page by the given factor by applying a transformation | |
matrix to its content and updating the page size. | |
Args: | |
factor: The scaling factor (for both X and Y axis). | |
""" | |
self.scale(factor, factor) | |
def scaleBy(self, factor: float) -> None: # pragma: no cover | |
""" | |
.. deprecated:: 1.28.0 | |
Use :meth:`scale_by` instead. | |
""" | |
deprecation_with_replacement("scaleBy", "scale_by", "3.0.0") | |
self.scale(factor, factor) | |
def scale_to(self, width: float, height: float) -> None: | |
""" | |
Scale a page to the specified dimensions by applying a | |
transformation matrix to its content and updating the page size. | |
Args: | |
width: The new width. | |
height: The new height. | |
""" | |
sx = width / float(self.mediabox.width) | |
sy = height / float(self.mediabox.height) | |
self.scale(sx, sy) | |
def scaleTo(self, width: float, height: float) -> None: # pragma: no cover | |
""" | |
.. deprecated:: 1.28.0 | |
Use :meth:`scale_to` instead. | |
""" | |
deprecation_with_replacement("scaleTo", "scale_to", "3.0.0") | |
self.scale_to(width, height) | |
def compress_content_streams(self) -> None: | |
""" | |
Compress the size of this page by joining all content streams and | |
applying a FlateDecode filter. | |
However, it is possible that this function will perform no action if | |
content stream compression becomes "automatic". | |
""" | |
content = self.get_contents() | |
if content is not None: | |
if not isinstance(content, ContentStream): | |
content = ContentStream(content, self.pdf) | |
self[NameObject(PG.CONTENTS)] = content.flate_encode() | |
def compressContentStreams(self) -> None: # pragma: no cover | |
""" | |
.. deprecated:: 1.28.0 | |
Use :meth:`compress_content_streams` instead. | |
""" | |
deprecation_with_replacement( | |
"compressContentStreams", "compress_content_streams", "3.0.0" | |
) | |
self.compress_content_streams() | |
def _debug_for_extract(self) -> str: # pragma: no cover | |
out = "" | |
for ope, op in ContentStream( | |
self["/Contents"].get_object(), self.pdf, "bytes" | |
).operations: | |
if op == b"TJ": | |
s = [x for x in ope[0] if isinstance(x, str)] | |
else: | |
s = [] | |
out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n" | |
out += "\n=============================\n" | |
try: | |
for fo in self[PG.RESOURCES]["/Font"]: # type:ignore | |
out += fo + "\n" | |
out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore | |
try: | |
enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore | |
"/Encoding" | |
].__repr__() | |
out += enc_repr + "\n" | |
except Exception: | |
pass | |
try: | |
out += ( | |
self[PG.RESOURCES]["/Font"][fo][ # type:ignore | |
"/ToUnicode" | |
] | |
.get_data() | |
.decode() | |
+ "\n" | |
) | |
except Exception: | |
pass | |
except KeyError: | |
out += "No Font\n" | |
return out | |
def _extract_text( | |
self, | |
obj: Any, | |
pdf: Any, | |
orientations: Tuple[int, ...] = (0, 90, 180, 270), | |
space_width: float = 200.0, | |
content_key: Optional[str] = PG.CONTENTS, | |
visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, | |
visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, | |
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, | |
) -> str: | |
""" | |
See extract_text for most arguments. | |
Args: | |
content_key: indicate the default key where to extract data | |
None = the object; this allow to reuse the function on XObject | |
default = "/Content" | |
""" | |
text: str = "" | |
output: str = "" | |
rtl_dir: bool = False # right-to-left | |
cmaps: Dict[ | |
str, | |
Tuple[ | |
str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject | |
], | |
] = {} | |
try: | |
objr = obj | |
while NameObject(PG.RESOURCES) not in objr: | |
# /Resources can be inherited sometimes so we look to parents | |
objr = objr["/Parent"].get_object() | |
# if no parents we will have no /Resources will be available => an exception wil be raised | |
resources_dict = cast(DictionaryObject, objr[PG.RESOURCES]) | |
except Exception: | |
return "" # no resources means no text is possible (no font) we consider the file as not damaged, no need to check for TJ or Tj | |
if "/Font" in resources_dict: | |
for f in cast(DictionaryObject, resources_dict["/Font"]): | |
cmaps[f] = build_char_map(f, space_width, obj) | |
cmap: Tuple[ | |
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] | |
] = ( | |
"charmap", | |
{}, | |
"NotInitialized", | |
None, | |
) # (encoding,CMAP,font resource name,dictionary-object of font) | |
try: | |
content = ( | |
obj[content_key].get_object() if isinstance(content_key, str) else obj | |
) | |
if not isinstance(content, ContentStream): | |
content = ContentStream(content, pdf, "bytes") | |
except KeyError: # it means no content can be extracted(certainly empty page) | |
return "" | |
# Note: we check all strings are TextStringObjects. ByteStringObjects | |
# are strings where the byte->string encoding was unknown, so adding | |
# them to the text here would be gibberish. | |
cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] | |
cm_stack = [] | |
tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] | |
tm_prev: List[float] = [ | |
1.0, | |
0.0, | |
0.0, | |
1.0, | |
0.0, | |
0.0, | |
] # will store cm_matrix * tm_matrix | |
char_scale = 1.0 | |
space_scale = 1.0 | |
_space_width: float = 500.0 # will be set correctly at first Tf | |
TL = 0.0 | |
font_size = 12.0 # init just in case of | |
def mult(m: List[float], n: List[float]) -> List[float]: | |
return [ | |
m[0] * n[0] + m[1] * n[2], | |
m[0] * n[1] + m[1] * n[3], | |
m[2] * n[0] + m[3] * n[2], | |
m[2] * n[1] + m[3] * n[3], | |
m[4] * n[0] + m[5] * n[2] + n[4], | |
m[4] * n[1] + m[5] * n[3] + n[5], | |
] | |
def orient(m: List[float]) -> int: | |
if m[3] > 1e-6: | |
return 0 | |
elif m[3] < -1e-6: | |
return 180 | |
elif m[1] > 0: | |
return 90 | |
else: | |
return 270 | |
def current_spacewidth() -> float: | |
# return space_scale * _space_width * char_scale | |
return _space_width / 1000.0 | |
def process_operation(operator: bytes, operands: List) -> None: | |
nonlocal cm_matrix, cm_stack, tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap, orientations, rtl_dir, visitor_text | |
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS | |
check_crlf_space: bool = False | |
# Table 5.4 page 405 | |
if operator == b"BT": | |
tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] | |
# tm_prev = tm_matrix | |
output += text | |
if visitor_text is not None: | |
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) | |
# based | |
# if output != "" and output[-1]!="\n": | |
# output += "\n" | |
text = "" | |
return None | |
elif operator == b"ET": | |
output += text | |
if visitor_text is not None: | |
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) | |
text = "" | |
# table 4.7 "Graphics state operators", page 219 | |
# cm_matrix calculation is a reserved for the moment | |
elif operator == b"q": | |
cm_stack.append( | |
( | |
cm_matrix, | |
cmap, | |
font_size, | |
char_scale, | |
space_scale, | |
_space_width, | |
TL, | |
) | |
) | |
elif operator == b"Q": | |
try: | |
( | |
cm_matrix, | |
cmap, | |
font_size, | |
char_scale, | |
space_scale, | |
_space_width, | |
TL, | |
) = cm_stack.pop() | |
except Exception: | |
cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] | |
# rtl_dir = False | |
elif operator == b"cm": | |
output += text | |
if visitor_text is not None: | |
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) | |
text = "" | |
cm_matrix = mult( | |
[ | |
float(operands[0]), | |
float(operands[1]), | |
float(operands[2]), | |
float(operands[3]), | |
float(operands[4]), | |
float(operands[5]), | |
], | |
cm_matrix, | |
) | |
# rtl_dir = False | |
# Table 5.2 page 398 | |
elif operator == b"Tz": | |
char_scale = float(operands[0]) / 100.0 | |
elif operator == b"Tw": | |
space_scale = 1.0 + float(operands[0]) | |
elif operator == b"TL": | |
TL = float(operands[0]) | |
elif operator == b"Tf": | |
if text != "": | |
output += text # .translate(cmap) | |
if visitor_text is not None: | |
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) | |
text = "" | |
# rtl_dir = False | |
try: | |
# charMapTuple: font_type, float(sp_width / 2), encoding, map_dict, font-dictionary | |
charMapTuple = cmaps[operands[0]] | |
_space_width = charMapTuple[1] | |
# current cmap: encoding, map_dict, font resource name (internal name, not the real font-name), | |
# font-dictionary. The font-dictionary describes the font. | |
cmap = ( | |
charMapTuple[2], | |
charMapTuple[3], | |
operands[0], | |
charMapTuple[4], | |
) | |
except KeyError: # font not found | |
_space_width = unknown_char_map[1] | |
cmap = ( | |
unknown_char_map[2], | |
unknown_char_map[3], | |
"???" + operands[0], | |
None, | |
) | |
try: | |
font_size = float(operands[1]) | |
except Exception: | |
pass # keep previous size | |
# Table 5.5 page 406 | |
elif operator == b"Td": | |
check_crlf_space = True | |
# A special case is a translating only tm: | |
# tm[0..5] = 1 0 0 1 e f, | |
# i.e. tm[4] += tx, tm[5] += ty. | |
tx = float(operands[0]) | |
ty = float(operands[1]) | |
tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2] | |
tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3] | |
elif operator == b"Tm": | |
check_crlf_space = True | |
tm_matrix = [ | |
float(operands[0]), | |
float(operands[1]), | |
float(operands[2]), | |
float(operands[3]), | |
float(operands[4]), | |
float(operands[5]), | |
] | |
elif operator == b"T*": | |
check_crlf_space = True | |
tm_matrix[5] -= TL | |
elif operator == b"Tj": | |
check_crlf_space = True | |
m = mult(tm_matrix, cm_matrix) | |
orientation = orient(m) | |
if orientation in orientations: | |
if isinstance(operands[0], str): | |
text += operands[0] | |
else: | |
t: str = "" | |
tt: bytes = ( | |
encode_pdfdocencoding(operands[0]) | |
if isinstance(operands[0], str) | |
else operands[0] | |
) | |
if isinstance(cmap[0], str): | |
try: | |
t = tt.decode( | |
cmap[0], "surrogatepass" | |
) # apply str encoding | |
except Exception: # the data does not match the expectation, we use the alternative ; text extraction may not be good | |
t = tt.decode( | |
"utf-16-be" if cmap[0] == "charmap" else "charmap", | |
"surrogatepass", | |
) # apply str encoding | |
else: # apply dict encoding | |
t = "".join( | |
[ | |
cmap[0][x] if x in cmap[0] else bytes((x,)).decode() | |
for x in tt | |
] | |
) | |
# "\u0590 - \u08FF \uFB50 - \uFDFF" | |
for x in "".join( | |
[cmap[1][x] if x in cmap[1] else x for x in t] | |
): | |
xx = ord(x) | |
# fmt: off | |
if ( # cases where the current inserting order is kept (punctuation,...) | |
(xx <= 0x2F) # punctuations but... | |
or (0x3A <= xx and xx <= 0x40) # numbers (x30-39) | |
or (0x2000 <= xx and xx <= 0x206F) # upper punctuations.. | |
or (0x20A0 <= xx and xx <= 0x21FF) # but (numbers) indices/exponents | |
or xx in CUSTOM_RTL_SPECIAL_CHARS # customized.... | |
): | |
text = x + text if rtl_dir else text + x | |
elif ( # right-to-left characters set | |
(0x0590 <= xx and xx <= 0x08FF) | |
or (0xFB1D <= xx and xx <= 0xFDFF) | |
or (0xFE70 <= xx and xx <= 0xFEFF) | |
or (CUSTOM_RTL_MIN <= xx and xx <= CUSTOM_RTL_MAX) | |
): | |
# print("<",xx,x) | |
if not rtl_dir: | |
rtl_dir = True | |
# print("RTL",text,"*") | |
output += text | |
if visitor_text is not None: | |
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) | |
text = "" | |
text = x + text | |
else: # left-to-right | |
# print(">",xx,x,end="") | |
if rtl_dir: | |
rtl_dir = False | |
# print("LTR",text,"*") | |
output += text | |
if visitor_text is not None: | |
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) | |
text = "" | |
text = text + x | |
# fmt: on | |
else: | |
return None | |
if check_crlf_space: | |
m = mult(tm_matrix, cm_matrix) | |
orientation = orient(m) | |
delta_x = m[4] - tm_prev[4] | |
delta_y = m[5] - tm_prev[5] | |
k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2])) | |
f = font_size * k | |
tm_prev = m | |
if orientation not in orientations: | |
return None | |
try: | |
if orientation == 0: | |
if delta_y < -0.8 * f: | |
if (output + text)[-1] != "\n": | |
output += text + "\n" | |
if visitor_text is not None: | |
visitor_text( | |
text + "\n", | |
cm_matrix, | |
tm_matrix, | |
cmap[3], | |
font_size, | |
) | |
text = "" | |
elif ( | |
abs(delta_y) < f * 0.3 | |
and abs(delta_x) > current_spacewidth() * f * 15 | |
): | |
if (output + text)[-1] != " ": | |
text += " " | |
elif orientation == 180: | |
if delta_y > 0.8 * f: | |
if (output + text)[-1] != "\n": | |
output += text + "\n" | |
if visitor_text is not None: | |
visitor_text( | |
text + "\n", | |
cm_matrix, | |
tm_matrix, | |
cmap[3], | |
font_size, | |
) | |
text = "" | |
elif ( | |
abs(delta_y) < f * 0.3 | |
and abs(delta_x) > current_spacewidth() * f * 15 | |
): | |
if (output + text)[-1] != " ": | |
text += " " | |
elif orientation == 90: | |
if delta_x > 0.8 * f: | |
if (output + text)[-1] != "\n": | |
output += text + "\n" | |
if visitor_text is not None: | |
visitor_text( | |
text + "\n", | |
cm_matrix, | |
tm_matrix, | |
cmap[3], | |
font_size, | |
) | |
text = "" | |
elif ( | |
abs(delta_x) < f * 0.3 | |
and abs(delta_y) > current_spacewidth() * f * 15 | |
): | |
if (output + text)[-1] != " ": | |
text += " " | |
elif orientation == 270: | |
if delta_x < -0.8 * f: | |
if (output + text)[-1] != "\n": | |
output += text + "\n" | |
if visitor_text is not None: | |
visitor_text( | |
text + "\n", | |
cm_matrix, | |
tm_matrix, | |
cmap[3], | |
font_size, | |
) | |
text = "" | |
elif ( | |
abs(delta_x) < f * 0.3 | |
and abs(delta_y) > current_spacewidth() * f * 15 | |
): | |
if (output + text)[-1] != " ": | |
text += " " | |
except Exception: | |
pass | |
for operands, operator in content.operations: | |
if visitor_operand_before is not None: | |
visitor_operand_before(operator, operands, cm_matrix, tm_matrix) | |
# multiple operators are defined in here #### | |
if operator == b"'": | |
process_operation(b"T*", []) | |
process_operation(b"Tj", operands) | |
elif operator == b'"': | |
process_operation(b"Tw", [operands[0]]) | |
process_operation(b"Tc", [operands[1]]) | |
process_operation(b"T*", []) | |
process_operation(b"Tj", operands[2:]) | |
elif operator == b"TD": | |
process_operation(b"TL", [-operands[1]]) | |
process_operation(b"Td", operands) | |
elif operator == b"TJ": | |
for op in operands[0]: | |
if isinstance(op, (str, bytes)): | |
process_operation(b"Tj", [op]) | |
if isinstance(op, (int, float, NumberObject, FloatObject)): | |
if ( | |
(abs(float(op)) >= _space_width) | |
and (len(text) > 0) | |
and (text[-1] != " ") | |
): | |
process_operation(b"Tj", [" "]) | |
elif operator == b"Do": | |
output += text | |
if visitor_text is not None: | |
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) | |
try: | |
if output[-1] != "\n": | |
output += "\n" | |
if visitor_text is not None: | |
visitor_text("\n", cm_matrix, tm_matrix, cmap[3], font_size) | |
except IndexError: | |
pass | |
try: | |
xobj = resources_dict["/XObject"] | |
if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore | |
# output += text | |
text = self.extract_xform_text( | |
xobj[operands[0]], # type: ignore | |
orientations, | |
space_width, | |
visitor_operand_before, | |
visitor_operand_after, | |
visitor_text, | |
) | |
output += text | |
if visitor_text is not None: | |
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) | |
except Exception: | |
logger_warning( | |
f" impossible to decode XFormObject {operands[0]}", | |
__name__, | |
) | |
finally: | |
text = "" | |
else: | |
process_operation(operator, operands) | |
if visitor_operand_after is not None: | |
visitor_operand_after(operator, operands, cm_matrix, tm_matrix) | |
output += text # just in case of | |
if text != "" and visitor_text is not None: | |
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) | |
return output | |
def extract_text( | |
self, | |
*args: Any, | |
Tj_sep: str = None, | |
TJ_sep: str = None, | |
orientations: Union[int, Tuple[int, ...]] = (0, 90, 180, 270), | |
space_width: float = 200.0, | |
visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, | |
visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, | |
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, | |
) -> str: | |
""" | |
Locate all text drawing commands, in the order they are provided in the | |
content stream, and extract the text. | |
This works well for some PDF files, but poorly for others, depending on | |
the generator used. This will be refined in the future. | |
Do not rely on the order of text coming out of this function, as it | |
will change if this function is made more sophisticated. | |
Arabic, Hebrew,... are extracted in the good order. | |
If required an custom RTL range of characters can be defined; see function set_custom_rtl | |
Additionally you can provide visitor-methods to get informed on all operands and all text-objects. | |
For example in some PDF files this can be useful to parse tables. | |
Args: | |
Tj_sep: Deprecated. Kept for compatibility until PyPDF2 4.0.0 | |
TJ_sep: Deprecated. Kept for compatibility until PyPDF2 4.0.0 | |
orientations: list of orientations text_extraction will look for | |
default = (0, 90, 180, 270) | |
note: currently only 0(Up),90(turned Left), 180(upside Down), | |
270 (turned Right) | |
space_width: force default space width | |
if not extracted from font (default: 200) | |
visitor_operand_before: function to be called before processing an operand. | |
It has four arguments: operand, operand-arguments, | |
current transformation matrix and text matrix. | |
visitor_operand_after: function to be called after processing an operand. | |
It has four arguments: operand, operand-arguments, | |
current transformation matrix and text matrix. | |
visitor_text: function to be called when extracting some text at some position. | |
It has five arguments: text, current transformation matrix, | |
text matrix, font-dictionary and font-size. | |
The font-dictionary may be None in case of unknown fonts. | |
If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". | |
Returns: | |
The extracted text | |
""" | |
if len(args) >= 1: | |
if isinstance(args[0], str): | |
Tj_sep = args[0] | |
if len(args) >= 2: | |
if isinstance(args[1], str): | |
TJ_sep = args[1] | |
else: | |
raise TypeError(f"Invalid positional parameter {args[1]}") | |
if len(args) >= 3: | |
if isinstance(args[2], (tuple, int)): | |
orientations = args[2] | |
else: | |
raise TypeError(f"Invalid positional parameter {args[2]}") | |
if len(args) >= 4: | |
if isinstance(args[3], (float, int)): | |
space_width = args[3] | |
else: | |
raise TypeError(f"Invalid positional parameter {args[3]}") | |
elif isinstance(args[0], (tuple, int)): | |
orientations = args[0] | |
if len(args) >= 2: | |
if isinstance(args[1], (float, int)): | |
space_width = args[1] | |
else: | |
raise TypeError(f"Invalid positional parameter {args[1]}") | |
else: | |
raise TypeError(f"Invalid positional parameter {args[0]}") | |
if Tj_sep is not None or TJ_sep is not None: | |
warnings.warn( | |
"parameters Tj_Sep, TJ_sep depreciated, and will be removed in PyPDF2 4.0.0.", | |
DeprecationWarning, | |
) | |
if isinstance(orientations, int): | |
orientations = (orientations,) | |
return self._extract_text( | |
self, | |
self.pdf, | |
orientations, | |
space_width, | |
PG.CONTENTS, | |
visitor_operand_before, | |
visitor_operand_after, | |
visitor_text, | |
) | |
def extract_xform_text( | |
self, | |
xform: EncodedStreamObject, | |
orientations: Tuple[int, ...] = (0, 90, 270, 360), | |
space_width: float = 200.0, | |
visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, | |
visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, | |
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, | |
) -> str: | |
""" | |
Extract text from an XObject. | |
Args: | |
space_width: force default space width (if not extracted from font (default 200) | |
Returns: | |
The extracted text | |
""" | |
return self._extract_text( | |
xform, | |
self.pdf, | |
orientations, | |
space_width, | |
None, | |
visitor_operand_before, | |
visitor_operand_after, | |
visitor_text, | |
) | |
def extractText( | |
self, Tj_sep: str = "", TJ_sep: str = "" | |
) -> str: # pragma: no cover | |
""" | |
.. deprecated:: 1.28.0 | |
Use :meth:`extract_text` instead. | |
""" | |
deprecation_with_replacement("extractText", "extract_text", "3.0.0") | |
return self.extract_text() | |
def _get_fonts(self) -> Tuple[Set[str], Set[str]]: | |
""" | |
Get the names of embedded fonts and unembedded fonts. | |
:return: (Set of embedded fonts, set of unembedded fonts) | |
""" | |
obj = self.get_object() | |
assert isinstance(obj, DictionaryObject) | |
fonts, embedded = _get_fonts_walk(cast(DictionaryObject, obj[PG.RESOURCES])) | |
unembedded = fonts - embedded | |
return embedded, unembedded | |
mediabox = _create_rectangle_accessor(PG.MEDIABOX, ()) | |
""" | |
A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, | |
defining the boundaries of the physical medium on which the page is | |
intended to be displayed or printed. | |
""" | |
def mediaBox(self) -> RectangleObject: # pragma: no cover | |
""" | |
.. deprecated:: 1.28.0 | |
Use :py:attr:`mediabox` instead. | |
""" | |
deprecation_with_replacement("mediaBox", "mediabox", "3.0.0") | |
return self.mediabox | |
def mediaBox(self, value: RectangleObject) -> None: # pragma: no cover | |
""" | |
.. deprecated:: 1.28.0 | |
Use :py:attr:`mediabox` instead. | |
""" | |
deprecation_with_replacement("mediaBox", "mediabox", "3.0.0") | |
self.mediabox = value | |
cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,)) | |
""" | |
A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, | |
defining the visible region of default user space. When the page is | |
displayed or printed, its contents are to be clipped (cropped) to this | |
rectangle and then imposed on the output medium in some | |
implementation-defined manner. Default value: same as :attr:`mediabox<mediabox>`. | |
""" | |
def cropBox(self) -> RectangleObject: # pragma: no cover | |
""" | |
.. deprecated:: 1.28.0 | |
Use :py:attr:`cropbox` instead. | |
""" | |
deprecation_with_replacement("cropBox", "cropbox", "3.0.0") | |
return self.cropbox | |
def cropBox(self, value: RectangleObject) -> None: # pragma: no cover | |
deprecation_with_replacement("cropBox", "cropbox", "3.0.0") | |
self.cropbox = value | |
bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) | |
""" | |
A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, | |
defining the region to which the contents of the page should be clipped | |
when output in a production environment. | |
""" | |
def bleedBox(self) -> RectangleObject: # pragma: no cover | |
""" | |
.. deprecated:: 1.28.0 | |
Use :py:attr:`bleedbox` instead. | |
""" | |
deprecation_with_replacement("bleedBox", "bleedbox", "3.0.0") | |
return self.bleedbox | |
def bleedBox(self, value: RectangleObject) -> None: # pragma: no cover | |
deprecation_with_replacement("bleedBox", "bleedbox", "3.0.0") | |
self.bleedbox = value | |
trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) | |
""" | |
A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, | |
defining the intended dimensions of the finished page after trimming. | |
""" | |
def trimBox(self) -> RectangleObject: # pragma: no cover | |
""" | |
.. deprecated:: 1.28.0 | |
Use :py:attr:`trimbox` instead. | |
""" | |
deprecation_with_replacement("trimBox", "trimbox", "3.0.0") | |
return self.trimbox | |
def trimBox(self, value: RectangleObject) -> None: # pragma: no cover | |
deprecation_with_replacement("trimBox", "trimbox", "3.0.0") | |
self.trimbox = value | |
artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) | |
""" | |
A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, | |
defining the extent of the page's meaningful content as intended by the | |
page's creator. | |
""" | |
def artBox(self) -> RectangleObject: # pragma: no cover | |
""" | |
.. deprecated:: 1.28.0 | |
Use :py:attr:`artbox` instead. | |
""" | |
deprecation_with_replacement("artBox", "artbox", "3.0.0") | |
return self.artbox | |
def artBox(self, value: RectangleObject) -> None: # pragma: no cover | |
deprecation_with_replacement("artBox", "artbox", "3.0.0") | |
self.artbox = value | |
def annotations(self) -> Optional[ArrayObject]: | |
if "/Annots" not in self: | |
return None | |
else: | |
return cast(ArrayObject, self["/Annots"]) | |
def annotations(self, value: Optional[ArrayObject]) -> None: | |
""" | |
Set the annotations array of the page. | |
Typically you don't want to set this value, but append to it. | |
If you append to it, don't forget to add the object first to the writer | |
and only add the indirect object. | |
""" | |
if value is None: | |
del self[NameObject("/Annots")] | |
else: | |
self[NameObject("/Annots")] = value | |
class _VirtualList: | |
def __init__( | |
self, | |
length_function: Callable[[], int], | |
get_function: Callable[[int], PageObject], | |
) -> None: | |
self.length_function = length_function | |
self.get_function = get_function | |
self.current = -1 | |
def __len__(self) -> int: | |
return self.length_function() | |
def __getitem__(self, index: int) -> PageObject: | |
if isinstance(index, slice): | |
indices = range(*index.indices(len(self))) | |
cls = type(self) | |
return cls(indices.__len__, lambda idx: self[indices[idx]]) # type: ignore | |
if not isinstance(index, int): | |
raise TypeError("sequence indices must be integers") | |
len_self = len(self) | |
if index < 0: | |
# support negative indexes | |
index = len_self + index | |
if index < 0 or index >= len_self: | |
raise IndexError("sequence index out of range") | |
return self.get_function(index) | |
def __iter__(self) -> Iterator[PageObject]: | |
for i in range(len(self)): | |
yield self[i] | |
def _get_fonts_walk( | |
obj: DictionaryObject, | |
fnt: Optional[Set[str]] = None, | |
emb: Optional[Set[str]] = None, | |
) -> Tuple[Set[str], Set[str]]: | |
""" | |
If there is a key called 'BaseFont', that is a font that is used in the document. | |
If there is a key called 'FontName' and another key in the same dictionary object | |
that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is | |
embedded. | |
We create and add to two sets, fnt = fonts used and emb = fonts embedded. | |
""" | |
if fnt is None: | |
fnt = set() | |
if emb is None: | |
emb = set() | |
if not hasattr(obj, "keys"): | |
return set(), set() | |
fontkeys = ("/FontFile", "/FontFile2", "/FontFile3") | |
if "/BaseFont" in obj: | |
fnt.add(cast(str, obj["/BaseFont"])) | |
if "/FontName" in obj: | |
if [x for x in fontkeys if x in obj]: # test to see if there is FontFile | |
emb.add(cast(str, obj["/FontName"])) | |
for key in obj.keys(): | |
_get_fonts_walk(cast(DictionaryObject, obj[key]), fnt, emb) | |
return fnt, emb # return the sets for each page | |