File size: 6,272 Bytes
35b22df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import codecs
from typing import Dict, List, Tuple, Union

from .._codecs import _pdfdoc_encoding
from .._utils import StreamType, b_, logger_warning, read_non_whitespace
from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError
from ._base import ByteStringObject, TextStringObject


def hex_to_rgb(value: str) -> Tuple[float, float, float]:
    return tuple(int(value.lstrip("#")[i : i + 2], 16) / 255.0 for i in (0, 2, 4))  # type: ignore


def read_hex_string_from_stream(
    stream: StreamType,
    forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,
) -> Union["TextStringObject", "ByteStringObject"]:
    stream.read(1)
    txt = ""
    x = b""
    while True:
        tok = read_non_whitespace(stream)
        if not tok:
            raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
        if tok == b">":
            break
        x += tok
        if len(x) == 2:
            txt += chr(int(x, base=16))
            x = b""
    if len(x) == 1:
        x += b"0"
    if len(x) == 2:
        txt += chr(int(x, base=16))
    return create_string_object(b_(txt), forced_encoding)


def read_string_from_stream(
    stream: StreamType,
    forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,
) -> Union["TextStringObject", "ByteStringObject"]:
    tok = stream.read(1)
    parens = 1
    txt = []
    while True:
        tok = stream.read(1)
        if not tok:
            raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
        if tok == b"(":
            parens += 1
        elif tok == b")":
            parens -= 1
            if parens == 0:
                break
        elif tok == b"\\":
            tok = stream.read(1)
            escape_dict = {
                b"n": b"\n",
                b"r": b"\r",
                b"t": b"\t",
                b"b": b"\b",
                b"f": b"\f",
                b"c": rb"\c",
                b"(": b"(",
                b")": b")",
                b"/": b"/",
                b"\\": b"\\",
                b" ": b" ",
                b"%": b"%",
                b"<": b"<",
                b">": b">",
                b"[": b"[",
                b"]": b"]",
                b"#": b"#",
                b"_": b"_",
                b"&": b"&",
                b"$": b"$",
            }
            try:
                tok = escape_dict[tok]
            except KeyError:
                if b"0" <= tok and tok <= b"7":
                    # "The number ddd may consist of one, two, or three
                    # octal digits; high-order overflow shall be ignored.
                    # Three octal digits shall be used, with leading zeros
                    # as needed, if the next character of the string is also
                    # a digit." (PDF reference 7.3.4.2, p 16)
                    for _ in range(2):
                        ntok = stream.read(1)
                        if b"0" <= ntok and ntok <= b"7":
                            tok += ntok
                        else:
                            stream.seek(-1, 1)  # ntok has to be analysed
                            break
                    tok = b_(chr(int(tok, base=8)))
                elif tok in b"\n\r":
                    # This case is  hit when a backslash followed by a line
                    # break occurs.  If it's a multi-char EOL, consume the
                    # second character:
                    tok = stream.read(1)
                    if tok not in b"\n\r":
                        stream.seek(-1, 1)
                    # Then don't add anything to the actual string, since this
                    # line break was escaped:
                    tok = b""
                else:
                    msg = rf"Unexpected escaped string: {tok.decode('utf8')}"
                    logger_warning(msg, __name__)
        txt.append(tok)
    return create_string_object(b"".join(txt), forced_encoding)


def create_string_object(
    string: Union[str, bytes],
    forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,
) -> Union[TextStringObject, ByteStringObject]:
    """
    Create a ByteStringObject or a TextStringObject from a string to represent the string.

    :param Union[str, bytes] string: A string

    :raises TypeError: If string is not of type str or bytes.
    """
    if isinstance(string, str):
        return TextStringObject(string)
    elif isinstance(string, bytes):
        if isinstance(forced_encoding, (list, dict)):
            out = ""
            for x in string:
                try:
                    out += forced_encoding[x]
                except Exception:
                    out += bytes((x,)).decode("charmap")
            return TextStringObject(out)
        elif isinstance(forced_encoding, str):
            if forced_encoding == "bytes":
                return ByteStringObject(string)
            return TextStringObject(string.decode(forced_encoding))
        else:
            try:
                if string.startswith(codecs.BOM_UTF16_BE):
                    retval = TextStringObject(string.decode("utf-16"))
                    retval.autodetect_utf16 = True
                    return retval
                else:
                    # This is probably a big performance hit here, but we need to
                    # convert string objects into the text/unicode-aware version if
                    # possible... and the only way to check if that's possible is
                    # to try.  Some strings are strings, some are just byte arrays.
                    retval = TextStringObject(decode_pdfdocencoding(string))
                    retval.autodetect_pdfdocencoding = True
                    return retval
            except UnicodeDecodeError:
                return ByteStringObject(string)
    else:
        raise TypeError("create_string_object should have str or unicode arg")


def decode_pdfdocencoding(byte_array: bytes) -> str:
    retval = ""
    for b in byte_array:
        c = _pdfdoc_encoding[b]
        if c == "\u0000":
            raise UnicodeDecodeError(
                "pdfdocencoding",
                bytearray(b),
                -1,
                -1,
                "does not exist in translation table",
            )
        retval += c
    return retval