rosa0003 commited on
Commit
f867c73
·
verified ·
1 Parent(s): 066bc00

Upload 5 files

Browse files
Files changed (5) hide show
  1. .gitignore +101 -0
  2. LICENSE +21 -0
  3. setup.py +37 -0
  4. toc2mindmap/convert.py +92 -0
  5. toc2mindmap/dumppdf.py +280 -0
.gitignore ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ env/
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+
28
+ # PyInstaller
29
+ # Usually these files are written by a python script from a template
30
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
31
+ *.manifest
32
+ *.spec
33
+
34
+ # Installer logs
35
+ pip-log.txt
36
+ pip-delete-this-directory.txt
37
+
38
+ # Unit test / coverage reports
39
+ htmlcov/
40
+ .tox/
41
+ .coverage
42
+ .coverage.*
43
+ .cache
44
+ nosetests.xml
45
+ coverage.xml
46
+ *.cover
47
+ .hypothesis/
48
+
49
+ # Translations
50
+ *.mo
51
+ *.pot
52
+
53
+ # Django stuff:
54
+ *.log
55
+ local_settings.py
56
+
57
+ # Flask stuff:
58
+ instance/
59
+ .webassets-cache
60
+
61
+ # Scrapy stuff:
62
+ .scrapy
63
+
64
+ # Sphinx documentation
65
+ docs/_build/
66
+
67
+ # PyBuilder
68
+ target/
69
+
70
+ # Jupyter Notebook
71
+ .ipynb_checkpoints
72
+
73
+ # pyenv
74
+ .python-version
75
+
76
+ # celery beat schedule file
77
+ celerybeat-schedule
78
+
79
+ # SageMath parsed files
80
+ *.sage.py
81
+
82
+ # dotenv
83
+ .env
84
+
85
+ # virtualenv
86
+ .venv
87
+ venv/
88
+ ENV/
89
+
90
+ # Spyder project settings
91
+ .spyderproject
92
+ .spyproject
93
+
94
+ # Rope project settings
95
+ .ropeproject
96
+
97
+ # mkdocs documentation
98
+ /site
99
+
100
+ # mypy
101
+ .mypy_cache/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2018 Lucas
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
setup.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ '''
3
+ PdfToc2MindMap setup
4
+
5
+ Warnings:
6
+ to make pip respect the links, you have to use
7
+ `--process-dependency-links` switch. So e.g.:
8
+ `pip install --process-dependency-links <repo_path_or_url>`
9
+ '''
10
+
11
+ import setuptools
12
+
13
+ # see http://setuptools.readthedocs.io/en/latest/setuptools.html
14
+ # and https://packaging.python.org/tutorials/distributing-packages/
15
+ setuptools.setup(
16
+ name='PdfToc2MindMap',
17
+ version='0.1.0a1',
18
+ install_requires=['pdfminer.six'],
19
+ dependency_links=[
20
+ 'git+https://github.com/andrii-z4i/xmind-sdk-python.git@master-0'
21
+ ],
22
+ packages=setuptools.find_packages(exclude=('tests*',)), # find automatically
23
+ author='Lucas Koelman',
24
+ author_email='[email protected]',
25
+ description='PdfToc2MindMap: create mindmaps from table of contents in a PDF file',
26
+ long_description='PdfToc2MindMap: create mindmaps from table of contents in a PDF file',
27
+ license='MIT',
28
+ keywords=('pdf', 'mindmap', 'visualization'),
29
+ url='https://github.com/mananatee/PdfToc2MindMap',
30
+ classifiers=[
31
+ 'Development Status :: 3 - Alpha',
32
+ 'Environment :: Console',
33
+ 'License :: OSI Approved :: MIT License',
34
+ 'Topic :: Utilities',
35
+ 'Programming Language :: Python :: 3',],
36
+ entry_points={},
37
+ package_data={})
toc2mindmap/convert.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding: utf-8 -*-
2
+ """
3
+ Module to convert table of contents of PDF file to mindmap format.
4
+
5
+ @author Lucas Koelman
6
+ @date 20/02/2018
7
+
8
+ @see https://github.com/pdfminer/pdfminer.six/blob/master/tools/dumppdf.py
9
+ @see https://github.com/xmindltd/xmind-sdk-python/blob/master/example.py
10
+ """
11
+
12
+ # standard library
13
+ import sys, os, re
14
+ import xml.etree.ElementTree as etree
15
+ try:
16
+ import cStringIO as io # Python 2
17
+ except (ImportError, ModuleNotFoundError):
18
+ import io # Python 3
19
+ import dumppdf
20
+
21
+ import xmind
22
+ from xmind.core.topic import TopicElement
23
+
24
+
25
+ def toc_to_xmind(outfp, pdf_filename):
26
+ """
27
+ Convert table of contents of given PDF file to XMind document.
28
+ """
29
+ out_str = io.StringIO()
30
+ dumppdf.dumpoutline(out_str, pdf_filename, [], set())
31
+
32
+ # Parse XML
33
+ toc_xml = out_str.getvalue()
34
+ out_str.close() # no 'with' statement possible
35
+ root_elem = etree.fromstring(toc_xml)
36
+
37
+ # Convert XML to XMind document
38
+ xwb = xmind.load(outfp) # load an existing file or create a new workbook if nothing is found
39
+
40
+ # Create XMind workbook
41
+ s1 = xwb.getPrimarySheet()
42
+ s1.setTitle(os.path.split(pdf_filename)[-1])
43
+ root_topic = s1.getRootTopic()
44
+ root_topic.setTitle("Contents")
45
+
46
+ # Transform each XML node into a mindmap node
47
+ topic_stack = [root_topic] # length will always equal depth/level during traversal
48
+ prev_level = 0
49
+ for node in root_elem.iter(): # depth-first traversal
50
+ if 'level' not in node.attrib:
51
+ continue # irrelevant node
52
+ node_level = int(node.attrib['level'])
53
+
54
+ # Create topic for this node
55
+ topic = TopicElement(ownerWorkbook=xwb)
56
+ title = re.sub(r"^[a-zA-Z]'(.*)'$", r'\1', node.attrib['title'])
57
+ topic.setTitle(title)
58
+
59
+ # Add it to the topic tree
60
+ level_difference = node_level - prev_level
61
+ for _ in range(-level_difference+1): # negative yields empty list
62
+ topic_stack.pop()
63
+ topic_stack[-1].addSubTopic(topic)
64
+ topic_stack.append(topic)
65
+ prev_level = node_level
66
+
67
+ xmind.save(xwb)
68
+
69
+
70
+ def main(argv):
71
+ """
72
+ Run conversion tool from command line.
73
+ """
74
+ import getopt
75
+ def usage():
76
+ print ('usage: %s -o outfile.xmind pdf_file.pdf' % argv[0])
77
+ return 100
78
+ try:
79
+ (opts, args) = getopt.getopt(argv[1:], 'o:')
80
+ except getopt.GetoptError:
81
+ return usage()
82
+ if not args:
83
+ return usage()
84
+
85
+ dopts = dict(opts)
86
+ outfp = dopts['-o']
87
+ pdf_filename = args[0]
88
+ toc_to_xmind(outfp, pdf_filename)
89
+
90
+
91
+ if __name__ == '__main__':
92
+ sys.exit(main(sys.argv))
toc2mindmap/dumppdf.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This is the dumppdf script copied from
3
+ https://github.com/pdfminer/pdfminer.six/tree/master/tools.
4
+
5
+ The script was copied here since it is not part of an exposed package.
6
+ """
7
+
8
+ #
9
+ # dumppdf.py - dump pdf contents in XML format.
10
+ #
11
+ # usage: dumppdf.py [options] [files ...]
12
+ # options:
13
+ # -i objid : object id
14
+ #
15
+ import sys, os.path, re, logging
16
+ from pdfminer.psparser import PSKeyword, PSLiteral, LIT
17
+ from pdfminer.pdfparser import PDFParser
18
+ from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
19
+ from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
20
+ from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
21
+ from pdfminer.pdfpage import PDFPage
22
+ from pdfminer.utils import isnumber
23
+
24
+
25
+ ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
26
+ def e(s):
27
+ if six.PY3 and isinstance(s,six.binary_type):
28
+ s=str(s,'latin-1')
29
+ return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)
30
+
31
+ import six # Python 2+3 compatibility
32
+
33
+
34
+ # dumpxml
35
+ def dumpxml(out, obj, codec=None):
36
+ if obj is None:
37
+ out.write('<null />')
38
+ return
39
+
40
+ if isinstance(obj, dict):
41
+ out.write('<dict size="%d">\n' % len(obj))
42
+ for (k,v) in six.iteritems(obj):
43
+ out.write('<key>%s</key>\n' % k)
44
+ out.write('<value>')
45
+ dumpxml(out, v)
46
+ out.write('</value>\n')
47
+ out.write('</dict>')
48
+ return
49
+
50
+ if isinstance(obj, list):
51
+ out.write('<list size="%d">\n' % len(obj))
52
+ for v in obj:
53
+ dumpxml(out, v)
54
+ out.write('\n')
55
+ out.write('</list>')
56
+ return
57
+
58
+ if isinstance(obj, (six.string_types, six.binary_type)):
59
+ out.write('<string size="%d">%s</string>' % (len(obj), e(obj)))
60
+ return
61
+
62
+ if isinstance(obj, PDFStream):
63
+ if codec == 'raw':
64
+ out.write(obj.get_rawdata())
65
+ elif codec == 'binary':
66
+ out.write(obj.get_data())
67
+ else:
68
+ out.write('<stream>\n<props>\n')
69
+ dumpxml(out, obj.attrs)
70
+ out.write('\n</props>\n')
71
+ if codec == 'text':
72
+ data = obj.get_data()
73
+ out.write('<data size="%d">%s</data>\n' % (len(data), e(data)))
74
+ out.write('</stream>')
75
+ return
76
+
77
+ if isinstance(obj, PDFObjRef):
78
+ out.write('<ref id="%d" />' % obj.objid)
79
+ return
80
+
81
+ if isinstance(obj, PSKeyword):
82
+ out.write('<keyword>%s</keyword>' % obj.name)
83
+ return
84
+
85
+ if isinstance(obj, PSLiteral):
86
+ out.write('<literal>%s</literal>' % obj.name)
87
+ return
88
+
89
+ if isnumber(obj):
90
+ out.write('<number>%s</number>' % obj)
91
+ return
92
+
93
+ raise TypeError(obj)
94
+
95
+ # dumptrailers
96
+ def dumptrailers(out, doc):
97
+ for xref in doc.xrefs:
98
+ out.write('<trailer>\n')
99
+ dumpxml(out, xref.trailer)
100
+ out.write('\n</trailer>\n\n')
101
+ return
102
+
103
+ # dumpallobjs
104
+ def dumpallobjs(out, doc, codec=None):
105
+ visited = set()
106
+ out.write('<pdf>')
107
+ for xref in doc.xrefs:
108
+ for objid in xref.get_objids():
109
+ if objid in visited: continue
110
+ visited.add(objid)
111
+ try:
112
+ obj = doc.getobj(objid)
113
+ if obj is None: continue
114
+ out.write('<object id="%d">\n' % objid)
115
+ dumpxml(out, obj, codec=codec)
116
+ out.write('\n</object>\n\n')
117
+ except PDFObjectNotFound as e:
118
+ print >>sys.stderr, 'not found: %r' % e
119
+ dumptrailers(out, doc)
120
+ out.write('</pdf>')
121
+ return
122
+
123
+ # dumpoutline
124
+ def dumpoutline(outfp, fname, objids, pagenos, password='',
125
+ dumpall=False, codec=None, extractdir=None):
126
+ fp = open(fname, 'rb')
127
+ parser = PDFParser(fp)
128
+ doc = PDFDocument(parser, password)
129
+ pages = dict( (page.pageid, pageno) for (pageno,page)
130
+ in enumerate(PDFPage.create_pages(doc), 1) )
131
+ def resolve_dest(dest):
132
+ if isinstance(dest, str):
133
+ dest = resolve1(doc.get_dest(dest))
134
+ elif isinstance(dest, PSLiteral):
135
+ dest = resolve1(doc.get_dest(dest.name))
136
+ if isinstance(dest, dict):
137
+ dest = dest['D']
138
+ if isinstance(dest, PDFObjRef):
139
+ dest = dest.resolve()
140
+ return dest
141
+ try:
142
+ outlines = doc.get_outlines()
143
+ outfp.write('<outlines>\n')
144
+ for (level,title,dest,a,se) in outlines:
145
+ pageno = None
146
+ if dest:
147
+ dest = resolve_dest(dest)
148
+ pageno = pages[dest[0].objid]
149
+ elif a:
150
+ action = a
151
+ if isinstance(action, dict):
152
+ subtype = action.get('S')
153
+ if subtype and repr(subtype) == '/\'GoTo\'' and action.get('D'):
154
+ dest = resolve_dest(action['D'])
155
+ pageno = pages[dest[0].objid]
156
+ s = e(title).encode('utf-8', 'xmlcharrefreplace')
157
+ outfp.write('<outline level="%r" title="%s">\n' % (level, s))
158
+ if dest is not None:
159
+ outfp.write('<dest>')
160
+ dumpxml(outfp, dest)
161
+ outfp.write('</dest>\n')
162
+ if pageno is not None:
163
+ outfp.write('<pageno>%r</pageno>\n' % pageno)
164
+ outfp.write('</outline>\n')
165
+ outfp.write('</outlines>\n')
166
+ except PDFNoOutlines:
167
+ pass
168
+ parser.close()
169
+ fp.close()
170
+ return
171
+
172
+ # extractembedded
173
+ LITERAL_FILESPEC = LIT('Filespec')
174
+ LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile')
175
+ def extractembedded(outfp, fname, objids, pagenos, password='',
176
+ dumpall=False, codec=None, extractdir=None):
177
+ def extract1(obj):
178
+ filename = os.path.basename(obj['UF'] or obj['F'])
179
+ fileref = obj['EF']['F']
180
+ fileobj = doc.getobj(fileref.objid)
181
+ if not isinstance(fileobj, PDFStream):
182
+ raise PDFValueError(
183
+ 'unable to process PDF: reference for %r is not a PDFStream' %
184
+ (filename))
185
+ if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
186
+ raise PDFValueError(
187
+ 'unable to process PDF: reference for %r is not an EmbeddedFile' %
188
+ (filename))
189
+ path = os.path.join(extractdir, filename)
190
+ if os.path.exists(path):
191
+ raise IOError('file exists: %r' % path)
192
+ print >>sys.stderr, 'extracting: %r' % path
193
+ out = file(path, 'wb')
194
+ out.write(fileobj.get_data())
195
+ out.close()
196
+ return
197
+
198
+ fp = open(fname, 'rb')
199
+ parser = PDFParser(fp)
200
+ doc = PDFDocument(parser, password)
201
+ for xref in doc.xrefs:
202
+ for objid in xref.get_objids():
203
+ obj = doc.getobj(objid)
204
+ if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
205
+ extract1(obj)
206
+ fp.close()
207
+ return
208
+
209
+ # dumppdf
210
+ def dumppdf(outfp, fname, objids, pagenos, password='',
211
+ dumpall=False, codec=None, extractdir=None):
212
+ fp = open(fname, 'rb')
213
+ parser = PDFParser(fp)
214
+ doc = PDFDocument(parser, password)
215
+ if objids:
216
+ for objid in objids:
217
+ obj = doc.getobj(objid)
218
+ dumpxml(outfp, obj, codec=codec)
219
+ if pagenos:
220
+ for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
221
+ if pageno in pagenos:
222
+ if codec:
223
+ for obj in page.contents:
224
+ obj = stream_value(obj)
225
+ dumpxml(outfp, obj, codec=codec)
226
+ else:
227
+ dumpxml(outfp, page.attrs)
228
+ if dumpall:
229
+ dumpallobjs(outfp, doc, codec=codec)
230
+ if (not objids) and (not pagenos) and (not dumpall):
231
+ dumptrailers(outfp, doc)
232
+ fp.close()
233
+ if codec not in ('raw','binary'):
234
+ outfp.write('\n')
235
+ return
236
+
237
+
238
+ # main
239
+ def main(argv):
240
+ import getopt
241
+ def usage():
242
+ print ('usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-E directory] [-i objid] file ...' % argv[0])
243
+ return 100
244
+ try:
245
+ (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTE:i:o:')
246
+ except getopt.GetoptError:
247
+ return usage()
248
+ if not args: return usage()
249
+ objids = []
250
+ pagenos = set()
251
+ codec = None
252
+ password = ''
253
+ dumpall = False
254
+ proc = dumppdf
255
+ outfp = sys.stdout
256
+ extractdir = None
257
+ for (k, v) in opts:
258
+ if k == '-d': logging.getLogger().setLevel(logging.DEBUG)
259
+ elif k == '-o': outfp = open(v, 'w')
260
+ elif k == '-i': objids.extend( int(x) for x in v.split(',') )
261
+ elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
262
+ elif k == '-P': password = v
263
+ elif k == '-a': dumpall = True
264
+ elif k == '-r': codec = 'raw'
265
+ elif k == '-b': codec = 'binary'
266
+ elif k == '-t': codec = 'text'
267
+ elif k == '-T': proc = dumpoutline
268
+ elif k == '-E':
269
+ extractdir = v
270
+ proc = extractembedded
271
+
272
+ if six.PY2 and sys.stdin.encoding:
273
+ password = password.decode(sys.stdin.encoding)
274
+
275
+ for fname in args:
276
+ proc(outfp, fname, objids, pagenos, password=password,
277
+ dumpall=dumpall, codec=codec, extractdir=extractdir)
278
+ outfp.close()
279
+
280
+ if __name__ == '__main__': sys.exit(main(sys.argv))