lamhieu commited on
Commit
fbf0154
Β·
0 Parent(s):

chore: initialize the project

Browse files
.editorconfig ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # top-most EditorConfig file
2
+ root = true
3
+
4
+ # Unix-style newlines with a newline ending every file
5
+ [*]
6
+ end_of_line = lf
7
+ insert_final_newline = true
8
+
9
+ # Matches multiple files with brace expansion notation
10
+ # Set default charset
11
+ [*]
12
+ charset = utf-8
13
+ indent_style = space
14
+ indent_size = 2
.github/workflows/githubhfsync.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync Repository to HuggingFace Space
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ workflow_dispatch: # Enable manual trigger
7
+
8
+ jobs:
9
+ sync-to-huggingface:
10
+ name: Sync code to HuggingFace Space
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - name: Checkout repository
14
+ uses: actions/checkout@v3
15
+ with:
16
+ fetch-depth: 0 # Fetch all history for all branches and tags
17
+ lfs: true # Enable Git LFS support
18
+
19
+ - name: Push to HuggingFace Space
20
+ env:
21
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
22
+ run: |
23
+ if ! git push https://lamhieu:[email protected]/spaces/lamhieu/docsifer main; then
24
+ echo "Failed to sync with HuggingFace Space"
25
+ exit 1
26
+ fi
.gitignore ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python bytecode files
2
+ # Ignore Python bytecode files
3
+ *.pyc
4
+
5
+ # Distribution packages
6
+ # Ignore distribution packages
7
+ /dist/*
8
+
9
+ # Test and coverage reports
10
+ # Ignore coverage and test result files
11
+ .coverage
12
+ .pytest_cache
13
+ .mypy_cache
14
+
15
+ # Log and temporary files
16
+ # Ignore log files and temporary files
17
+ *.log
18
+ *.tmp
19
+ tmp
20
+
21
+ # System files
22
+ # Ignore OS generated files
23
+ .DS_Store
24
+
25
+ # IDE and editor specific files
26
+ # Ignore project-specific files from various IDEs and editors
27
+ .idea/*
28
+ .vscode/*
29
+ .python-version
30
+
31
+ # Generated documentation
32
+ # Ignore generated documentation files
33
+ /docs/site/*
34
+
35
+ # Virtual environments
36
+ # Ignore virtual environment directories
37
+ .venv
38
+
39
+ # Configuration files
40
+ # Ignore configuration files
41
+ .poetry.toml
42
+ .env.local
43
+ .env.development
44
+ .env.test
45
+ .env.production
46
+ .env
47
+
48
+ # Temporary files and directories for operations
49
+ # Ignore Ops temporary files and directories
50
+ .aider*
51
+
52
+ # Credentials and secrets
53
+ # Ignore credentials and secrets files
54
+ .credentials
.pylintrc ADDED
@@ -0,0 +1,579 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [MAIN]
2
+
3
+ # Specify a configuration file.
4
+ #rcfile=
5
+
6
+ # Python code to execute, usually for sys.path manipulation such as
7
+ # pygtk.require().
8
+ #init-hook=
9
+
10
+ # Files or directories to be skipped. They should be base names, not
11
+ # paths.
12
+ ignore=CVS
13
+
14
+ # Add files or directories matching the regex patterns to the ignore-list. The
15
+ # regex matches against paths and can be in Posix or Windows format.
16
+ ignore-paths=
17
+
18
+ # Files or directories matching the regex patterns are skipped. The regex
19
+ # matches against base names, not paths.
20
+ ignore-patterns=^\.#
21
+
22
+ # Pickle collected data for later comparisons.
23
+ persistent=yes
24
+
25
+ # List of plugins (as comma separated values of python modules names) to load,
26
+ # usually to register additional checkers.
27
+ load-plugins=
28
+ pylint.extensions.check_elif,
29
+ pylint.extensions.bad_builtin,
30
+ pylint.extensions.docparams,
31
+ pylint.extensions.for_any_all,
32
+ pylint.extensions.set_membership,
33
+ pylint.extensions.code_style,
34
+ pylint.extensions.overlapping_exceptions,
35
+ pylint.extensions.typing,
36
+ pylint.extensions.redefined_variable_type,
37
+ pylint.extensions.comparison_placement,
38
+ pylint.extensions.mccabe,
39
+
40
+ # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
41
+ # number of processors available to use.
42
+ jobs=0
43
+
44
+ # When enabled, pylint would attempt to guess common misconfiguration and emit
45
+ # user-friendly hints instead of false-positive error messages.
46
+ suggestion-mode=yes
47
+
48
+ # Allow loading of arbitrary C extensions. Extensions are imported into the
49
+ # active Python interpreter and may run arbitrary code.
50
+ unsafe-load-any-extension=no
51
+
52
+ # A comma-separated list of package or module names from where C extensions may
53
+ # be loaded. Extensions are loading into the active Python interpreter and may
54
+ # run arbitrary code
55
+ extension-pkg-allow-list=
56
+
57
+ # Minimum supported python version
58
+ py-version = 3.7.2
59
+
60
+ # Control the amount of potential inferred values when inferring a single
61
+ # object. This can help the performance when dealing with large functions or
62
+ # complex, nested conditions.
63
+ limit-inference-results=100
64
+
65
+ # Specify a score threshold to be exceeded before program exits with error.
66
+ fail-under=10.0
67
+
68
+ # Return non-zero exit code if any of these messages/categories are detected,
69
+ # even if score is above --fail-under value. Syntax same as enable. Messages
70
+ # specified are enabled, while categories only check already-enabled messages.
71
+ fail-on=
72
+
73
+
74
+ [MESSAGES CONTROL]
75
+
76
+ # Only show warnings with the listed confidence levels. Leave empty to show
77
+ # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
78
+ # confidence=
79
+
80
+ # Enable the message, report, category or checker with the given id(s). You can
81
+ # either give multiple identifier separated by comma (,) or put this option
82
+ # multiple time (only on the command line, not in the configuration file where
83
+ # it should appear only once). See also the "--disable" option for examples.
84
+ enable=
85
+ use-symbolic-message-instead,
86
+ useless-suppression,
87
+
88
+ # Disable the message, report, category or checker with the given id(s). You
89
+ # can either give multiple identifiers separated by comma (,) or put this
90
+ # option multiple times (only on the command line, not in the configuration
91
+ # file where it should appear only once).You can also use "--disable=all" to
92
+ # disable everything first and then re-enable specific checks. For example, if
93
+ # you want to run only the similarities checker, you can use "--disable=all
94
+ # --enable=similarities". If you want to run only the classes checker, but have
95
+ # no Warning level messages displayed, use"--disable=all --enable=classes
96
+ # --disable=W"
97
+
98
+ disable=
99
+ attribute-defined-outside-init,
100
+ invalid-name,
101
+ missing-docstring,
102
+ protected-access,
103
+ too-few-public-methods,
104
+ # handled by black
105
+ format,
106
+ # We anticipate #3512 where it will become optional
107
+ fixme,
108
+ cyclic-import,
109
+ import-error,
110
+ #
111
+ unnecessary-pass,
112
+ unrecognized-option,
113
+ cell-var-from-loop,
114
+ no-member,
115
+ wrong-import-order,
116
+ raise-missing-from,
117
+ consider-using-f-string
118
+
119
+
120
+ [REPORTS]
121
+
122
+ # Set the output format. Available formats are text, parseable, colorized, msvs
123
+ # (visual studio) and html. You can also give a reporter class, eg
124
+ # mypackage.mymodule.MyReporterClass.
125
+ output-format=text
126
+
127
+ # Tells whether to display a full report or only the messages
128
+ reports=no
129
+
130
+ # Python expression which should return a note less than 10 (10 is the highest
131
+ # note). You have access to the variables 'fatal', 'error', 'warning', 'refactor', 'convention'
132
+ # and 'info', which contain the number of messages in each category, as
133
+ # well as 'statement', which is the total number of statements analyzed. This
134
+ # score is used by the global evaluation report (RP0004).
135
+ evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))
136
+
137
+ # Template used to display messages. This is a python new-style format string
138
+ # used to format the message information. See doc for all details
139
+ #msg-template=
140
+
141
+ # Activate the evaluation score.
142
+ score=yes
143
+
144
+
145
+ [LOGGING]
146
+
147
+ # Logging modules to check that the string format arguments are in logging
148
+ # function parameter format
149
+ logging-modules=logging
150
+
151
+ # The type of string formatting that logging methods do. `old` means using %
152
+ # formatting, `new` is for `{}` formatting.
153
+ logging-format-style=old
154
+
155
+
156
+ [MISCELLANEOUS]
157
+
158
+ # List of note tags to take in consideration, separated by a comma.
159
+ notes=FIXME,XXX,TODO
160
+
161
+ # Regular expression of note tags to take in consideration.
162
+ #notes-rgx=
163
+
164
+
165
+ [SIMILARITIES]
166
+
167
+ # Minimum lines number of a similarity.
168
+ min-similarity-lines=6
169
+
170
+ # Ignore comments when computing similarities.
171
+ ignore-comments=yes
172
+
173
+ # Ignore docstrings when computing similarities.
174
+ ignore-docstrings=yes
175
+
176
+ # Ignore imports when computing similarities.
177
+ ignore-imports=yes
178
+
179
+ # Signatures are removed from the similarity computation
180
+ ignore-signatures=yes
181
+
182
+
183
+ [VARIABLES]
184
+
185
+ # Tells whether we should check for unused import in __init__ files.
186
+ init-import=no
187
+
188
+ # A regular expression matching the name of dummy variables (i.e. expectedly
189
+ # not used).
190
+ dummy-variables-rgx=_$|dummy
191
+
192
+ # List of additional names supposed to be defined in builtins. Remember that
193
+ # you should avoid defining new builtins when possible.
194
+ additional-builtins=
195
+
196
+ # List of strings which can identify a callback function by name. A callback
197
+ # name must start or end with one of those strings.
198
+ callbacks=cb_,_cb
199
+
200
+ # Tells whether unused global variables should be treated as a violation.
201
+ allow-global-unused-variables=yes
202
+
203
+ # List of names allowed to shadow builtins
204
+ allowed-redefined-builtins=
205
+
206
+ # Argument names that match this expression will be ignored. Default to name
207
+ # with leading underscore.
208
+ ignored-argument-names=_.*
209
+
210
+ # List of qualified module names which can have objects that can redefine
211
+ # builtins.
212
+ redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
213
+
214
+
215
+ [FORMAT]
216
+
217
+ # Maximum number of characters on a single line.
218
+ max-line-length=120
219
+
220
+ # Regexp for a line that is allowed to be longer than the limit.
221
+ ignore-long-lines=^\s*(# )?<?https?://\S+>?$
222
+
223
+ # Allow the body of an if to be on the same line as the test if there is no
224
+ # else.
225
+ single-line-if-stmt=no
226
+
227
+ # Allow the body of a class to be on the same line as the declaration if body
228
+ # contains single statement.
229
+ single-line-class-stmt=no
230
+
231
+ # Maximum number of lines in a module
232
+ max-module-lines=1000
233
+
234
+ # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
235
+ # tab).
236
+ indent-string=' '
237
+
238
+ # Number of spaces of indent required inside a hanging or continued line.
239
+ indent-after-paren=4
240
+
241
+ # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
242
+ expected-line-ending-format=
243
+
244
+
245
+ [BASIC]
246
+
247
+ # Good variable names which should always be accepted, separated by a comma
248
+ good-names=i,j,k,ex,Run,_
249
+
250
+ # Good variable names regexes, separated by a comma. If names match any regex,
251
+ # they will always be accepted
252
+ good-names-rgxs=
253
+
254
+ # Bad variable names which should always be refused, separated by a comma
255
+ bad-names=foo,bar,baz,toto,tutu,tata
256
+
257
+ # Bad variable names regexes, separated by a comma. If names match any regex,
258
+ # they will always be refused
259
+ bad-names-rgxs=
260
+
261
+ # Colon-delimited sets of names that determine each other's naming style when
262
+ # the name regexes allow several styles.
263
+ name-group=
264
+
265
+ # Include a hint for the correct naming format with invalid-name
266
+ include-naming-hint=no
267
+
268
+ # Naming style matching correct function names.
269
+ function-naming-style=snake_case
270
+
271
+ # Regular expression matching correct function names
272
+ function-rgx=[a-z_][a-z0-9_]{2,30}$
273
+
274
+ # Naming style matching correct variable names.
275
+ variable-naming-style=snake_case
276
+
277
+ # Regular expression matching correct variable names
278
+ variable-rgx=[a-z_][a-z0-9_]{2,30}$
279
+
280
+ # Naming style matching correct constant names.
281
+ const-naming-style=UPPER_CASE
282
+
283
+ # Regular expression matching correct constant names
284
+ const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
285
+
286
+ # Naming style matching correct attribute names.
287
+ attr-naming-style=snake_case
288
+
289
+ # Regular expression matching correct attribute names
290
+ attr-rgx=[a-z_][a-z0-9_]{2,}$
291
+
292
+ # Naming style matching correct argument names.
293
+ argument-naming-style=snake_case
294
+
295
+ # Regular expression matching correct argument names
296
+ argument-rgx=[a-z_][a-z0-9_]{2,30}$
297
+
298
+ # Naming style matching correct class attribute names.
299
+ class-attribute-naming-style=any
300
+
301
+ # Regular expression matching correct class attribute names
302
+ class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
303
+
304
+ # Naming style matching correct class constant names.
305
+ class-const-naming-style=UPPER_CASE
306
+
307
+ # Regular expression matching correct class constant names. Overrides class-
308
+ # const-naming-style.
309
+ #class-const-rgx=
310
+
311
+ # Naming style matching correct inline iteration names.
312
+ inlinevar-naming-style=any
313
+
314
+ # Regular expression matching correct inline iteration names
315
+ inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
316
+
317
+ # Naming style matching correct class names.
318
+ class-naming-style=PascalCase
319
+
320
+ # Regular expression matching correct class names
321
+ class-rgx=[A-Z_][a-zA-Z0-9]+$
322
+
323
+
324
+ # Naming style matching correct module names.
325
+ module-naming-style=snake_case
326
+
327
+ # Regular expression matching correct module names
328
+ module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
329
+
330
+
331
+ # Naming style matching correct method names.
332
+ method-naming-style=snake_case
333
+
334
+ # Regular expression matching correct method names
335
+ method-rgx=[a-z_][a-z0-9_]{2,}$
336
+
337
+ # Regular expression which can overwrite the naming style set by typevar-naming-style.
338
+ #typevar-rgx=
339
+
340
+ # Regular expression which should only match function or class names that do
341
+ # not require a docstring. Use ^(?!__init__$)_ to also check __init__.
342
+ no-docstring-rgx=__.*__
343
+
344
+ # Minimum line length for functions/classes that require docstrings, shorter
345
+ # ones are exempt.
346
+ docstring-min-length=-1
347
+
348
+ # List of decorators that define properties, such as abc.abstractproperty.
349
+ property-classes=abc.abstractproperty
350
+
351
+
352
+ [TYPECHECK]
353
+
354
+ # Regex pattern to define which classes are considered mixins if ignore-mixin-
355
+ # members is set to 'yes'
356
+ mixin-class-rgx=.*MixIn
357
+
358
+ # List of module names for which member attributes should not be checked
359
+ # (useful for modules/projects where namespaces are manipulated during runtime
360
+ # and thus existing member attributes cannot be deduced by static analysis). It
361
+ # supports qualified module names, as well as Unix pattern matching.
362
+ ignored-modules=
363
+
364
+ # List of class names for which member attributes should not be checked (useful
365
+ # for classes with dynamically set attributes). This supports the use of
366
+ # qualified names.
367
+ ignored-classes=SQLObject, optparse.Values, thread._local, _thread._local
368
+
369
+ # List of members which are set dynamically and missed by pylint inference
370
+ # system, and so shouldn't trigger E1101 when accessed. Python regular
371
+ # expressions are accepted.
372
+ generated-members=REQUEST,acl_users,aq_parent,argparse.Namespace
373
+
374
+ # List of decorators that create context managers from functions, such as
375
+ # contextlib.contextmanager.
376
+ contextmanager-decorators=contextlib.contextmanager
377
+
378
+ # Tells whether to warn about missing members when the owner of the attribute
379
+ # is inferred to be None.
380
+ ignore-none=yes
381
+
382
+ # This flag controls whether pylint should warn about no-member and similar
383
+ # checks whenever an opaque object is returned when inferring. The inference
384
+ # can return multiple potential results while evaluating a Python object, but
385
+ # some branches might not be evaluated, which results in partial inference. In
386
+ # that case, it might be useful to still emit no-member and other checks for
387
+ # the rest of the inferred objects.
388
+ ignore-on-opaque-inference=yes
389
+
390
+ # Show a hint with possible names when a member name was not found. The aspect
391
+ # of finding the hint is based on edit distance.
392
+ missing-member-hint=yes
393
+
394
+ # The minimum edit distance a name should have in order to be considered a
395
+ # similar match for a missing member name.
396
+ missing-member-hint-distance=1
397
+
398
+ # The total number of similar names that should be taken in consideration when
399
+ # showing a hint for a missing member.
400
+ missing-member-max-choices=1
401
+
402
+ [SPELLING]
403
+
404
+ # Spelling dictionary name. Available dictionaries: none. To make it working
405
+ # install python-enchant package.
406
+ spelling-dict=
407
+
408
+ # List of comma separated words that should not be checked.
409
+ spelling-ignore-words=
410
+
411
+ # List of comma separated words that should be considered directives if they
412
+ # appear and the beginning of a comment and should not be checked.
413
+ spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:,pragma:,# noinspection
414
+
415
+ # A path to a file that contains private dictionary; one word per line.
416
+ spelling-private-dict-file=.pyenchant_pylint_custom_dict.txt
417
+
418
+ # Tells whether to store unknown words to indicated private dictionary in
419
+ # --spelling-private-dict-file option instead of raising a message.
420
+ spelling-store-unknown-words=no
421
+
422
+ # Limits count of emitted suggestions for spelling mistakes.
423
+ max-spelling-suggestions=2
424
+
425
+
426
+ [DESIGN]
427
+
428
+ # Maximum number of arguments for function / method
429
+ max-args=10
430
+
431
+ # Maximum number of locals for function / method body
432
+ max-locals=25
433
+
434
+ # Maximum number of return / yield for function / method body
435
+ max-returns=11
436
+
437
+ # Maximum number of branch for function / method body
438
+ max-branches=27
439
+
440
+ # Maximum number of statements in function / method body
441
+ max-statements=100
442
+
443
+ # Maximum number of parents for a class (see R0901).
444
+ max-parents=7
445
+
446
+ # List of qualified class names to ignore when counting class parents (see R0901).
447
+ ignored-parents=
448
+
449
+ # Maximum number of attributes for a class (see R0902).
450
+ max-attributes=11
451
+
452
+ # Minimum number of public methods for a class (see R0903).
453
+ min-public-methods=2
454
+
455
+ # Maximum number of public methods for a class (see R0904).
456
+ max-public-methods=25
457
+
458
+ # Maximum number of boolean expressions in an if statement (see R0916).
459
+ max-bool-expr=5
460
+
461
+ # List of regular expressions of class ancestor names to
462
+ # ignore when counting public methods (see R0903).
463
+ exclude-too-few-public-methods=
464
+
465
+ max-complexity=10
466
+
467
+ [CLASSES]
468
+
469
+ # List of method names used to declare (i.e. assign) instance attributes.
470
+ defining-attr-methods=__init__,__new__,setUp,__post_init__
471
+
472
+ # List of valid names for the first argument in a class method.
473
+ valid-classmethod-first-arg=cls
474
+
475
+ # List of valid names for the first argument in a metaclass class method.
476
+ valid-metaclass-classmethod-first-arg=mcs
477
+
478
+ # List of member names, which should be excluded from the protected access
479
+ # warning.
480
+ exclude-protected=_asdict,_fields,_replace,_source,_make
481
+
482
+ # Warn about protected attribute access inside special methods
483
+ check-protected-access-in-special-methods=no
484
+
485
+ [IMPORTS]
486
+
487
+ # List of modules that can be imported at any level, not just the top level
488
+ # one.
489
+ allow-any-import-level=
490
+
491
+ # Allow wildcard imports from modules that define __all__.
492
+ allow-wildcard-with-all=no
493
+
494
+ # Analyse import fallback blocks. This can be used to support both Python 2 and
495
+ # 3 compatible code, which means that the block might have code that exists
496
+ # only in one or another interpreter, leading to false positives when analysed.
497
+ analyse-fallback-blocks=no
498
+
499
+ # Deprecated modules which should not be used, separated by a comma
500
+ deprecated-modules=regsub,TERMIOS,Bastion,rexec
501
+
502
+ # Create a graph of every (i.e. internal and external) dependencies in the
503
+ # given file (report RP0402 must not be disabled)
504
+ import-graph=
505
+
506
+ # Create a graph of external dependencies in the given file (report RP0402 must
507
+ # not be disabled)
508
+ ext-import-graph=
509
+
510
+ # Create a graph of internal dependencies in the given file (report RP0402 must
511
+ # not be disabled)
512
+ int-import-graph=
513
+
514
+ # Force import order to recognize a module as part of the standard
515
+ # compatibility libraries.
516
+ known-standard-library=
517
+
518
+ # Force import order to recognize a module as part of a third party library.
519
+ known-third-party=enchant
520
+
521
+ # Couples of modules and preferred modules, separated by a comma.
522
+ preferred-modules=
523
+
524
+
525
+ [EXCEPTIONS]
526
+
527
+ # Exceptions that will emit a warning when being caught. Defaults to
528
+ # "Exception"
529
+ overgeneral-exceptions=Exception
530
+
531
+
532
+ [TYPING]
533
+
534
+ # Set to ``no`` if the app / library does **NOT** need to support runtime
535
+ # introspection of type annotations. If you use type annotations
536
+ # **exclusively** for type checking of an application, you're probably fine.
537
+ # For libraries, evaluate if some users what to access the type hints at
538
+ # runtime first, e.g., through ``typing.get_type_hints``. Applies to Python
539
+ # versions 3.7 - 3.9
540
+ runtime-typing = no
541
+
542
+
543
+ [DEPRECATED_BUILTINS]
544
+
545
+ # List of builtins function names that should not be used, separated by a comma
546
+ bad-functions=map,input
547
+
548
+
549
+ [REFACTORING]
550
+
551
+ # Maximum number of nested blocks for function / method body
552
+ max-nested-blocks=5
553
+
554
+ # Complete name of functions that never returns. When checking for
555
+ # inconsistent-return-statements if a never returning function is called then
556
+ # it will be considered as an explicit return statement and no message will be
557
+ # printed.
558
+ never-returning-functions=sys.exit,argparse.parse_error
559
+
560
+
561
+ [STRING]
562
+
563
+ # This flag controls whether inconsistent-quotes generates a warning when the
564
+ # character used as a quote delimiter is used inconsistently within a module.
565
+ check-quote-consistency=no
566
+
567
+ # This flag controls whether the implicit-str-concat should generate a warning
568
+ # on implicit string concatenation in sequences defined over several lines.
569
+ check-str-concat-over-line-jumps=no
570
+
571
+
572
+ [CODE_STYLE]
573
+
574
+ # Max line length for which to sill emit suggestions. Used to prevent optional
575
+ # suggestions which would get split by a code formatter (e.g., black). Will
576
+ # default to the setting for ``max-line-length``.
577
+ #max-line-length-suggestions=
578
+
579
+ W0107:unnecessary-pass
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.10.9 as the base image for consistent runtime environment
2
+ FROM python:3.10.9
3
+
4
+ # Add metadata labels
5
+ LABEL maintainer="[email protected]"
6
+ LABEL description="Docsifer: Efficient Data Conversion to Markdown using FastAPI and Hugging Face Transformers."
7
+ LABEL version="1.0"
8
+
9
+ # Setup non-root user for security
10
+ RUN useradd -m -u 1000 user
11
+ USER user
12
+ ENV HOME=/home/user \
13
+ PATH=/home/user/.local/bin:$PATH
14
+
15
+ # Set working directory for all subsequent commands
16
+ WORKDIR $HOME/app
17
+
18
+ # Copy application files
19
+ # Copy requirements first to leverage Docker cache
20
+ COPY --chown=user requirements.txt .
21
+ COPY --chown=user . .
22
+
23
+ # Install Python dependencies
24
+ # --no-cache-dir reduces image size
25
+ # --upgrade ensures latest compatible versions
26
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
27
+
28
+ # Expose service port
29
+ EXPOSE 7860
30
+
31
+ # Launch FastAPI application using uvicorn server
32
+ # --host 0.0.0.0: Listen on all network interfaces
33
+ # --port 7860: Run on port 7860
34
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Hieu Lam
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Docsifer
3
+ emoji: πŸ‘» / πŸ“š
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: docker
7
+ app_file: app.py
8
+ pinned: false
9
+ ---
10
+
11
+ # πŸ“„ Docsifer: Efficient Data Conversion to Markdown
12
+
13
+ **Docsifer** is a powerful FastAPI + Gradio service for converting various data formats (PDF, PowerPoint, Word, Excel, Images, Audio, HTML, etc.) to Markdown. It leverages the [MarkItDown](https://github.com/microsoft/markitdown) library and can optionally use LLMs (via OpenAI) for richer extraction (OCR, speech-to-text, etc.).
14
+
15
+ ## ✨ Key Features
16
+
17
+ - **Comprehensive Format Support**:
18
+ - **PDF**: Extracts text and structure effectively.
19
+ - **PowerPoint**: Converts slides into Markdown-friendly content.
20
+ - **Word**: Processes `.docx` files with precision.
21
+ - **Excel**: Extracts tabular data as Markdown tables.
22
+ - **Images**: Reads **EXIF metadata** and applies **OCR** for text extraction.
23
+ - **Audio**: Retrieves **EXIF metadata** and performs **speech transcription**.
24
+ - **HTML**: Transforms web pages into Markdown.
25
+ - **Text-Based Formats**: Handles CSV, JSON, XML with ease.
26
+ - **ZIP Files**: Iterates over contents for batch processing.
27
+ - **LLM Integration**: Leverages OpenAI's GPT-4 for enhanced extraction quality and contextual understanding.
28
+ - **Efficient and Fast**: Optimized for speed while maintaining high accuracy.
29
+ - **Easy Deployment**: Dockerized for hassle-free setup and scalability.
30
+ - **Interactive Playground**: Test conversion processes interactively using a **Gradio-powered interface**.
31
+ - **Usage Analytics**: Tracks token usage and access statistics via Upstash Redis.
32
+
33
+ ## πŸš€ Use Cases
34
+
35
+ - **Knowledge Indexing**: Convert various document formats into Markdown for indexing and search.
36
+ - **Text Analysis**: Prepare data for semantic analysis and NLP tasks.
37
+ - **Content Transformation**: Simplify content preparation for blogs, documentation, or databases.
38
+ - **Metadata Extraction**: Extract meaningful metadata from images and audio for categorization and tagging.
39
+
40
+ ## πŸ› οΈ Getting Started
41
+
42
+ ### 1. Clone the Repository
43
+
44
+ ```bash
45
+ git clone https://github.com/lh0x00/docsifer.git
46
+ cd docsifer
47
+ ```
48
+
49
+ ### 2. Build and Run with Docker
50
+
51
+ Make sure Docker is installed and running on your machine.
52
+
53
+ ```bash
54
+ docker build -t docsifer .
55
+ docker run -p 7860:7860 docsifer
56
+ ```
57
+
58
+ The API will now be accessible at `http://localhost:7860`.
59
+
60
+ ### 3. Environment Variables
61
+
62
+ Set the following environment variables as needed:
63
+
64
+ - `OPENAI_API_KEY`: Your OpenAI API key (optional, for LLM-enhanced features).
65
+ - `OPENAI_BASE_URL`: Custom base endpoint for OpenAI-compatible services (optional).
66
+ - `REDIS_URL`: Upstash Redis URL (default: `redis://localhost:6379/0`).
67
+ - `REDIS_TOKEN`: Upstash Redis token (default: `***`).
68
+
69
+ You can set these in a `.env` file or pass them directly when running the Docker container:
70
+
71
+ ```bash
72
+ docker run -p 7860:7860 \
73
+ -e OPENAI_API_KEY=sk-xxxxx \
74
+ -e OPENAI_BASE_URL=https://api.openai.com/v1 \
75
+ -e REDIS_URL=redis://your-upstash-url \
76
+ -e REDIS_TOKEN=your-upstash-token \
77
+ docsifer
78
+ ```
79
+
80
+ ## πŸ“– API Overview
81
+
82
+ ### Endpoints
83
+
84
+ - **`/v1/convert`**: Convert a file to Markdown. Supports both file uploads and file path inputs. Accepts optional OpenAI parameters to enable LLM-based enhancements.
85
+ - **`/v1/stats`**: Retrieve usage statistics, including access counts and token usage.
86
+
87
+ ### Interactive Docs
88
+
89
+ - Visit the [Swagger UI](http://localhost:7860/docs) for detailed, interactive documentation.
90
+ - Explore additional resources with [ReDoc](http://localhost:7860/redoc).
91
+
92
+ ## πŸ”¬ Playground
93
+
94
+ ### Interactive Conversion
95
+
96
+ - Test file conversion directly in the browser using the **Gradio interface**.
97
+ - Simply visit `http://localhost:7860` after starting the server to access the playground.
98
+
99
+ ### Features
100
+
101
+ - **File Upload**: Upload a file directly or provide a local file path.
102
+ - **OpenAI Integration**: Optionally provide OpenAI API details to enhance conversion with LLM capabilities.
103
+ - **Conversion Result**: View the resulting Markdown output instantly.
104
+ - **Usage Statistics**: Monitor access and token usage through the Gradio interface.
105
+
106
+ ## 🌐 Resources
107
+
108
+ - **Documentation**: [Explore full documentation](https://lamhieu-docsifer.hf.space/docs)
109
+ - **Hugging Face Space**: [Try the live demo](https://huggingface.co/spaces/lh0x00/docsifer)
110
+ - **GitHub Repository**: [View source code](https://github.com/lh0x00/docsifer)
111
+
112
+ ## πŸ’‘ Why Docsifer?
113
+
114
+ 1. **Versatile and Comprehensive**: Handles a wide range of formats, making it a one-stop solution for content conversion.
115
+ 2. **AI-Powered**: Uses OpenAI's GPT-4 to enhance extraction accuracy and adapt to complex data structures.
116
+ 3. **User-Friendly**: Offers intuitive APIs and a built-in interactive interface for experimentation.
117
+ 4. **Scalable and Efficient**: Optimized for performance with Docker support and asynchronous processing.
118
+ 5. **Transparent Analytics**: Tracks usage metrics to help monitor and manage service consumption.
119
+
120
+ ## πŸ‘₯ Contributors
121
+
122
+ - **lamhieu / lh0x00** – Creator and Maintainer ([GitHub](https://github.com/lh0x00))
123
+
124
+ Contributions are welcome! Check out the [contribution guidelines](https://github.com/lh0x00/docsifer/blob/main/CONTRIBUTING.md).
125
+
126
+ ## πŸ“œ License
127
+
128
+ This project is licensed under the **MIT License**. See the [LICENSE](https://github.com/lh0x00/docsifer/blob/main/LICENSE) file for details.
app.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from docsifer import app
docsifer/__init__.py ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # filename: __init__.py
2
+
3
+ import json
4
+ import logging
5
+ import tempfile
6
+ from typing import Tuple, Optional
7
+
8
+ import gradio as gr
9
+ import pandas as pd
10
+ import requests
11
+ from fastapi import FastAPI
12
+ from fastapi.middleware.cors import CORSMiddleware
13
+ from gradio.routes import mount_gradio_app
14
+ from pathlib import Path
15
+
16
+ # If you want to generate unique filenames, e.g. scuid:
17
+ from scuid import scuid
18
+
19
+
20
+ # Filter out /v1 requests from the access log
21
+ class LogFilter(logging.Filter):
22
+ def filter(self, record):
23
+ # Only keep log records that contain "/v1" in the request path
24
+ if record.args and len(record.args) >= 3:
25
+ if "/v1" in str(record.args[2]):
26
+ return True
27
+ return False
28
+
29
+
30
+ logger = logging.getLogger("uvicorn.access")
31
+ logger.addFilter(LogFilter())
32
+
33
+ # Application metadata
34
+ __version__ = "1.0.0"
35
+ __author__ = "lamhieu"
36
+ __description__ = "Docsifer: Efficient Data Conversion to Markdown."
37
+ __metadata__ = {
38
+ "project": "Docsifer",
39
+ "version": __version__,
40
+ "description": (
41
+ "Effortlessly convert various files to Markdown, including PDF, PowerPoint, Word, Excel, "
42
+ "images, audio, HTML, JSON, CSV, XML, ZIP, and more."
43
+ ),
44
+ "docs": "https://lamhieu-docsifer.hf.space/docs",
45
+ "github": "https://github.com/lh0x00/docsifer",
46
+ "spaces": "https://huggingface.co/spaces/lh0x00/docsifer",
47
+ }
48
+
49
+ # Update your Docsifer API endpoints (you can replace with your HF Space or other URL)
50
+ DOCSIFER_API_URL = "http://localhost:7860/v1/convert"
51
+ DOCSIFER_STATS_URL = "http://localhost:7860/v1/stats"
52
+
53
+ # Markdown description for the main interface
54
+ APP_DESCRIPTION = f"""
55
+ # πŸ“ **Docsifer: Convert Your Documents to Markdown**
56
+
57
+ Welcome to **Docsifer**, a specialized service that converts your filesβ€”like PDF, PPT, Word, Excel, images, audio, HTML, JSON, CSV, XML, ZIP, etc.β€”into **Markdown** using **MarkItDown** at the core. Optionally, you can leverage **LLMs** (OpenAI) for advanced text extraction.
58
+
59
+ ### Features & Privacy
60
+
61
+ - **Open Source**: The entire Docsifer codebase is publicly available for review and contribution.
62
+ - **Efficient & Flexible**: Supports multiple file formats, ensuring quick and accurate Markdown conversion.
63
+ - **Privacy-Focused**: We never store user data; all processing is ephemeral. We only collect minimal anonymous usage stats for service improvement.
64
+ - **Production-Ready**: Easy Docker deployment, interactive Gradio playground, and comprehensive REST API documentation.
65
+ - **Community & Collaboration**: Contribute on [GitHub]({__metadata__["github"]}) or try it out on [Hugging Face Spaces]({__metadata__["spaces"]}).
66
+
67
+ ### πŸ”— Resources
68
+ - [Documentation]({__metadata__["docs"]}) | [GitHub]({__metadata__["github"]}) | [Live Demo]({__metadata__["spaces"]})
69
+ """
70
+
71
+ # Initialize FastAPI application
72
+ app = FastAPI(
73
+ title="Docsifer Service API",
74
+ description=__description__,
75
+ version=__version__,
76
+ docs_url="/docs",
77
+ redoc_url="/redoc",
78
+ )
79
+
80
+ # Configure CORS
81
+ app.add_middleware(
82
+ CORSMiddleware,
83
+ allow_origins=["*"], # Adjust if needed for specific domains
84
+ allow_credentials=True,
85
+ allow_methods=["*"],
86
+ allow_headers=["*"],
87
+ )
88
+
89
+ # Import and include your existing router (which has /v1/convert, /v1/stats, etc.)
90
+ from .router import router
91
+
92
+ app.include_router(router, prefix="/v1")
93
+
94
+
95
+ def call_convert_api(
96
+ file_obj: bytes,
97
+ filename: str,
98
+ cleanup: bool = True,
99
+ openai_base_url: Optional[str] = None,
100
+ openai_api_key: Optional[str] = None,
101
+ openai_model: Optional[str] = None,
102
+ ) -> Tuple[str, str]:
103
+ """
104
+ Calls the /v1/convert endpoint, returning (markdown_content, md_file_path).
105
+ If there's an error, the first return value is an error message (str),
106
+ the second is an empty string.
107
+
108
+ The updated /v1/convert expects:
109
+ - file (UploadFile)
110
+ - openai (object, e.g. {"api_key":"...","base_url":"..."})
111
+ - settings (object, e.g. {"cleanup": true})
112
+ """
113
+
114
+ if file_obj is None:
115
+ return ("❌ No file was uploaded.", "")
116
+
117
+ # Build the "openai" object
118
+ openai_dict = {}
119
+ if openai_api_key and openai_api_key.strip():
120
+ openai_dict["api_key"] = openai_api_key
121
+ if openai_base_url and openai_base_url.strip():
122
+ openai_dict["base_url"] = openai_base_url
123
+ if openai_model and openai_model.strip():
124
+ openai_dict["model"] = openai_model
125
+
126
+ # Build the "settings" object
127
+ settings_dict = {"cleanup": cleanup}
128
+
129
+ data = {
130
+ # These must match the `Form(...)` fields named "openai" and "settings"
131
+ "openai": json.dumps(openai_dict),
132
+ "settings": json.dumps(settings_dict),
133
+ }
134
+
135
+ if len(openai_dict) <= 3:
136
+ data.pop("openai")
137
+
138
+ # Prepare files for multipart/form-data
139
+ files = {"file": (filename, file_obj)}
140
+
141
+ try:
142
+ response = requests.post(DOCSIFER_API_URL, files=files, data=data, timeout=30)
143
+ except requests.exceptions.RequestException as e:
144
+ return (f"❌ Network Error: {str(e)}", "")
145
+
146
+ if response.status_code != 200:
147
+ return (f"❌ API Error {response.status_code}: {response.text}", "")
148
+
149
+ try:
150
+ converted = response.json()
151
+ # Expecting { "filename": "...", "markdown": "..." }
152
+ markdown_content = converted["markdown"]
153
+ except Exception as e:
154
+ return (f"❌ Error parsing JSON: {str(e)}", "")
155
+
156
+ # Write the returned Markdown to a temporary .md file so Gradio can serve it
157
+ with tempfile.NamedTemporaryFile(
158
+ mode="w+", suffix=".md", dir="/tmp", delete=False
159
+ ) as tmp_file:
160
+ tmp_file.write(markdown_content)
161
+ tmp_md_path = tmp_file.name
162
+
163
+ return (markdown_content, tmp_md_path)
164
+
165
+
166
+ def call_stats_api_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
167
+ """
168
+ Calls /v1/stats endpoint to retrieve analytics data.
169
+ Returns two DataFrames: (access_df, tokens_df).
170
+ """
171
+ try:
172
+ response = requests.get(DOCSIFER_STATS_URL, timeout=10)
173
+ except requests.exceptions.RequestException as e:
174
+ raise ValueError(f"Failed to fetch stats: {str(e)}")
175
+
176
+ if response.status_code != 200:
177
+ raise ValueError(f"Failed to fetch stats: {response.text}")
178
+
179
+ data = response.json()
180
+ # Expected structure:
181
+ # {
182
+ # "access": { <period>: {"docsifer": count, ...}, ... },
183
+ # "tokens": { <period>: {"docsifer": count, ...}, ... }
184
+ # }
185
+ access_data = data.get("access", {})
186
+ tokens_data = data.get("tokens", {})
187
+
188
+ def build_stats_df(bucket: dict) -> pd.DataFrame:
189
+ # We want columns for periods: total, daily, weekly, monthly, yearly
190
+ # Each row => "docsifer" (just 1 row if everything is aggregated)
191
+ all_models = set()
192
+ for period_key in ["total", "daily", "weekly", "monthly", "yearly"]:
193
+ period_dict = bucket.get(period_key, {})
194
+ all_models.update(period_dict.keys()) # typically just "docsifer"
195
+
196
+ result_dict = {
197
+ "Model": [],
198
+ "Total": [],
199
+ "Daily": [],
200
+ "Weekly": [],
201
+ "Monthly": [],
202
+ "Yearly": [],
203
+ }
204
+
205
+ for model in sorted(all_models):
206
+ result_dict["Model"].append(model)
207
+ result_dict["Total"].append(bucket.get("total", {}).get(model, 0))
208
+ result_dict["Daily"].append(bucket.get("daily", {}).get(model, 0))
209
+ result_dict["Weekly"].append(bucket.get("weekly", {}).get(model, 0))
210
+ result_dict["Monthly"].append(bucket.get("monthly", {}).get(model, 0))
211
+ result_dict["Yearly"].append(bucket.get("yearly", {}).get(model, 0))
212
+
213
+ return pd.DataFrame(result_dict)
214
+
215
+ access_df = build_stats_df(access_data)
216
+ tokens_df = build_stats_df(tokens_data)
217
+ return access_df, tokens_df
218
+
219
+
220
+ def create_main_interface():
221
+ """
222
+ Creates a Gradio Blocks interface:
223
+ - A 'Conversion Playground' tab for uploading a file and converting to Markdown
224
+ - An 'Analytics Stats' section to display usage statistics
225
+ - cURL examples for reference
226
+ """
227
+ with gr.Blocks(title="Docsifer: Convert to Markdown", theme="default") as demo:
228
+ gr.Markdown(APP_DESCRIPTION)
229
+
230
+ with gr.Tab("Conversion Playground"):
231
+ gr.Markdown("### Convert your files to Markdown with Docsifer.")
232
+
233
+ with gr.Row():
234
+ with gr.Column():
235
+ file_input = gr.File(
236
+ label="Upload File",
237
+ file_types=[
238
+ ".pdf",
239
+ ".docx",
240
+ ".pptx",
241
+ ".xlsx",
242
+ ".html",
243
+ ".htm",
244
+ ".jpg",
245
+ ".jpeg",
246
+ ".png",
247
+ ".mp3",
248
+ ".wav",
249
+ ".zip",
250
+ ],
251
+ type="binary",
252
+ )
253
+
254
+ with gr.Accordion("OpenAI Configuration (Optional)", open=False):
255
+ gr.Markdown(
256
+ "Provide these if you'd like **LLM-assisted** extraction. "
257
+ "Supports both OpenAI and OpenAI-compatible APIs. "
258
+ "If left blank, basic conversion (no LLM) will be used."
259
+ )
260
+ openai_base_url = gr.Textbox(
261
+ label="Base URL",
262
+ placeholder="https://api.openai.com/v1",
263
+ value="https://api.openai.com/v1",
264
+ )
265
+ openai_api_key = gr.Textbox(
266
+ label="API Key",
267
+ placeholder="sk-...",
268
+ type="password",
269
+ )
270
+ openai_model = gr.Textbox(
271
+ label="Model ID",
272
+ placeholder="e.g. gpt-4o-mini",
273
+ value="gpt-4o-mini",
274
+ )
275
+
276
+ with gr.Accordion("Conversion Settings", open=True):
277
+ gr.Markdown(
278
+ "Enable to remove <style> tags or hidden elements from `.html` files before conversion."
279
+ )
280
+ cleanup_toggle = gr.Checkbox(
281
+ label="Enable Cleanup",
282
+ value=True,
283
+ )
284
+
285
+ convert_btn = gr.Button("Convert")
286
+
287
+ with gr.Column():
288
+ output_md = gr.Textbox(
289
+ label="Conversion Result (Markdown)",
290
+ lines=20,
291
+ interactive=False,
292
+ )
293
+ # Set visible=True so the user always sees a small download button
294
+ download_file = gr.File(
295
+ label="Download",
296
+ interactive=False,
297
+ visible=True,
298
+ )
299
+
300
+ gr.Markdown(
301
+ """
302
+ ### cURL Examples
303
+
304
+ **Convert via File Upload (multipart/form-data)**:
305
+ ```bash
306
+ curl -X POST \\
307
+ "https://lamhieu-docsifer.hf.space/v1/convert" \\
308
+ -F "file=@/path/to/local/document.pdf" \\
309
+ -F "openai={{\\"api_key\\":\\"sk-xxxxx\\",\\"model\\":\\"gpt-4o-mini\\",\\"base_url\\":\\"https://api.openai.com/v1\\"}}" \\
310
+ -F "settings={{\\"cleanup\\":true}}"
311
+ ```
312
+ """
313
+ )
314
+
315
+ def on_convert(file_bytes, base_url, api_key, model_id, cleanup):
316
+ """
317
+ Callback for the 'Convert' button.
318
+ We generate a unique name if the user uploads a file.
319
+ """
320
+ if not file_bytes:
321
+ return "❌ Please upload a file first.", None
322
+
323
+ unique_name = f"{scuid()}.tmp"
324
+ markdown, temp_md_path = call_convert_api(
325
+ file_obj=file_bytes,
326
+ filename=unique_name,
327
+ openai_base_url=base_url,
328
+ openai_api_key=api_key,
329
+ openai_model=model_id,
330
+ cleanup=cleanup,
331
+ )
332
+ return markdown, temp_md_path
333
+
334
+ convert_btn.click(
335
+ fn=on_convert,
336
+ inputs=[
337
+ file_input,
338
+ openai_base_url,
339
+ openai_api_key,
340
+ openai_model,
341
+ cleanup_toggle,
342
+ ],
343
+ outputs=[output_md, download_file],
344
+ )
345
+
346
+ with gr.Tab("Analytics Stats"):
347
+ gr.Markdown(
348
+ "View Docsifer usage statistics (access count, token usage, etc.)"
349
+ )
350
+ stats_btn = gr.Button("Get Stats")
351
+ access_df = gr.DataFrame(
352
+ label="Access Stats",
353
+ headers=["Model", "Total", "Daily", "Weekly", "Monthly", "Yearly"],
354
+ interactive=False,
355
+ )
356
+ tokens_df = gr.DataFrame(
357
+ label="Token Stats",
358
+ headers=["Model", "Total", "Daily", "Weekly", "Monthly", "Yearly"],
359
+ interactive=False,
360
+ )
361
+
362
+ stats_btn.click(
363
+ fn=call_stats_api_df,
364
+ inputs=[],
365
+ outputs=[access_df, tokens_df],
366
+ )
367
+
368
+ return demo
369
+
370
+
371
+ # Build our Gradio interface and mount it at the root path
372
+ main_interface = create_main_interface()
373
+ mount_gradio_app(app, main_interface, path="/")
374
+
375
+
376
+ # Startup / Shutdown events
377
+ @app.on_event("startup")
378
+ async def startup_event():
379
+ logger.info("Docsifer Service is starting up...")
380
+
381
+
382
+ @app.on_event("shutdown")
383
+ async def shutdown_event():
384
+ logger.info("Docsifer Service is shutting down.")
docsifer/analytics.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # filename: analytics.py
2
+
3
+ import logging
4
+ import asyncio
5
+ from upstash_redis import Redis as UpstashRedis
6
+ from datetime import datetime
7
+ from collections import defaultdict
8
+ from typing import Dict
9
+ from functools import partial
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class Analytics:
15
+ def __init__(
16
+ self, url: str, token: str, sync_interval: int = 60, max_retries: int = 5
17
+ ):
18
+ """
19
+ Initializes the Analytics class with an Upstash Redis client (HTTP-based),
20
+ wrapped in async methods by using run_in_executor.
21
+
22
+ We maintain two dictionaries:
23
+ - current_totals: absolute counters (loaded from Redis at startup).
24
+ - new_increments: only the new usage since last sync.
25
+
26
+ Both structures only track a single label "docsifer" for access/tokens.
27
+ """
28
+ self.url = url
29
+ self.token = token
30
+ self.sync_interval = sync_interval
31
+ self.max_retries = max_retries
32
+
33
+ # Create the synchronous Upstash Redis client over HTTP
34
+ self.redis_client = self._create_redis_client()
35
+
36
+ # current_totals: absolute counters from Redis
37
+ self.current_totals = {
38
+ "access": defaultdict(lambda: defaultdict(int)),
39
+ "tokens": defaultdict(lambda: defaultdict(int)),
40
+ }
41
+ # new_increments: only new usage since the last successful sync
42
+ self.new_increments = {
43
+ "access": defaultdict(lambda: defaultdict(int)),
44
+ "tokens": defaultdict(lambda: defaultdict(int)),
45
+ }
46
+
47
+ # Async lock to protect shared data
48
+ self.lock = asyncio.Lock()
49
+
50
+ # Start initial sync from Redis, then a periodic sync task
51
+ asyncio.create_task(self._initialize())
52
+
53
+ logger.info("Initialized Analytics with Upstash Redis: %s", url)
54
+
55
+ def _create_redis_client(self) -> UpstashRedis:
56
+ """Creates and returns a new Upstash Redis (synchronous) client."""
57
+ return UpstashRedis(url=self.url, token=self.token)
58
+
59
+ async def _initialize(self):
60
+ """
61
+ Fetch existing data from Redis into current_totals,
62
+ then start the periodic sync task.
63
+ """
64
+ try:
65
+ await self._sync_from_redis()
66
+ logger.info("Initial sync from Upstash Redis completed successfully.")
67
+ except Exception as e:
68
+ logger.error("Error during initial Redis sync: %s", e)
69
+
70
+ asyncio.create_task(self._start_sync_task())
71
+
72
+ def _get_period_keys(self):
73
+ """
74
+ Returns day, week, month, year keys based on current UTC date.
75
+ Also consider "total" if you want an all-time key.
76
+
77
+ Example: ("2025-01-14", "2025-W02", "2025-01", "2025", "total")
78
+ """
79
+ now = datetime.utcnow()
80
+ day_key = now.strftime("%Y-%m-%d")
81
+ week_key = f"{now.year}-W{now.strftime('%U')}"
82
+ month_key = now.strftime("%Y-%m")
83
+ year_key = now.strftime("%Y")
84
+ # For convenience, also track everything in "total".
85
+ return day_key, week_key, month_key, year_key, "total"
86
+
87
+ async def access(self, tokens: int):
88
+ """
89
+ Records an access and token usage for the "docsifer" label.
90
+ This function updates both current_totals and new_increments.
91
+ """
92
+ day_key, week_key, month_key, year_key, total_key = self._get_period_keys()
93
+
94
+ async with self.lock:
95
+ # For each time period, increment "docsifer" usage
96
+ for period in [day_key, week_key, month_key, year_key, total_key]:
97
+ # Increase new usage
98
+ self.new_increments["access"][period]["docsifer"] += 1
99
+ self.new_increments["tokens"][period]["docsifer"] += tokens
100
+
101
+ # Also update the absolute totals for immediate stats
102
+ self.current_totals["access"][period]["docsifer"] += 1
103
+ self.current_totals["tokens"][period]["docsifer"] += tokens
104
+
105
+ async def stats(self) -> Dict[str, Dict[str, Dict[str, int]]]:
106
+ """
107
+ Returns a snapshot of current stats (absolute totals).
108
+ We use current_totals, which is always up to date.
109
+ """
110
+ async with self.lock:
111
+ return {
112
+ "access": {
113
+ period: dict(models)
114
+ for period, models in self.current_totals["access"].items()
115
+ },
116
+ "tokens": {
117
+ period: dict(models)
118
+ for period, models in self.current_totals["tokens"].items()
119
+ },
120
+ }
121
+
122
+ async def _sync_from_redis(self):
123
+ """
124
+ Pull existing data from Redis into current_totals and reset new_increments.
125
+ We read "analytics:access:*" and "analytics:tokens:*" keys via SCAN.
126
+ """
127
+ loop = asyncio.get_running_loop()
128
+
129
+ async with self.lock:
130
+ # Reset both structures
131
+ self.current_totals = {
132
+ "access": defaultdict(lambda: defaultdict(int)),
133
+ "tokens": defaultdict(lambda: defaultdict(int)),
134
+ }
135
+ self.new_increments = {
136
+ "access": defaultdict(lambda: defaultdict(int)),
137
+ "tokens": defaultdict(lambda: defaultdict(int)),
138
+ }
139
+
140
+ # ---------------------------
141
+ # Load "access" data
142
+ # ---------------------------
143
+ cursor = 0
144
+ while True:
145
+ scan_result = await loop.run_in_executor(
146
+ None,
147
+ partial(
148
+ self.redis_client.scan,
149
+ cursor=cursor,
150
+ match="analytics:access:*",
151
+ count=1000,
152
+ ),
153
+ )
154
+ cursor, keys = scan_result[0], scan_result[1]
155
+
156
+ for key in keys:
157
+ # key => "analytics:access:<period>"
158
+ period = key.replace("analytics:access:", "")
159
+ data = await loop.run_in_executor(
160
+ None,
161
+ partial(self.redis_client.hgetall, key),
162
+ )
163
+ for name_key, count_str in data.items():
164
+ self.current_totals["access"][period][name_key] = int(count_str)
165
+
166
+ if cursor == 0:
167
+ break
168
+
169
+ # ---------------------------
170
+ # Load "tokens" data
171
+ # ---------------------------
172
+ cursor = 0
173
+ while True:
174
+ scan_result = await loop.run_in_executor(
175
+ None,
176
+ partial(
177
+ self.redis_client.scan,
178
+ cursor=cursor,
179
+ match="analytics:tokens:*",
180
+ count=1000,
181
+ ),
182
+ )
183
+ cursor, keys = scan_result[0], scan_result[1]
184
+
185
+ for key in keys:
186
+ # key => "analytics:tokens:<period>"
187
+ period = key.replace("analytics:tokens:", "")
188
+ data = await loop.run_in_executor(
189
+ None,
190
+ partial(self.redis_client.hgetall, key),
191
+ )
192
+ for name_key, count_str in data.items():
193
+ self.current_totals["tokens"][period][name_key] = int(count_str)
194
+
195
+ if cursor == 0:
196
+ break
197
+
198
+ async def _sync_to_redis(self):
199
+ """
200
+ Push the new_increments to Redis with HINCRBY,
201
+ then reset new_increments to zero if successful.
202
+ """
203
+ loop = asyncio.get_running_loop()
204
+
205
+ async with self.lock:
206
+ try:
207
+ # Sync "access" increments
208
+ for period, models in self.new_increments["access"].items():
209
+ redis_key = f"analytics:access:{period}"
210
+ for name_key, count_val in models.items():
211
+ if count_val != 0:
212
+ await loop.run_in_executor(
213
+ None,
214
+ partial(
215
+ self.redis_client.hincrby,
216
+ redis_key,
217
+ name_key,
218
+ count_val,
219
+ ),
220
+ )
221
+
222
+ # Sync "tokens" increments
223
+ for period, models in self.new_increments["tokens"].items():
224
+ redis_key = f"analytics:tokens:{period}"
225
+ for name_key, count_val in models.items():
226
+ if count_val != 0:
227
+ await loop.run_in_executor(
228
+ None,
229
+ partial(
230
+ self.redis_client.hincrby,
231
+ redis_key,
232
+ name_key,
233
+ count_val,
234
+ ),
235
+ )
236
+
237
+ logger.info("Analytics data synced to Upstash Redis.")
238
+
239
+ # Reset new_increments only
240
+ self.new_increments = {
241
+ "access": defaultdict(lambda: defaultdict(int)),
242
+ "tokens": defaultdict(lambda: defaultdict(int)),
243
+ }
244
+
245
+ except Exception as e:
246
+ logger.error("Error syncing to Redis: %s", e)
247
+ raise e
248
+
249
+ async def _start_sync_task(self):
250
+ """Periodically sync local increments to Redis."""
251
+ while True:
252
+ await asyncio.sleep(self.sync_interval)
253
+ try:
254
+ await self._sync_to_redis()
255
+ except Exception as e:
256
+ logger.error("Error during scheduled sync: %s", e)
257
+ await self._handle_redis_reconnection()
258
+
259
+ async def _handle_redis_reconnection(self):
260
+ """
261
+ Attempts to reconnect to Redis if connection fails (HTTP-based, stateless).
262
+ """
263
+ loop = asyncio.get_running_loop()
264
+ retry_count = 0
265
+ delay = 1
266
+
267
+ while retry_count < self.max_retries:
268
+ try:
269
+ logger.info(
270
+ "Attempting Redis reconnection (attempt %d)...", retry_count + 1
271
+ )
272
+ await loop.run_in_executor(None, self.redis_client.close)
273
+ self.redis_client = self._create_redis_client()
274
+ logger.info("Reconnected to Redis successfully.")
275
+ return
276
+ except Exception as e:
277
+ logger.error("Reconnection attempt %d failed: %s", retry_count + 1, e)
278
+ retry_count += 1
279
+ await asyncio.sleep(delay)
280
+ delay *= 2
281
+
282
+ logger.critical("Max reconnection attempts reached. Redis is unavailable.")
283
+
284
+ async def close(self):
285
+ """
286
+ Close the Upstash Redis client (though it's stateless over HTTP).
287
+ """
288
+ loop = asyncio.get_running_loop()
289
+ await loop.run_in_executor(None, self.redis_client.close)
290
+ logger.info("Redis client closed.")
docsifer/router.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # filename: router.py
2
+
3
+ import logging
4
+ import json
5
+ import tempfile
6
+ import os
7
+ from pathlib import Path
8
+
9
+ from fastapi import APIRouter, HTTPException, UploadFile, File, Form, BackgroundTasks
10
+ from pydantic import BaseModel
11
+
12
+ from .service import DocsiferService
13
+ from .analytics import Analytics
14
+
15
+ logger = logging.getLogger(__name__)
16
+ router = APIRouter(tags=["v1"], responses={404: {"description": "Not found"}})
17
+
18
+ # Initialize analytics (single aggregator = "docsifer")
19
+ analytics = Analytics(
20
+ url=os.environ.get("REDIS_URL", "redis://localhost:6379/0"),
21
+ token=os.environ.get("REDIS_TOKEN", "***"),
22
+ sync_interval=30 * 60, # e.g. 30 minutes
23
+ )
24
+
25
+ # Initialize the Docsifer service (token counting with gpt-4o)
26
+ docsifer_service = DocsiferService(model_name="gpt-4o")
27
+
28
+
29
+ class ConvertResponse(BaseModel):
30
+ filename: str
31
+ markdown: str
32
+
33
+
34
+ @router.post("/convert", response_model=ConvertResponse)
35
+ async def convert_document(
36
+ background_tasks: BackgroundTasks,
37
+ file: UploadFile = File(..., description="File to convert (1 file per request)"),
38
+ openai: str = Form("{}", description="OpenAI config as a JSON object"),
39
+ settings: str = Form("{}", description="Settings as a JSON object"),
40
+ ):
41
+ """
42
+ Convert a single uploaded file to Markdown, optionally using OpenAI for advanced text extraction.
43
+ - `openai` is a JSON string with keys: {"api_key": "...", "base_url": "..."}
44
+ - `settings` is a JSON string with keys: {"cleanup": bool}
45
+ - We do not store or track model_id in analytics; everything is aggregated as "docsifer".
46
+ """
47
+ try:
48
+ try:
49
+ openai_config = json.loads(openai) if openai else {}
50
+ except json.JSONDecodeError:
51
+ raise ValueError("Invalid JSON in 'openai' parameter.")
52
+
53
+ try:
54
+ settings_config = json.loads(settings) if settings else {}
55
+ except json.JSONDecodeError:
56
+ raise ValueError("Invalid JSON in 'settings' parameter.")
57
+
58
+ cleanup = settings_config.get("cleanup", True)
59
+
60
+ with tempfile.TemporaryDirectory() as tmpdir:
61
+ temp_path = Path(tmpdir) / file.filename
62
+ contents = await file.read()
63
+ temp_path.write_bytes(contents)
64
+
65
+ result, token_count = await docsifer_service.convert_file(
66
+ file_path=str(temp_path), openai_config=openai_config, cleanup=cleanup
67
+ )
68
+
69
+ # Track usage in analytics (single aggregator => "docsifer")
70
+ background_tasks.add_task(analytics.access, token_count)
71
+
72
+ return ConvertResponse(**result)
73
+
74
+ except Exception as e:
75
+ msg = f"Failed to convert document. Error: {str(e)}"
76
+ logger.error(msg)
77
+ raise HTTPException(status_code=500, detail=msg)
78
+
79
+
80
+ @router.get("/stats")
81
+ async def get_stats():
82
+ """
83
+ Return usage statistics (access, tokens) from the Analytics system.
84
+ All data is stored under "docsifer".
85
+ """
86
+ try:
87
+ data = await analytics.stats()
88
+ return data
89
+ except Exception as e:
90
+ msg = f"Failed to fetch analytics stats: {str(e)}"
91
+ logger.error(msg)
92
+ raise HTTPException(status_code=500, detail=msg)
docsifer/service.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # filename: service.py
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import tempfile
7
+ from pathlib import Path
8
+ from typing import Optional, Dict, Tuple, Any
9
+
10
+ import tiktoken
11
+ from pyquery import PyQuery as pq
12
+ from markitdown import MarkItDown
13
+ from openai import OpenAI
14
+
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class DocsiferService:
20
+ """
21
+ A service that converts local files to Markdown using MarkItDown,
22
+ optionally with an OpenAI LLM for advanced extraction.
23
+ Token counting uses "gpt-4o" as a heuristic via tiktoken.
24
+ """
25
+
26
+ def __init__(self, model_name: str = "gpt-4o"):
27
+ """
28
+ Initialize the DocsiferService with a basic MarkItDown instance
29
+ and a tiktoken encoder for counting tokens using "gpt-4o".
30
+ """
31
+ self._basic_markitdown = MarkItDown() # MarkItDown without LLM
32
+ # Use "gpt-4o" for token counting
33
+ try:
34
+ self._encoder = tiktoken.encoding_for_model(model_name)
35
+ except Exception as e:
36
+ logger.warning(
37
+ "Error loading tiktoken model '%s': %s. Falling back to 'gpt-3.5-turbo-0301'.",
38
+ model_name,
39
+ e,
40
+ )
41
+ self._encoder = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
42
+
43
+ logger.info("DocsiferService initialized with token model '%s'.", model_name)
44
+
45
+ def _init_markitdown_with_llm(self, openai_config: Dict[str, Any]) -> MarkItDown:
46
+ """
47
+ If openai_config has an 'api_key', configure openai and return
48
+ a MarkItDown instance with that OpenAI client.
49
+ """
50
+ api_key = openai_config.get("api_key", "")
51
+ if not api_key:
52
+ logger.info("No OpenAI API key provided. Using basic MarkItDown.")
53
+ return self._basic_markitdown
54
+
55
+ model = openai_config.get("model", "gpt-4o-mini")
56
+ base_url = openai_config.get("base_url", "https://api.openai.com/v1")
57
+ client = OpenAI(api_key=api_key, base_url=base_url)
58
+
59
+ logger.info("Initialized OpenAI with base_url=%s", base_url)
60
+ return MarkItDown(llm_client=client, llm_model=model)
61
+
62
+ def _maybe_cleanup_html(self, html_file: Path) -> None:
63
+ """
64
+ If the file is HTML, remove <style> tags, optionally hidden elements, etc.
65
+ """
66
+ try:
67
+ content = html_file.read_text(encoding="utf-8", errors="ignore")
68
+ d = pq(content)
69
+ # Remove hidden elements
70
+ d('*[style*="display:none"]').remove()
71
+ cleaned_html = str(d)
72
+ cleaned_html = cleaned_html.strip()
73
+ html_file.write_text(cleaned_html, encoding="utf-8")
74
+ except Exception as e:
75
+ logger.error("HTML cleanup failed for %s: %s", html_file, e)
76
+
77
+ def _count_tokens(self, text: str) -> int:
78
+ """
79
+ Count tokens using the configured tiktoken encoder.
80
+ Fallback to whitespace-based counting if an error occurs.
81
+ """
82
+ try:
83
+ return len(self._encoder.encode(text))
84
+ except Exception as e:
85
+ logger.warning(
86
+ "Token counting failed, fallback to whitespace. Error: %s", e
87
+ )
88
+ return len(text.split())
89
+
90
+ async def convert_file(
91
+ self, file_path: str, openai_config: Optional[dict] = None, cleanup: bool = True
92
+ ) -> Tuple[Dict[str, str], int]:
93
+ """
94
+ Converts a file at `file_path` to Markdown.
95
+ - If `cleanup` is True and file is .html/.htm, does HTML cleanup.
96
+ - If `openai_config` has a valid API key, use LLM-based MarkItDown.
97
+ Returns ({"filename": filename, "markdown": md_string}, token_count).
98
+ """
99
+ src = Path(file_path)
100
+ if not src.exists():
101
+ raise FileNotFoundError(f"File not found: {file_path}")
102
+
103
+ logger.info("Converting file: %s (cleanup=%s)", file_path, cleanup)
104
+
105
+ # Use a temp directory so MarkItDown sees the real file extension
106
+ with tempfile.TemporaryDirectory() as tmpdir:
107
+ tmp_path = Path(tmpdir) / src.name
108
+ tmp_path.write_bytes(src.read_bytes())
109
+
110
+ # If it's HTML and cleanup is requested
111
+ if cleanup and tmp_path.suffix.lower() in (".html", ".htm"):
112
+ self._maybe_cleanup_html(tmp_path)
113
+
114
+ # Decide whether to use LLM or basic
115
+ if openai_config and openai_config.get("api_key"):
116
+ md_converter = self._init_markitdown_with_llm(openai_config)
117
+ else:
118
+ md_converter = self._basic_markitdown
119
+
120
+ try:
121
+ result_obj = md_converter.convert(str(tmp_path))
122
+ except Exception as e:
123
+ logger.error("MarkItDown conversion failed: %s", e)
124
+ raise RuntimeError(f"Conversion failed for '{file_path}': {e}")
125
+
126
+ # Count tokens
127
+ token_count = self._count_tokens(result_obj.text_content)
128
+
129
+ result_dict = {
130
+ "filename": src.name,
131
+ "markdown": result_obj.text_content,
132
+ }
133
+ return result_dict, token_count
pyproject.toml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "docsifer"
3
+ version = "1.0.0"
4
+ description = "Docsifer is a powerful tool for converting various data formats into Markdown for applications such as indexing, text analysis, and more. It supports PDF, PowerPoint, Word, Excel, Images, Audio, HTML, and other text-based formats, and leverages Large Language Models (LLMs) to enhance performance."
5
+ authors = ["Hieu Lam <[email protected]>"]
6
+ readme = "README.md"
7
+ homepage = "https://github.com/lh0x00/docsifer"
8
+ repository = "https://github.com/lh0x00/docsifer"
9
+ license = "MIT"
10
+
11
+ [tool.poetry.dependencies]
12
+ python = "^3.10"
13
+
14
+
15
+ [build-system]
16
+ requires = ["poetry-core"]
17
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ fastapi
3
+ uvicorn
4
+ requests
5
+ pydantic
6
+ cachetools
7
+ upstash_redis==1.2.0
8
+ markitdown
9
+ openai
10
+ pyquery
11
+ tiktoken
12
+ scuid
13
+ python-magic
14
+ plotly
15
+ matplotlib