openfree commited on
Commit
72c2546
·
verified ·
1 Parent(s): a0b69d1

Create app-backup.py

Browse files
Files changed (1) hide show
  1. app-backup.py +413 -0
app-backup.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ import os
4
+ import time
5
+ import zipfile
6
+ from pathlib import Path
7
+ import re
8
+ import uuid
9
+ import pymupdf
10
+
11
+ os.system('pip uninstall -y magic-pdf')
12
+ os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
13
+
14
+ os.system('wget https://github.com/opendatalab/MinerU/raw/dev/scripts/download_models_hf.py -O download_models_hf.py')
15
+ os.system('python download_models_hf.py')
16
+
17
+ with open('/home/user/magic-pdf.json', 'r') as file:
18
+ data = json.load(file)
19
+
20
+ data['device-mode'] = "cuda"
21
+ if os.getenv('apikey'):
22
+ data['llm-aided-config']['title_aided']['api_key'] = os.getenv('apikey')
23
+ data['llm-aided-config']['title_aided']['enable'] = True
24
+
25
+ with open('/home/user/magic-pdf.json', 'w') as file:
26
+ json.dump(data, file, indent=4)
27
+
28
+ os.system('cp -r paddleocr /home/user/.paddleocr')
29
+ from gradio_pdf import PDF
30
+
31
+ import gradio as gr
32
+ from loguru import logger
33
+
34
+ from magic_pdf.data.data_reader_writer import FileBasedDataReader
35
+ from magic_pdf.libs.hash_utils import compute_sha256
36
+ from magic_pdf.tools.common import do_parse, prepare_env
37
+
38
+ def create_css():
39
+ return """
40
+ /* 전체 스타일 */
41
+ .gradio-container {
42
+ background: linear-gradient(135deg, #EFF6FF 0%, #F5F3FF 100%);
43
+ max-width: 1200px !important;
44
+ margin: 0 auto !important;
45
+ padding: 2rem !important;
46
+ }
47
+ /* 제목 스타일 */
48
+ .title-area {
49
+ text-align: center;
50
+ margin-bottom: 2rem;
51
+ padding: 1rem;
52
+ background: white;
53
+ border-radius: 1rem;
54
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
55
+ }
56
+ .title-area h1 {
57
+ background: linear-gradient(90deg, #2563EB 0%, #7C3AED 100%);
58
+ -webkit-background-clip: text;
59
+ -webkit-text-fill-color: transparent;
60
+ font-size: 2.5rem;
61
+ font-weight: bold;
62
+ margin-bottom: 0.5rem;
63
+ }
64
+ .title-area p {
65
+ color: #6B7280;
66
+ font-size: 1.1rem;
67
+ }
68
+ /* 컴포넌트 스타일링 */
69
+ .gr-box, .gr-panel {
70
+ border: 2px solid #E0E7FF !important;
71
+ border-radius: 12px !important;
72
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1) !important;
73
+ background: white !important;
74
+ }
75
+ /* 파일 업로드 영역 */
76
+ .file-upload {
77
+ border: 2px dashed #93C5FD !important;
78
+ border-radius: 8px !important;
79
+ padding: 2rem !important;
80
+ background: #F0F9FF !important;
81
+ transition: all 0.3s ease;
82
+ }
83
+ .file-upload:hover {
84
+ background: #E0F2FE !important;
85
+ border-color: #60A5FA !important;
86
+ }
87
+ /* 버튼 스타일링 */
88
+ .gr-button.primary-button {
89
+ background: linear-gradient(90deg, #2563EB 0%, #7C3AED 100%) !important;
90
+ color: white !important;
91
+ border: none !important;
92
+ border-radius: 8px !important;
93
+ padding: 0.75rem 1.5rem !important;
94
+ font-weight: bold !important;
95
+ transition: opacity 0.2s !important;
96
+ }
97
+ .gr-button.primary-button:hover {
98
+ opacity: 0.9 !important;
99
+ }
100
+ .gr-button.secondary-button {
101
+ background: white !important;
102
+ color: #4B5563 !important;
103
+ border: 1px solid #D1D5DB !important;
104
+ border-radius: 8px !important;
105
+ padding: 0.75rem 1.5rem !important;
106
+ }
107
+ .gr-button.secondary-button:hover {
108
+ background: #F9FAFB !important;
109
+ }
110
+ /* 슬라이더 스타일링 */
111
+ .gr-slider {
112
+ background: #E0E7FF !important;
113
+ }
114
+ .gr-slider .gr-slider-handle {
115
+ background: #4F46E5 !important;
116
+ }
117
+ /* 체크박스 스타일링 */
118
+ .gr-checkbox {
119
+ border-color: #6366F1 !important;
120
+ }
121
+ .gr-checkbox:checked {
122
+ background-color: #4F46E5 !important;
123
+ }
124
+ /* 탭 스타일링 */
125
+ .gr-tabs {
126
+ border-bottom: 2px solid #E0E7FF !important;
127
+ }
128
+ .gr-tab-button {
129
+ color: #6B7280 !important;
130
+ padding: 0.75rem 1rem !important;
131
+ font-weight: 500 !important;
132
+ }
133
+ .gr-tab-button.selected {
134
+ color: #4F46E5 !important;
135
+ border-bottom: 2px solid #4F46E5 !important;
136
+ }
137
+ /* 마크다운 출력 영역 */
138
+ .markdown-output {
139
+ background: white !important;
140
+ border-radius: 8px !important;
141
+ padding: 1rem !important;
142
+ box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.05) !important;
143
+ }
144
+ """
145
+
146
+ def read_fn(path):
147
+ disk_rw = FileBasedDataReader(os.path.dirname(path))
148
+ return disk_rw.read(os.path.basename(path))
149
+
150
+ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language):
151
+ os.makedirs(output_dir, exist_ok=True)
152
+
153
+ try:
154
+ file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
155
+ pdf_data = read_fn(doc_path)
156
+ if is_ocr:
157
+ parse_method = "ocr"
158
+ else:
159
+ parse_method = "auto"
160
+ local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
161
+ do_parse(
162
+ output_dir,
163
+ file_name,
164
+ pdf_data,
165
+ [],
166
+ parse_method,
167
+ False,
168
+ end_page_id=end_page_id,
169
+ layout_model=layout_mode,
170
+ formula_enable=formula_enable,
171
+ table_enable=table_enable,
172
+ lang=language,
173
+ f_dump_orig_pdf=False,
174
+ )
175
+ return local_md_dir, file_name
176
+ except Exception as e:
177
+ logger.exception(e)
178
+
179
+ def compress_directory_to_zip(directory_path, output_zip_path):
180
+ try:
181
+ with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
182
+ for root, dirs, files in os.walk(directory_path):
183
+ for file in files:
184
+ file_path = os.path.join(root, file)
185
+ arcname = os.path.relpath(file_path, directory_path)
186
+ zipf.write(file_path, arcname)
187
+ return 0
188
+ except Exception as e:
189
+ logger.exception(e)
190
+ return -1
191
+
192
+ def image_to_base64(image_path):
193
+ with open(image_path, "rb") as image_file:
194
+ return base64.b64encode(image_file.read()).decode('utf-8')
195
+
196
+ def replace_image_with_base64(markdown_text, image_dir_path):
197
+ pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
198
+ def replace(match):
199
+ relative_path = match.group(1)
200
+ full_path = os.path.join(image_dir_path, relative_path)
201
+ base64_image = image_to_base64(full_path)
202
+ return f"![{relative_path}](data:image/jpeg;base64,{base64_image})"
203
+ return re.sub(pattern, replace, markdown_text)
204
+
205
+ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language):
206
+ file_path = to_pdf(file_path)
207
+ if end_pages > 20:
208
+ end_pages = 20
209
+ local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
210
+ layout_mode, formula_enable, table_enable, language)
211
+ archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
212
+ zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
213
+ if zip_archive_success == 0:
214
+ logger.info("압축 성공")
215
+ else:
216
+ logger.error("압축 실패")
217
+ md_path = os.path.join(local_md_dir, file_name + ".md")
218
+ with open(md_path, 'r', encoding='utf-8') as f:
219
+ txt_content = f.read()
220
+ md_content = replace_image_with_base64(txt_content, local_md_dir)
221
+ new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
222
+ return md_content, txt_content, archive_zip_path, new_pdf_path
223
+
224
+ def to_pdf(file_path):
225
+ with pymupdf.open(file_path) as f:
226
+ if f.is_pdf:
227
+ return file_path
228
+ else:
229
+ pdf_bytes = f.convert_to_pdf()
230
+ unique_filename = f"{uuid.uuid4()}.pdf"
231
+ tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
232
+ with open(tmp_file_path, 'wb') as tmp_pdf_file:
233
+ tmp_pdf_file.write(pdf_bytes)
234
+ return tmp_file_path
235
+
236
+ latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
237
+ {"left": '$', "right": '$', "display": False}]
238
+
239
+ def init_model():
240
+ from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
241
+ try:
242
+ model_manager = ModelSingleton()
243
+ txt_model = model_manager.get_model(False, False)
244
+ logger.info(f"txt_model init final")
245
+ ocr_model = model_manager.get_model(True, False)
246
+ logger.info(f"ocr_model init final")
247
+ return 0
248
+ except Exception as e:
249
+ logger.exception(e)
250
+ return -1
251
+
252
+ model_init = init_model()
253
+ logger.info(f"model_init: {model_init}")
254
+
255
+ latin_lang = [
256
+ 'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
257
+ 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
258
+ 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
259
+ 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
260
+ ]
261
+ arabic_lang = ['ar', 'fa', 'ug', 'ur']
262
+ cyrillic_lang = [
263
+ 'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
264
+ 'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
265
+ ]
266
+ devanagari_lang = [
267
+ 'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
268
+ 'sa', 'bgc'
269
+ ]
270
+ other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
271
+
272
+ all_lang = ['', 'auto']
273
+ all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
274
+
275
+ if __name__ == "__main__":
276
+ with gr.Blocks(title="OCR FLEX", css=create_css()) as demo:
277
+ # 타이틀 영역
278
+ with gr.Row(elem_classes="title-area"):
279
+ gr.HTML("""
280
+ <h1>OCR FLEX</h1>
281
+ <p>PDF와 이미지에서 텍스트를 빠르고 정확하게 추출하세요</p>
282
+ """)
283
+
284
+ with gr.Row():
285
+ # 왼쪽 패널
286
+ with gr.Column(variant='panel', scale=5):
287
+ file = gr.File(
288
+ label="PDF 또는 이미지 파일을 업로드하세요",
289
+ file_types=[".pdf", ".png", ".jpeg", ".jpg"],
290
+ elem_classes="file-upload"
291
+ )
292
+
293
+ max_pages = gr.Slider(
294
+ 1, 20, 10,
295
+ step=1,
296
+ label='최대 변환 페이지 수',
297
+ elem_classes="custom-slider"
298
+ )
299
+
300
+ with gr.Row():
301
+ layout_mode = gr.Dropdown(
302
+ ["layoutlmv3", "doclayout_yolo"],
303
+ label="레이아웃 모델",
304
+ value="doclayout_yolo",
305
+ elem_classes="custom-dropdown"
306
+ )
307
+ language = gr.Dropdown(
308
+ all_lang,
309
+ label="언어",
310
+ value='auto',
311
+ elem_classes="custom-dropdown"
312
+ )
313
+
314
+ with gr.Row():
315
+ formula_enable = gr.Checkbox(
316
+ label="수식 인식 활성화",
317
+ value=True,
318
+ elem_classes="custom-checkbox"
319
+ )
320
+ is_ocr = gr.Checkbox(
321
+ label="OCR 강제 활성화",
322
+ value=False,
323
+ elem_classes="custom-checkbox"
324
+ )
325
+ table_enable = gr.Checkbox(
326
+ label="표 인식 활성화(테스트)",
327
+ value=True,
328
+ elem_classes="custom-checkbox"
329
+ )
330
+
331
+ with gr.Row():
332
+ change_bu = gr.Button(
333
+ "변환",
334
+ elem_classes="primary-button"
335
+ )
336
+ clear_bu = gr.ClearButton(
337
+ value="초기화",
338
+ elem_classes="secondary-button"
339
+ )
340
+
341
+ pdf_show = PDF(
342
+ label='PDF 미리보기',
343
+ interactive=False,
344
+ visible=True,
345
+ height=800,
346
+ elem_classes="pdf-preview"
347
+ )
348
+
349
+ with gr.Accordion("예제:", open=False):
350
+ example_root = os.path.join(os.path.dirname(__file__), "examples")
351
+ gr.Examples(
352
+ examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
353
+ _.endswith("pdf")],
354
+ inputs=file
355
+ )
356
+
357
+ # 오른쪽 패널
358
+ with gr.Column(variant='panel', scale=5):
359
+ output_file = gr.File(
360
+ label="변환 결과",
361
+ interactive=False,
362
+ elem_classes="output-file"
363
+ )
364
+
365
+ with gr.Tabs() as tabs:
366
+ with gr.Tab("마크다운 렌더링"):
367
+ md = gr.Markdown(
368
+ label="마크다운 렌더링",
369
+ height=1100,
370
+ show_copy_button=True,
371
+ latex_delimiters=latex_delimiters,
372
+ line_breaks=True,
373
+ elem_classes="markdown-output"
374
+ )
375
+
376
+ with gr.Tab("마크다운 텍스트"):
377
+ md_text = gr.TextArea(
378
+ lines=45,
379
+ show_copy_button=True,
380
+ elem_classes="markdown-text"
381
+ )
382
+
383
+ # 이벤트 핸들러
384
+ file.change(
385
+ fn=to_pdf,
386
+ inputs=file,
387
+ outputs=pdf_show
388
+ )
389
+
390
+ change_bu.click(
391
+ fn=to_markdown,
392
+ inputs=[
393
+ file,
394
+ max_pages,
395
+ is_ocr,
396
+ layout_mode,
397
+ formula_enable,
398
+ table_enable,
399
+ language
400
+ ],
401
+ outputs=[
402
+ md,
403
+ md_text,
404
+ output_file,
405
+ pdf_show
406
+ ],
407
+ api_name=False
408
+ )
409
+
410
+ clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
411
+
412
+ # 앱 실행
413
+ demo.launch(ssr_mode=True)