openfree commited on
Commit
99d5f89
·
verified ·
1 Parent(s): b2059b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -77
app.py CHANGED
@@ -8,10 +8,8 @@ import re
8
  import uuid
9
  import pymupdf
10
 
11
- # os.system('pip install -U magic-pdf==0.10.5')
12
  os.system('pip uninstall -y magic-pdf')
13
  os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
14
- # os.system('pip install git+https://github.com/myhloli/Magic-PDF.git@dev')
15
 
16
  os.system('wget https://github.com/opendatalab/MinerU/raw/dev/scripts/download_models_hf.py -O download_models_hf.py')
17
  os.system('python download_models_hf.py')
@@ -37,12 +35,10 @@ from magic_pdf.data.data_reader_writer import FileBasedDataReader
37
  from magic_pdf.libs.hash_utils import compute_sha256
38
  from magic_pdf.tools.common import do_parse, prepare_env
39
 
40
-
41
  def read_fn(path):
42
  disk_rw = FileBasedDataReader(os.path.dirname(path))
43
  return disk_rw.read(os.path.basename(path))
44
 
45
-
46
  def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language):
47
  os.makedirs(output_dir, exist_ok=True)
48
 
@@ -72,78 +68,53 @@ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_en
72
  except Exception as e:
73
  logger.exception(e)
74
 
75
-
76
  def compress_directory_to_zip(directory_path, output_zip_path):
77
- """
78
- 压缩指定目录到一个 ZIP 文件。
79
-
80
- :param directory_path: 要压缩的目录路径
81
- :param output_zip_path: 输出的 ZIP 文件路径
82
- """
83
  try:
84
  with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
85
-
86
- # 遍历目录中的所有文件和子目录
87
  for root, dirs, files in os.walk(directory_path):
88
  for file in files:
89
- # 构建完整的文件路径
90
  file_path = os.path.join(root, file)
91
- # 计算相对路径
92
  arcname = os.path.relpath(file_path, directory_path)
93
- # 添加文件到 ZIP 文件
94
  zipf.write(file_path, arcname)
95
  return 0
96
  except Exception as e:
97
  logger.exception(e)
98
  return -1
99
 
100
-
101
  def image_to_base64(image_path):
102
  with open(image_path, "rb") as image_file:
103
  return base64.b64encode(image_file.read()).decode('utf-8')
104
 
105
-
106
  def replace_image_with_base64(markdown_text, image_dir_path):
107
- # 匹配Markdown中的图片标签
108
  pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
109
-
110
- # 替换图片链接
111
  def replace(match):
112
  relative_path = match.group(1)
113
  full_path = os.path.join(image_dir_path, relative_path)
114
  base64_image = image_to_base64(full_path)
115
  return f"![{relative_path}](data:image/jpeg;base64,{base64_image})"
116
-
117
- # 应用替换
118
  return re.sub(pattern, replace, markdown_text)
119
 
120
-
121
  def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language):
122
  file_path = to_pdf(file_path)
123
  if end_pages > 20:
124
  end_pages = 20
125
- # 获取识别的md文件以及压缩包文件路径
126
  local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
127
- layout_mode, formula_enable, table_enable, language)
128
  archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
129
  zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
130
  if zip_archive_success == 0:
131
- logger.info("压缩成功")
132
  else:
133
- logger.error("压缩失败")
134
  md_path = os.path.join(local_md_dir, file_name + ".md")
135
  with open(md_path, 'r', encoding='utf-8') as f:
136
  txt_content = f.read()
137
  md_content = replace_image_with_base64(txt_content, local_md_dir)
138
- # 返回转换后的PDF路径
139
  new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
140
-
141
  return md_content, txt_content, archive_zip_path, new_pdf_path
142
 
143
-
144
  latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
145
- {"left": '$', "right": '$', "display": False}]
146
-
147
 
148
  def init_model():
149
  from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
@@ -158,93 +129,78 @@ def init_model():
158
  logger.exception(e)
159
  return -1
160
 
161
-
162
  model_init = init_model()
163
  logger.info(f"model_init: {model_init}")
164
 
165
-
166
- with open("header.html", "r") as file:
167
- header = file.read()
168
-
169
-
170
  latin_lang = [
171
- 'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
172
- 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
173
- 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
174
- 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
175
  ]
176
  arabic_lang = ['ar', 'fa', 'ug', 'ur']
177
  cyrillic_lang = [
178
- 'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
179
- 'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
180
  ]
181
  devanagari_lang = [
182
- 'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
183
- 'sa', 'bgc'
184
  ]
185
  other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
186
 
187
  all_lang = ['', 'auto']
188
  all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
189
 
190
-
191
  def to_pdf(file_path):
192
  with pymupdf.open(file_path) as f:
193
  if f.is_pdf:
194
  return file_path
195
  else:
196
  pdf_bytes = f.convert_to_pdf()
197
- # 将pdfbytes 写入到uuid.pdf中
198
- # 生成唯一的文件名
199
  unique_filename = f"{uuid.uuid4()}.pdf"
200
-
201
- # 构建完整的文件路径
202
  tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
203
-
204
- # 将字节数据写入文件
205
  with open(tmp_file_path, 'wb') as tmp_pdf_file:
206
  tmp_pdf_file.write(pdf_bytes)
207
-
208
  return tmp_file_path
209
 
210
-
211
  if __name__ == "__main__":
212
- with gr.Blocks() as demo:
213
- gr.HTML(header)
214
  with gr.Row():
215
  with gr.Column(variant='panel', scale=5):
216
- file = gr.File(label="Please upload a PDF or image", file_types=[".pdf", ".png", ".jpeg", ".jpg"])
217
- max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
218
  with gr.Row():
219
- layout_mode = gr.Dropdown(["layoutlmv3", "doclayout_yolo"], label="Layout model", value="doclayout_yolo")
220
- language = gr.Dropdown(all_lang, label="Language", value='auto')
221
  with gr.Row():
222
- formula_enable = gr.Checkbox(label="Enable formula recognition", value=True)
223
- is_ocr = gr.Checkbox(label="Force enable OCR", value=False)
224
- table_enable = gr.Checkbox(label="Enable table recognition(test)", value=True)
225
  with gr.Row():
226
- change_bu = gr.Button("Convert")
227
- clear_bu = gr.ClearButton(value="Clear")
228
- pdf_show = PDF(label='PDF preview', interactive=False, visible=True, height=800)
229
- with gr.Accordion("Examples:"):
230
  example_root = os.path.join(os.path.dirname(__file__), "examples")
231
  gr.Examples(
232
  examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
233
- _.endswith("pdf")],
234
  inputs=file
235
  )
236
 
237
  with gr.Column(variant='panel', scale=5):
238
- output_file = gr.File(label="convert result", interactive=False)
239
  with gr.Tabs():
240
- with gr.Tab("Markdown rendering"):
241
- md = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True,
242
- latex_delimiters=latex_delimiters, line_breaks=True)
243
- with gr.Tab("Markdown text"):
244
  md_text = gr.TextArea(lines=45, show_copy_button=True)
 
245
  file.change(fn=to_pdf, inputs=file, outputs=pdf_show)
246
  change_bu.click(fn=to_markdown, inputs=[file, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
247
- outputs=[md, md_text, output_file, pdf_show], api_name=False)
248
  clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
249
 
250
- demo.launch(ssr_mode=True)
 
8
  import uuid
9
  import pymupdf
10
 
 
11
  os.system('pip uninstall -y magic-pdf')
12
  os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
 
13
 
14
  os.system('wget https://github.com/opendatalab/MinerU/raw/dev/scripts/download_models_hf.py -O download_models_hf.py')
15
  os.system('python download_models_hf.py')
 
35
  from magic_pdf.libs.hash_utils import compute_sha256
36
  from magic_pdf.tools.common import do_parse, prepare_env
37
 
 
38
  def read_fn(path):
39
  disk_rw = FileBasedDataReader(os.path.dirname(path))
40
  return disk_rw.read(os.path.basename(path))
41
 
 
42
  def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language):
43
  os.makedirs(output_dir, exist_ok=True)
44
 
 
68
  except Exception as e:
69
  logger.exception(e)
70
 
 
71
  def compress_directory_to_zip(directory_path, output_zip_path):
 
 
 
 
 
 
72
  try:
73
  with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
 
 
74
  for root, dirs, files in os.walk(directory_path):
75
  for file in files:
 
76
  file_path = os.path.join(root, file)
 
77
  arcname = os.path.relpath(file_path, directory_path)
 
78
  zipf.write(file_path, arcname)
79
  return 0
80
  except Exception as e:
81
  logger.exception(e)
82
  return -1
83
 
 
84
  def image_to_base64(image_path):
85
  with open(image_path, "rb") as image_file:
86
  return base64.b64encode(image_file.read()).decode('utf-8')
87
 
 
88
  def replace_image_with_base64(markdown_text, image_dir_path):
 
89
  pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
 
 
90
  def replace(match):
91
  relative_path = match.group(1)
92
  full_path = os.path.join(image_dir_path, relative_path)
93
  base64_image = image_to_base64(full_path)
94
  return f"![{relative_path}](data:image/jpeg;base64,{base64_image})"
 
 
95
  return re.sub(pattern, replace, markdown_text)
96
 
 
97
  def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language):
98
  file_path = to_pdf(file_path)
99
  if end_pages > 20:
100
  end_pages = 20
 
101
  local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
102
+ layout_mode, formula_enable, table_enable, language)
103
  archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
104
  zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
105
  if zip_archive_success == 0:
106
+ logger.info("압축 성공")
107
  else:
108
+ logger.error("압축 실패")
109
  md_path = os.path.join(local_md_dir, file_name + ".md")
110
  with open(md_path, 'r', encoding='utf-8') as f:
111
  txt_content = f.read()
112
  md_content = replace_image_with_base64(txt_content, local_md_dir)
 
113
  new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
 
114
  return md_content, txt_content, archive_zip_path, new_pdf_path
115
 
 
116
  latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
117
+ {"left": '$', "right": '$', "display": False}]
 
118
 
119
  def init_model():
120
  from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
 
129
  logger.exception(e)
130
  return -1
131
 
 
132
  model_init = init_model()
133
  logger.info(f"model_init: {model_init}")
134
 
 
 
 
 
 
135
  latin_lang = [
136
+ 'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
137
+ 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
138
+ 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
139
+ 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
140
  ]
141
  arabic_lang = ['ar', 'fa', 'ug', 'ur']
142
  cyrillic_lang = [
143
+ 'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
144
+ 'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
145
  ]
146
  devanagari_lang = [
147
+ 'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
148
+ 'sa', 'bgc'
149
  ]
150
  other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
151
 
152
  all_lang = ['', 'auto']
153
  all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
154
 
 
155
  def to_pdf(file_path):
156
  with pymupdf.open(file_path) as f:
157
  if f.is_pdf:
158
  return file_path
159
  else:
160
  pdf_bytes = f.convert_to_pdf()
 
 
161
  unique_filename = f"{uuid.uuid4()}.pdf"
 
 
162
  tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
 
 
163
  with open(tmp_file_path, 'wb') as tmp_pdf_file:
164
  tmp_pdf_file.write(pdf_bytes)
 
165
  return tmp_file_path
166
 
 
167
  if __name__ == "__main__":
168
+ with gr.Blocks(title="OCR FLEX") as demo:
 
169
  with gr.Row():
170
  with gr.Column(variant='panel', scale=5):
171
+ file = gr.File(label="PDF 또는 이미지 파일을 업로드하세요", file_types=[".pdf", ".png", ".jpeg", ".jpg"])
172
+ max_pages = gr.Slider(1, 20, 10, step=1, label='최대 변환 페이지 수')
173
  with gr.Row():
174
+ layout_mode = gr.Dropdown(["layoutlmv3", "doclayout_yolo"], label="레이아웃 모델", value="doclayout_yolo")
175
+ language = gr.Dropdown(all_lang, label="언어", value='auto')
176
  with gr.Row():
177
+ formula_enable = gr.Checkbox(label="수식 인식 활성화", value=True)
178
+ is_ocr = gr.Checkbox(label="OCR 강제 활성화", value=False)
179
+ table_enable = gr.Checkbox(label=" 인식 활성화(테스트)", value=True)
180
  with gr.Row():
181
+ change_bu = gr.Button("변환")
182
+ clear_bu = gr.ClearButton(value="초기화")
183
+ pdf_show = PDF(label='PDF 미리보기', interactive=False, visible=True, height=800)
184
+ with gr.Accordion("예제:"):
185
  example_root = os.path.join(os.path.dirname(__file__), "examples")
186
  gr.Examples(
187
  examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
188
+ _.endswith("pdf")],
189
  inputs=file
190
  )
191
 
192
  with gr.Column(variant='panel', scale=5):
193
+ output_file = gr.File(label="변환 결과", interactive=False)
194
  with gr.Tabs():
195
+ with gr.Tab("마크다운 렌더링"):
196
+ md = gr.Markdown(label="마크다운 렌더링", height=1100, show_copy_button=True,
197
+ latex_delimiters=latex_delimiters, line_breaks=True)
198
+ with gr.Tab("마크다운 텍스트"):
199
  md_text = gr.TextArea(lines=45, show_copy_button=True)
200
+
201
  file.change(fn=to_pdf, inputs=file, outputs=pdf_show)
202
  change_bu.click(fn=to_markdown, inputs=[file, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
203
+ outputs=[md, md_text, output_file, pdf_show], api_name=False)
204
  clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
205
 
206
+ demo.launch(ssr_mode=True)