Spaces:
Paused
Paused
exchange math delimiters
Browse files+ markdown specific postprocessing
app.py
CHANGED
|
@@ -3,6 +3,7 @@ import subprocess
|
|
| 3 |
import uuid
|
| 4 |
import os
|
| 5 |
import requests
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
def get_pdf(pdf_link):
|
|
@@ -31,7 +32,8 @@ def nougat_ocr(file_name):
|
|
| 31 |
#'--out', unique_filename,
|
| 32 |
'--out', 'output',
|
| 33 |
'pdf', f'{file_name}',
|
| 34 |
-
'--checkpoint', 'nougat'
|
|
|
|
| 35 |
]
|
| 36 |
|
| 37 |
# Run the command and capture its output
|
|
@@ -64,6 +66,8 @@ def predict(pdf_file, pdf_link):
|
|
| 64 |
file_name = file_name.split('/')[-1][:-4]
|
| 65 |
with open(f'output/{file_name}.mmd', 'r') as file:
|
| 66 |
content = file.read()
|
|
|
|
|
|
|
| 67 |
return content
|
| 68 |
|
| 69 |
|
|
@@ -76,7 +80,8 @@ def nougat_ocr1(file_name):
|
|
| 76 |
'nougat',
|
| 77 |
'--out', 'output',
|
| 78 |
'pdf', f'{file_name}',
|
| 79 |
-
'--checkpoint', 'nougat'
|
|
|
|
| 80 |
]
|
| 81 |
|
| 82 |
# Run the command and get .mmd file in an output folder
|
|
|
|
| 3 |
import uuid
|
| 4 |
import os
|
| 5 |
import requests
|
| 6 |
+
import re
|
| 7 |
|
| 8 |
|
| 9 |
def get_pdf(pdf_link):
|
|
|
|
| 32 |
#'--out', unique_filename,
|
| 33 |
'--out', 'output',
|
| 34 |
'pdf', f'{file_name}',
|
| 35 |
+
'--checkpoint', 'nougat',
|
| 36 |
+
'--markdown'
|
| 37 |
]
|
| 38 |
|
| 39 |
# Run the command and capture its output
|
|
|
|
| 66 |
file_name = file_name.split('/')[-1][:-4]
|
| 67 |
with open(f'output/{file_name}.mmd', 'r') as file:
|
| 68 |
content = file.read()
|
| 69 |
+
# switch math delimiters
|
| 70 |
+
content = content.replace(r'\(', '$').replace(r'\)', '$').replace(r'\[', '$$').replace(r'\]', '$$')
|
| 71 |
return content
|
| 72 |
|
| 73 |
|
|
|
|
| 80 |
'nougat',
|
| 81 |
'--out', 'output',
|
| 82 |
'pdf', f'{file_name}',
|
| 83 |
+
'--checkpoint', 'nougat',
|
| 84 |
+
'--markdown'
|
| 85 |
]
|
| 86 |
|
| 87 |
# Run the command and get .mmd file in an output folder
|