Spaces:
Runtime error
Runtime error
Commit
·
26072cc
1
Parent(s):
5a9a58b
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
import tempfile
|
3 |
import re
|
4 |
-
from PyPDF2 import PdfReader, PdfFileReader
|
5 |
import os
|
6 |
import spacy
|
7 |
import pytesseract
|
@@ -12,6 +11,9 @@ from pdf2image.exceptions import (
|
|
12 |
PDFPageCountError,
|
13 |
PDFSyntaxError
|
14 |
)
|
|
|
|
|
|
|
15 |
|
16 |
def clean_text(text):
|
17 |
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
|
@@ -25,30 +27,29 @@ def image_to_latex(image):
|
|
25 |
result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
|
26 |
return result.stdout
|
27 |
|
28 |
-
|
29 |
def pdf_to_text(file):
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
44 |
base_name = os.path.splitext(os.path.basename(file.name))[0]
|
45 |
output_file_name = base_name + ".txt"
|
46 |
with open(output_file_name, 'w') as f:
|
47 |
f.write(full_text)
|
48 |
return output_file_name, page_number
|
49 |
|
50 |
-
|
51 |
-
|
52 |
iface = gr.Interface(fn=pdf_to_text,
|
53 |
inputs=gr.inputs.File(label="Your PDF"),
|
54 |
outputs=gr.outputs.File(label="Download TXT"),
|
|
|
1 |
import gradio as gr
|
2 |
import tempfile
|
3 |
import re
|
|
|
4 |
import os
|
5 |
import spacy
|
6 |
import pytesseract
|
|
|
11 |
PDFPageCountError,
|
12 |
PDFSyntaxError
|
13 |
)
|
14 |
+
import fitz # PyMuPDF
|
15 |
+
from PIL import Image
|
16 |
+
import io
|
17 |
|
18 |
def clean_text(text):
|
19 |
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
|
|
|
27 |
result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
|
28 |
return result.stdout
|
29 |
|
|
|
30 |
def pdf_to_text(file):
|
31 |
+
doc = fitz.open(file.name)
|
32 |
+
full_text = ''
|
33 |
+
for i, page in enumerate(doc):
|
34 |
+
page_text = page.getText()
|
35 |
+
images = page.getImageList()
|
36 |
+
if images:
|
37 |
+
for image in images:
|
38 |
+
xref = image[0]
|
39 |
+
base_image = doc.extract_image(xref)
|
40 |
+
image = Image.open(io.BytesIO(base_image["image"]))
|
41 |
+
page_text += image_to_latex(image)
|
42 |
+
page_text = clean_text(page_text)
|
43 |
+
if len(page_text.split()) > 5:
|
44 |
+
page_number = i + 1
|
45 |
+
page_text = "## Metadata: Page Number " + str(page_number) + "\n" + page_text
|
46 |
+
full_text += page_text + "\n\n"
|
47 |
base_name = os.path.splitext(os.path.basename(file.name))[0]
|
48 |
output_file_name = base_name + ".txt"
|
49 |
with open(output_file_name, 'w') as f:
|
50 |
f.write(full_text)
|
51 |
return output_file_name, page_number
|
52 |
|
|
|
|
|
53 |
iface = gr.Interface(fn=pdf_to_text,
|
54 |
inputs=gr.inputs.File(label="Your PDF"),
|
55 |
outputs=gr.outputs.File(label="Download TXT"),
|