Spaces:
Running
Running
format
Browse files- app.py +6 -1
- pdf2text.py +7 -8
app.py
CHANGED
|
@@ -95,7 +95,12 @@ if __name__ == "__main__":
|
|
| 95 |
logging.info(f"Using GPU status: {use_GPU}")
|
| 96 |
logging.info("Loading OCR model")
|
| 97 |
with contextlib.redirect_stdout(None):
|
| 98 |
-
ocr_model = ocr_predictor(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
# define pdf bytes as None
|
| 101 |
pdf_obj = _here / "example_file.pdf"
|
|
|
|
| 95 |
logging.info(f"Using GPU status: {use_GPU}")
|
| 96 |
logging.info("Loading OCR model")
|
| 97 |
with contextlib.redirect_stdout(None):
|
| 98 |
+
ocr_model = ocr_predictor(
|
| 99 |
+
"db_resnet50",
|
| 100 |
+
"crnn_mobilenet_v3_large",
|
| 101 |
+
pretrained=True,
|
| 102 |
+
assume_straight_pages=True,
|
| 103 |
+
)
|
| 104 |
|
| 105 |
# define pdf bytes as None
|
| 106 |
pdf_obj = _here / "example_file.pdf"
|
pdf2text.py
CHANGED
|
@@ -32,6 +32,8 @@ from tqdm.auto import tqdm
|
|
| 32 |
|
| 33 |
from doctr.io import DocumentFile
|
| 34 |
from doctr.models import ocr_predictor
|
|
|
|
|
|
|
| 35 |
def fast_scandir(dirname):
|
| 36 |
# return all subfolders in a given filepath
|
| 37 |
|
|
@@ -421,7 +423,6 @@ def download_URL(url: str, file=None, dlpath=None, verbose=False):
|
|
| 421 |
"""
|
| 422 |
|
| 423 |
|
| 424 |
-
|
| 425 |
# need to run only once to load model into memory
|
| 426 |
|
| 427 |
custom_replace_list = {
|
|
@@ -554,6 +555,7 @@ def postprocess(text: str) -> str:
|
|
| 554 |
|
| 555 |
return eval_and_replace(proc)
|
| 556 |
|
|
|
|
| 557 |
def result2text(result) -> str:
|
| 558 |
"""Convert OCR result to text"""
|
| 559 |
|
|
@@ -568,11 +570,10 @@ def result2text(result) -> str:
|
|
| 568 |
text += word.value + " "
|
| 569 |
full_doc.append(text)
|
| 570 |
|
| 571 |
-
|
| 572 |
-
|
| 573 |
full_text = "\n".join(full_doc)
|
| 574 |
return full_text
|
| 575 |
|
|
|
|
| 576 |
import warnings
|
| 577 |
from datetime import date
|
| 578 |
from os.path import join
|
|
@@ -593,7 +594,9 @@ def convert_PDF_to_Text(
|
|
| 593 |
doc = DocumentFile.from_pdf(PDF_file)
|
| 594 |
|
| 595 |
if len(doc) > max_pages:
|
| 596 |
-
logging.warning(
|
|
|
|
|
|
|
| 597 |
doc = doc[:max_pages]
|
| 598 |
|
| 599 |
# Analyze
|
|
@@ -603,14 +606,10 @@ def convert_PDF_to_Text(
|
|
| 603 |
proc_text = format_ocr_out(raw_text)
|
| 604 |
output_text = postprocess(proc_text)
|
| 605 |
|
| 606 |
-
|
| 607 |
fn_rt = time.perf_counter() - st
|
| 608 |
|
| 609 |
-
|
| 610 |
-
|
| 611 |
logging.info("OCR complete")
|
| 612 |
|
| 613 |
-
|
| 614 |
results_dict = {
|
| 615 |
"num_pages": len(doc),
|
| 616 |
"runtime": round(fn_rt, 2),
|
|
|
|
| 32 |
|
| 33 |
from doctr.io import DocumentFile
|
| 34 |
from doctr.models import ocr_predictor
|
| 35 |
+
|
| 36 |
+
|
| 37 |
def fast_scandir(dirname):
|
| 38 |
# return all subfolders in a given filepath
|
| 39 |
|
|
|
|
| 423 |
"""
|
| 424 |
|
| 425 |
|
|
|
|
| 426 |
# need to run only once to load model into memory
|
| 427 |
|
| 428 |
custom_replace_list = {
|
|
|
|
| 555 |
|
| 556 |
return eval_and_replace(proc)
|
| 557 |
|
| 558 |
+
|
| 559 |
def result2text(result) -> str:
|
| 560 |
"""Convert OCR result to text"""
|
| 561 |
|
|
|
|
| 570 |
text += word.value + " "
|
| 571 |
full_doc.append(text)
|
| 572 |
|
|
|
|
|
|
|
| 573 |
full_text = "\n".join(full_doc)
|
| 574 |
return full_text
|
| 575 |
|
| 576 |
+
|
| 577 |
import warnings
|
| 578 |
from datetime import date
|
| 579 |
from os.path import join
|
|
|
|
| 594 |
doc = DocumentFile.from_pdf(PDF_file)
|
| 595 |
|
| 596 |
if len(doc) > max_pages:
|
| 597 |
+
logging.warning(
|
| 598 |
+
f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating"
|
| 599 |
+
)
|
| 600 |
doc = doc[:max_pages]
|
| 601 |
|
| 602 |
# Analyze
|
|
|
|
| 606 |
proc_text = format_ocr_out(raw_text)
|
| 607 |
output_text = postprocess(proc_text)
|
| 608 |
|
|
|
|
| 609 |
fn_rt = time.perf_counter() - st
|
| 610 |
|
|
|
|
|
|
|
| 611 |
logging.info("OCR complete")
|
| 612 |
|
|
|
|
| 613 |
results_dict = {
|
| 614 |
"num_pages": len(doc),
|
| 615 |
"runtime": round(fn_rt, 2),
|