Spaces:
Runtime error
Runtime error
Reorganized the project into a module called document processing, and execute through a single entry file pdf_to_embeddings.py
824bd32
from docling.document_converter import DocumentConverter, PdfFormatOption | |
from docling.datamodel.base_models import InputFormat | |
from docling.datamodel.pipeline_options import PdfPipelineOptions | |
from document_processing.utilities import get_file_name_without_ext | |
def document_converter(): | |
pipeline_options = PdfPipelineOptions() | |
pipeline_options.do_ocr = False | |
pipeline_options.do_table_structure = True | |
return DocumentConverter( | |
format_options={ | |
InputFormat.PDF: PdfFormatOption( | |
pipeline_options=pipeline_options | |
) | |
} | |
) | |
def process_pdf(pdf_path): | |
converter = document_converter() | |
result = converter.convert(pdf_path) | |
return result | |
def process_pdf_file(pdf_path): | |
file_name = get_file_name_without_ext(pdf_path) | |
result = process_pdf(pdf_path) | |
return result | |