drakosfire's picture
Reorganized the project into a module called document processing, and execute through a single entry file pdf_to_embeddings.py
824bd32
raw
history blame contribute delete
865 Bytes
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from document_processing.utilities import get_file_name_without_ext
def document_converter():
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
return DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options
)
}
)
def process_pdf(pdf_path):
converter = document_converter()
result = converter.convert(pdf_path)
return result
def process_pdf_file(pdf_path):
file_name = get_file_name_without_ext(pdf_path)
result = process_pdf(pdf_path)
return result