import gradio as gr import warnings from typing import List from pdfitdown.pdfconversion import convert_to_pdf, convert_markdown_to_pdf from base_utils import ( convert_pdf_to_image, extract_text_from_pdf, convert_doc_to_text, extract_text_from_docx, extract_text_from_ppt, extract_text_from_pptx, sanitize_list_of_lists, parse_url, ) pdf_to_img = gr.Interface( convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img" ) pdf_to_text = gr.Interface( extract_text_from_pdf, gr.File(), gr.Textbox(placeholder="Extracted text will appear here"), api_name="pdf_to_text", ) doc_to_text = gr.Interface( convert_doc_to_text, gr.File(), gr.Textbox(), api_name="doc_to_text" ) docx_to_text = gr.Interface( extract_text_from_docx, gr.File(), gr.Textbox(), api_name="docx_to_text" ) ppt_to_text = gr.Interface( extract_text_from_ppt, gr.File(), gr.Textbox(), api_name="ppt_to_text", ) pptx_to_text = gr.Interface( extract_text_from_pptx, gr.File(), gr.Textbox(), api_name="pptx_to_text", ) str_to_json = gr.Interface( sanitize_list_of_lists, gr.Text(), gr.JSON(), api_name="str_to_json", examples=[ """[ ["What year was the Carthaginian Empire founded?", "Around 814 BCE"], ["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"], ["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"], ["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"], ["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"], ["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"], ["In what year was Carthage captured and destroyed by Rome?", "146 BCE"], ["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"] ]""" ], ) url_parser = gr.Interface( parse_url, inputs=["text"], outputs=["text"], api_name="url_to_text", ) class FileNotConvertedWarning(Warning): """The file was not in one of the specified formats for conversion to PDF""" pass def to_pdf(files: List[str]) -> List[str]: pdfs = [] for f in files: if f.endswith(".docx"): newfile = f.replace(".docx", ".pdf") file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) pdfs.append(file_to_add) elif f.endswith(".pdf"): pdfs.append(f) elif f.endswith(".html"): newfile = f.replace(".html", ".pdf") file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) pdfs.append(file_to_add) elif f.endswith(".pptx"): newfile = f.replace(".pptx", ".pdf") file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) pdfs.append(file_to_add) elif f.endswith(".csv"): newfile = f.replace(".csv", ".pdf") file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) pdfs.append(file_to_add) elif f.endswith(".xml"): newfile = f.replace(".xml", ".pdf") file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) pdfs.append(file_to_add) elif f.endswith(".md"): newfile = f.replace(".md", ".pdf") file_to_add = convert_markdown_to_pdf(f, newfile, newfile.split(".")[0]) pdfs.append(file_to_add) else: warnings.warn( f"File {f} was not converted to PDF because its file format is not included in those that can be converted", FileNotConvertedWarning, ) continue return pdfs def convert(file: str) -> str: files = [file] pdfs = to_pdf(files) return pdfs pdf_converter = gr.Interface( fn=convert, inputs=gr.File(label="Upload your file"), outputs=gr.File(label="Converted PDF"), title="File to PDF Converter", description="Upload a file in .docx, .pdf, .html, .pptx, .csv, .xml, or .md format, and get it converted to PDF.", api_name="convert_to_pdf", ) demo = gr.TabbedInterface( [ pdf_to_img, pdf_to_text, doc_to_text, docx_to_text, ppt_to_text, pptx_to_text, url_parser, str_to_json, pdf_converter, ], [ "PDF to Image", "Extract PDF Text", "Extract DOC Text", "Extract DOCX Text", "Extract PPT Text", "Extract PPTX Text", "Extract text from URL", "Extract Json", "Convert to PDF", ], ) demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)