Spaces:
Runtime error
Runtime error
import os | |
import gradio as gr | |
from pdf2image import convert_from_path,pdfinfo_from_path | |
import zipfile | |
def zip_folder(folder_path, output_path): | |
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf: | |
for root, dirs, files in os.walk(folder_path): | |
for file in files: | |
file_path = os.path.join(root, file) | |
zipf.write(file_path, os.path.relpath(file_path, folder_path)) | |
DIRECTORY = "image_reference" | |
DIRECTORY_OUTPUT = "output" | |
DIRECTORIES = [DIRECTORY, DIRECTORY_OUTPUT] | |
# Check and create directories | |
for directory in DIRECTORIES: | |
if not os.path.exists(directory): | |
os.makedirs(directory) | |
else: | |
pass | |
ALLOWED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif'] | |
def get_image_files(directory): | |
image_files = [] | |
for filename in os.listdir(directory): | |
if filename.lower().endswith(tuple(ALLOWED_EXTENSIONS)): | |
filepath = os.path.join(directory, filename) | |
image_files.append(filepath) | |
return image_files | |
def clear_directory(directory): | |
for filename in os.listdir(directory): | |
file_path = os.path.join(directory, filename) | |
try: | |
if os.path.isfile(file_path) or os.path.islink(file_path): | |
os.unlink(file_path) | |
elif os.path.isdir(file_path): | |
os.rmdir(file_path) | |
except Exception as e: | |
print(f"Failed to delete {file_path}. Reason: {e}") | |
def extract_photos_from_pdf(file_pdf): | |
clear_directory(DIRECTORY) | |
clear_directory(DIRECTORY_OUTPUT) | |
try: | |
pdf_path = file_pdf.name | |
info = pdfinfo_from_path(pdf_path, userpw=None, poppler_path=None) | |
total_pages = info["Pages"] # Total number of pages in the PDF book | |
batch_size = 100 # Number of pages to process in each batch | |
for start_page in range(0, total_pages, batch_size): | |
end_page = min(start_page + batch_size, total_pages) | |
images = convert_from_path(pdf_path, first_page=start_page, last_page=end_page) | |
for idx, image in enumerate(images, start=start_page): | |
image.save(f'{DIRECTORY}/{idx+1}.png', 'PNG') | |
images_pdf_list = get_image_files(DIRECTORY) | |
image_names = [(path, os.path.basename(path)) for path in images_pdf_list] | |
sorted_names = sorted(image_names, key=lambda x: int(x[1].split('.')[0])) | |
zip_folder(DIRECTORY, f'{DIRECTORY_OUTPUT}/all_photos.zip') | |
return ( | |
gr.Gallery.update(value=sorted_names, label=f"Detected {len(images_pdf_list)} Page{'' if len(images_pdf_list) == 1 else 's'}", show_label=True, visible=True), | |
gr.File.update(value=f'{DIRECTORY_OUTPUT}/all_photos.zip',visible=True) | |
) | |
except: | |
return ( | |
gr.Gallery.update(value=[], label="Error", show_label=True, visible=True), | |
gr.File.update(visible=False) | |
) | |
with gr.Blocks() as demo: | |
with gr.Tabs() as tabs: | |
with gr.TabItem("PDF",id=0): | |
with gr.Row(): | |
with gr.Column(): | |
proegres = gr.Text(show_label=False,value="",visible=False) | |
file_pdf = gr.File(file_types=['.pdf'], label="Upload PDF *") | |
btn = gr.Button("Extract Photos from PDF") | |
with gr.Tabs(visible=True) as tabs_under: | |
with gr.TabItem("Photos",id=0): | |
with gr.Column(): | |
list_image = gr.Gallery(value=[], label=f"0 Page",visible=True, show_label=True, elem_id="gallery").style(columns=[3], object_fit="cover", height="auto") | |
file_download = gr.File(file_types=['.zip'], label="Download File",visible=False) | |
examples = gr.Examples([["./1706.03762.pdf", None]], fn=extract_photos_from_pdf,inputs=[file_pdf],outputs=[list_image,file_download], cache_examples=False) | |
btn.click(fn=extract_photos_from_pdf,inputs=[file_pdf],outputs=[list_image,file_download]) | |
demo.queue().launch() |