ShadowDominator's picture
Upload 4 files
febd231
raw
history blame
4.14 kB
import os
import gradio as gr
from pdf2image import convert_from_path,pdfinfo_from_path
import zipfile
def zip_folder(folder_path, output_path):
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for root, dirs, files in os.walk(folder_path):
for file in files:
file_path = os.path.join(root, file)
zipf.write(file_path, os.path.relpath(file_path, folder_path))
DIRECTORY = "image_reference"
DIRECTORY_OUTPUT = "output"
DIRECTORIES = [DIRECTORY, DIRECTORY_OUTPUT]
# Check and create directories
for directory in DIRECTORIES:
if not os.path.exists(directory):
os.makedirs(directory)
else:
pass
ALLOWED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif']
def get_image_files(directory):
image_files = []
for filename in os.listdir(directory):
if filename.lower().endswith(tuple(ALLOWED_EXTENSIONS)):
filepath = os.path.join(directory, filename)
image_files.append(filepath)
return image_files
def clear_directory(directory):
for filename in os.listdir(directory):
file_path = os.path.join(directory, filename)
try:
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
os.rmdir(file_path)
except Exception as e:
print(f"Failed to delete {file_path}. Reason: {e}")
def extract_photos_from_pdf(file_pdf):
clear_directory(DIRECTORY)
clear_directory(DIRECTORY_OUTPUT)
try:
pdf_path = file_pdf.name
info = pdfinfo_from_path(pdf_path, userpw=None, poppler_path=None)
total_pages = info["Pages"] # Total number of pages in the PDF book
batch_size = 100 # Number of pages to process in each batch
for start_page in range(0, total_pages, batch_size):
end_page = min(start_page + batch_size, total_pages)
images = convert_from_path(pdf_path, first_page=start_page, last_page=end_page)
for idx, image in enumerate(images, start=start_page):
image.save(f'{DIRECTORY}/{idx+1}.png', 'PNG')
images_pdf_list = get_image_files(DIRECTORY)
image_names = [(path, os.path.basename(path)) for path in images_pdf_list]
sorted_names = sorted(image_names, key=lambda x: int(x[1].split('.')[0]))
zip_folder(DIRECTORY, f'{DIRECTORY_OUTPUT}/all_photos.zip')
return (
gr.Gallery.update(value=sorted_names, label=f"Detected {len(images_pdf_list)} Page{'' if len(images_pdf_list) == 1 else 's'}", show_label=True, visible=True),
gr.File.update(value=f'{DIRECTORY_OUTPUT}/all_photos.zip',visible=True)
)
except:
return (
gr.Gallery.update(value=[], label="Error", show_label=True, visible=True),
gr.File.update(visible=False)
)
with gr.Blocks() as demo:
with gr.Tabs() as tabs:
with gr.TabItem("PDF",id=0):
with gr.Row():
with gr.Column():
proegres = gr.Text(show_label=False,value="",visible=False)
file_pdf = gr.File(file_types=['.pdf'], label="Upload PDF *")
btn = gr.Button("Extract Photos from PDF")
with gr.Tabs(visible=True) as tabs_under:
with gr.TabItem("Photos",id=0):
with gr.Column():
list_image = gr.Gallery(value=[], label=f"0 Page",visible=True, show_label=True, elem_id="gallery").style(columns=[3], object_fit="cover", height="auto")
file_download = gr.File(file_types=['.zip'], label="Download File",visible=False)
examples = gr.Examples([["./1706.03762.pdf", None]], fn=extract_photos_from_pdf,inputs=[file_pdf],outputs=[list_image,file_download], cache_examples=False)
btn.click(fn=extract_photos_from_pdf,inputs=[file_pdf],outputs=[list_image,file_download])
demo.queue().launch()