Spaces:

ShadowDominator
/

extract-photos-from-pdf

Runtime error

App Files Files Community

extract-photos-from-pdf / app.py

ShadowDominator

Upload 4 files

febd231 almost 2 years ago

raw

history blame

4.14 kB

	import os
	import gradio as gr
	from pdf2image import convert_from_path,pdfinfo_from_path
	import zipfile

	def zip_folder(folder_path, output_path):
	with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
	for root, dirs, files in os.walk(folder_path):
	for file in files:
	file_path = os.path.join(root, file)
	zipf.write(file_path, os.path.relpath(file_path, folder_path))



	DIRECTORY = "image_reference"
	DIRECTORY_OUTPUT = "output"
	DIRECTORIES = [DIRECTORY, DIRECTORY_OUTPUT]

	# Check and create directories
	for directory in DIRECTORIES:
	if not os.path.exists(directory):
	os.makedirs(directory)
	else:
	pass


	ALLOWED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif']
	def get_image_files(directory):
	image_files = []
	for filename in os.listdir(directory):
	if filename.lower().endswith(tuple(ALLOWED_EXTENSIONS)):
	filepath = os.path.join(directory, filename)
	image_files.append(filepath)
	return image_files


	def clear_directory(directory):
	for filename in os.listdir(directory):
	file_path = os.path.join(directory, filename)
	try:
	if os.path.isfile(file_path) or os.path.islink(file_path):
	os.unlink(file_path)
	elif os.path.isdir(file_path):
	os.rmdir(file_path)
	except Exception as e:
	print(f"Failed to delete {file_path}. Reason: {e}")



	def extract_photos_from_pdf(file_pdf):
	clear_directory(DIRECTORY)
	clear_directory(DIRECTORY_OUTPUT)
	try:
	pdf_path = file_pdf.name
	info = pdfinfo_from_path(pdf_path, userpw=None, poppler_path=None)

	total_pages = info["Pages"] # Total number of pages in the PDF book
	batch_size = 100 # Number of pages to process in each batch

	for start_page in range(0, total_pages, batch_size):
	end_page = min(start_page + batch_size, total_pages)
	images = convert_from_path(pdf_path, first_page=start_page, last_page=end_page)
	for idx, image in enumerate(images, start=start_page):
	image.save(f'{DIRECTORY}/{idx+1}.png', 'PNG')

	images_pdf_list = get_image_files(DIRECTORY)
	image_names = [(path, os.path.basename(path)) for path in images_pdf_list]
	sorted_names = sorted(image_names, key=lambda x: int(x[1].split('.')[0]))
	zip_folder(DIRECTORY, f'{DIRECTORY_OUTPUT}/all_photos.zip')
	return (
	gr.Gallery.update(value=sorted_names, label=f"Detected {len(images_pdf_list)} Page{'' if len(images_pdf_list) == 1 else 's'}", show_label=True, visible=True),
	gr.File.update(value=f'{DIRECTORY_OUTPUT}/all_photos.zip',visible=True)
	)
	except:
	return (
	gr.Gallery.update(value=[], label="Error", show_label=True, visible=True),
	gr.File.update(visible=False)
	)

	with gr.Blocks() as demo:
	with gr.Tabs() as tabs:

	with gr.TabItem("PDF",id=0):

	with gr.Row():
	with gr.Column():
	proegres = gr.Text(show_label=False,value="",visible=False)
	file_pdf = gr.File(file_types=['.pdf'], label="Upload PDF *")
	btn = gr.Button("Extract Photos from PDF")


	with gr.Tabs(visible=True) as tabs_under:

	with gr.TabItem("Photos",id=0):

	with gr.Column():

	list_image = gr.Gallery(value=[], label=f"0 Page",visible=True, show_label=True, elem_id="gallery").style(columns=[3], object_fit="cover", height="auto")
	file_download = gr.File(file_types=['.zip'], label="Download File",visible=False)


	examples = gr.Examples([["./1706.03762.pdf", None]], fn=extract_photos_from_pdf,inputs=[file_pdf],outputs=[list_image,file_download], cache_examples=False)
	btn.click(fn=extract_photos_from_pdf,inputs=[file_pdf],outputs=[list_image,file_download])




	demo.queue().launch()