import gradio as gr from gradio_client import Client, handle_file import requests from PIL import Image import io import fitz # PyMuPDF import tempfile import os # Função para extrair texto e imagens de um PDF def extract_from_pdf(pdf_path): try: # Abre o PDF doc = fitz.open(pdf_path) extracted_text = "" extracted_images = [] # Itera sobre as páginas do PDF for page_num in range(len(doc)): page = doc.load_page(page_num) # Extrai texto extracted_text += page.get_text() # Extrai imagens image_list = page.get_images(full=True) for img_index, img in enumerate(image_list): xref = img[0] base_image = doc.extract_image(xref) image_bytes = base_image["image"] image = Image.open(io.BytesIO(image_bytes)) extracted_images.append(image) return extracted_text, extracted_images except Exception as e: return f"Erro ao processar PDF: {str(e)}", [] # Função principal para fazer a predição def predict(file, question, seed, top_p, temperature): try: # Verifica se o arquivo é um PDF if file.endswith(".pdf"): # Extrai texto e imagens do PDF extracted_text, extracted_images = extract_from_pdf(file) # Se houver imagens, processa a primeira imagem if extracted_images: image = extracted_images[0] with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file: image.save(tmp_file.name, format="PNG") img_path = tmp_file.name else: return "Nenhuma imagem encontrada no PDF." # Se houver texto, adiciona ao prompt if extracted_text: question = f"Texto extraído do PDF:\n{extracted_text}\n\nPergunta: {question}" else: # Se não for PDF, trata como imagem if file.startswith('http'): response = requests.get(file) img_path = handle_file(io.BytesIO(response.content)) else: img_path = handle_file(file) # Inicializa o cliente do Gradio client = Client("deepseek-ai/Janus-Pro-7B") # Faz a predição result = client.predict( image=img_path, question=question, seed=seed, top_p=top_p, temperature=temperature, api_name="/multimodal_understanding" ) return result except Exception as e: return f"Erro durante a predição: {str(e)}" # Componentes da interface file_input = gr.File(label="Upload PDF or Image", file_types=[".pdf", ".png", ".jpg", ".jpeg"]) question_input = gr.Textbox(label="Question", placeholder="Ask something about the file...") seed_slider = gr.Slider(0, 100, value=42, label="Seed") top_p_slider = gr.Slider(0, 1, value=0.95, label="Top-p") temp_slider = gr.Slider(0, 1, value=0.1, label="Temperature") # Cria a interface demo = gr.Interface( fn=predict, inputs=[ file_input, question_input, seed_slider, top_p_slider, temp_slider ], outputs=gr.Textbox(label="Answer"), title="Janus-Pro-7B Multimodal Demo", description="Ask questions about PDFs or images using the Janus-Pro-7B model", examples=[ ["https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png", "What's in this image?", 42, 0.95, 0.1] ] ) if __name__ == "__main__": demo.launch()