import gradio as gr import fitz # PyMuPDF for reading PDFs import numpy as np from bokeh.plotting import figure, output_file, save from bokeh.models import HoverTool, ColumnDataSource import umap import pandas as pd from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances from sentence_transformers import SentenceTransformer import tempfile import logging # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # Initialize the model globally model = SentenceTransformer('all-MiniLM-L6-v2') logging.info("Model loaded successfully.") def process_pdf(pdf_path): logging.info(f"Processing PDF: {pdf_path}") # Open the PDF doc = fitz.open(pdf_path) texts = [page.get_text() for page in doc] print("PDF processed successfully.") return " ".join(texts) def create_embeddings(text): print("Creating embeddings.") sentences = text.split(". ") # A simple split; consider a more robust sentence splitter embeddings = model.encode(sentences) print("Embeddings created successfully.") return embeddings, sentences import plotly.express as px import plotly.graph_objects as go def generate_plotly_figure(query, pdf_file): logging.info("Generating plot with Plotly.") # Generate embeddings for the query query_embedding = model.encode([query])[0] # Process the PDF and create embeddings text = process_pdf(pdf_file.name) embeddings, sentences = create_embeddings(text) logging.info("Data prepared for UMAP.") # Prepare the data for UMAP and visualization all_embeddings = np.vstack([embeddings, query_embedding]) all_sentences = sentences + [query] # UMAP transformation umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42) umap_embeddings = umap_transform.fit_transform(all_embeddings) logging.info("UMAP transformation completed.") # Find the closest sentences to the query distances = cosine_similarity([query_embedding], embeddings)[0] closest_indices = distances.argsort()[-5:][::-1] # Adjust the number as needed # Prepare data for plotting colors = ['green' if i in closest_indices else 'blue' for i in range(len(sentences))] # Target points in green colors.append('red') # Query point in red # Add the scatter plot for sentences and query fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers', marker=dict(color=colors[:-1]), text=all_sentences[:-1], name='Sentences')) # Add the scatter plot for the query point fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers', marker=dict(color='red'), text=[query], name='Query')) fig.update_layout(title="UMAP Projection of Sentences", xaxis_title="UMAP 1", yaxis_title="UMAP 2") logging.info("Plotly figure created successfully.") return fig def gradio_interface(pdf_file, query): logging.info("Gradio interface called.") fig = generate_plotly_figure(query, pdf_file) logging.info("Returning Plotly figure.") return fig iface = gr.Interface( fn=gradio_interface, inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Query")], outputs=gr.Plot(), # Updated to use gr.Plot() for Plotly figures title="PDF Content Visualizer", description="Upload a PDF and enter a query to visualize the content." ) if __name__ == "__main__": iface.launch()