import gradio as gr
import fitz  # PyMuPDF for reading PDFs
import numpy as np
from bokeh.plotting import figure, output_file, save
from bokeh.models import HoverTool, ColumnDataSource
import umap
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sentence_transformers import SentenceTransformer
import tempfile
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize the model globally
model = SentenceTransformer('all-MiniLM-L6-v2')
logging.info("Model loaded successfully.")

def process_pdf(pdf_path):
    logging.info(f"Processing PDF: {pdf_path}")
    # Open the PDF
    doc = fitz.open(pdf_path)
    texts = [page.get_text() for page in doc]
    print("PDF processed successfully.")
    return " ".join(texts)

def create_embeddings(text):
    print("Creating embeddings.")
    sentences = text.split(". ")  # A simple split; consider a more robust sentence splitter
    embeddings = model.encode(sentences)
    print("Embeddings created successfully.")
    return embeddings, sentences

import plotly.express as px
import plotly.graph_objects as go

def generate_plotly_figure(query, pdf_file):
    logging.info("Generating plot with Plotly.")
    # Generate embeddings for the query
    query_embedding = model.encode([query])[0]
    
    # Process the PDF and create embeddings
    text = process_pdf(pdf_file.name)
    embeddings, sentences = create_embeddings(text)
    
    logging.info("Data prepared for UMAP.")
    # Prepare the data for UMAP and visualization
    all_embeddings = np.vstack([embeddings, query_embedding])
    all_sentences = sentences + [query]
    
    # UMAP transformation
    umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
    umap_embeddings = umap_transform.fit_transform(all_embeddings)
    
    logging.info("UMAP transformation completed.")
    # Find the closest sentences to the query
    distances = cosine_similarity([query_embedding], embeddings)[0]
    closest_indices = distances.argsort()[-5:][::-1]  # Adjust the number as needed
    
    # Prepare data for plotting
    colors = ['green' if i in closest_indices else 'blue' for i in range(len(sentences))]  # Target points in green
    colors.append('red')  # Query point in red
    
    # Add the scatter plot for sentences and query
    fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers',
                                    marker=dict(color=colors[:-1]), text=all_sentences[:-1],
                                    name='Sentences'))
    
    # Add the scatter plot for the query point
    fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers',
                             marker=dict(color='red'), text=[query], name='Query'))
    
    fig.update_layout(title="UMAP Projection of Sentences", xaxis_title="UMAP 1", yaxis_title="UMAP 2")
    
    logging.info("Plotly figure created successfully.")
    return fig
def gradio_interface(pdf_file, query):
    logging.info("Gradio interface called.")
    fig = generate_plotly_figure(query, pdf_file)
    logging.info("Returning Plotly figure.")
    return fig
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Query")],
    outputs=gr.Plot(),  # Updated to use gr.Plot() for Plotly figures
    title="PDF Content Visualizer",
    description="Upload a PDF and enter a query to visualize the content."
)

if __name__ == "__main__":
    iface.launch()