Spaces:

Kathirsci
/

Report_summarizer

Sleeping

File size: 5,933 Bytes

import streamlit as st
import tempfile
import logging
import time
from typing import List
from langchain_community.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import HuggingFacePipeline
from langchain.chains.summarize import load_summarize_chain
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from transformers import pipeline

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Constants
EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
DEFAULT_MODEL = "llava-v1.6-mistral-7b-hf"

# Cache expiration time for models (adjust as needed)
MODEL_CACHE_EXPIRATION = 3600

@st.cache_resource(ttl=MODEL_CACHE_EXPIRATION)
def load_embeddings():
    """Load and cache the embedding model."""
    try:
        return HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    except Exception as e:
        logger.error(f"Failed to load embeddings: {e}")
        st.error("Failed to load the embedding model. Please try again later.")
        return None

@st.cache_resource(ttl=MODEL_CACHE_EXPIRATION)
def load_llm(model_name):
    """Load and cache the language model."""
    try:
        pipe = pipeline("text-generation", model=model_name, max_length=1024)
        return HuggingFacePipeline(pipeline=pipe)
    except Exception as e:
        logger.error(f"Failed to load LLM: {e}")
        st.error(f"Failed to load the model {model_name}. Please try again.")
        return None

def process_pdf(file) -> List[Document]:
    """Process the uploaded PDF file."""
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
            temp_file.write(file.getvalue())
            temp_file_path = temp_file.name

        loader = PyPDFLoader(file_path=temp_file_path)
        pages = loader.load()

        # Check for empty documents
        if not pages:
            st.warning("No text extracted from the PDF. Please ensure it's a valid PDF file.")
            return []

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200)
        documents = text_splitter.split_documents(pages)
        return documents
    except Exception as e:
        logger.error(f"Error processing PDF: {e}")
        st.error("Failed to process the PDF. Please make sure it's a valid PDF file.")
        return []

def create_vector_store(documents: List[Document], embeddings):
    """Create the vector store."""
    try:
        return FAISS.from_documents(documents, embeddings)
    except Exception as e:
        logger.error(f"Error creating vector store: {e}")
        st.error("Failed to create the vector store. Please try again.")
        return None

def summarize_report(documents: List[Document], llm) -> str:
    """Summarize the report using the loaded model."""
    try:
        prompt_template = """
        <s>[INST] You are an advanced AI assistant with expertise in summarizing technical documents. Your goal is to create a clear, concise, and well-organized summary using Markdown formatting. Focus on extracting and presenting the essential points of the document effectively.
        *Instructions:*
        - Analyze the provided context and input carefully.
        - Identify and highlight the key points, main arguments, and important details.
        - Format the summary using Markdown for clarity:
          - Use # for main headers and ## for subheaders.
          - Use **text** for important terms or concepts.
          - Provide a brief introduction, followed by the main points, and a concluding summary if applicable.
        - Ensure the summary is easy to read and understand, avoiding unnecessary jargon.
        *Example Summary Format:*
        # Overview
        *Document Title:* Technical Analysis Report
        *Summary:*
        The report provides an in-depth analysis of the recent technical advancements in AI. It covers key areas such as ...
        # Key Findings
        - *Finding 1:* Description of finding 1.
        - *Finding 2:* Description of finding 2.
        # Conclusion
        The analysis highlights the significant advancements and future directions for AI technology.
        *Your Response:* [/INST]</s> {input}
        Context: {context}
        """

        prompt = PromptTemplate.from_template(prompt_template)
        chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
        summary = chain.run(documents)
        return summary

    except Exception as e:
        logger.error(f"Error summarizing report: {e}")
        st.error("Failed to summarize the report. Please try again.")
        return ""

def main():
    st.title("Report Summarizer")

    model_option = st.sidebar.selectbox("Llm Model", options=["ChocoWu/nextgpt_7b_tiva_v0", "google-t5/t5-11b"])

    uploaded_file = st.sidebar.file_uploader("Upload your Report", type="pdf")

    llm = load_llm(model_option)
    embeddings = load_embeddings()

    if not llm or not embeddings:
        return

    if uploaded_file:
        with st.spinner("Processing PDF..."):
            documents = process_pdf(uploaded_file)

        if documents:
            with st.spinner("Creating vector store..."):
                db = create_vector_store(documents, embeddings)

            if db and st.button("Summarize"):
                with st.spinner(f"Generating structured summary using {model_option}..."):
                    summary = summarize_report(documents, llm)

                    if summary:
                        st.subheader("Structured Summary:")
                        st.markdown(summary)
                    else:
                        st.warning("Failed to generate summary. Please try again.")

if __name__ == "__main__":
    main()