import streamlit as st import tempfile import logging from typing import List import torch from langchain_community.document_loaders import PyPDFLoader from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain_community.llms import HuggingFacePipeline from langchain.chains.summarize import load_summarize_chain from langchain.schema import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.prompts import PromptTemplate from transformers import pipeline # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Constants EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2' DEFAULT_MODEL = "microsoft/phi-2" # Check for GPU device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") @st.cache_resource def load_embeddings(): """Load and cache the embedding model.""" try: return HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL) except Exception as e: logger.error(f"Failed to load embeddings: {e}") st.error("Failed to load the embedding model. Please try again later.") return None @st.cache_resource def load_llm(model_name): """Load and cache the language model.""" try: pipe = pipeline("text-generation", model=model_name, device=device, max_length=1024) return HuggingFacePipeline(pipeline=pipe) except Exception as e: logger.error(f"Failed to load LLM: {e}") st.error(f"Failed to load the model {model_name}. Please try another model or check your internet connection.") return None def process_pdf(file) -> List[Document]: """Process the uploaded PDF file.""" try: with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: temp_file.write(file.getvalue()) temp_file_path = temp_file.name loader = PyPDFLoader(file_path=temp_file_path) pages = loader.load() # Check for empty documents if not pages: st.warning("No text extracted from the PDF. Please ensure it's a valid PDF file.") return [] text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200) documents = text_splitter.split_documents(pages) return documents except Exception as e: logger.error(f"Error processing PDF: {e}") st.error("Failed to process the PDF. Please make sure it's a valid PDF file.") return [] def create_vector_store(documents: List[Document], embeddings): """Create the vector store.""" try: return FAISS.from_documents(documents, embeddings) except Exception as e: logger.error(f"Error creating vector store: {e}") st.error("Failed to create the vector store. Please try again.") return None def summarize_report(documents: List[Document], llm) -> str: """Summarize the report using the loaded model.""" try: prompt_template = """ <s>[INST] You are an advanced AI assistant with expertise in summarizing technical documents. Your goal is to create a clear, concise, and well-organized summary using Markdown formatting. Focus on extracting and presenting the essential points of the document effectively. *Instructions:* - Analyze the provided context and input carefully. - Identify and highlight the key points, main arguments, and important details. - Format the summary using Markdown for clarity: - Use # for main headers and ## for subheaders. - Use **text** for important terms or concepts. - Provide a brief introduction, followed by the main points, and a concluding summary if applicable. - Ensure the summary is easy to read and understand, avoiding unnecessary jargon. *Example Summary Format:* # Overview *Document Title:* Technical Analysis Report *Summary:* The report provides an in-depth analysis of the recent technical advancements in AI. It covers key areas such as ... # Key Findings - *Finding 1:* Description of finding 1. - *Finding 2:* Description of finding 2. # Conclusion The analysis highlights the significant advancements and future directions for AI technology. *Your Response:* [/INST]</s> {input} Context: {context} """ prompt = PromptTemplate.from_template(prompt_template) chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt) summary = chain.run(documents) return summary except Exception as e: logger.error(f"Error summarizing report: {e}") st.error("Failed to summarize the report. Please try again.") return "" def main(): st.title("Report Summarizer") model_option = st.sidebar.selectbox("Llm Model", options=["ChocoWu/nextgpt_7b_tiva_v0", "google-t5/t5-11b"]) uploaded_file = st.sidebar.file_uploader("Upload your Report", type="pdf") llm = load_llm(model_option) embeddings = load_embeddings() if not llm or not embeddings: return if uploaded_file: with st.spinner("Processing PDF..."): documents = process_pdf(uploaded_file) if documents: with st.spinner("Creating vector store..."): db = create_vector_store(documents, embeddings) if db and st.button("Summarize"): with st.spinner(f"Generating structured summary using {model_option}..."): summary = summarize_report(documents, llm) if summary: st.subheader("Structured Summary:") st.markdown(summary) else: st.warning("Failed to generate summary. Please try again.") if __name__ == "__main__": main()