Spaces:
Sleeping
Sleeping
File size: 5,296 Bytes
b47040f ec8e5d1 07fb84b fae072e 07fb84b fae072e b47040f ffb4b75 b47040f a8c600f ec8e5d1 b47040f 07fb84b ffb4b75 d061dc5 ec8e5d1 b47040f ec8e5d1 b47040f ec8e5d1 b47040f ffb4b75 a8c600f 81cff83 b47040f ec8e5d1 b47040f 3f068be b47040f ffb4b75 b47040f ec8e5d1 b47040f ec8e5d1 b47040f ec8e5d1 b47040f ec8e5d1 ffb4b75 d061dc5 b47040f ffb4b75 b47040f ec8e5d1 3f068be b47040f ec8e5d1 b47040f 3f068be ffb4b75 ec8e5d1 b47040f ec8e5d1 ffb4b75 b47040f ec8e5d1 ffb4b75 ec8e5d1 b47040f ec8e5d1 ffb4b75 ec8e5d1 b47040f ffb4b75 b47040f ec8e5d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import streamlit as st
import tempfile
import logging
from typing import List, Optional
import torch
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import HuggingFacePipeline
from langchain.chains.summarize import load_summarize_chain
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Constants
EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
DEFAULT_MODEL = "distilgpt2"
MAX_LENGTH_FRACTION = 0.2 # Set max_length to 20% of input length
# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
st.sidebar.write(f"Using device: {device}")
@st.cache_data
def load_embeddings(model_name: str) -> Optional[HuggingFaceEmbeddings]:
"""Load the embedding model."""
try:
return HuggingFaceEmbeddings(model_name=model_name)
except Exception as e:
logger.error(f"Failed to load embeddings: {e}")
return None
@st.cache_data
def load_llm(model_name: str, max_length: int) -> Optional[HuggingFacePipeline]:
"""Load the language model."""
try:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device, max_length=max_length)
return HuggingFacePipeline(pipeline=pipe)
except Exception as e:
logger.error(f"Failed to load LLM: {e}")
return None
def process_pdf(file) -> Optional[List[Document]]:
"""Process the uploaded PDF file."""
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
temp_file.write(file.getvalue())
temp_file_path = temp_file.name
loader = PyPDFLoader(file_path=temp_file_path)
pages = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
documents = text_splitter.split_documents(pages)
return documents
except Exception as e:
logger.error(f"Error processing PDF: {e}")
return None
def create_vector_store(documents: List[Document], embeddings: HuggingFaceEmbeddings) -> Optional[FAISS]:
"""Create the vector store."""
try:
return FAISS.from_documents(documents, embeddings)
except Exception as e:
logger.error(f"Error creating vector store: {e}")
return None
def summarize_report(documents: List[Document], llm: HuggingFacePipeline, max_length: int, summary_style: str) -> Optional[str]:
"""Summarize the report using the loaded model."""
try:
prompt_template = f"""
Summarize the following text in a {summary_style} manner. Focus on the main points and key details:
{{text}}
Summary:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
summary = chain.run(documents, max_length=max_length)
return summary
except Exception as e:
logger.error(f"Error summarizing report: {e}")
return None
def main():
st.title("Report Summarizer")
model_option = st.sidebar.text_input("Enter model name", value=DEFAULT_MODEL)
summary_style = st.sidebar.selectbox("Summary style", options=["clear and concise", "formal", "informal", "bullet points"])
uploaded_file = st.sidebar.file_uploader("Upload your Report", type="pdf")
llm = load_llm(model_option, 1024) # Load the model with a default max_length
if not llm:
st.error(f"Failed to load the model {model_option}. Please try another model.")
return
embeddings = load_embeddings(EMBEDDING_MODEL)
if not embeddings:
st.error(f"Failed to load embeddings. Please try again later.")
return
if uploaded_file:
with st.spinner("Processing PDF..."):
documents = process_pdf(uploaded_file)
if documents:
with st.spinner("Creating vector store..."):
db = create_vector_store(documents, embeddings)
if db and st.button("Summarize"):
# Calculate max_length based on input text
input_length = sum([len(doc.page_content.split()) for doc in documents])
max_length = int(input_length * MAX_LENGTH_FRACTION)
# Reload the model with the calculated max_length
llm = load_llm(model_option, max_length)
with st.spinner(f"Generating summary using {model_option}..."):
summary = summarize_report(documents, llm, max_length, summary_style)
if summary:
st.subheader("Summary:")
st.write(summary)
else:
st.warning("Failed to generate summary. Please try again.")
if __name__ == "__main__":
main()
|