Kathirsci commited on
Commit
ffb4b75
·
verified ·
1 Parent(s): 431a644

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -39
app.py CHANGED
@@ -11,7 +11,7 @@ from langchain.chains.summarize import load_summarize_chain
11
  from langchain.schema import Document
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
  from langchain.prompts import PromptTemplate
14
- from transformers import pipeline
15
 
16
  # Set up logging
17
  logging.basicConfig(level=logging.INFO)
@@ -19,11 +19,11 @@ logger = logging.getLogger(__name__)
19
 
20
  # Constants
21
  EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
22
- DEFAULT_MODEL = "microsoft/phi-2"
23
 
24
  # Check for GPU
25
  device = "cuda" if torch.cuda.is_available() else "cpu"
26
- print(f"Using device: {device}")
27
 
28
  @st.cache_resource
29
  def load_embeddings():
@@ -39,7 +39,9 @@ def load_embeddings():
39
  def load_llm(model_name):
40
  """Load and cache the language model."""
41
  try:
42
- pipe = pipeline("text-generation", model=model_name, device=device, max_length=1024)
 
 
43
  return HuggingFacePipeline(pipeline=pipe)
44
  except Exception as e:
45
  logger.error(f"Failed to load LLM: {e}")
@@ -55,13 +57,7 @@ def process_pdf(file) -> List[Document]:
55
 
56
  loader = PyPDFLoader(file_path=temp_file_path)
57
  pages = loader.load()
58
-
59
- # Check for empty documents
60
- if not pages:
61
- st.warning("No text extracted from the PDF. Please ensure it's a valid PDF file.")
62
- return []
63
-
64
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200)
65
  documents = text_splitter.split_documents(pages)
66
  return documents
67
  except Exception as e:
@@ -82,30 +78,14 @@ def summarize_report(documents: List[Document], llm) -> str:
82
  """Summarize the report using the loaded model."""
83
  try:
84
  prompt_template = """
85
- <s>[INST] You are an advanced AI assistant with expertise in summarizing technical documents. Your goal is to create a clear, concise, and well-organized summary using Markdown formatting. Focus on extracting and presenting the essential points of the document effectively.
86
- *Instructions:*
87
- - Analyze the provided context and input carefully.
88
- - Identify and highlight the key points, main arguments, and important details.
89
- - Format the summary using Markdown for clarity:
90
- - Use # for main headers and ## for subheaders.
91
- - Use **text** for important terms or concepts.
92
- - Provide a brief introduction, followed by the main points, and a concluding summary if applicable.
93
- - Ensure the summary is easy to read and understand, avoiding unnecessary jargon.
94
- *Example Summary Format:*
95
- # Overview
96
- *Document Title:* Technical Analysis Report
97
- *Summary:*
98
- The report provides an in-depth analysis of the recent technical advancements in AI. It covers key areas such as ...
99
- # Key Findings
100
- - *Finding 1:* Description of finding 1.
101
- - *Finding 2:* Description of finding 2.
102
- # Conclusion
103
- The analysis highlights the significant advancements and future directions for AI technology.
104
- *Your Response:* [/INST]</s> {input}
105
- Context: {context}
106
  """
107
 
108
- prompt = PromptTemplate.from_template(prompt_template)
109
  chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
110
  summary = chain.run(documents)
111
  return summary
@@ -118,14 +98,18 @@ def summarize_report(documents: List[Document], llm) -> str:
118
  def main():
119
  st.title("Report Summarizer")
120
 
121
- model_option = st.sidebar.selectbox("Llm Model", options=["ChocoWu/nextgpt_7b_tiva_v0", "google-t5/t5-11b"])
122
 
123
  uploaded_file = st.sidebar.file_uploader("Upload your Report", type="pdf")
124
 
125
  llm = load_llm(model_option)
126
- embeddings = load_embeddings()
 
 
127
 
128
- if not llm or not embeddings:
 
 
129
  return
130
 
131
  if uploaded_file:
@@ -137,12 +121,12 @@ def main():
137
  db = create_vector_store(documents, embeddings)
138
 
139
  if db and st.button("Summarize"):
140
- with st.spinner(f"Generating structured summary using {model_option}..."):
141
  summary = summarize_report(documents, llm)
142
 
143
  if summary:
144
- st.subheader("Structured Summary:")
145
- st.markdown(summary)
146
  else:
147
  st.warning("Failed to generate summary. Please try again.")
148
 
 
11
  from langchain.schema import Document
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
  from langchain.prompts import PromptTemplate
14
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
15
 
16
  # Set up logging
17
  logging.basicConfig(level=logging.INFO)
 
19
 
20
  # Constants
21
  EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
22
+ DEFAULT_MODEL = "distilgpt2" # A smaller model that's more likely to work in Spaces
23
 
24
  # Check for GPU
25
  device = "cuda" if torch.cuda.is_available() else "cpu"
26
+ st.sidebar.write(f"Using device: {device}")
27
 
28
  @st.cache_resource
29
  def load_embeddings():
 
39
  def load_llm(model_name):
40
  """Load and cache the language model."""
41
  try:
42
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
43
+ model = AutoModelForCausalLM.from_pretrained(model_name)
44
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device, max_length=512)
45
  return HuggingFacePipeline(pipeline=pipe)
46
  except Exception as e:
47
  logger.error(f"Failed to load LLM: {e}")
 
57
 
58
  loader = PyPDFLoader(file_path=temp_file_path)
59
  pages = loader.load()
60
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
 
 
 
 
 
 
61
  documents = text_splitter.split_documents(pages)
62
  return documents
63
  except Exception as e:
 
78
  """Summarize the report using the loaded model."""
79
  try:
80
  prompt_template = """
81
+ Summarize the following text in a clear and concise manner:
82
+
83
+ {text}
84
+
85
+ Summary:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  """
87
 
88
+ prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
89
  chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
90
  summary = chain.run(documents)
91
  return summary
 
98
  def main():
99
  st.title("Report Summarizer")
100
 
101
+ model_option = st.sidebar.text_input("Enter model name", value=DEFAULT_MODEL)
102
 
103
  uploaded_file = st.sidebar.file_uploader("Upload your Report", type="pdf")
104
 
105
  llm = load_llm(model_option)
106
+ if not llm:
107
+ st.error(f"Failed to load the model {model_option}. Please try another model.")
108
+ return
109
 
110
+ embeddings = load_embeddings()
111
+ if not embeddings:
112
+ st.error("Failed to load embeddings. Please try again later.")
113
  return
114
 
115
  if uploaded_file:
 
121
  db = create_vector_store(documents, embeddings)
122
 
123
  if db and st.button("Summarize"):
124
+ with st.spinner(f"Generating summary using {model_option}..."):
125
  summary = summarize_report(documents, llm)
126
 
127
  if summary:
128
+ st.subheader("Summary:")
129
+ st.write(summary)
130
  else:
131
  st.warning("Failed to generate summary. Please try again.")
132