pendar02 commited on
Commit
3ffe379
Β·
verified Β·
1 Parent(s): cf44c2f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +258 -185
app.py CHANGED
@@ -1,12 +1,19 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import torch
4
- import re
5
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
  from peft import PeftModel
7
  from text_processing import TextProcessor
8
  import gc
9
  from pathlib import Path
 
 
 
 
 
 
 
10
 
11
  # Configure page
12
  st.set_page_config(
@@ -26,6 +33,25 @@ if 'processing_started' not in st.session_state:
26
  st.session_state.processing_started = False
27
  if 'focused_summary_generated' not in st.session_state:
28
  st.session_state.focused_summary_generated = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  def load_model(model_type):
31
  """Load appropriate model based on type with proper memory management"""
@@ -72,6 +98,26 @@ def load_model(model_type):
72
  st.error(f"Error loading model: {str(e)}")
73
  raise
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  def cleanup_model(model, tokenizer):
76
  """Properly cleanup model resources"""
77
  try:
@@ -82,9 +128,7 @@ def cleanup_model(model, tokenizer):
82
  except Exception:
83
  pass
84
 
85
-
86
  @st.cache_data
87
-
88
  def process_excel(uploaded_file):
89
  """Process uploaded Excel file"""
90
  try:
@@ -119,7 +163,6 @@ def process_excel(uploaded_file):
119
  st.error("Please check if your file is in the correct Excel format (.xlsx or .xls)")
120
  return None
121
 
122
-
123
  def validate_excel_structure(df):
124
  """Validate the structure and content of the Excel file"""
125
  validation_messages = []
@@ -150,147 +193,142 @@ def validate_excel_structure(df):
150
 
151
  return len(validation_messages) == 0, validation_messages
152
 
153
-
154
-
155
  def preprocess_text(text):
156
- """Preprocess text to add appropriate formatting before summarization"""
157
  if not isinstance(text, str) or not text.strip():
158
  return text
159
 
160
- # Split text into sentences (basic implementation)
161
- sentences = [s.strip() for s in text.replace('. ', '.\n').split('\n')]
162
 
163
- # Remove empty sentences
164
- sentences = [s for s in sentences if s]
165
 
166
- # Join with proper line breaks
167
- formatted_text = '\n'.join(sentences)
 
 
168
 
169
- return formatted_text
170
-
171
- def post_process_summary(summary):
172
- """Clean up and improve summary coherence"""
173
- if not summary:
174
- return summary
175
-
176
- # Split into sentences
177
- sentences = [s.strip() for s in summary.split('.')]
178
- sentences = [s for s in sentences if s] # Remove empty sentences
179
-
180
- # Fix common issues
181
- processed_sentences = []
182
- for i, sentence in enumerate(sentences):
183
- # Remove redundant words/phrases
184
- sentence = sentence.replace(" and and ", " and ")
185
- sentence = sentence.replace("appointment and appointment", "appointment")
186
-
187
- # Fix common grammatical issues
188
- sentence = sentence.replace("Cancers distress", "Cancer distress")
189
- sentence = sentence.replace(" ", " ") # Remove double spaces
190
-
191
- # Capitalize first letter of each sentence
192
- sentence = sentence.capitalize()
193
-
194
- # Add to processed sentences if not empty
195
- if sentence.strip():
196
- processed_sentences.append(sentence)
197
 
198
- # Join sentences with proper spacing and punctuation
199
- cleaned_summary = '. '.join(processed_sentences)
200
- if cleaned_summary and not cleaned_summary.endswith('.'):
201
- cleaned_summary += '.'
202
-
203
- return cleaned_summary
204
-
205
- def improve_summary_generation(text, model, tokenizer):
206
- """Generate improved summary with better prompt and validation"""
207
- if not isinstance(text, str) or not text.strip():
208
- return "No abstract available to summarize."
209
 
210
- # Add a more specific prompt
211
- formatted_text = (
212
- "Summarize this medical research paper following this structure exactly:\n"
213
- "1. Background and objectives\n"
214
- "2. Methods\n"
215
- "3. Key findings with specific numbers/percentages\n"
216
- "4. Main conclusions\n"
217
- "Original text: " + preprocess_text(text)
218
- )
219
 
220
- # Adjust generation parameters
221
- inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
222
- inputs = {k: v.to(model.device) for k, v in inputs.items()}
223
 
224
- with torch.no_grad():
225
- summary_ids = model.generate(
226
- **{
227
- "input_ids": inputs["input_ids"],
228
- "attention_mask": inputs["attention_mask"],
229
- "max_length": 200,
230
- "min_length": 50,
231
- "num_beams": 5,
232
- "length_penalty": 1.5,
233
- "no_repeat_ngram_size": 3,
234
- "temperature": 0.7,
235
- "repetition_penalty": 1.5
236
- }
237
- )
238
 
239
- summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 
 
 
240
 
241
- # Post-process the summary
242
- processed_summary = post_process_summary(summary)
 
243
 
244
- # Validate the summary
245
- if not validate_summary(processed_summary, text):
246
- # If validation fails, try one more time with different parameters
247
- with torch.no_grad():
248
- summary_ids = model.generate(
249
- **{
250
- "input_ids": inputs["input_ids"],
251
- "attention_mask": inputs["attention_mask"],
252
- "max_length": 200,
253
- "min_length": 50,
254
- "num_beams": 4,
255
- "length_penalty": 2.0,
256
- "no_repeat_ngram_size": 4,
257
- "temperature": 0.8,
258
- "repetition_penalty": 2.0
259
- }
260
- )
261
- summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
262
- processed_summary = post_process_summary(summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
 
264
- return processed_summary
265
-
266
- def validate_summary(summary, original_text):
267
- """Validate summary content against original text"""
268
- # Check for age inconsistencies
269
- age_mentions = re.findall(r'(\d+\.?\d*)\s*years?', summary.lower())
270
- if len(age_mentions) > 1: # Multiple age mentions
271
- return False
272
 
273
- # Check for repetitive sentences
274
- sentences = summary.split('.')
275
- unique_sentences = set(s.strip().lower() for s in sentences if s.strip())
276
- if len(sentences) - len(unique_sentences) > 1: # More than one duplicate
277
- return False
278
 
279
- # Check summary isn't too long or too short compared to original
280
- summary_words = len(summary.split())
281
- original_words = len(original_text.split())
282
- if summary_words < 20 or summary_words > original_words * 0.8:
283
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
- return True
286
-
287
- def generate_focused_summary(question, abstracts, model, tokenizer):
288
- """Generate focused summary based on question"""
289
- # Preprocess each abstract
290
- formatted_abstracts = [preprocess_text(abstract) for abstract in abstracts]
291
- combined_input = f"Question: {question} Abstracts: " + " [SEP] ".join(formatted_abstracts)
292
 
293
- inputs = tokenizer(combined_input, return_tensors="pt", max_length=1024, truncation=True)
294
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
295
 
296
  with torch.no_grad():
@@ -298,15 +336,33 @@ def generate_focused_summary(question, abstracts, model, tokenizer):
298
  **{
299
  "input_ids": inputs["input_ids"],
300
  "attention_mask": inputs["attention_mask"],
301
- "max_length": 200,
302
- "min_length": 50,
303
  "num_beams": 4,
304
  "length_penalty": 2.0,
305
- "early_stopping": True
 
 
306
  }
307
  )
 
 
308
 
309
- return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
  def create_filter_controls(df, sort_column):
312
  """Create appropriate filter controls based on the selected column"""
@@ -367,6 +423,7 @@ def create_filter_controls(df, sort_column):
367
 
368
  return filtered_df
369
 
 
370
  def main():
371
  st.title("πŸ”¬ Biomedical Papers Analysis")
372
 
@@ -429,26 +486,26 @@ def main():
429
  # Individual Summaries Section
430
  st.header("πŸ“ Individual Paper Summaries")
431
 
 
432
  # Generate summaries if not already done
433
  if st.session_state.summaries is None:
434
  try:
435
  with st.spinner("Generating individual paper summaries..."):
436
- model, tokenizer = load_model("summarize")
437
- summaries = []
438
- progress_bar = st.progress(0)
 
439
 
440
- for idx, abstract in enumerate(df['Abstract']):
441
- summary = improve_summary_generation(abstract, model, tokenizer)
442
- summaries.append(summary)
443
- progress_bar.progress((idx + 1) / len(df))
444
-
445
- st.session_state.summaries = summaries
446
- cleanup_model(model, tokenizer)
447
- progress_bar.empty()
448
 
449
  except Exception as e:
450
  st.error(f"Error generating summaries: {str(e)}")
451
- st.session_state.processing_started = False
452
 
453
  # Display summaries with improved sorting and filtering
454
  if st.session_state.summaries is not None:
@@ -543,7 +600,7 @@ def main():
543
  </div>
544
  </div>
545
  """, unsafe_allow_html=True)
546
-
547
  with paper_info_cols[1]: # SUMMARY column
548
  st.markdown('<div class="paper-section"><div class="section-header">SUMMARY</div>', unsafe_allow_html=True)
549
  st.markdown(f"""
@@ -554,54 +611,68 @@ def main():
554
 
555
  # Add spacing between papers
556
  st.markdown("<div style='margin-bottom: 20px;'></div>", unsafe_allow_html=True)
557
-
558
- # Question-focused Summary Section (only if question provided)
559
- if question.strip():
560
- st.header("❓ Question-focused Summary")
561
-
562
- if not st.session_state.get('focused_summary_generated', False):
563
- try:
564
- with st.spinner("Analyzing relevant papers..."):
565
- # Initialize text processor if needed
566
- if st.session_state.text_processor is None:
567
- st.session_state.text_processor = TextProcessor()
568
-
569
- # Find relevant abstracts
570
- results = st.session_state.text_processor.find_most_relevant_abstracts(
571
- question,
572
- df['Abstract'].tolist(),
573
- top_k=5
574
- )
575
-
576
- # Load question-focused model
577
- model, tokenizer = load_model("question_focused")
578
-
579
- # Generate focused summary
580
- relevant_abstracts = df['Abstract'].iloc[results['top_indices']].tolist()
581
- focused_summary = generate_focused_summary(
582
- question,
583
- relevant_abstracts,
584
- model,
585
- tokenizer
586
- )
587
-
588
- # Store results
589
- st.session_state.focused_summary = focused_summary
590
- st.session_state.relevant_papers = df.iloc[results['top_indices']]
591
- st.session_state.relevance_scores = results['scores']
592
- st.session_state.focused_summary_generated = True
593
-
594
- # Cleanup second model
595
- cleanup_model(model, tokenizer)
596
 
597
- except Exception as e:
598
- st.error(f"Error generating focused summary: {str(e)}")
 
599
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
600
  # Display focused summary results
601
  if st.session_state.get('focused_summary_generated', False):
602
  st.subheader("Summary")
603
  st.write(st.session_state.focused_summary)
604
-
605
  st.subheader("Most Relevant Papers")
606
  relevant_papers = st.session_state.relevant_papers[
607
  ['Article Title', 'Authors', 'Publication Year', 'DOI']
@@ -609,6 +680,8 @@ def main():
609
  relevant_papers['Relevance Score'] = st.session_state.relevance_scores
610
  relevant_papers['Publication Year'] = relevant_papers['Publication Year'].astype(int)
611
  st.dataframe(relevant_papers, hide_index=True)
 
 
612
 
613
  if __name__ == "__main__":
614
  main()
 
1
  import streamlit as st
2
  import pandas as pd
3
  import torch
4
+ import re
5
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
  from peft import PeftModel
7
  from text_processing import TextProcessor
8
  import gc
9
  from pathlib import Path
10
+ import concurrent.futures
11
+ import time
12
+ import nltk
13
+ from nltk.tokenize import sent_tokenize
14
+ from concurrent.futures import ThreadPoolExecutor # Add this import
15
+
16
+ nltk.download('punkt')
17
 
18
  # Configure page
19
  st.set_page_config(
 
33
  st.session_state.processing_started = False
34
  if 'focused_summary_generated' not in st.session_state:
35
  st.session_state.focused_summary_generated = False
36
+ if 'current_model' not in st.session_state:
37
+ st.session_state.current_model = None
38
+ if 'current_tokenizer' not in st.session_state:
39
+ st.session_state.current_tokenizer = None
40
+ if 'model_type' not in st.session_state:
41
+ st.session_state.model_type = None
42
+
43
+
44
+ # TextProcessor class definition
45
+ try:
46
+ from text_processing import TextProcessor
47
+ except ImportError:
48
+ class TextProcessor:
49
+ def find_most_relevant_abstracts(self, question, abstracts, top_k=5):
50
+ return {
51
+ 'top_indices': list(range(min(top_k, len(abstracts)))),
52
+ 'scores': [1.0] * min(top_k, len(abstracts))
53
+ }
54
+
55
 
56
  def load_model(model_type):
57
  """Load appropriate model based on type with proper memory management"""
 
98
  st.error(f"Error loading model: {str(e)}")
99
  raise
100
 
101
+ def get_model(model_type):
102
+ """Get model from session state or load if needed"""
103
+ try:
104
+ if (st.session_state.current_model is None or
105
+ st.session_state.model_type != model_type):
106
+ # Clean up existing model
107
+ if st.session_state.current_model is not None:
108
+ cleanup_model(st.session_state.current_model,
109
+ st.session_state.current_tokenizer)
110
+ # Load new model
111
+ model, tokenizer = load_model(model_type)
112
+ st.session_state.current_model = model
113
+ st.session_state.current_tokenizer = tokenizer
114
+ st.session_state.model_type = model_type
115
+ return st.session_state.current_model, st.session_state.current_tokenizer
116
+ except Exception as e:
117
+ st.error(f"Error loading model: {str(e)}")
118
+ st.session_state.processing_started = False
119
+ return None, None
120
+
121
  def cleanup_model(model, tokenizer):
122
  """Properly cleanup model resources"""
123
  try:
 
128
  except Exception:
129
  pass
130
 
 
131
  @st.cache_data
 
132
  def process_excel(uploaded_file):
133
  """Process uploaded Excel file"""
134
  try:
 
163
  st.error("Please check if your file is in the correct Excel format (.xlsx or .xls)")
164
  return None
165
 
 
166
  def validate_excel_structure(df):
167
  """Validate the structure and content of the Excel file"""
168
  validation_messages = []
 
193
 
194
  return len(validation_messages) == 0, validation_messages
195
 
 
 
196
  def preprocess_text(text):
197
+ """Enhanced text preprocessing with improved header and list handling"""
198
  if not isinstance(text, str) or not text.strip():
199
  return text
200
 
201
+ # Initial cleanup
202
+ text = re.sub(r'\s+', ' ', text.strip())
203
 
204
+ # Standardize case for specific terms (e.g., PRIME -> Prime)
205
+ text = re.sub(r'\b([A-Z]{2,})\b', lambda m: m.group(1).title(), text)
206
 
207
+ # Fix spacing around punctuation and parentheses
208
+ text = re.sub(r'\s*:\s*', ': ', text)
209
+ text = re.sub(r'\s*,\s*', ', ', text)
210
+ text = re.sub(r'\(\s*([ivx\d]+)\s*\)', r'(\1)', text)
211
 
212
+ # Convert numbered lists to consistent format
213
+ text = re.sub(r'(?m)^\s*(\d+)\.\s*', r'(\1) ', text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
+ # Normalize section headers (using comprehensive patterns)
216
+ section_patterns = {
217
+ r'\b(?:Introduction|Background|Objectives|Purpose|Context)\s*:': 'Background and Objectives: ',
218
+ r'\b(?:Methods|Materials and Methods|Approach|Study Design|Experimental Design)\s*:': 'Methods: ',
219
+ r'\b(?:Results|Findings|Observations|Key Findings)\s*:': 'Results: ',
220
+ r'\b(?:Discussion|Analysis|Implications|Interpretation)\s*:': 'Discussion: ',
221
+ r'\b(?:Conclusion|Conclusions|Summary|Final Remarks)\s*:': 'Conclusions: '
222
+ }
 
 
 
223
 
224
+ # Remove nested headers
225
+ nested_header_pattern = r'\d+\.\s*(?:Background|Objectives|Methods|Results|Discussion|Conclusions)\s*:'
226
+ text = re.sub(nested_header_pattern, '', text)
 
 
 
 
 
 
227
 
228
+ # Standardize section headers
229
+ for pattern, replacement in section_patterns.items():
230
+ text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
231
 
232
+ # Split merged section headers
233
+ text = re.sub(r'(?i)Results\s+and\s+Conclusions:', 'Results: ', text)
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
+ # Handle special characters and normalize spacing
236
+ text = re.sub(r'[β€œβ€]', '"', text) # Correctly handle double quotes
237
+ text = re.sub(r"[β€˜β€™]", "'", text) # Correctly handle single quotes
238
+ text = re.sub(r'\s*-\s*', '-', text)
239
 
240
+ # Tokenize and capitalize sentences
241
+ sentences = re.split(r'(?<=\w[.!?])\s+|\n(?=\d+\.|\(\w+\)|-)', text)
242
+ formatted_sentences = [s.strip().capitalize() for s in sentences if s.strip()]
243
 
244
+ return ' '.join(formatted_sentences)
245
+
246
+
247
+
248
+ def post_process_summary(summary):
249
+ """Enhanced summary post-processing with improved formatting."""
250
+ if not summary:
251
+ return summary
252
+
253
+ # Step 1: Remove empty or redundant headers
254
+ summary = re.sub(r'\b(?:Background|Objectives|Methods|Results|Conclusions)\s*:\s*\.?\s*', '', summary)
255
+
256
+ # Step 2: Fix spacing issues in lists and parentheses
257
+ summary = re.sub(r'\(\s*([ivx\d]+)\s*\)', r'(\1)', summary) # Fix space inside parentheses
258
+ summary = re.sub(r'\s*,\s*(\([ivx\d]+\))', r', \1', summary) # Fix spacing before list items
259
+
260
+ # Step 3: Ensure proper punctuation and spacing
261
+ summary = re.sub(r'(?<=[.!?])\s*([A-Z])', r' \1', summary) # Add space after punctuation
262
+ summary = re.sub(r'\s*:\s*', ': ', summary) # Fix spacing around colons
263
+
264
+ # Step 4: Remove sections with too little content
265
+ sections = [s.strip() for s in summary.split('\n') if len(s.split()) > 3]
266
+ summary = ' '.join(sections)
267
+
268
+ # Step 5: Remove multiple periods
269
+ summary = re.sub(r'\.\.+', '.', summary)
270
+
271
+ # Step 6: Ensure summary ends with a single period
272
+ summary = summary.strip()
273
+ if not summary.endswith('.'):
274
+ summary += '.'
275
+
276
+ return summary
277
+
278
+
279
+ def generate_focused_summary(question, abstracts, model, tokenizer):
280
+ """Generate a structured summary based on the given question and abstracts."""
281
+ # Preprocess and clean abstracts
282
+ formatted_abstracts = [preprocess_text(abstract) for abstract in abstracts if abstract.strip()]
283
 
284
+ if not formatted_abstracts:
285
+ raise ValueError("Abstracts list is empty or improperly formatted.")
 
 
 
 
 
 
286
 
287
+ # Join abstracts with separator
288
+ abstracts_content = " [SEP] ".join(formatted_abstracts)
 
 
 
289
 
290
+ # Create the prompt
291
+ prompt = f"""
292
+ Generate a structured summary based on the given abstracts and the question. Follow these rules STRICTLY:
293
+ **QUESTION:** {question}
294
+ **SECTION FORMATTING RULES:**
295
+ 1. Each section MUST start with the section name followed by ": " (e.g., "Background: ").
296
+ 2. Each section MUST end with a period.
297
+ 3. Write complete, grammatically correct sentences.
298
+ 4. Do not use bullet points, lists, or combined section headers.
299
+ 5. Maintain the exact order of sections: Background, Objectives, Methods, Results, Conclusions.
300
+ 6. Avoid redundancies, incomplete thoughts, and cutting sentences mid-way.
301
+ 7. Use transition words (e.g., "Additionally," "Furthermore," "Moreover") to connect ideas naturally.
302
+ **REQUIRED SECTIONS AND CONTENT:**
303
+ 1. **Background**:
304
+ - Provide the context and motivation for the study.
305
+ - Do not mention objectives, methods, or results in this section.
306
+ 2. **Objectives**:
307
+ - Clearly state the aim(s) of the study.
308
+ - Avoid referencing any methods or findings.
309
+ 3. **Methods**:
310
+ - Describe the approach, tools, and procedures used.
311
+ - Do not include any findings or results in this section.
312
+ 4. **Results**:
313
+ - Summarize the key findings, including relevant statistics and outcomes.
314
+ - Mention implications only if explicitly stated in the abstracts.
315
+ 5. **Conclusions**:
316
+ - Highlight the overall interpretation of findings.
317
+ - Emphasize the significance and implications of the study.
318
+ **CRITICAL FORMAT RULES:**
319
+ 1. Each section title must be followed by a colon and a space.
320
+ 2. All sentences must be grammatically complete and coherent.
321
+ 3. Avoid bullet points, lists, and repeated sections.
322
+ 4. End each section with a period.
323
+ **INPUT ABSTRACTS:** {abstracts_content}
324
+ """
325
 
326
+ # Tokenize input (use the correct variable `prompt` here)
327
+ inputs = tokenizer(prompt,
328
+ return_tensors="pt",
329
+ max_length=1024,
330
+ truncation=True)
 
 
331
 
 
332
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
333
 
334
  with torch.no_grad():
 
336
  **{
337
  "input_ids": inputs["input_ids"],
338
  "attention_mask": inputs["attention_mask"],
339
+ "max_length": 280,
340
+ "min_length": 100,
341
  "num_beams": 4,
342
  "length_penalty": 2.0,
343
+ "no_repeat_ngram_size": 2,
344
+ "temperature": 0.7,
345
+ "do_sample": False
346
  }
347
  )
348
+
349
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
350
 
351
+ return post_process_summary(summary)
352
+
353
+
354
+ def process_papers_in_batches(df, model, tokenizer, batch_size=2):
355
+ """Process papers in batches for better efficiency"""
356
+ abstracts = df['Abstract'].tolist()
357
+ summaries = []
358
+
359
+ with ThreadPoolExecutor(max_workers=4) as executor: # Parallel processing
360
+ future_to_batch = {executor.submit(generate_focused_summary, "Focus on key findings and methods.", [abstract], model, tokenizer): abstract for abstract in abstracts}
361
+ for future in future_to_batch:
362
+ summaries.append(future.result())
363
+
364
+ return summaries
365
+
366
 
367
  def create_filter_controls(df, sort_column):
368
  """Create appropriate filter controls based on the selected column"""
 
423
 
424
  return filtered_df
425
 
426
+
427
  def main():
428
  st.title("πŸ”¬ Biomedical Papers Analysis")
429
 
 
486
  # Individual Summaries Section
487
  st.header("πŸ“ Individual Paper Summaries")
488
 
489
+
490
  # Generate summaries if not already done
491
  if st.session_state.summaries is None:
492
  try:
493
  with st.spinner("Generating individual paper summaries..."):
494
+ model, tokenizer = get_model("summarize")
495
+ if model is None or tokenizer is None:
496
+ reset_processing_state()
497
+ return
498
 
499
+ start_time = time.time()
500
+ st.session_state.summaries = process_papers_in_batches(
501
+ df, model, tokenizer, batch_size=2
502
+ )
503
+ end_time = time.time()
504
+ st.write(f"Processing time: {end_time - start_time:.2f} seconds")
 
 
505
 
506
  except Exception as e:
507
  st.error(f"Error generating summaries: {str(e)}")
508
+ reset_processing_state()
509
 
510
  # Display summaries with improved sorting and filtering
511
  if st.session_state.summaries is not None:
 
600
  </div>
601
  </div>
602
  """, unsafe_allow_html=True)
603
+
604
  with paper_info_cols[1]: # SUMMARY column
605
  st.markdown('<div class="paper-section"><div class="section-header">SUMMARY</div>', unsafe_allow_html=True)
606
  st.markdown(f"""
 
611
 
612
  # Add spacing between papers
613
  st.markdown("<div style='margin-bottom: 20px;'></div>", unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
614
 
615
+ # Question-focused Summary Section (only if question provided)
616
+ if question.strip():
617
+ st.header("❓ Question-focused Summary")
618
 
619
+ if not st.session_state.get('focused_summary_generated', False):
620
+ try:
621
+ with st.spinner("Analyzing relevant papers..."):
622
+ # Initialize text processor if needed
623
+ if st.session_state.text_processor is None:
624
+ st.session_state.text_processor = TextProcessor()
625
+
626
+ # Validate question
627
+ if not question.strip():
628
+ st.warning("Please enter a question first")
629
+ return
630
+
631
+ # Find relevant abstracts
632
+ results = st.session_state.text_processor.find_most_relevant_abstracts(
633
+ question,
634
+ df['Abstract'].tolist(),
635
+ top_k=5
636
+ )
637
+
638
+ if not results['top_indices']:
639
+ st.warning("No relevant papers found for your question")
640
+ return
641
+
642
+ # Load question-focused model
643
+ model, tokenizer = get_model("question_focused")
644
+ if model is None or tokenizer is None:
645
+ return
646
+
647
+ # Generate focused summary
648
+ try:
649
+ relevant_abstracts = df['Abstract'].iloc[results['top_indices']].tolist()
650
+ focused_summary = generate_focused_summary(
651
+ question,
652
+ relevant_abstracts,
653
+ model,
654
+ tokenizer
655
+ )
656
+
657
+ # Store results
658
+ st.session_state.focused_summary = focused_summary
659
+ st.session_state.relevant_papers = df.iloc[results['top_indices']]
660
+ st.session_state.relevance_scores = results['scores']
661
+ st.session_state.focused_summary_generated = True
662
+
663
+ finally:
664
+ # Cleanup second model
665
+ cleanup_model(model, tokenizer)
666
+
667
+ except Exception as e:
668
+ st.error(f"Error generating focused summary: {str(e)}")
669
+ reset_processing_state()
670
+
671
  # Display focused summary results
672
  if st.session_state.get('focused_summary_generated', False):
673
  st.subheader("Summary")
674
  st.write(st.session_state.focused_summary)
675
+
676
  st.subheader("Most Relevant Papers")
677
  relevant_papers = st.session_state.relevant_papers[
678
  ['Article Title', 'Authors', 'Publication Year', 'DOI']
 
680
  relevant_papers['Relevance Score'] = st.session_state.relevance_scores
681
  relevant_papers['Publication Year'] = relevant_papers['Publication Year'].astype(int)
682
  st.dataframe(relevant_papers, hide_index=True)
683
+
684
+
685
 
686
  if __name__ == "__main__":
687
  main()