pendar02 commited on
Commit
dee9a31
·
verified ·
1 Parent(s): 3ffe379

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +220 -152
app.py CHANGED
@@ -23,6 +23,10 @@ st.set_page_config(
23
  )
24
 
25
  # Initialize session state
 
 
 
 
26
  if 'processed_data' not in st.session_state:
27
  st.session_state.processed_data = None
28
  if 'summaries' not in st.session_state:
@@ -39,6 +43,8 @@ if 'current_tokenizer' not in st.session_state:
39
  st.session_state.current_tokenizer = None
40
  if 'model_type' not in st.session_state:
41
  st.session_state.model_type = None
 
 
42
 
43
 
44
  # TextProcessor class definition
@@ -193,142 +199,156 @@ def validate_excel_structure(df):
193
 
194
  return len(validation_messages) == 0, validation_messages
195
 
 
 
196
  def preprocess_text(text):
197
- """Enhanced text preprocessing with improved header and list handling"""
198
  if not isinstance(text, str) or not text.strip():
199
  return text
200
 
201
- # Initial cleanup
202
- text = re.sub(r'\s+', ' ', text.strip())
203
 
204
- # Standardize case for specific terms (e.g., PRIME -> Prime)
205
- text = re.sub(r'\b([A-Z]{2,})\b', lambda m: m.group(1).title(), text)
 
206
 
207
- # Fix spacing around punctuation and parentheses
208
- text = re.sub(r'\s*:\s*', ': ', text)
209
- text = re.sub(r'\s*,\s*', ', ', text)
210
- text = re.sub(r'\(\s*([ivx\d]+)\s*\)', r'(\1)', text)
211
 
212
- # Convert numbered lists to consistent format
213
- text = re.sub(r'(?m)^\s*(\d+)\.\s*', r'(\1) ', text)
214
 
215
- # Normalize section headers (using comprehensive patterns)
 
 
 
 
216
  section_patterns = {
217
- r'\b(?:Introduction|Background|Objectives|Purpose|Context)\s*:': 'Background and Objectives: ',
218
- r'\b(?:Methods|Materials and Methods|Approach|Study Design|Experimental Design)\s*:': 'Methods: ',
219
- r'\b(?:Results|Findings|Observations|Key Findings)\s*:': 'Results: ',
220
- r'\b(?:Discussion|Analysis|Implications|Interpretation)\s*:': 'Discussion: ',
221
- r'\b(?:Conclusion|Conclusions|Summary|Final Remarks)\s*:': 'Conclusions: '
 
222
  }
223
 
224
- # Remove nested headers
225
- nested_header_pattern = r'\d+\.\s*(?:Background|Objectives|Methods|Results|Discussion|Conclusions)\s*:'
226
- text = re.sub(nested_header_pattern, '', text)
227
-
228
- # Standardize section headers
229
  for pattern, replacement in section_patterns.items():
230
  text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
231
 
232
- # Split merged section headers
233
- text = re.sub(r'(?i)Results\s+and\s+Conclusions:', 'Results: ', text)
 
 
234
 
235
- # Handle special characters and normalize spacing
236
- text = re.sub(r'[“”]', '"', text) # Correctly handle double quotes
237
- text = re.sub(r"[‘’]", "'", text) # Correctly handle single quotes
238
- text = re.sub(r'\s*-\s*', '-', text)
239
 
240
- # Tokenize and capitalize sentences
241
- sentences = re.split(r'(?<=\w[.!?])\s+|\n(?=\d+\.|\(\w+\)|-)', text)
242
- formatted_sentences = [s.strip().capitalize() for s in sentences if s.strip()]
 
 
 
 
 
 
243
 
244
- return ' '.join(formatted_sentences)
245
-
246
-
247
-
248
- def post_process_summary(summary):
249
- """Enhanced summary post-processing with improved formatting."""
250
- if not summary:
251
- return summary
252
-
253
- # Step 1: Remove empty or redundant headers
254
- summary = re.sub(r'\b(?:Background|Objectives|Methods|Results|Conclusions)\s*:\s*\.?\s*', '', summary)
255
-
256
- # Step 2: Fix spacing issues in lists and parentheses
257
- summary = re.sub(r'\(\s*([ivx\d]+)\s*\)', r'(\1)', summary) # Fix space inside parentheses
258
- summary = re.sub(r'\s*,\s*(\([ivx\d]+\))', r', \1', summary) # Fix spacing before list items
259
-
260
- # Step 3: Ensure proper punctuation and spacing
261
- summary = re.sub(r'(?<=[.!?])\s*([A-Z])', r' \1', summary) # Add space after punctuation
262
- summary = re.sub(r'\s*:\s*', ': ', summary) # Fix spacing around colons
263
-
264
- # Step 4: Remove sections with too little content
265
- sections = [s.strip() for s in summary.split('\n') if len(s.split()) > 3]
266
- summary = ' '.join(sections)
267
-
268
- # Step 5: Remove multiple periods
269
- summary = re.sub(r'\.\.+', '.', summary)
270
-
271
- # Step 6: Ensure summary ends with a single period
272
- summary = summary.strip()
273
- if not summary.endswith('.'):
274
- summary += '.'
275
-
276
- return summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
 
279
  def generate_focused_summary(question, abstracts, model, tokenizer):
280
- """Generate a structured summary based on the given question and abstracts."""
281
- # Preprocess and clean abstracts
282
  formatted_abstracts = [preprocess_text(abstract) for abstract in abstracts if abstract.strip()]
283
-
284
- if not formatted_abstracts:
285
- raise ValueError("Abstracts list is empty or improperly formatted.")
286
-
287
- # Join abstracts with separator
288
  abstracts_content = " [SEP] ".join(formatted_abstracts)
289
-
290
- # Create the prompt
291
  prompt = f"""
292
- Generate a structured summary based on the given abstracts and the question. Follow these rules STRICTLY:
293
- **QUESTION:** {question}
294
- **SECTION FORMATTING RULES:**
295
- 1. Each section MUST start with the section name followed by ": " (e.g., "Background: ").
296
- 2. Each section MUST end with a period.
297
- 3. Write complete, grammatically correct sentences.
298
- 4. Do not use bullet points, lists, or combined section headers.
299
- 5. Maintain the exact order of sections: Background, Objectives, Methods, Results, Conclusions.
300
- 6. Avoid redundancies, incomplete thoughts, and cutting sentences mid-way.
301
- 7. Use transition words (e.g., "Additionally," "Furthermore," "Moreover") to connect ideas naturally.
302
- **REQUIRED SECTIONS AND CONTENT:**
303
- 1. **Background**:
304
- - Provide the context and motivation for the study.
305
- - Do not mention objectives, methods, or results in this section.
306
- 2. **Objectives**:
307
- - Clearly state the aim(s) of the study.
308
- - Avoid referencing any methods or findings.
309
- 3. **Methods**:
310
- - Describe the approach, tools, and procedures used.
311
- - Do not include any findings or results in this section.
312
- 4. **Results**:
313
- - Summarize the key findings, including relevant statistics and outcomes.
314
- - Mention implications only if explicitly stated in the abstracts.
315
- 5. **Conclusions**:
316
- - Highlight the overall interpretation of findings.
317
- - Emphasize the significance and implications of the study.
318
- **CRITICAL FORMAT RULES:**
319
- 1. Each section title must be followed by a colon and a space.
320
- 2. All sentences must be grammatically complete and coherent.
321
- 3. Avoid bullet points, lists, and repeated sections.
322
- 4. End each section with a period.
323
- **INPUT ABSTRACTS:** {abstracts_content}
324
- """
325
 
326
- # Tokenize input (use the correct variable `prompt` here)
327
- inputs = tokenizer(prompt,
328
- return_tensors="pt",
329
- max_length=1024,
330
- truncation=True)
 
 
 
 
331
 
 
332
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
333
 
334
  with torch.no_grad():
@@ -336,20 +356,80 @@ def generate_focused_summary(question, abstracts, model, tokenizer):
336
  **{
337
  "input_ids": inputs["input_ids"],
338
  "attention_mask": inputs["attention_mask"],
339
- "max_length": 280,
340
- "min_length": 100,
341
  "num_beams": 4,
342
  "length_penalty": 2.0,
343
- "no_repeat_ngram_size": 2,
344
  "temperature": 0.7,
345
  "do_sample": False
346
  }
347
  )
348
-
349
- summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
350
 
 
351
  return post_process_summary(summary)
352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
 
354
  def process_papers_in_batches(df, model, tokenizer, batch_size=2):
355
  """Process papers in batches for better efficiency"""
@@ -619,54 +699,42 @@ def main():
619
  if not st.session_state.get('focused_summary_generated', False):
620
  try:
621
  with st.spinner("Analyzing relevant papers..."):
622
- # Initialize text processor if needed
623
  if st.session_state.text_processor is None:
624
  st.session_state.text_processor = TextProcessor()
625
-
626
- # Validate question
627
- if not question.strip():
628
- st.warning("Please enter a question first")
629
- return
630
-
631
- # Find relevant abstracts
632
  results = st.session_state.text_processor.find_most_relevant_abstracts(
633
  question,
634
  df['Abstract'].tolist(),
635
  top_k=5
636
  )
637
-
638
  if not results['top_indices']:
639
- st.warning("No relevant papers found for your question")
640
- return
641
-
642
- # Load question-focused model
643
- model, tokenizer = get_model("question_focused")
644
- if model is None or tokenizer is None:
645
  return
646
-
647
- # Generate focused summary
648
- try:
649
- relevant_abstracts = df['Abstract'].iloc[results['top_indices']].tolist()
650
- focused_summary = generate_focused_summary(
651
- question,
652
- relevant_abstracts,
653
- model,
654
- tokenizer
655
- )
656
-
657
- # Store results
658
- st.session_state.focused_summary = focused_summary
659
- st.session_state.relevant_papers = df.iloc[results['top_indices']]
660
- st.session_state.relevance_scores = results['scores']
661
- st.session_state.focused_summary_generated = True
662
-
663
- finally:
664
- # Cleanup second model
665
- cleanup_model(model, tokenizer)
666
-
667
  except Exception as e:
668
  st.error(f"Error generating focused summary: {str(e)}")
669
  reset_processing_state()
 
 
 
670
 
671
  # Display focused summary results
672
  if st.session_state.get('focused_summary_generated', False):
 
23
  )
24
 
25
  # Initialize session state
26
+ if 'relevant_papers' not in st.session_state:
27
+ st.session_state.relevant_papers = None
28
+ if 'relevance_scores' not in st.session_state:
29
+ st.session_state.relevance_scores = None
30
  if 'processed_data' not in st.session_state:
31
  st.session_state.processed_data = None
32
  if 'summaries' not in st.session_state:
 
43
  st.session_state.current_tokenizer = None
44
  if 'model_type' not in st.session_state:
45
  st.session_state.model_type = None
46
+ if 'focused_summary' not in st.session_state:
47
+ st.session_state.focused_summary = None
48
 
49
 
50
  # TextProcessor class definition
 
199
 
200
  return len(validation_messages) == 0, validation_messages
201
 
202
+
203
+
204
  def preprocess_text(text):
205
+ """Clean biomedical text by handling common formatting issues and standardizing structure."""
206
  if not isinstance(text, str) or not text.strip():
207
  return text
208
 
209
+ # Remove extra whitespace
210
+ text = ' '.join(text.split())
211
 
212
+ # Roman numeral conversion
213
+ roman_map = {'i': '1', 'ii': '2', 'iii': '3', 'iv': '4', 'v': '5',
214
+ 'vi': '6', 'vii': '7', 'viii': '8', 'ix': '9', 'x': '10'}
215
 
216
+ def replace_roman(match):
217
+ roman = match.group(1).lower()
218
+ return f"({roman_map.get(roman, roman)})"
 
219
 
220
+ text = re.sub(r'\(([ivx]+)\)', replace_roman, text)
 
221
 
222
+ # Clean enumerated lists
223
+ for roman in roman_map:
224
+ text = re.sub(f"\\b{roman}\\)", f"{roman_map[roman]})", text, flags=re.IGNORECASE)
225
+
226
+ # Standardize section headers
227
  section_patterns = {
228
+ r'\b(?:introduction|purpose|background|objectives?|context)\s*:?\s*': 'Background: ',
229
+ r'\b(?:materials?\s+and\s+methods?|methods?|approach|study\s+design)\s*:?\s*': 'Methods: ',
230
+ r'\b(?:results?|findings?|observations?)\s*:?\s*': 'Results: ',
231
+ r'\b(?:conclusions?|summary|final\s+remarks?)\s*:?\s*': 'Conclusions: ',
232
+ r'\b(?:results?\s+and\s+conclusions?)\s*:?\s*(?=.*?:)': '', # Remove if followed by another section
233
+ r'\b(?:results?\s*:\s*and\s*conclusions?\s*:)': 'Results: ' # Fix malformed combination
234
  }
235
 
 
 
 
 
 
236
  for pattern, replacement in section_patterns.items():
237
  text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
238
 
239
+ # Ensure complete sentences in sections
240
+ text = re.sub(r'(?<=:)\s*([^.!?\n]*?)(?=\s*(?:[A-Z][^:]*:|$))',
241
+ lambda m: f" {m.group(1)}." if m.group(1) and not m.group(1).strip().endswith('.') else m.group(0),
242
+ text)
243
 
244
+ # Fix truncated sentences
245
+ text = re.sub(r'(?<=:)\s*([^.!?\n]*?)\s*(?=[A-Z][^:]*:)',
246
+ lambda m: f" {m.group(1)}." if m.group(1) else "",
247
+ text)
248
 
249
+ # Clean formatting
250
+ text = re.sub(r'[\r\n]+', ' ', text)
251
+ text = re.sub(r'\s*:\s*', ': ', text)
252
+ text = re.sub(r'\s+', ' ', text)
253
+ text = re.sub(r'(?<=[.!?])\s*(?=[A-Z])', ' ', text)
254
+ text = re.sub(r'•|\*|■|□|→|✓', '', text)
255
+ text = re.sub(r'\\n|\\r', ' ', text)
256
+ text = re.sub(r'\s*\(\s*', ' (', text)
257
+ text = re.sub(r'\s*\)\s*', ') ', text)
258
 
259
+ # Fix statistical notations
260
+ text = re.sub(r'p\s*[<=>]\s*0\.\d+', lambda m: m.group().replace(' ', ''), text)
261
+ text = re.sub(r'(?<=\d)\s*%', '%', text)
262
+
263
+ # Fix abbreviations spacing
264
+ text = re.sub(r'(?<=\w)vs\.(?=\w)', 'vs. ', text)
265
+ text = re.sub(r'(?<=\w)et\s+al\.(?=\w)', 'et al. ', text)
266
+
267
+ # Remove repeated punctuation
268
+ text = re.sub(r'([.!?])\1+', r'\1', text)
269
+
270
+ # Final cleanup
271
+ text = re.sub(r'(?<=[.!?])\s*(?=[A-Z])', ' ', text)
272
+ text = text.strip()
273
+ if not text.endswith('.'):
274
+ text += '.'
275
+
276
+ return text
277
+
278
+ # """Enhanced text preprocessing with better section handling and prompt removal."""
279
+ # if not isinstance(text, str) or not text.strip():
280
+ # return text
281
+
282
+ # # Remove prompt leakage
283
+ # prompt_patterns = [
284
+ # r'Generate a structured summary addressing this question:.*?(?=\w+:)',
285
+ # r'Focus on key findings and methods\.',
286
+ # r'is a structured summary addressing this question:'
287
+ # ]
288
+ # for pattern in prompt_patterns:
289
+ # text = re.sub(pattern, '', text, flags=re.IGNORECASE)
290
+
291
+ # # Clean section headers more aggressively
292
+ # section_patterns = {
293
+ # r'\b(?:introduction|purpose|background|objectives?|context)\s*:?\s*': 'Background: ',
294
+ # r'\b(?:materials?\s+and\s+methods?|methods?|approach|study\s+design)\s*:?\s*': 'Methods: ',
295
+ # r'\b(?:results?|findings?|observations?)\s*:?\s*': 'Results: ',
296
+ # r'\b(?:conclusions?|summary|final\s+remarks?)\s*:?\s*': 'Conclusions: '
297
+ # }
298
+
299
+ # # Apply section normalization
300
+ # for pattern, replacement in section_patterns.items():
301
+ # text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
302
+
303
+ # # Remove combined section headers
304
+ # combined_headers = [
305
+ # r'\bmethods?\s+and\s+conclusions?\b',
306
+ # r'\bresults?\s+and\s+conclusions?\b',
307
+ # r'\bmaterials?\s+and\s+methods?\b'
308
+ # ]
309
+ # for pattern in combined_headers:
310
+ # text = re.sub(pattern, 'Methods:', text, flags=re.IGNORECASE)
311
+
312
+ # # Clean up sentences
313
+ # sentences = text.split('.')
314
+ # cleaned_sentences = []
315
+ # for sentence in sentences:
316
+ # # Remove redundant section references
317
+ # sentence = re.sub(r'\b(?:first|second|third|fourth|fifth)\s+sections?\b', '', sentence, flags=re.IGNORECASE)
318
+ # # Remove comparative phrases about section details
319
+ # sentence = re.sub(r'\b(?:more|less)\s+detailed\s+than.*', '', sentence, flags=re.IGNORECASE)
320
+ # if sentence.strip():
321
+ # cleaned_sentences.append(sentence.strip())
322
+
323
+ # # Rejoin and format
324
+ # text = '. '.join(cleaned_sentences)
325
+ # text = re.sub(r'\s+', ' ', text) # Remove extra spaces
326
+ # text = re.sub(r'\s*:\s*', ': ', text) # Fix spacing around colons
327
+
328
+ # return text.strip()
329
 
330
 
331
  def generate_focused_summary(question, abstracts, model, tokenizer):
 
 
332
  formatted_abstracts = [preprocess_text(abstract) for abstract in abstracts if abstract.strip()]
 
 
 
 
 
333
  abstracts_content = " [SEP] ".join(formatted_abstracts)
 
 
334
  prompt = f"""
335
+ Provide a factual summary structured as:
336
+ - Background: Context and origin only if present
337
+ - Methods: Key procedures and approaches
338
+ - Results: Specific findings with numbers
339
+ - Conclusions: Main implications
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
 
341
+ Requirements:
342
+ - Present sections sequentially
343
+ - Merge related points within sections
344
+ - Complete all sentences
345
+ - Avoid repeating section headers
346
+ - Use original terminology
347
+
348
+ Content: {abstracts_content}
349
+ """
350
 
351
+ inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
352
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
353
 
354
  with torch.no_grad():
 
356
  **{
357
  "input_ids": inputs["input_ids"],
358
  "attention_mask": inputs["attention_mask"],
359
+ "max_length": 512,
360
+ "min_length": 200,
361
  "num_beams": 4,
362
  "length_penalty": 2.0,
363
+ "no_repeat_ngram_size": 3,
364
  "temperature": 0.7,
365
  "do_sample": False
366
  }
367
  )
 
 
368
 
369
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
370
  return post_process_summary(summary)
371
 
372
+ def post_process_summary(summary):
373
+ """Post-process summary with improved section handling and formatting."""
374
+ if not summary:
375
+ return summary
376
+
377
+ valid_sections = ['Background', 'Methods', 'Results', 'Conclusions']
378
+ sections = {}
379
+ current_section = None
380
+ current_content = []
381
+
382
+ # Pre-clean section headers
383
+ summary = re.sub(r'\b(?:results?\s*:\s*and\s*conclusions?\s*:)', 'Results:', summary, flags=re.IGNORECASE)
384
+ summary = re.sub(r'\bresults?\s*and\s*conclusions?\s*:', 'Results:', summary, flags=re.IGNORECASE)
385
+
386
+ # Process line by line
387
+ lines = [line.strip() for line in summary.split('.') if line.strip()]
388
+ for i, line in enumerate(lines):
389
+ section_match = None
390
+ for section in valid_sections:
391
+ if re.match(fr'\b{section}:', line, re.IGNORECASE):
392
+ section_match = section
393
+ break
394
+
395
+ if section_match:
396
+ if current_section:
397
+ content = ' '.join(current_content)
398
+ if content:
399
+ sections[current_section] = content
400
+ current_section = section_match
401
+ content = re.sub(fr'\b{section_match}:\s*', '', line, flags=re.IGNORECASE)
402
+ current_content = [content] if content else []
403
+ elif current_section:
404
+ # Prevent section header splitting
405
+ if not any(sect.lower() in line.lower() for sect in valid_sections):
406
+ current_content.append(line)
407
+
408
+ if current_section and current_content:
409
+ sections[current_section] = ' '.join(current_content)
410
+
411
+ # Format sections
412
+ formatted_sections = []
413
+ for section in valid_sections:
414
+ if section in sections:
415
+ content = sections[section].strip()
416
+ if content:
417
+ # Complete truncated sentences
418
+ if not re.search(r'[.!?]$', content):
419
+ if len(content.split()) >= 3: # Only complete if substantial
420
+ content += '.'
421
+
422
+ # Ensure capitalization
423
+ content = content[0].upper() + content[1:]
424
+
425
+ # Fix double periods
426
+ content = re.sub(r'\.+', '.', content)
427
+
428
+ formatted_sections.append(f"{section}: {content}")
429
+
430
+ return ' '.join(formatted_sections)
431
+
432
+
433
 
434
  def process_papers_in_batches(df, model, tokenizer, batch_size=2):
435
  """Process papers in batches for better efficiency"""
 
699
  if not st.session_state.get('focused_summary_generated', False):
700
  try:
701
  with st.spinner("Analyzing relevant papers..."):
 
702
  if st.session_state.text_processor is None:
703
  st.session_state.text_processor = TextProcessor()
704
+
705
+ model, tokenizer = get_model("question_focused")
706
+ if model is None or tokenizer is None:
707
+ raise Exception("Failed to load question-focused model")
708
+
 
 
709
  results = st.session_state.text_processor.find_most_relevant_abstracts(
710
  question,
711
  df['Abstract'].tolist(),
712
  top_k=5
713
  )
714
+
715
  if not results['top_indices']:
716
+ st.warning("No papers found relevant to your question")
 
 
 
 
 
717
  return
718
+
719
+ # Store relevant papers and scores
720
+ st.session_state.relevant_papers = df.iloc[results['top_indices']]
721
+ st.session_state.relevance_scores = results['scores']
722
+
723
+ relevant_abstracts = df['Abstract'].iloc[results['top_indices']].tolist()
724
+ st.session_state.focused_summary = generate_focused_summary(
725
+ question,
726
+ relevant_abstracts,
727
+ model,
728
+ tokenizer
729
+ )
730
+ st.session_state.focused_summary_generated = True
731
+
 
 
 
 
 
 
 
732
  except Exception as e:
733
  st.error(f"Error generating focused summary: {str(e)}")
734
  reset_processing_state()
735
+
736
+ finally:
737
+ cleanup_model(model, tokenizer)
738
 
739
  # Display focused summary results
740
  if st.session_state.get('focused_summary_generated', False):