pendar02 commited on
Commit
e1116a3
·
verified ·
1 Parent(s): b25dfc8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +274 -102
app.py CHANGED
@@ -88,28 +88,179 @@ def process_excel(uploaded_file):
88
  try:
89
  df = pd.read_excel(uploaded_file)
90
  required_columns = ['Abstract', 'Article Title', 'Authors',
91
- 'Source Title', 'Publication Year', 'DOI', 'Times Cited, All Databases']
92
-
93
  # Check required columns
94
  missing_columns = [col for col in required_columns if col not in df.columns]
95
  if missing_columns:
96
  st.error(f"Missing required columns: {', '.join(missing_columns)}")
97
  return None
98
-
99
  return df[required_columns]
100
  except Exception as e:
101
  st.error(f"Error processing file: {str(e)}")
102
  return None
103
 
104
- # Define preprocess_text, post_process_summary, improve_summary_generation,
105
- # validate_summary, generate_focused_summary as is in the original code
106
-
107
- # Updated create_filter_controls to include the new "Time Cited" column
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  def create_filter_controls(df, sort_column):
110
  """Create appropriate filter controls based on the selected column"""
111
  filtered_df = df.copy()
112
-
113
  if sort_column == 'Publication Year':
114
  # Year range slider
115
  year_min = int(df['Publication Year'].min())
@@ -117,19 +268,19 @@ def create_filter_controls(df, sort_column):
117
  col1, col2 = st.columns(2)
118
  with col1:
119
  start_year = st.number_input('From Year',
120
- min_value=year_min,
121
- max_value=year_max,
122
- value=year_min)
123
  with col2:
124
  end_year = st.number_input('To Year',
125
- min_value=year_min,
126
- max_value=year_max,
127
- value=year_max)
128
  filtered_df = filtered_df[
129
  (filtered_df['Publication Year'] >= start_year) &
130
  (filtered_df['Publication Year'] <= end_year)
131
  ]
132
-
133
  elif sort_column == 'Authors':
134
  # Multi-select for authors
135
  unique_authors = sorted(set(
@@ -147,45 +298,58 @@ def create_filter_controls(df, sort_column):
147
  lambda x: any(author in str(x) for author in selected_authors)
148
  )
149
  ]
150
-
151
  elif sort_column == 'Source Title':
152
  # Multi-select for source titles
153
- unique_sources = sorted(set(df['Source Title'].dropna()))
154
  selected_sources = st.multiselect(
155
  'Select Sources',
156
  unique_sources
157
  )
158
  if selected_sources:
159
  filtered_df = filtered_df[filtered_df['Source Title'].isin(selected_sources)]
160
-
161
- elif sort_column == 'Times Cited':
162
- # Sorting by citation count
163
- col1, col2 = st.columns(2)
164
- with col1:
165
- order = st.radio('Sort by:', ['Most to Least Cited', 'Least to Most Cited'])
166
- ascending = order == 'Least to Most Cited'
167
- filtered_df = filtered_df.sort_values(by='Times Cited, All Databases', ascending=ascending)
168
-
169
  elif sort_column == 'Article Title':
170
  # Only alphabetical sorting, no filtering
171
  pass
172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  return filtered_df
174
 
175
  def main():
176
  st.title("🔬 Biomedical Papers Analysis")
177
-
178
  # File upload section
179
  uploaded_file = st.file_uploader(
180
  "Upload Excel file containing papers",
181
  type=['xlsx', 'xls'],
182
- help="File must contain: Abstract, Article Title, Authors, Source Title, Publication Year, DOI, Times Cited, All Databases"
183
  )
184
-
185
  # Question input - moved up but hidden initially
186
  question_container = st.empty()
187
  question = ""
188
-
189
  if uploaded_file is not None:
190
  # Process Excel file
191
  if st.session_state.processed_data is None:
@@ -193,28 +357,28 @@ def main():
193
  df = process_excel(uploaded_file)
194
  if df is not None:
195
  st.session_state.processed_data = df.dropna(subset=["Abstract"])
196
-
197
  if st.session_state.processed_data is not None:
198
  df = st.session_state.processed_data
199
  st.write(f"📊 Loaded {len(df)} papers with abstracts")
200
-
201
  # Get question before processing
202
  with question_container:
203
  question = st.text_input(
204
  "Enter your research question (optional):",
205
  help="If provided, a question-focused summary will be generated after individual summaries"
206
  )
207
-
208
  # Single button for both processes
209
  if not st.session_state.get('processing_started', False):
210
  if st.button("Start Analysis"):
211
  st.session_state.processing_started = True
212
-
213
  # Show processing status and results
214
  if st.session_state.get('processing_started', False):
215
  # Individual Summaries Section
216
- st.header("🗒 Individual Paper Summaries")
217
-
218
  # Generate summaries if not already done
219
  if st.session_state.summaries is None:
220
  try:
@@ -222,56 +386,65 @@ def main():
222
  model, tokenizer = load_model("summarize")
223
  summaries = []
224
  progress_bar = st.progress(0)
225
-
226
  for idx, abstract in enumerate(df['Abstract']):
227
  summary = improve_summary_generation(abstract, model, tokenizer)
228
  summaries.append(summary)
229
  progress_bar.progress((idx + 1) / len(df))
230
-
231
  st.session_state.summaries = summaries
232
  cleanup_model(model, tokenizer)
233
  progress_bar.empty()
234
-
235
  except Exception as e:
236
  st.error(f"Error generating summaries: {str(e)}")
237
  st.session_state.processing_started = False
238
-
239
  # Display summaries with improved sorting and filtering
240
  if st.session_state.summaries is not None:
241
- col1, col2 = st.columns(2)
242
- with col1:
243
- sort_options = ['Article Title', 'Authors', 'Publication Year', 'Source Title', 'Times Cited']
244
- sort_column = st.selectbox("Sort/Filter by:", sort_options)
245
- with col2:
246
- # Only show A-Z/Z-A option for Article Title
247
- if sort_column == 'Article Title':
248
- ascending = st.radio(
249
- "Sort order",
250
- ["A to Z", "Z to A"],
251
- horizontal=True
252
- ) == "A to Z"
253
- else:
254
- ascending = True # Default for other columns
255
-
256
- # Create display dataframe
257
- display_df = df.copy()
258
- display_df['Summary'] = st.session_state.summaries
259
- display_df['Publication Year'] = display_df['Publication Year'].astype(int)
260
-
 
 
 
 
 
 
 
 
261
  # Apply filters
262
  filtered_df = create_filter_controls(display_df, sort_column)
263
-
264
  if sort_column == 'Article Title':
265
  # Sort alphabetically
266
  sorted_df = filtered_df.sort_values(by=sort_column, ascending=ascending)
267
  else:
268
- # Keep original order for other columns after filtering
 
269
  sorted_df = filtered_df
270
-
271
  # Show number of filtered results
272
  if len(sorted_df) != len(display_df):
273
  st.write(f"Showing {len(sorted_df)} of {len(display_df)} papers")
274
-
275
  # Apply custom styling
276
  st.markdown("""
277
  <style>
@@ -302,58 +475,58 @@ def main():
302
  }
303
  </style>
304
  """, unsafe_allow_html=True)
305
-
306
  # Display papers using the filtered and sorted dataframe
307
  for _, row in sorted_df.iterrows():
308
- paper_info_cols = st.columns([1, 1])
309
-
310
- with paper_info_cols[0]: # PAPER column
311
- st.markdown('<div class="paper-section"><div class="section-header">PAPER</div>', unsafe_allow_html=True)
312
- st.markdown(f"""
313
- <div class="paper-info">
314
- <div class="paper-title">{row['Article Title']}</div>
315
- <div class="paper-meta">
316
- <strong>Authors:</strong> {row['Authors']}<br>
317
- <strong>Source:</strong> {row['Source Title']}<br>
318
- <strong>Publication Year:</strong> {row['Publication Year']}<br>
319
- <strong>Time Cited:</strong> {row['Times Cited, All Databases']}<br>
320
- <strong>DOI:</strong> {row['DOI'] if pd.notna(row['DOI']) else 'None'}
 
321
  </div>
322
- </div>
323
- """, unsafe_allow_html=True)
324
-
325
- with paper_info_cols[1]: # SUMMARY column
326
- st.markdown('<div class="paper-section"><div class="section-header">SUMMARY</div>', unsafe_allow_html=True)
327
- st.markdown(f"""
328
- <div class="paper-info">
329
- {row['Summary']}
330
- </div>
331
- """, unsafe_allow_html=True)
332
-
333
- # Add spacing between papers
334
- st.markdown("<div style='margin-bottom: 20px;'></div>", unsafe_allow_html=True)
335
 
 
 
 
 
 
 
 
 
 
 
 
336
  # Question-focused Summary Section (only if question provided)
337
  if question.strip():
338
  st.header("❓ Question-focused Summary")
339
-
340
  if not st.session_state.get('focused_summary_generated', False):
341
  try:
342
  with st.spinner("Analyzing relevant papers..."):
343
  # Initialize text processor if needed
344
  if st.session_state.text_processor is None:
345
  st.session_state.text_processor = TextProcessor()
346
-
347
  # Find relevant abstracts
348
  results = st.session_state.text_processor.find_most_relevant_abstracts(
349
  question,
350
  df['Abstract'].tolist(),
351
  top_k=5
352
  )
353
-
354
  # Load question-focused model
355
  model, tokenizer = load_model("question_focused")
356
-
357
  # Generate focused summary
358
  relevant_abstracts = df['Abstract'].iloc[results['top_indices']].tolist()
359
  focused_summary = generate_focused_summary(
@@ -362,24 +535,24 @@ def main():
362
  model,
363
  tokenizer
364
  )
365
-
366
  # Store results
367
  st.session_state.focused_summary = focused_summary
368
  st.session_state.relevant_papers = df.iloc[results['top_indices']]
369
  st.session_state.relevance_scores = results['scores']
370
  st.session_state.focused_summary_generated = True
371
-
372
  # Cleanup second model
373
  cleanup_model(model, tokenizer)
374
-
375
  except Exception as e:
376
  st.error(f"Error generating focused summary: {str(e)}")
377
-
378
  # Display focused summary results
379
  if st.session_state.get('focused_summary_generated', False):
380
  st.subheader("Summary")
381
  st.write(st.session_state.focused_summary)
382
-
383
  st.subheader("Most Relevant Papers")
384
  relevant_papers = st.session_state.relevant_papers[
385
  ['Article Title', 'Authors', 'Publication Year', 'DOI']
@@ -388,6 +561,5 @@ def main():
388
  relevant_papers['Publication Year'] = relevant_papers['Publication Year'].astype(int)
389
  st.dataframe(relevant_papers, hide_index=True)
390
 
391
-
392
  if __name__ == "__main__":
393
- main()
 
88
  try:
89
  df = pd.read_excel(uploaded_file)
90
  required_columns = ['Abstract', 'Article Title', 'Authors',
91
+ 'Source Title', 'Publication Year', 'DOI', 'Times Cited, All Databases']
92
+
93
  # Check required columns
94
  missing_columns = [col for col in required_columns if col not in df.columns]
95
  if missing_columns:
96
  st.error(f"Missing required columns: {', '.join(missing_columns)}")
97
  return None
98
+
99
  return df[required_columns]
100
  except Exception as e:
101
  st.error(f"Error processing file: {str(e)}")
102
  return None
103
 
104
+ def preprocess_text(text):
105
+ """Preprocess text to add appropriate formatting before summarization"""
106
+ if not isinstance(text, str) or not text.strip():
107
+ return text
108
+
109
+ # Split text into sentences (basic implementation)
110
+ sentences = [s.strip() for s in text.replace('. ', '.\n').split('\n')]
111
+
112
+ # Remove empty sentences
113
+ sentences = [s for s in sentences if s]
114
+
115
+ # Join with proper line breaks
116
+ formatted_text = '\n'.join(sentences)
117
+
118
+ return formatted_text
119
+
120
+ def post_process_summary(summary):
121
+ """Clean up and improve summary coherence"""
122
+ if not summary:
123
+ return summary
124
+
125
+ # Split into sentences
126
+ sentences = [s.strip() for s in summary.split('.')]
127
+ sentences = [s for s in sentences if s] # Remove empty sentences
128
+
129
+ # Fix common issues
130
+ processed_sentences = []
131
+ for i, sentence in enumerate(sentences):
132
+ # Remove redundant words/phrases
133
+ sentence = sentence.replace(" and and ", " and ")
134
+ sentence = sentence.replace("appointment and appointment", "appointment")
135
+
136
+ # Fix common grammatical issues
137
+ sentence = sentence.replace("Cancers distress", "Cancer distress")
138
+ sentence = sentence.replace(" ", " ") # Remove double spaces
139
+
140
+ # Capitalize first letter of each sentence
141
+ sentence = sentence.capitalize()
142
+
143
+ # Add to processed sentences if not empty
144
+ if sentence.strip():
145
+ processed_sentences.append(sentence)
146
+
147
+ # Join sentences with proper spacing and punctuation
148
+ cleaned_summary = '. '.join(processed_sentences)
149
+ if cleaned_summary and not cleaned_summary.endswith('.'):
150
+ cleaned_summary += '.'
151
+
152
+ return cleaned_summary
153
+
154
+ def improve_summary_generation(text, model, tokenizer):
155
+ """Generate improved summary with better prompt and validation"""
156
+ if not isinstance(text, str) or not text.strip():
157
+ return "No abstract available to summarize."
158
+
159
+ # Add a more specific prompt
160
+ formatted_text = (
161
+ "Summarize this medical research paper following this structure exactly:\n"
162
+ "1. Background and objectives\n"
163
+ "2. Methods\n"
164
+ "3. Key findings with specific numbers/percentages\n"
165
+ "4. Main conclusions\n"
166
+ "Original text: " + preprocess_text(text)
167
+ )
168
+
169
+ # Adjust generation parameters
170
+ inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
171
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
172
+
173
+ with torch.no_grad():
174
+ summary_ids = model.generate(
175
+ **{
176
+ "input_ids": inputs["input_ids"],
177
+ "attention_mask": inputs["attention_mask"],
178
+ "max_length": 200,
179
+ "min_length": 50,
180
+ "num_beams": 5,
181
+ "length_penalty": 1.5,
182
+ "no_repeat_ngram_size": 3,
183
+ "temperature": 0.7,
184
+ "repetition_penalty": 1.5
185
+ }
186
+ )
187
+
188
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
189
+
190
+ # Post-process the summary
191
+ processed_summary = post_process_summary(summary)
192
+
193
+ # Validate the summary
194
+ if not validate_summary(processed_summary, text):
195
+ # If validation fails, try one more time with different parameters
196
+ with torch.no_grad():
197
+ summary_ids = model.generate(
198
+ **{
199
+ "input_ids": inputs["input_ids"],
200
+ "attention_mask": inputs["attention_mask"],
201
+ "max_length": 200,
202
+ "min_length": 50,
203
+ "num_beams": 4,
204
+ "length_penalty": 2.0,
205
+ "no_repeat_ngram_size": 4,
206
+ "temperature": 0.8,
207
+ "repetition_penalty": 2.0
208
+ }
209
+ )
210
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
211
+ processed_summary = post_process_summary(summary)
212
+
213
+ return processed_summary
214
+
215
+ def validate_summary(summary, original_text):
216
+ """Validate summary content against original text"""
217
+ # Check for age inconsistencies
218
+ age_mentions = re.findall(r'(\d+\.?\d*)\s*years?', summary.lower())
219
+ if len(age_mentions) > 1: # Multiple age mentions
220
+ return False
221
+
222
+ # Check for repetitive sentences
223
+ sentences = summary.split('.')
224
+ unique_sentences = set(s.strip().lower() for s in sentences if s.strip())
225
+ if len(sentences) - len(unique_sentences) > 1: # More than one duplicate
226
+ return False
227
+
228
+ # Check summary isn't too long or too short compared to original
229
+ summary_words = len(summary.split())
230
+ original_words = len(original_text.split())
231
+ if summary_words < 20 or summary_words > original_words * 0.8:
232
+ return False
233
+
234
+ return True
235
+
236
+ def generate_focused_summary(question, abstracts, model, tokenizer):
237
+ """Generate focused summary based on question"""
238
+ # Preprocess each abstract
239
+ formatted_abstracts = [preprocess_text(abstract) for abstract in abstracts]
240
+ combined_input = f"Question: {question} Abstracts: " + " [SEP] ".join(formatted_abstracts)
241
+
242
+ inputs = tokenizer(combined_input, return_tensors="pt", max_length=1024, truncation=True)
243
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
244
+
245
+ with torch.no_grad():
246
+ summary_ids = model.generate(
247
+ **{
248
+ "input_ids": inputs["input_ids"],
249
+ "attention_mask": inputs["attention_mask"],
250
+ "max_length": 200,
251
+ "min_length": 50,
252
+ "num_beams": 4,
253
+ "length_penalty": 2.0,
254
+ "early_stopping": True
255
+ }
256
+ )
257
+
258
+ return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
259
 
260
  def create_filter_controls(df, sort_column):
261
  """Create appropriate filter controls based on the selected column"""
262
  filtered_df = df.copy()
263
+
264
  if sort_column == 'Publication Year':
265
  # Year range slider
266
  year_min = int(df['Publication Year'].min())
 
268
  col1, col2 = st.columns(2)
269
  with col1:
270
  start_year = st.number_input('From Year',
271
+ min_value=year_min,
272
+ max_value=year_max,
273
+ value=year_min)
274
  with col2:
275
  end_year = st.number_input('To Year',
276
+ min_value=year_min,
277
+ max_value=year_max,
278
+ value=year_max)
279
  filtered_df = filtered_df[
280
  (filtered_df['Publication Year'] >= start_year) &
281
  (filtered_df['Publication Year'] <= end_year)
282
  ]
283
+
284
  elif sort_column == 'Authors':
285
  # Multi-select for authors
286
  unique_authors = sorted(set(
 
298
  lambda x: any(author in str(x) for author in selected_authors)
299
  )
300
  ]
301
+
302
  elif sort_column == 'Source Title':
303
  # Multi-select for source titles
304
+ unique_sources = sorted(df['Source Title'].unique())
305
  selected_sources = st.multiselect(
306
  'Select Sources',
307
  unique_sources
308
  )
309
  if selected_sources:
310
  filtered_df = filtered_df[filtered_df['Source Title'].isin(selected_sources)]
311
+
 
 
 
 
 
 
 
 
312
  elif sort_column == 'Article Title':
313
  # Only alphabetical sorting, no filtering
314
  pass
315
 
316
+
317
+ elif sort_column == 'Times Cited':
318
+ # Cited count range slider
319
+ cited_min = int(df['Times Cited'].min())
320
+ cited_max = int(df['Times Cited'].max())
321
+ col1, col2 = st.columns(2)
322
+ with col1:
323
+ start_cited = st.number_input('From Cited Count',
324
+ min_value=cited_min,
325
+ max_value=cited_max,
326
+ value=cited_min)
327
+ with col2:
328
+ end_cited = st.number_input('To Cited Count',
329
+ min_value=cited_min,
330
+ max_value=cited_max,
331
+ value=cited_max)
332
+ filtered_df = filtered_df[
333
+ (filtered_df['Times Cited'] >= start_cited) &
334
+ (filtered_df['Times Cited'] <= end_cited)
335
+ ]
336
+
337
  return filtered_df
338
 
339
  def main():
340
  st.title("🔬 Biomedical Papers Analysis")
341
+
342
  # File upload section
343
  uploaded_file = st.file_uploader(
344
  "Upload Excel file containing papers",
345
  type=['xlsx', 'xls'],
346
+ help="File must contain: Abstract, Article Title, Authors, Source Title, Publication Year, DOI"
347
  )
348
+
349
  # Question input - moved up but hidden initially
350
  question_container = st.empty()
351
  question = ""
352
+
353
  if uploaded_file is not None:
354
  # Process Excel file
355
  if st.session_state.processed_data is None:
 
357
  df = process_excel(uploaded_file)
358
  if df is not None:
359
  st.session_state.processed_data = df.dropna(subset=["Abstract"])
360
+
361
  if st.session_state.processed_data is not None:
362
  df = st.session_state.processed_data
363
  st.write(f"📊 Loaded {len(df)} papers with abstracts")
364
+
365
  # Get question before processing
366
  with question_container:
367
  question = st.text_input(
368
  "Enter your research question (optional):",
369
  help="If provided, a question-focused summary will be generated after individual summaries"
370
  )
371
+
372
  # Single button for both processes
373
  if not st.session_state.get('processing_started', False):
374
  if st.button("Start Analysis"):
375
  st.session_state.processing_started = True
376
+
377
  # Show processing status and results
378
  if st.session_state.get('processing_started', False):
379
  # Individual Summaries Section
380
+ st.header("📝 Individual Paper Summaries")
381
+
382
  # Generate summaries if not already done
383
  if st.session_state.summaries is None:
384
  try:
 
386
  model, tokenizer = load_model("summarize")
387
  summaries = []
388
  progress_bar = st.progress(0)
389
+
390
  for idx, abstract in enumerate(df['Abstract']):
391
  summary = improve_summary_generation(abstract, model, tokenizer)
392
  summaries.append(summary)
393
  progress_bar.progress((idx + 1) / len(df))
394
+
395
  st.session_state.summaries = summaries
396
  cleanup_model(model, tokenizer)
397
  progress_bar.empty()
398
+
399
  except Exception as e:
400
  st.error(f"Error generating summaries: {str(e)}")
401
  st.session_state.processing_started = False
402
+
403
  # Display summaries with improved sorting and filtering
404
  if st.session_state.summaries is not None:
405
+ col1, col2 = st.columns(2)
406
+ with col1:
407
+ sort_options = ['Article Title', 'Authors', 'Publication Year', 'Source Title', 'Times Cited']
408
+ sort_column = st.selectbox("Sort/Filter by:", sort_options)
409
+ with col2:
410
+ # Only show A-Z/Z-A option for Article Title
411
+ if sort_column == 'Article Title':
412
+ ascending = st.radio(
413
+ "Sort order",
414
+ ["A to Z", "Z to A"],
415
+ horizontal=True
416
+ ) == "A to Z"
417
+ elif sort_column == 'Times Cited':
418
+ ascending = st.radio(
419
+ "Sort order",
420
+ ["Most cited", "Least cited"],
421
+ horizontal=True
422
+ ) == "Least cited"
423
+ else:
424
+ ascending = True # Default for other columns
425
+
426
+ # Create display dataframe
427
+ display_df = df.copy()
428
+ display_df['Summary'] = st.session_state.summaries
429
+ display_df['Publication Year'] = display_df['Publication Year'].astype(int)
430
+ display_df.rename(columns={'Times Cited, All Databases': 'Times Cited'}, inplace=True)
431
+ display_df['Times Cited'] = display_df['Times Cited'].fillna(0).astype(int)
432
+
433
  # Apply filters
434
  filtered_df = create_filter_controls(display_df, sort_column)
435
+
436
  if sort_column == 'Article Title':
437
  # Sort alphabetically
438
  sorted_df = filtered_df.sort_values(by=sort_column, ascending=ascending)
439
  else:
440
+ # Keep original order for other columns after filtering
441
+ # Keep original order for other columns after filtering
442
  sorted_df = filtered_df
443
+
444
  # Show number of filtered results
445
  if len(sorted_df) != len(display_df):
446
  st.write(f"Showing {len(sorted_df)} of {len(display_df)} papers")
447
+
448
  # Apply custom styling
449
  st.markdown("""
450
  <style>
 
475
  }
476
  </style>
477
  """, unsafe_allow_html=True)
478
+
479
  # Display papers using the filtered and sorted dataframe
480
  for _, row in sorted_df.iterrows():
481
+ paper_info_cols = st.columns([1, 1])
482
+
483
+ with paper_info_cols[0]: # PAPER column
484
+ st.markdown('<div class="paper-section"><div class="section-header">PAPER</div>', unsafe_allow_html=True)
485
+ st.markdown(f"""
486
+ <div class="paper-info">
487
+ <div class="paper-title">{row['Article Title']}</div>
488
+ <div class="paper-meta">
489
+ <strong>Authors:</strong> {row['Authors']}<br>
490
+ <strong>Source:</strong> {row['Source Title']}<br>
491
+ <strong>Publication Year:</strong> {row['Publication Year']}<br>
492
+ <strong>Times Cited:</strong> {row['Times Cited']}<br>
493
+ <strong>DOI:</strong> {row['DOI'] if pd.notna(row['DOI']) else 'None'}
494
+ </div>
495
  </div>
496
+ """, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
497
 
498
+ with paper_info_cols[1]: # SUMMARY column
499
+ st.markdown('<div class="paper-section"><div class="section-header">SUMMARY</div>', unsafe_allow_html=True)
500
+ st.markdown(f"""
501
+ <div class="paper-info">
502
+ {row['Summary']}
503
+ </div>
504
+ """, unsafe_allow_html=True)
505
+
506
+ # Add spacing between papers
507
+ st.markdown("<div style='margin-bottom: 20px;'></div>", unsafe_allow_html=True)
508
+
509
  # Question-focused Summary Section (only if question provided)
510
  if question.strip():
511
  st.header("❓ Question-focused Summary")
512
+
513
  if not st.session_state.get('focused_summary_generated', False):
514
  try:
515
  with st.spinner("Analyzing relevant papers..."):
516
  # Initialize text processor if needed
517
  if st.session_state.text_processor is None:
518
  st.session_state.text_processor = TextProcessor()
519
+
520
  # Find relevant abstracts
521
  results = st.session_state.text_processor.find_most_relevant_abstracts(
522
  question,
523
  df['Abstract'].tolist(),
524
  top_k=5
525
  )
526
+
527
  # Load question-focused model
528
  model, tokenizer = load_model("question_focused")
529
+
530
  # Generate focused summary
531
  relevant_abstracts = df['Abstract'].iloc[results['top_indices']].tolist()
532
  focused_summary = generate_focused_summary(
 
535
  model,
536
  tokenizer
537
  )
538
+
539
  # Store results
540
  st.session_state.focused_summary = focused_summary
541
  st.session_state.relevant_papers = df.iloc[results['top_indices']]
542
  st.session_state.relevance_scores = results['scores']
543
  st.session_state.focused_summary_generated = True
544
+
545
  # Cleanup second model
546
  cleanup_model(model, tokenizer)
547
+
548
  except Exception as e:
549
  st.error(f"Error generating focused summary: {str(e)}")
550
+
551
  # Display focused summary results
552
  if st.session_state.get('focused_summary_generated', False):
553
  st.subheader("Summary")
554
  st.write(st.session_state.focused_summary)
555
+
556
  st.subheader("Most Relevant Papers")
557
  relevant_papers = st.session_state.relevant_papers[
558
  ['Article Title', 'Authors', 'Publication Year', 'DOI']
 
561
  relevant_papers['Publication Year'] = relevant_papers['Publication Year'].astype(int)
562
  st.dataframe(relevant_papers, hide_index=True)
563
 
 
564
  if __name__ == "__main__":
565
+ main()