pendar02 commited on
Commit
60b1427
Β·
verified Β·
1 Parent(s): 74a9b22

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +250 -0
app.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import torch
4
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
+ from peft import PeftModel
6
+ from text_processing import TextProcessor
7
+ import gc
8
+ import time
9
+ from pathlib import Path
10
+
11
+ # Configure page
12
+ st.set_page_config(
13
+ page_title="Biomedical Papers Analysis",
14
+ page_icon="πŸ”¬",
15
+ layout="wide"
16
+ )
17
+
18
+ # Initialize session state
19
+ if 'processed_data' not in st.session_state:
20
+ st.session_state.processed_data = None
21
+ if 'summaries' not in st.session_state:
22
+ st.session_state.summaries = None
23
+ if 'text_processor' not in st.session_state:
24
+ st.session_state.text_processor = None
25
+
26
+ def load_model(model_type):
27
+ """Load appropriate model based on type"""
28
+ if model_type == "summarize":
29
+ base_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
30
+ model = PeftModel.from_pretrained(base_model, "pendar02/results")
31
+ tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
32
+ else: # question_focused
33
+ base_model = AutoModelForSeq2SeqLM.from_pretrained("GanjinZero/biobart-base")
34
+ model = PeftModel.from_pretrained(base_model, "pendar02/biobart-finetune")
35
+ tokenizer = AutoTokenizer.from_pretrained("GanjinZero/biobart-base")
36
+
37
+ return model, tokenizer
38
+
39
+ @st.cache_data
40
+ def process_excel(uploaded_file):
41
+ """Process uploaded Excel file"""
42
+ try:
43
+ df = pd.read_excel(uploaded_file)
44
+ required_columns = ['Abstract', 'Article Title', 'Authors',
45
+ 'Source Title', 'Publication Year', 'DOI']
46
+
47
+ # Check required columns
48
+ missing_columns = [col for col in required_columns if col not in df.columns]
49
+ if missing_columns:
50
+ st.error(f"Missing required columns: {', '.join(missing_columns)}")
51
+ return None
52
+
53
+ return df[required_columns]
54
+ except Exception as e:
55
+ st.error(f"Error processing file: {str(e)}")
56
+ return None
57
+
58
+ def generate_summary(text, model, tokenizer):
59
+ """Generate summary for single abstract"""
60
+ inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
61
+
62
+ with torch.no_grad():
63
+ summary_ids = model.generate(
64
+ **{
65
+ "input_ids": inputs["input_ids"],
66
+ "attention_mask": inputs["attention_mask"],
67
+ "max_length": 150,
68
+ "min_length": 50,
69
+ "num_beams": 4,
70
+ "length_penalty": 2.0,
71
+ "early_stopping": True
72
+ }
73
+ )
74
+
75
+ return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
76
+
77
+ def generate_focused_summary(question, abstracts, model, tokenizer):
78
+ """Generate focused summary based on question"""
79
+ combined_input = f"Question: {question} Abstracts: " + " [SEP] ".join(abstracts)
80
+
81
+ inputs = tokenizer(combined_input, return_tensors="pt", max_length=1024, truncation=True)
82
+
83
+ with torch.no_grad():
84
+ summary_ids = model.generate(
85
+ **{
86
+ "input_ids": inputs["input_ids"],
87
+ "attention_mask": inputs["attention_mask"],
88
+ "max_length": 200,
89
+ "min_length": 50,
90
+ "num_beams": 4,
91
+ "length_penalty": 2.0,
92
+ "early_stopping": True
93
+ }
94
+ )
95
+
96
+ return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
97
+
98
+ def main():
99
+ st.title("πŸ”¬ Biomedical Papers Analysis")
100
+
101
+ # Sidebar
102
+ st.sidebar.header("About")
103
+ st.sidebar.info(
104
+ "This app analyzes biomedical research papers. Upload an Excel file "
105
+ "containing paper details and abstracts to:"
106
+ "\n- Generate individual summaries"
107
+ "\n- Get question-focused insights"
108
+ )
109
+
110
+ # Initialize text processor if not already done
111
+ if st.session_state.text_processor is None:
112
+ with st.spinner("Loading NLP models..."):
113
+ st.session_state.text_processor = TextProcessor()
114
+
115
+ # File upload section
116
+ uploaded_file = st.file_uploader(
117
+ "Upload Excel file containing papers",
118
+ type=['xlsx', 'xls'],
119
+ help="File must contain: Abstract, Article Title, Authors, Source Title, Publication Year, DOI"
120
+ )
121
+
122
+ if uploaded_file is not None:
123
+ # Process Excel file
124
+ if st.session_state.processed_data is None:
125
+ with st.spinner("Processing file..."):
126
+ df = process_excel(uploaded_file)
127
+ if df is not None:
128
+ st.session_state.processed_data = df
129
+
130
+ if st.session_state.processed_data is not None:
131
+ df = st.session_state.processed_data
132
+ st.write(f"πŸ“Š Loaded {len(df)} papers")
133
+
134
+ # Individual Summaries Section
135
+ st.header("πŸ“ Individual Paper Summaries")
136
+
137
+ if st.session_state.summaries is None and st.button("Generate Individual Summaries"):
138
+ try:
139
+ with st.spinner("Generating summaries..."):
140
+ # Load summarization model
141
+ model, tokenizer = load_model("summarize")
142
+
143
+ # Process abstracts
144
+ progress_bar = st.progress(0)
145
+ summaries = []
146
+
147
+ for i, abstract in enumerate(df['Abstract']):
148
+ summary = generate_summary(abstract, model, tokenizer)
149
+ summaries.append(summary)
150
+ progress_bar.progress((i + 1) / len(df))
151
+
152
+ st.session_state.summaries = summaries
153
+
154
+ # Clear GPU memory
155
+ del model
156
+ del tokenizer
157
+ torch.cuda.empty_cache()
158
+ gc.collect()
159
+
160
+ except Exception as e:
161
+ st.error(f"Error generating summaries: {str(e)}")
162
+
163
+ if st.session_state.summaries is not None:
164
+ # Display summaries with sorting options
165
+ col1, col2 = st.columns(2)
166
+ with col1:
167
+ sort_column = st.selectbox("Sort by:", df.columns)
168
+ with col2:
169
+ ascending = st.checkbox("Ascending order", True)
170
+
171
+ # Create display dataframe
172
+ display_df = df.copy()
173
+ display_df['Summary'] = st.session_state.summaries
174
+ sorted_df = display_df.sort_values(by=sort_column, ascending=ascending)
175
+
176
+ # Show interactive table
177
+ st.dataframe(
178
+ sorted_df,
179
+ column_config={
180
+ "Abstract": st.column_config.TextColumn(
181
+ "Abstract",
182
+ width="medium",
183
+ help="Original abstract text"
184
+ ),
185
+ "Summary": st.column_config.TextColumn(
186
+ "Summary",
187
+ width="medium",
188
+ help="Generated summary"
189
+ )
190
+ },
191
+ hide_index=True
192
+ )
193
+
194
+ # Question-focused Summary Section
195
+ st.header("❓ Question-focused Summary")
196
+ question = st.text_input("Enter your research question:")
197
+
198
+ if question and st.button("Generate Focused Summary"):
199
+ try:
200
+ with st.spinner("Analyzing relevant papers..."):
201
+ # Find relevant abstracts
202
+ results = st.session_state.text_processor.find_most_relevant_abstracts(
203
+ question,
204
+ df['Abstract'].tolist(),
205
+ top_k=5
206
+ )
207
+
208
+ # Show spell-check suggestion if needed
209
+ if results['processed_question']['original'] != results['processed_question']['corrected']:
210
+ st.info(f"Did you mean: {results['processed_question']['corrected']}?")
211
+
212
+ # Load question-focused model
213
+ model, tokenizer = load_model("question_focused")
214
+
215
+ # Get relevant abstracts and generate summary
216
+ relevant_abstracts = df['Abstract'].iloc[results['top_indices']].tolist()
217
+ focused_summary = generate_focused_summary(
218
+ results['processed_question']['corrected'],
219
+ relevant_abstracts,
220
+ model,
221
+ tokenizer
222
+ )
223
+
224
+ # Display results
225
+ st.subheader("Summary")
226
+ st.write(focused_summary)
227
+
228
+ # Show relevant papers
229
+ st.subheader("Most Relevant Papers")
230
+ relevant_papers = df.iloc[results['top_indices']][
231
+ ['Article Title', 'Authors', 'Publication Year', 'DOI']
232
+ ]
233
+ relevant_papers['Relevance Score'] = results['scores']
234
+ st.dataframe(relevant_papers, hide_index=True)
235
+
236
+ # Show identified medical terms
237
+ st.subheader("Identified Medical Terms")
238
+ st.write(", ".join(results['processed_question']['medical_entities']))
239
+
240
+ # Clear GPU memory
241
+ del model
242
+ del tokenizer
243
+ torch.cuda.empty_cache()
244
+ gc.collect()
245
+
246
+ except Exception as e:
247
+ st.error(f"Error generating focused summary: {str(e)}")
248
+
249
+ if __name__ == "__main__":
250
+ main()