pendar02 commited on
Commit
ee10f7f
Β·
verified Β·
1 Parent(s): 737cac5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -46
app.py CHANGED
@@ -82,76 +82,81 @@ def cleanup_model(model, tokenizer):
82
  except Exception:
83
  pass
84
 
85
- def validate_excel_structure(df):
86
- """Validate the structure and content of the Excel file"""
87
- validation_messages = []
88
-
89
- # Check for minimum content
90
- if len(df) == 0:
91
- validation_messages.append("File contains no data")
92
- return False, validation_messages
93
-
94
- # Check abstract length
95
- if df['Abstract'].str.len().min() < 50:
96
- validation_messages.append("Some abstracts are too short (less than 50 characters)")
97
-
98
- # Check publication year format
99
- try:
100
- df['Publication Year'] = df['Publication Year'].astype(int)
101
- if df['Publication Year'].min() < 1900 or df['Publication Year'].max() > 2025:
102
- validation_messages.append("Invalid publication years detected")
103
- except:
104
- validation_messages.append("Invalid format in Publication Year column")
105
-
106
- # Check if DOIs are in valid format (basic check)
107
- try:
108
- # Convert DOI column to string and handle NaN values
109
- doi_series = df['DOI'].fillna('')
110
- if not doi_series.astype(str).str.contains(r'10\.\d{4,}/.+', na=True).all():
111
- validation_messages.append("Some DOIs are in invalid format or missing")
112
- except Exception as e:
113
- validation_messages.append(f"Error validating DOI format: {str(e)}")
114
-
115
- return len(validation_messages) == 0, validation_messages
116
-
117
 
118
  @st.cache_data
 
119
  def process_excel(uploaded_file):
120
  """Process uploaded Excel file"""
121
  try:
122
  df = pd.read_excel(uploaded_file)
123
  required_columns = ['Abstract', 'Article Title', 'Authors',
124
- 'Source Title', 'Publication Year', 'DOI', 'Times Cited, All Databases']
 
125
 
126
- # Check required columns
127
  missing_columns = [col for col in required_columns if col not in df.columns]
128
  if missing_columns:
129
- st.error(f"Missing required columns: {', '.join(missing_columns)}")
 
130
  return None
131
-
132
- # Check number of papers
133
  if len(df) > 5:
134
  st.error("❌ Your file contains more than 5 papers. Please upload a file with maximum 5 papers.")
135
  return None
136
-
137
- # Validate structure and content
138
  is_valid, messages = validate_excel_structure(df)
139
  if not is_valid:
140
  for msg in messages:
141
  st.error(f"❌ {msg}")
142
  return None
 
 
 
 
 
 
 
143
 
144
 
145
- # Check for empty required fields
146
- for col in required_columns:
147
- if df[col].isna().any():
148
- st.warning(f"⚠️ Some entries in '{col}' column are empty. This might affect the analysis.")
 
 
 
 
 
 
 
 
 
 
149
 
 
 
 
 
 
 
 
 
 
150
 
151
- return df[required_columns]
 
 
 
 
 
152
  except Exception as e:
153
- st.error(f"Error processing file: {str(e)}")
154
- return None
 
 
 
155
 
156
  def preprocess_text(text):
157
  """Preprocess text to add appropriate formatting before summarization"""
 
82
  except Exception:
83
  pass
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  @st.cache_data
87
+
88
  def process_excel(uploaded_file):
89
  """Process uploaded Excel file"""
90
  try:
91
  df = pd.read_excel(uploaded_file)
92
  required_columns = ['Abstract', 'Article Title', 'Authors',
93
+ 'Source Title', 'Publication Year', 'DOI',
94
+ 'Times Cited, All Databases']
95
 
96
+ # Check required columns first
97
  missing_columns = [col for col in required_columns if col not in df.columns]
98
  if missing_columns:
99
+ st.error("❌ Missing required columns: " + ", ".join(missing_columns))
100
+ st.error("Please ensure your Excel file contains all required columns.")
101
  return None
102
+
103
+ # Only proceed with validation if all required columns exist
104
  if len(df) > 5:
105
  st.error("❌ Your file contains more than 5 papers. Please upload a file with maximum 5 papers.")
106
  return None
107
+
108
+ # Now safe to validate structure as we know columns exist
109
  is_valid, messages = validate_excel_structure(df)
110
  if not is_valid:
111
  for msg in messages:
112
  st.error(f"❌ {msg}")
113
  return None
114
+
115
+ return df[required_columns]
116
+
117
+ except Exception as e:
118
+ st.error(f"❌ Error reading file: {str(e)}")
119
+ st.error("Please check if your file is in the correct Excel format (.xlsx or .xls)")
120
+ return None
121
 
122
 
123
+ def validate_excel_structure(df):
124
+ """Validate the structure and content of the Excel file"""
125
+ validation_messages = []
126
+
127
+ # Check for minimum content
128
+ if len(df) == 0:
129
+ validation_messages.append("File contains no data")
130
+ return False, validation_messages
131
+
132
+ try:
133
+ # Check abstract length
134
+ min_length = df['Abstract'].fillna('').astype(str).str.len().min()
135
+ if min_length < 50:
136
+ validation_messages.append("Some abstracts are too short (less than 50 characters)")
137
 
138
+ # Check publication year format
139
+ df['Publication Year'] = pd.to_numeric(df['Publication Year'], errors='coerce')
140
+ if df['Publication Year'].isna().any():
141
+ validation_messages.append("Some publication years are invalid")
142
+ else:
143
+ years = df['Publication Year'].dropna()
144
+ if len(years) > 0: # Only check if we have valid years
145
+ if years.min() < 1900 or years.max() > 2025:
146
+ validation_messages.append("Publication years must be between 1900 and 2025")
147
 
148
+ # Check DOIs (allow empty DOIs)
149
+ doi_pattern = r'10\.\d{4,}/.+'
150
+ valid_dois = df['DOI'].fillna('').astype(str).str.contains(doi_pattern, na=True, regex=True)
151
+ if not valid_dois.all() and len(valid_dois) > 0:
152
+ validation_messages.append("Some DOIs are in invalid format")
153
+
154
  except Exception as e:
155
+ validation_messages.append(f"Error validating data: {str(e)}")
156
+
157
+ return len(validation_messages) == 0, validation_messages
158
+
159
+
160
 
161
  def preprocess_text(text):
162
  """Preprocess text to add appropriate formatting before summarization"""