Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -82,76 +82,81 @@ def cleanup_model(model, tokenizer):
|
|
82 |
except Exception:
|
83 |
pass
|
84 |
|
85 |
-
def validate_excel_structure(df):
|
86 |
-
"""Validate the structure and content of the Excel file"""
|
87 |
-
validation_messages = []
|
88 |
-
|
89 |
-
# Check for minimum content
|
90 |
-
if len(df) == 0:
|
91 |
-
validation_messages.append("File contains no data")
|
92 |
-
return False, validation_messages
|
93 |
-
|
94 |
-
# Check abstract length
|
95 |
-
if df['Abstract'].str.len().min() < 50:
|
96 |
-
validation_messages.append("Some abstracts are too short (less than 50 characters)")
|
97 |
-
|
98 |
-
# Check publication year format
|
99 |
-
try:
|
100 |
-
df['Publication Year'] = df['Publication Year'].astype(int)
|
101 |
-
if df['Publication Year'].min() < 1900 or df['Publication Year'].max() > 2025:
|
102 |
-
validation_messages.append("Invalid publication years detected")
|
103 |
-
except:
|
104 |
-
validation_messages.append("Invalid format in Publication Year column")
|
105 |
-
|
106 |
-
# Check if DOIs are in valid format (basic check)
|
107 |
-
try:
|
108 |
-
# Convert DOI column to string and handle NaN values
|
109 |
-
doi_series = df['DOI'].fillna('')
|
110 |
-
if not doi_series.astype(str).str.contains(r'10\.\d{4,}/.+', na=True).all():
|
111 |
-
validation_messages.append("Some DOIs are in invalid format or missing")
|
112 |
-
except Exception as e:
|
113 |
-
validation_messages.append(f"Error validating DOI format: {str(e)}")
|
114 |
-
|
115 |
-
return len(validation_messages) == 0, validation_messages
|
116 |
-
|
117 |
|
118 |
@st.cache_data
|
|
|
119 |
def process_excel(uploaded_file):
|
120 |
"""Process uploaded Excel file"""
|
121 |
try:
|
122 |
df = pd.read_excel(uploaded_file)
|
123 |
required_columns = ['Abstract', 'Article Title', 'Authors',
|
124 |
-
'Source Title', 'Publication Year', 'DOI',
|
|
|
125 |
|
126 |
-
# Check required columns
|
127 |
missing_columns = [col for col in required_columns if col not in df.columns]
|
128 |
if missing_columns:
|
129 |
-
st.error(
|
|
|
130 |
return None
|
131 |
-
|
132 |
-
#
|
133 |
if len(df) > 5:
|
134 |
st.error("β Your file contains more than 5 papers. Please upload a file with maximum 5 papers.")
|
135 |
return None
|
136 |
-
|
137 |
-
#
|
138 |
is_valid, messages = validate_excel_structure(df)
|
139 |
if not is_valid:
|
140 |
for msg in messages:
|
141 |
st.error(f"β {msg}")
|
142 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
152 |
except Exception as e:
|
153 |
-
|
154 |
-
|
|
|
|
|
|
|
155 |
|
156 |
def preprocess_text(text):
|
157 |
"""Preprocess text to add appropriate formatting before summarization"""
|
|
|
82 |
except Exception:
|
83 |
pass
|
84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
@st.cache_data
|
87 |
+
|
88 |
def process_excel(uploaded_file):
|
89 |
"""Process uploaded Excel file"""
|
90 |
try:
|
91 |
df = pd.read_excel(uploaded_file)
|
92 |
required_columns = ['Abstract', 'Article Title', 'Authors',
|
93 |
+
'Source Title', 'Publication Year', 'DOI',
|
94 |
+
'Times Cited, All Databases']
|
95 |
|
96 |
+
# Check required columns first
|
97 |
missing_columns = [col for col in required_columns if col not in df.columns]
|
98 |
if missing_columns:
|
99 |
+
st.error("β Missing required columns: " + ", ".join(missing_columns))
|
100 |
+
st.error("Please ensure your Excel file contains all required columns.")
|
101 |
return None
|
102 |
+
|
103 |
+
# Only proceed with validation if all required columns exist
|
104 |
if len(df) > 5:
|
105 |
st.error("β Your file contains more than 5 papers. Please upload a file with maximum 5 papers.")
|
106 |
return None
|
107 |
+
|
108 |
+
# Now safe to validate structure as we know columns exist
|
109 |
is_valid, messages = validate_excel_structure(df)
|
110 |
if not is_valid:
|
111 |
for msg in messages:
|
112 |
st.error(f"β {msg}")
|
113 |
return None
|
114 |
+
|
115 |
+
return df[required_columns]
|
116 |
+
|
117 |
+
except Exception as e:
|
118 |
+
st.error(f"β Error reading file: {str(e)}")
|
119 |
+
st.error("Please check if your file is in the correct Excel format (.xlsx or .xls)")
|
120 |
+
return None
|
121 |
|
122 |
|
123 |
+
def validate_excel_structure(df):
|
124 |
+
"""Validate the structure and content of the Excel file"""
|
125 |
+
validation_messages = []
|
126 |
+
|
127 |
+
# Check for minimum content
|
128 |
+
if len(df) == 0:
|
129 |
+
validation_messages.append("File contains no data")
|
130 |
+
return False, validation_messages
|
131 |
+
|
132 |
+
try:
|
133 |
+
# Check abstract length
|
134 |
+
min_length = df['Abstract'].fillna('').astype(str).str.len().min()
|
135 |
+
if min_length < 50:
|
136 |
+
validation_messages.append("Some abstracts are too short (less than 50 characters)")
|
137 |
|
138 |
+
# Check publication year format
|
139 |
+
df['Publication Year'] = pd.to_numeric(df['Publication Year'], errors='coerce')
|
140 |
+
if df['Publication Year'].isna().any():
|
141 |
+
validation_messages.append("Some publication years are invalid")
|
142 |
+
else:
|
143 |
+
years = df['Publication Year'].dropna()
|
144 |
+
if len(years) > 0: # Only check if we have valid years
|
145 |
+
if years.min() < 1900 or years.max() > 2025:
|
146 |
+
validation_messages.append("Publication years must be between 1900 and 2025")
|
147 |
|
148 |
+
# Check DOIs (allow empty DOIs)
|
149 |
+
doi_pattern = r'10\.\d{4,}/.+'
|
150 |
+
valid_dois = df['DOI'].fillna('').astype(str).str.contains(doi_pattern, na=True, regex=True)
|
151 |
+
if not valid_dois.all() and len(valid_dois) > 0:
|
152 |
+
validation_messages.append("Some DOIs are in invalid format")
|
153 |
+
|
154 |
except Exception as e:
|
155 |
+
validation_messages.append(f"Error validating data: {str(e)}")
|
156 |
+
|
157 |
+
return len(validation_messages) == 0, validation_messages
|
158 |
+
|
159 |
+
|
160 |
|
161 |
def preprocess_text(text):
|
162 |
"""Preprocess text to add appropriate formatting before summarization"""
|