Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -82,76 +82,81 @@ def cleanup_model(model, tokenizer):
|
|
| 82 |
except Exception:
|
| 83 |
pass
|
| 84 |
|
| 85 |
-
def validate_excel_structure(df):
|
| 86 |
-
"""Validate the structure and content of the Excel file"""
|
| 87 |
-
validation_messages = []
|
| 88 |
-
|
| 89 |
-
# Check for minimum content
|
| 90 |
-
if len(df) == 0:
|
| 91 |
-
validation_messages.append("File contains no data")
|
| 92 |
-
return False, validation_messages
|
| 93 |
-
|
| 94 |
-
# Check abstract length
|
| 95 |
-
if df['Abstract'].str.len().min() < 50:
|
| 96 |
-
validation_messages.append("Some abstracts are too short (less than 50 characters)")
|
| 97 |
-
|
| 98 |
-
# Check publication year format
|
| 99 |
-
try:
|
| 100 |
-
df['Publication Year'] = df['Publication Year'].astype(int)
|
| 101 |
-
if df['Publication Year'].min() < 1900 or df['Publication Year'].max() > 2025:
|
| 102 |
-
validation_messages.append("Invalid publication years detected")
|
| 103 |
-
except:
|
| 104 |
-
validation_messages.append("Invalid format in Publication Year column")
|
| 105 |
-
|
| 106 |
-
# Check if DOIs are in valid format (basic check)
|
| 107 |
-
try:
|
| 108 |
-
# Convert DOI column to string and handle NaN values
|
| 109 |
-
doi_series = df['DOI'].fillna('')
|
| 110 |
-
if not doi_series.astype(str).str.contains(r'10\.\d{4,}/.+', na=True).all():
|
| 111 |
-
validation_messages.append("Some DOIs are in invalid format or missing")
|
| 112 |
-
except Exception as e:
|
| 113 |
-
validation_messages.append(f"Error validating DOI format: {str(e)}")
|
| 114 |
-
|
| 115 |
-
return len(validation_messages) == 0, validation_messages
|
| 116 |
-
|
| 117 |
|
| 118 |
@st.cache_data
|
|
|
|
| 119 |
def process_excel(uploaded_file):
|
| 120 |
"""Process uploaded Excel file"""
|
| 121 |
try:
|
| 122 |
df = pd.read_excel(uploaded_file)
|
| 123 |
required_columns = ['Abstract', 'Article Title', 'Authors',
|
| 124 |
-
'Source Title', 'Publication Year', 'DOI',
|
|
|
|
| 125 |
|
| 126 |
-
# Check required columns
|
| 127 |
missing_columns = [col for col in required_columns if col not in df.columns]
|
| 128 |
if missing_columns:
|
| 129 |
-
st.error(
|
|
|
|
| 130 |
return None
|
| 131 |
-
|
| 132 |
-
#
|
| 133 |
if len(df) > 5:
|
| 134 |
st.error("β Your file contains more than 5 papers. Please upload a file with maximum 5 papers.")
|
| 135 |
return None
|
| 136 |
-
|
| 137 |
-
#
|
| 138 |
is_valid, messages = validate_excel_structure(df)
|
| 139 |
if not is_valid:
|
| 140 |
for msg in messages:
|
| 141 |
st.error(f"β {msg}")
|
| 142 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
except Exception as e:
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
def preprocess_text(text):
|
| 157 |
"""Preprocess text to add appropriate formatting before summarization"""
|
|
|
|
| 82 |
except Exception:
|
| 83 |
pass
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
@st.cache_data
|
| 87 |
+
|
| 88 |
def process_excel(uploaded_file):
|
| 89 |
"""Process uploaded Excel file"""
|
| 90 |
try:
|
| 91 |
df = pd.read_excel(uploaded_file)
|
| 92 |
required_columns = ['Abstract', 'Article Title', 'Authors',
|
| 93 |
+
'Source Title', 'Publication Year', 'DOI',
|
| 94 |
+
'Times Cited, All Databases']
|
| 95 |
|
| 96 |
+
# Check required columns first
|
| 97 |
missing_columns = [col for col in required_columns if col not in df.columns]
|
| 98 |
if missing_columns:
|
| 99 |
+
st.error("β Missing required columns: " + ", ".join(missing_columns))
|
| 100 |
+
st.error("Please ensure your Excel file contains all required columns.")
|
| 101 |
return None
|
| 102 |
+
|
| 103 |
+
# Only proceed with validation if all required columns exist
|
| 104 |
if len(df) > 5:
|
| 105 |
st.error("β Your file contains more than 5 papers. Please upload a file with maximum 5 papers.")
|
| 106 |
return None
|
| 107 |
+
|
| 108 |
+
# Now safe to validate structure as we know columns exist
|
| 109 |
is_valid, messages = validate_excel_structure(df)
|
| 110 |
if not is_valid:
|
| 111 |
for msg in messages:
|
| 112 |
st.error(f"β {msg}")
|
| 113 |
return None
|
| 114 |
+
|
| 115 |
+
return df[required_columns]
|
| 116 |
+
|
| 117 |
+
except Exception as e:
|
| 118 |
+
st.error(f"β Error reading file: {str(e)}")
|
| 119 |
+
st.error("Please check if your file is in the correct Excel format (.xlsx or .xls)")
|
| 120 |
+
return None
|
| 121 |
|
| 122 |
|
| 123 |
+
def validate_excel_structure(df):
|
| 124 |
+
"""Validate the structure and content of the Excel file"""
|
| 125 |
+
validation_messages = []
|
| 126 |
+
|
| 127 |
+
# Check for minimum content
|
| 128 |
+
if len(df) == 0:
|
| 129 |
+
validation_messages.append("File contains no data")
|
| 130 |
+
return False, validation_messages
|
| 131 |
+
|
| 132 |
+
try:
|
| 133 |
+
# Check abstract length
|
| 134 |
+
min_length = df['Abstract'].fillna('').astype(str).str.len().min()
|
| 135 |
+
if min_length < 50:
|
| 136 |
+
validation_messages.append("Some abstracts are too short (less than 50 characters)")
|
| 137 |
|
| 138 |
+
# Check publication year format
|
| 139 |
+
df['Publication Year'] = pd.to_numeric(df['Publication Year'], errors='coerce')
|
| 140 |
+
if df['Publication Year'].isna().any():
|
| 141 |
+
validation_messages.append("Some publication years are invalid")
|
| 142 |
+
else:
|
| 143 |
+
years = df['Publication Year'].dropna()
|
| 144 |
+
if len(years) > 0: # Only check if we have valid years
|
| 145 |
+
if years.min() < 1900 or years.max() > 2025:
|
| 146 |
+
validation_messages.append("Publication years must be between 1900 and 2025")
|
| 147 |
|
| 148 |
+
# Check DOIs (allow empty DOIs)
|
| 149 |
+
doi_pattern = r'10\.\d{4,}/.+'
|
| 150 |
+
valid_dois = df['DOI'].fillna('').astype(str).str.contains(doi_pattern, na=True, regex=True)
|
| 151 |
+
if not valid_dois.all() and len(valid_dois) > 0:
|
| 152 |
+
validation_messages.append("Some DOIs are in invalid format")
|
| 153 |
+
|
| 154 |
except Exception as e:
|
| 155 |
+
validation_messages.append(f"Error validating data: {str(e)}")
|
| 156 |
+
|
| 157 |
+
return len(validation_messages) == 0, validation_messages
|
| 158 |
+
|
| 159 |
+
|
| 160 |
|
| 161 |
def preprocess_text(text):
|
| 162 |
"""Preprocess text to add appropriate formatting before summarization"""
|