Spaces:

pendar02
/

biomedical

Sleeping

App Files Files Community

pendar02 commited on Jan 13

Commit

ee10f7f

verified ·

1 Parent(s): 737cac5

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -46

app.py CHANGED Viewed

@@ -82,76 +82,81 @@ def cleanup_model(model, tokenizer):
     except Exception:
         pass
-def validate_excel_structure(df):
-    """Validate the structure and content of the Excel file"""
-    validation_messages = []
-    # Check for minimum content
-    if len(df) == 0:
-        validation_messages.append("File contains no data")
-        return False, validation_messages
-    # Check abstract length
-    if df['Abstract'].str.len().min() < 50:
-        validation_messages.append("Some abstracts are too short (less than 50 characters)")
-    # Check publication year format
-    try:
-        df['Publication Year'] = df['Publication Year'].astype(int)
-        if df['Publication Year'].min() < 1900 or df['Publication Year'].max() > 2025:
-            validation_messages.append("Invalid publication years detected")
-    except:
-        validation_messages.append("Invalid format in Publication Year column")
-    # Check if DOIs are in valid format (basic check)
-    try:
-        # Convert DOI column to string and handle NaN values
-        doi_series = df['DOI'].fillna('')
-        if not doi_series.astype(str).str.contains(r'10\.\d{4,}/.+', na=True).all():
-            validation_messages.append("Some DOIs are in invalid format or missing")
-    except Exception as e:
-        validation_messages.append(f"Error validating DOI format: {str(e)}")
-    return len(validation_messages) == 0, validation_messages
 @st.cache_data
 def process_excel(uploaded_file):
     """Process uploaded Excel file"""
     try:
         df = pd.read_excel(uploaded_file)
         required_columns = ['Abstract', 'Article Title', 'Authors',
-                          'Source Title', 'Publication Year', 'DOI', 'Times Cited, All Databases']
-        # Check required columns
         missing_columns = [col for col in required_columns if col not in df.columns]
         if missing_columns:
-            st.error(f"Missing required columns: {', '.join(missing_columns)}")
             return None
-        # Check number of papers
         if len(df) > 5:
             st.error("❌ Your file contains more than 5 papers. Please upload a file with maximum 5 papers.")
             return None
-        # Validate structure and content
         is_valid, messages = validate_excel_structure(df)
         if not is_valid:
             for msg in messages:
                 st.error(f"❌ {msg}")
             return None
-        # Check for empty required fields
-        for col in required_columns:
-            if df[col].isna().any():
-                st.warning(f"⚠️ Some entries in '{col}' column are empty. This might affect the analysis.")
-        return df[required_columns]
     except Exception as e:
-        st.error(f"Error processing file: {str(e)}")
-        return None
 def preprocess_text(text):
     """Preprocess text to add appropriate formatting before summarization"""

     except Exception:
         pass
 @st.cache_data
 def process_excel(uploaded_file):
     """Process uploaded Excel file"""
     try:
         df = pd.read_excel(uploaded_file)
         required_columns = ['Abstract', 'Article Title', 'Authors',
+                          'Source Title', 'Publication Year', 'DOI',
+                          'Times Cited, All Databases']
+        # Check required columns first
         missing_columns = [col for col in required_columns if col not in df.columns]
         if missing_columns:
+            st.error("❌ Missing required columns: " + ", ".join(missing_columns))
+            st.error("Please ensure your Excel file contains all required columns.")
             return None
+        # Only proceed with validation if all required columns exist
         if len(df) > 5:
             st.error("❌ Your file contains more than 5 papers. Please upload a file with maximum 5 papers.")
             return None
+        # Now safe to validate structure as we know columns exist
         is_valid, messages = validate_excel_structure(df)
         if not is_valid:
             for msg in messages:
                 st.error(f"❌ {msg}")
             return None
+        return df[required_columns]
+    except Exception as e:
+        st.error(f"❌ Error reading file: {str(e)}")
+        st.error("Please check if your file is in the correct Excel format (.xlsx or .xls)")
+        return None
+def validate_excel_structure(df):
+    """Validate the structure and content of the Excel file"""
+    validation_messages = []
+    # Check for minimum content
+    if len(df) == 0:
+        validation_messages.append("File contains no data")
+        return False, validation_messages
+    try:
+        # Check abstract length
+        min_length = df['Abstract'].fillna('').astype(str).str.len().min()
+        if min_length < 50:
+            validation_messages.append("Some abstracts are too short (less than 50 characters)")
+        # Check publication year format
+        df['Publication Year'] = pd.to_numeric(df['Publication Year'], errors='coerce')
+        if df['Publication Year'].isna().any():
+            validation_messages.append("Some publication years are invalid")
+        else:
+            years = df['Publication Year'].dropna()
+            if len(years) > 0:  # Only check if we have valid years
+                if years.min() < 1900 or years.max() > 2025:
+                    validation_messages.append("Publication years must be between 1900 and 2025")
+        # Check DOIs (allow empty DOIs)
+        doi_pattern = r'10\.\d{4,}/.+'
+        valid_dois = df['DOI'].fillna('').astype(str).str.contains(doi_pattern, na=True, regex=True)
+        if not valid_dois.all() and len(valid_dois) > 0:
+            validation_messages.append("Some DOIs are in invalid format")
     except Exception as e:
+        validation_messages.append(f"Error validating data: {str(e)}")
+    return len(validation_messages) == 0, validation_messages
 def preprocess_text(text):
     """Preprocess text to add appropriate formatting before summarization"""