Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files

xet

Community

Ahmedik95316 commited on Aug 19

Commit

c29bcf3

1 Parent(s): 63bd8f9

Update model/train.py

Browse files

Fixed error with model not saving after training

- When GridSearchCV tries to pickle the pipeline during hyperparameter tuning, it fails because lambda functions are not serializable
- Fixed issue in pipeline saving code that tries to save the pipeline before the model is set

Files changed (1) hide show

model/train.py +54 -42

model/train.py CHANGED Viewed

@@ -25,10 +25,9 @@ import hashlib
 from datetime import datetime
 from typing import Dict, Tuple, Optional, Any
 import warnings
 warnings.filterwarnings('ignore')
-# Scikit-learn imports
 # Configure logging
 logging.basicConfig(
     level=logging.INFO,
@@ -41,6 +40,41 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
 class RobustModelTrainer:
     """Production-ready model trainer with comprehensive evaluation and validation"""
@@ -169,37 +203,12 @@ class RobustModelTrainer:
             logger.error(error_msg)
             return False, None, error_msg
-    def preprocess_text(self, text):
-        """Advanced text preprocessing"""
-        import re
-        # Convert to string
-        text = str(text)
-        # Remove URLs
-        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
-        # Remove email addresses
-        text = re.sub(r'\S+@\S+', '', text)
-        # Remove excessive punctuation
-        text = re.sub(r'[!]{2,}', '!', text)
-        text = re.sub(r'[?]{2,}', '?', text)
-        text = re.sub(r'[.]{3,}', '...', text)
-        # Remove non-alphabetic characters except spaces and basic punctuation
-        text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
-        # Remove excessive whitespace
-        text = re.sub(r'\s+', ' ', text)
-        return text.strip().lower()
     def create_preprocessing_pipeline(self) -> Pipeline:
-        """Create advanced preprocessing pipeline"""
-        # Text preprocessing
         text_preprocessor = FunctionTransformer(
-            func=lambda x: [self.preprocess_text(text) for text in x],
             validate=False
         )
@@ -228,13 +237,6 @@ class RobustModelTrainer:
             ('model', None)  # Will be set during training
         ])
-        # After creating the pipeline
-        joblib.dump(pipeline, "/tmp/pipeline.pkl")  # Save complete pipeline
-        # Individual model
-        joblib.dump(pipeline.named_steps['model'], "/tmp/model.pkl")
-        # Individual vectorizer
-        joblib.dump(pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
         return pipeline
     def comprehensive_evaluation(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
@@ -441,10 +443,20 @@ class RobustModelTrainer:
             # Save the full pipeline
             joblib.dump(model, self.pipeline_path)
             # Save individual components for backward compatibility
-            joblib.dump(model.named_steps['model'], self.model_path)
-            joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
             # Generate data hash
             data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
@@ -479,7 +491,7 @@ class RobustModelTrainer:
             with open(self.metadata_path, 'w') as f:
                 json.dump(metadata, f, indent=2)
-            logger.info(f"Model artifacts saved successfully")
             logger.info(f"Model path: {self.model_path}")
             logger.info(f"Vectorizer path: {self.vectorizer_path}")
             logger.info(f"Pipeline path: {self.pipeline_path}")
@@ -592,4 +604,4 @@ def main():
 if __name__ == "__main__":
-    main()

 from datetime import datetime
 from typing import Dict, Tuple, Optional, Any
 import warnings
+import re
 warnings.filterwarnings('ignore')
 # Configure logging
 logging.basicConfig(
     level=logging.INFO,
 logger = logging.getLogger(__name__)
+def preprocess_text_function(texts):
+    """
+    Standalone function for text preprocessing - pickle-safe
+    """
+    def clean_single_text(text):
+        # Convert to string
+        text = str(text)
+        # Remove URLs
+        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
+        # Remove email addresses
+        text = re.sub(r'\S+@\S+', '', text)
+        # Remove excessive punctuation
+        text = re.sub(r'[!]{2,}', '!', text)
+        text = re.sub(r'[?]{2,}', '?', text)
+        text = re.sub(r'[.]{3,}', '...', text)
+        # Remove non-alphabetic characters except spaces and basic punctuation
+        text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
+        # Remove excessive whitespace
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip().lower()
+    # Process all texts
+    processed = []
+    for text in texts:
+        processed.append(clean_single_text(text))
+    return processed
 class RobustModelTrainer:
     """Production-ready model trainer with comprehensive evaluation and validation"""
             logger.error(error_msg)
             return False, None, error_msg
     def create_preprocessing_pipeline(self) -> Pipeline:
+        """Create advanced preprocessing pipeline - pickle-safe"""
+        # Use the standalone function instead of lambda
         text_preprocessor = FunctionTransformer(
+            func=preprocess_text_function,  # ✅ Pickle-safe function reference
             validate=False
         )
             ('model', None)  # Will be set during training
         ])
         return pipeline
     def comprehensive_evaluation(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
             # Save the full pipeline
             joblib.dump(model, self.pipeline_path)
+            logger.info(f"✅ Saved pipeline to {self.pipeline_path}")
             # Save individual components for backward compatibility
+            if hasattr(model, 'named_steps') and 'model' in model.named_steps:
+                joblib.dump(model.named_steps['model'], self.model_path)
+                logger.info(f"✅ Saved model to {self.model_path}")
+            else:
+                logger.warning("❌ Could not extract model component")
+            if hasattr(model, 'named_steps') and 'vectorize' in model.named_steps:
+                joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
+                logger.info(f"✅ Saved vectorizer to {self.vectorizer_path}")
+            else:
+                logger.warning("❌ Could not extract vectorizer component")
             # Generate data hash
             data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
             with open(self.metadata_path, 'w') as f:
                 json.dump(metadata, f, indent=2)
+            logger.info(f"✅ Model artifacts saved successfully")
             logger.info(f"Model path: {self.model_path}")
             logger.info(f"Vectorizer path: {self.vectorizer_path}")
             logger.info(f"Pipeline path: {self.pipeline_path}")
 if __name__ == "__main__":
+    main()