Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files Community

Ahmedik95316 commited on Aug 20

Commit

9702556

1 Parent(s): b9a8a05

Update initialize_system.py

Browse files

Files changed (1) hide show

initialize_system.py +91 -17

initialize_system.py CHANGED Viewed

@@ -184,13 +184,43 @@ def run_initial_training():
     log_step("Starting initial model training...")
     try:
-        # Check if model already exists
         model_path = path_manager.get_model_file_path()
         vectorizer_path = path_manager.get_vectorizer_path()
         pipeline_path = path_manager.get_pipeline_path()
         if pipeline_path.exists() or (model_path.exists() and vectorizer_path.exists()):
-            log_step("✅ Model files already exist")
             return True
         # Import required libraries
@@ -253,14 +283,34 @@ def run_initial_training():
         accuracy = accuracy_score(y_test, y_pred)
         f1 = f1_score(y_test, y_pred, average='weighted')
-        # Save complete pipeline
         joblib.dump(pipeline, pipeline_path)
-        log_step(f"✅ Saved pipeline to {pipeline_path}")
         # Save individual components for backward compatibility
-        joblib.dump(pipeline.named_steps['model'], model_path)
-        joblib.dump(pipeline.named_steps['vectorizer'], vectorizer_path)
-        log_step(f"✅ Saved individual components")
         # Save metadata
         metadata = {
@@ -274,7 +324,9 @@ def run_initial_training():
             "training_method": "initialization",
             "environment": path_manager.environment,
             "data_path": str(dataset_path),
-            "class_distribution": class_counts.to_dict()
         }
         metadata_path = path_manager.get_metadata_path()
@@ -284,9 +336,9 @@ def run_initial_training():
         log_step(f"✅ Training completed successfully")
         log_step(f"   Accuracy: {accuracy:.4f}")
         log_step(f"   F1 Score: {f1:.4f}")
         log_step(f"   Model saved to: {model_path}")
         log_step(f"   Vectorizer saved to: {vectorizer_path}")
-        log_step(f"   Pipeline saved to: {pipeline_path}")
         return True
@@ -352,6 +404,7 @@ def verify_system():
         (path_manager.get_combined_dataset_path(), "Combined dataset"),
         (path_manager.get_model_file_path(), "Model file"),
         (path_manager.get_vectorizer_path(), "Vectorizer file"),
         (path_manager.get_metadata_path(), "Metadata file"),
         (path_manager.get_activity_log_path(), "Activity log")
     ]
@@ -362,24 +415,31 @@ def verify_system():
             log_step(f"✅ {description}: {file_path}")
         else:
             log_step(f"❌ Missing {description}: {file_path}")
-            all_good = False
-    # Test model loading
     try:
         import joblib
         pipeline_path = path_manager.get_pipeline_path()
         if pipeline_path.exists():
             pipeline = joblib.load(pipeline_path)
             test_pred = pipeline.predict(["This is a test text"])
-            log_step(f"✅ Model test prediction successful: {test_pred}")
         else:
             model_path = path_manager.get_model_file_path()
             vectorizer_path = path_manager.get_vectorizer_path()
-            model = joblib.load(model_path)
-            vectorizer = joblib.load(vectorizer_path)
-            test_text_vec = vectorizer.transform(["This is a test text"])
-            test_pred = model.predict(test_text_vec)
-            log_step(f"✅ Model component test prediction successful: {test_pred}")
     except Exception as e:
         log_step(f"❌ Model test failed: {e}")
         all_good = False
@@ -441,6 +501,20 @@ def main():
     log_step(f"   Available datasets: {sum(env_info['available_datasets'].values())}")
     log_step(f"   Available models: {sum(env_info['available_models'].values())}")
     log_step("\n🎯 System ready for use!")
     return len(failed_steps) == 0

     log_step("Starting initial model training...")
     try:
+        # Get all the paths
         model_path = path_manager.get_model_file_path()
         vectorizer_path = path_manager.get_vectorizer_path()
         pipeline_path = path_manager.get_pipeline_path()
+        log_step(f"Model path: {model_path}")
+        log_step(f"Vectorizer path: {vectorizer_path}")
+        log_step(f"Pipeline path: {pipeline_path}")
+        # Check if model already exists
         if pipeline_path.exists() or (model_path.exists() and vectorizer_path.exists()):
+            log_step("✅ Model files already exist, checking if pipeline needs to be created...")
+            # If individual components exist but pipeline doesn't, create pipeline
+            if model_path.exists() and vectorizer_path.exists() and not pipeline_path.exists():
+                log_step("Creating pipeline from existing components...")
+                try:
+                    import joblib
+                    from sklearn.pipeline import Pipeline
+                    # Load existing components
+                    model = joblib.load(model_path)
+                    vectorizer = joblib.load(vectorizer_path)
+                    # Create pipeline
+                    pipeline = Pipeline([
+                        ('vectorizer', vectorizer),
+                        ('model', model)
+                    ])
+                    # Save pipeline
+                    joblib.dump(pipeline, pipeline_path)
+                    log_step(f"✅ Created pipeline from existing components: {pipeline_path}")
+                except Exception as e:
+                    log_step(f"⚠️ Failed to create pipeline from existing components: {e}")
             return True
         # Import required libraries
         accuracy = accuracy_score(y_test, y_pred)
         f1 = f1_score(y_test, y_pred, average='weighted')
+        # Ensure model directory exists
+        model_path.parent.mkdir(parents=True, exist_ok=True)
+        # Save complete pipeline FIRST (this is the priority)
+        log_step(f"Saving pipeline to: {pipeline_path}")
         joblib.dump(pipeline, pipeline_path)
+        # Verify pipeline was saved
+        if pipeline_path.exists():
+            log_step(f"✅ Pipeline saved successfully to {pipeline_path}")
+            # Test loading the pipeline
+            try:
+                test_pipeline = joblib.load(pipeline_path)
+                test_pred = test_pipeline.predict(["This is a test"])
+                log_step(f"✅ Pipeline verification successful: {test_pred}")
+            except Exception as e:
+                log_step(f"⚠️ Pipeline verification failed: {e}")
+        else:
+            log_step(f"❌ Pipeline was not saved to {pipeline_path}")
         # Save individual components for backward compatibility
+        try:
+            joblib.dump(pipeline.named_steps['model'], model_path)
+            joblib.dump(pipeline.named_steps['vectorizer'], vectorizer_path)
+            log_step(f"✅ Saved individual components")
+        except Exception as e:
+            log_step(f"⚠️ Failed to save individual components: {e}")
         # Save metadata
         metadata = {
             "training_method": "initialization",
             "environment": path_manager.environment,
             "data_path": str(dataset_path),
+            "class_distribution": class_counts.to_dict(),
+            "pipeline_created": pipeline_path.exists(),
+            "individual_components_created": model_path.exists() and vectorizer_path.exists()
         }
         metadata_path = path_manager.get_metadata_path()
         log_step(f"✅ Training completed successfully")
         log_step(f"   Accuracy: {accuracy:.4f}")
         log_step(f"   F1 Score: {f1:.4f}")
+        log_step(f"   Pipeline saved: {pipeline_path.exists()}")
         log_step(f"   Model saved to: {model_path}")
         log_step(f"   Vectorizer saved to: {vectorizer_path}")
         return True
         (path_manager.get_combined_dataset_path(), "Combined dataset"),
         (path_manager.get_model_file_path(), "Model file"),
         (path_manager.get_vectorizer_path(), "Vectorizer file"),
+        (path_manager.get_pipeline_path(), "Pipeline file"),
         (path_manager.get_metadata_path(), "Metadata file"),
         (path_manager.get_activity_log_path(), "Activity log")
     ]
             log_step(f"✅ {description}: {file_path}")
         else:
             log_step(f"❌ Missing {description}: {file_path}")
+            if description == "Pipeline file":
+                # Pipeline is critical, mark as not all good
+                all_good = False
+    # Test model loading - prioritize pipeline
     try:
         import joblib
         pipeline_path = path_manager.get_pipeline_path()
         if pipeline_path.exists():
             pipeline = joblib.load(pipeline_path)
             test_pred = pipeline.predict(["This is a test text"])
+            log_step(f"✅ Pipeline test prediction successful: {test_pred}")
         else:
+            log_step("⚠️ Pipeline not available, testing individual components...")
             model_path = path_manager.get_model_file_path()
             vectorizer_path = path_manager.get_vectorizer_path()
+            if model_path.exists() and vectorizer_path.exists():
+                model = joblib.load(model_path)
+                vectorizer = joblib.load(vectorizer_path)
+                test_text_vec = vectorizer.transform(["This is a test text"])
+                test_pred = model.predict(test_text_vec)
+                log_step(f"✅ Individual components test prediction successful: {test_pred}")
+            else:
+                log_step("❌ No working model components found")
+                all_good = False
     except Exception as e:
         log_step(f"❌ Model test failed: {e}")
         all_good = False
     log_step(f"   Available datasets: {sum(env_info['available_datasets'].values())}")
     log_step(f"   Available models: {sum(env_info['available_models'].values())}")
+    # Final pipeline check
+    pipeline_path = path_manager.get_pipeline_path()
+    log_step(f"\n🎯 Final Pipeline Check:")
+    log_step(f"   Pipeline path: {pipeline_path}")
+    log_step(f"   Pipeline exists: {pipeline_path.exists()}")
+    if pipeline_path.exists():
+        try:
+            import joblib
+            pipeline = joblib.load(pipeline_path)
+            log_step(f"   Pipeline loadable: ✅")
+            log_step(f"   Pipeline steps: {list(pipeline.named_steps.keys())}")
+        except Exception as e:
+            log_step(f"   Pipeline load error: {e}")
     log_step("\n🎯 System ready for use!")
     return len(failed_steps) == 0