Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files Community

Ahmedik95316 commited on Aug 19

Commit

e847844

1 Parent(s): 9666aeb

Update initialize_system.py

Browse files

Files changed (1) hide show

initialize_system.py +297 -374

initialize_system.py CHANGED Viewed

@@ -6,88 +6,109 @@ import json
 from pathlib import Path
 from datetime import datetime
 def log_step(message):
     """Log initialization steps"""
     print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
-def check_model_exists():
-    """Check if trained model already exists"""
-    model_files = [
-        Path("/tmp/pipeline.pkl"),
-        Path("/tmp/model.pkl"),
-        Path("/tmp/vectorizer.pkl"),
-        Path("/tmp/metadata.json")
-    ]
-    existing_files = [f for f in model_files if f.exists()]
-    if len(existing_files) >= 2:  # At least pipeline + metadata OR model + vectorizer
-        log_step(f"✅ Found {len(existing_files)} existing model files")
-        return True, existing_files
-    else:
-        log_step(f"❌ Missing model files - only found {len(existing_files)}")
-        return False, existing_files
-def check_training_data_exists():
-    """Check if training data is available"""
-    data_files = [
-        Path("/tmp/data/combined_dataset.csv"),
-        Path("/app/data/combined_dataset.csv"),
-        Path("/tmp/data/kaggle/Fake.csv"),
-        Path("/tmp/data/kaggle/True.csv")
-    ]
-    existing_data = [f for f in data_files if f.exists()]
-    if existing_data:
-        log_step(f"✅ Found training data: {[str(f) for f in existing_data]}")
-        return True, existing_data
-    else:
-        log_step("❌ No training data found")
-        return False, []
 def create_directories():
     """Create necessary directories"""
     log_step("Creating directory structure...")
     directories = [
-        "/tmp/data",
-        "/tmp/data/kaggle",
-        "/tmp/model",
-        "/tmp/logs",
-        "/tmp/results",
-        "/tmp/backups"
     ]
     for dir_path in directories:
-        Path(dir_path).mkdir(parents=True, exist_ok=True)
-        log_step(f"✅ Created {dir_path}")
-def copy_original_datasets():
-    """Copy original datasets from /app to /tmp"""
-    log_step("Copying original datasets...")
-    source_files = [
-        ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
-        ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
-        ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv"),
-        ("/app/data/liar/train.tsv", "/tmp/data/liar/train.tsv"),
-        ("/app/data/liar/test.tsv", "/tmp/data/liar/test.tsv"),
-        ("/app/data/liar/valid.tsv", "/tmp/data/liar/valid.tsv")
     ]
     copied_count = 0
-    for source, dest in source_files:
-        if Path(source).exists():
-            Path(dest).parent.mkdir(parents=True, exist_ok=True)
-            shutil.copy(source, dest)
-            log_step(f"✅ Copied {source} to {dest}")
-            copied_count += 1
         else:
             log_step(f"⚠️ Source file not found: {source}")
@@ -95,268 +116,184 @@ def copy_original_datasets():
 def create_minimal_dataset():
-    """Create a minimal dataset if original doesn't exist"""
     log_step("Creating minimal dataset...")
-    combined_path = Path("/tmp/data/combined_dataset.csv")
     if combined_path.exists():
-        log_step("✅ Combined dataset already exists")
         return True
-    # Create minimal training data with more samples for better training
-    minimal_data = pd.DataFrame({
-        'text': [
-            # Real news samples
-            'Scientists at Stanford University have developed a new method for detecting cancer cells using artificial intelligence',
-            'The Federal Reserve announced today a decision to maintain current interest rates amid economic uncertainty',
-            'Local authorities report significant improvements in air quality following new environmental regulations',
-            'Research published in Nature journal shows promising results for renewable energy storage technology',
-            'The United Nations climate summit concluded with new commitments from world leaders on carbon reduction',
-            'Economic indicators suggest steady growth in the manufacturing sector according to latest government data',
-            'Healthcare workers receive additional training on new medical procedures approved by regulatory agencies',
-            'Transportation department announces infrastructure improvements to major highways across the region',
-            'Educational institutions implement new digital learning platforms to enhance student engagement',
-            'Agricultural studies reveal improved crop yields through sustainable farming practices',
-            'Technology companies invest heavily in cybersecurity measures to protect user data and privacy',
-            'Municipal government approves budget for public transportation expansion project in urban areas',
-            'Medical researchers make breakthrough in understanding genetic factors contributing to heart disease',
-            'International trade agreements show positive impact on local businesses and job creation',
-            'Environmental protection agency releases report on water quality improvements in major rivers',
-            # Fake news samples
-            'SHOCKING: Government secretly controls weather using hidden technology, whistleblower reveals truth',
-            'EXPOSED: Celebrities caught in massive conspiracy to manipulate public opinion through social media',
-            'URGENT: New study proves that drinking water causes immediate memory loss in 99% of population',
-            'BREAKING: Scientists discover that smartphones are actually mind control devices from aliens',
-            'EXCLUSIVE: Secret documents reveal that all elections have been predetermined by shadow organization',
-            'ALERT: Doctors confirm that eating vegetables makes people 500% more likely to develop rare diseases',
-            'LEAKED: Underground network of billionaires planning to replace all humans with artificial intelligence',
-            'CONSPIRACY: Major corporations hiding cure for aging to maintain population control and profits',
-            'REVEALED: Government admits that gravity is fake and Earth is actually moving upward constantly',
-            'WARNING: New technology allows complete thought reading through WiFi signals in your home',
-            'BOMBSHELL: Ancient aliens return to Earth disguised as tech executives to harvest human energy',
-            'UNCOVERED: All news media controlled by single person living in secret underground bunker',
-            'PROOF: Time travel already exists but only available to wealthy elite who control world events',
-            'SCANDAL: Pharmaceutical companies intentionally create diseases to sell more expensive treatments',
-            'EXPOSED: Education system designed to suppress human creativity and independent thinking abilities'
-        ],
-        'label': [
-            # Real news labels (0)
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            # Fake news labels (1)
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-        ]
-    })
-    minimal_data.to_csv(combined_path, index=False)
-    log_step(f"✅ Created enhanced minimal dataset with {len(minimal_data)} samples")
-    log_step(f"   - Real news samples: {sum(minimal_data['label'] == 0)}")
-    log_step(f"   - Fake news samples: {sum(minimal_data['label'] == 1)}")
-    return True
 def run_initial_training():
-    """Run comprehensive model training for first-time setup"""
-    log_step("🚀 Starting comprehensive model training for first-time setup...")
     try:
-        # Import training modules
         from sklearn.feature_extraction.text import TfidfVectorizer
         from sklearn.linear_model import LogisticRegression
-        from sklearn.ensemble import RandomForestClassifier
-        from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
         from sklearn.pipeline import Pipeline
-        from sklearn.feature_selection import SelectKBest, chi2
-        from sklearn.preprocessing import FunctionTransformer
-        from sklearn.metrics import accuracy_score, f1_score, classification_report
         import joblib
-        import re
-        # Text preprocessing function (same as in train.py)
-        def preprocess_text_function(texts):
-            def clean_single_text(text):
-                text = str(text)
-                text = re.sub(r'http\S+|www\S+|https\S+', '', text)
-                text = re.sub(r'\S+@\S+', '', text)
-                text = re.sub(r'[!]{2,}', '!', text)
-                text = re.sub(r'[?]{2,}', '?', text)
-                text = re.sub(r'[.]{3,}', '...', text)
-                text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
-                text = re.sub(r'\s+', ' ', text)
-                return text.strip().lower()
-            processed = []
-            for text in texts:
-                processed.append(clean_single_text(text))
-            return processed
         # Load dataset
-        dataset_path = Path("/tmp/data/combined_dataset.csv")
         if not dataset_path.exists():
             log_step("❌ No dataset available for training")
             return False
         df = pd.read_csv(dataset_path)
-        log_step(f"📊 Loaded dataset with {len(df)} samples")
-        # Data validation and cleaning
-        df = df.dropna(subset=['text', 'label'])
-        df = df[df['text'].astype(str).str.len() > 10]
-        log_step(f"📊 After cleaning: {len(df)} samples")
-        log_step(f"📊 Class distribution: {df['label'].value_counts().to_dict()}")
         # Prepare data
         X = df['text'].values
         y = df['label'].values
         # Train-test split
         X_train, X_test, y_train, y_test = train_test_split(
-            X, y, test_size=0.2, random_state=42, stratify=y
-        )
-        log_step(f"📊 Data split: {len(X_train)} train, {len(X_test)} test")
-        # Create comprehensive pipeline
-        text_preprocessor = FunctionTransformer(
-            func=preprocess_text_function,
-            validate=False
         )
-        vectorizer = TfidfVectorizer(
-            max_features=5000,
-            min_df=1,
-            max_df=0.95,
-            ngram_range=(1, 2),
-            stop_words='english',
-            sublinear_tf=True,
-            norm='l2'
-        )
-        feature_selector = SelectKBest(
-            score_func=chi2,
-            k=2000
-        )
-        # Create pipeline with Logistic Regression
         pipeline = Pipeline([
-            ('vectorize', TfidfVectorizer(
-                max_features=10000,
-                min_df=2,
-                max_df=0.95,
-                ngram_range=(1, 2),
                 stop_words='english',
-                lowercase=True,
-                strip_accents='ascii'
             )),
             ('model', LogisticRegression(
-                max_iter=1000,
-                class_weight='balanced',
-                random_state=42
             ))
         ])
-        # Fit and save
         pipeline.fit(X_train, y_train)
         # Evaluate
         y_pred = pipeline.predict(X_test)
         accuracy = accuracy_score(y_test, y_pred)
-        # Save artifacts
-        joblib.dump(pipeline, "/tmp/pipeline.pkl")
-        joblib.dump(pipeline.named_steps['model'], "/tmp/model.pkl")
-        joblib.dump(pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
-        log_step("🔧 Training model with optimized pipeline...")
-        # Hyperparameter tuning for datasets with sufficient samples
-        if len(X_train) >= 20:
-            log_step("⚙️ Performing hyperparameter tuning...")
-            param_grid = {
-                'model__C': [0.1, 1, 10],
-                'model__penalty': ['l2']
-            }
-            cv_folds = max(2, min(3, len(X_train) // 10))
-            grid_search = GridSearchCV(
-                pipeline,
-                param_grid,
-                cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42),
-                scoring='f1_weighted',
-                n_jobs=1
-            )
-            grid_search.fit(X_train, y_train)
-            best_pipeline = grid_search.best_estimator_
-            log_step(f"✅ Best parameters: {grid_search.best_params_}")
-            log_step(f"✅ Best CV score: {grid_search.best_score_:.4f}")
-        else:
-            log_step("⚙️ Using simple training for small dataset...")
-            pipeline.fit(X_train, y_train)
-            best_pipeline = pipeline
-        # Evaluate model
-        y_pred = best_pipeline.predict(X_test)
-        accuracy = accuracy_score(y_test, y_pred)
         f1 = f1_score(y_test, y_pred, average='weighted')
-        log_step(f"📈 Model Performance:")
-        log_step(f"   - Accuracy: {accuracy:.4f}")
-        log_step(f"   - F1 Score: {f1:.4f}")
-        # Save model artifacts
-        log_step("💾 Saving model artifacts...")
-        # Save the complete pipeline
-        joblib.dump(best_pipeline, "/tmp/pipeline.pkl")
-        log_step("✅ Saved complete pipeline")
-        # Save individual components for compatibility
-        joblib.dump(best_pipeline.named_steps['model'], "/tmp/model.pkl")
-        joblib.dump(best_pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
-        log_step("✅ Saved individual model components")
-        # Generate comprehensive metadata
         metadata = {
-            "model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
-            "model_type": "logistic_regression",
-            "training_method": "initial_setup",
-            "dataset_size": len(df),
-            "train_size": len(X_train),
-            "test_size": len(X_test),
             "test_accuracy": float(accuracy),
             "test_f1": float(f1),
-            "hyperparameter_tuning": len(X_train) >= 20,
-            "cv_folds": cv_folds if len(X_train) >= 20 else "not_used",
-            "class_distribution": df['label'].value_counts().to_dict(),
-            "training_config": {
-                "max_features": 5000,
-                "ngram_range": [1, 2],
-                "feature_selection_k": 2000,
-                "test_size": 0.2
-            },
             "timestamp": datetime.now().isoformat(),
-            "initialization_notes": "Model trained during system initialization",
-            "ready_for_production": True
         }
-        # Save metadata
-        with open("/tmp/metadata.json", 'w') as f:
             json.dump(metadata, f, indent=2)
-        log_step("✅ Saved comprehensive metadata")
-        log_step(f"🎉 Initial model training completed successfully!")
-        log_step(f"📊 Final Performance - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
         return True
     except Exception as e:
         log_step(f"❌ Training failed: {str(e)}")
         import traceback
-        log_step(f"🔍 Error details: {traceback.format_exc()}")
         return False
@@ -368,25 +305,37 @@ def create_initial_logs():
         # Activity log
         activity_log = [{
             "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
-            "event": "System initialized successfully with trained model",
-            "level": "INFO"
         }]
-        with open("/tmp/activity_log.json", 'w') as f:
             json.dump(activity_log, f, indent=2)
         # Create empty monitoring logs
-        log_dirs = ["/tmp/logs"]
-        for log_dir in log_dirs:
-            Path(log_dir).mkdir(parents=True, exist_ok=True)
-        with open("/tmp/logs/monitoring_log.json", 'w') as f:
             json.dump([], f)
-        with open("/tmp/logs/scheduler_execution.json", 'w') as f:
-            json.dump([], f)
-        log_step("✅ Initial log files created")
         return True
     except Exception as e:
@@ -394,100 +343,76 @@ def create_initial_logs():
         return False
-def validate_installation():
-    """Validate that the system is properly set up"""
-    log_step("🔍 Validating system installation...")
-    validation_checks = []
-    # Check model files
-    model_exists, model_files = check_model_exists()
-    validation_checks.append(("Model Files", model_exists, f"Found: {[str(f.name) for f in model_files]}"))
-    # Check data files
-    data_exists, data_files = check_training_data_exists()
-    validation_checks.append(("Training Data", data_exists, f"Found: {len(data_files)} files"))
-    # Check directories
-    required_dirs = ["/tmp/data", "/tmp/model", "/tmp/logs"]
-    dirs_exist = all(Path(d).exists() for d in required_dirs)
-    validation_checks.append(("Directories", dirs_exist, f"Required dirs: {required_dirs}"))
-    # Check logs
-    log_exists = Path("/tmp/activity_log.json").exists()
-    validation_checks.append(("Log Files", log_exists, "Activity log created"))
     # Test model loading
-    model_loadable = False
     try:
         import joblib
-        pipeline = joblib.load("/tmp/pipeline.pkl")
-        test_prediction = pipeline.predict(["This is a test news article"])
-        model_loadable = True
-        validation_checks.append(("Model Loading", True, f"Test prediction: {test_prediction[0]}"))
     except Exception as e:
-        validation_checks.append(("Model Loading", False, f"Error: {str(e)}"))
-    # Print validation results
-    log_step("📋 Validation Results:")
-    all_passed = True
-    for check_name, passed, details in validation_checks:
-        status = "✅ PASS" if passed else "❌ FAIL"
-        log_step(f"   {status} {check_name}: {details}")
-        if not passed:
-            all_passed = False
-    return all_passed, validation_checks
 def main():
-    """Main initialization function with smart training logic"""
-    log_step("🚀 Starting intelligent system initialization...")
-    # Check if model already exists
-    model_exists, existing_model_files = check_model_exists()
-    if model_exists:
-        log_step("🎯 EXISTING INSTALLATION DETECTED")
-        log_step("📄 Found existing model files - skipping training")
-        # Load existing metadata to show info
-        try:
-            with open("/tmp/metadata.json", 'r') as f:
-                metadata = json.load(f)
-            log_step(f"📊 Existing Model Info:")
-            log_step(f"   - Version: {metadata.get('model_version', 'Unknown')}")
-            log_step(f"   - Accuracy: {metadata.get('test_accuracy', 'Unknown')}")
-            log_step(f"   - F1 Score: {metadata.get('test_f1', 'Unknown')}")
-            log_step(f"   - Created: {metadata.get('timestamp', 'Unknown')}")
-        except Exception as e:
-            log_step(f"⚠️ Could not read existing metadata: {e}")
-    else:
-        log_step("🆕 FIRST-TIME INSTALLATION DETECTED")
-        log_step("🔧 No existing model found - will train new model")
-    # Run initialization steps
     steps = [
         ("Directory Creation", create_directories),
-        ("Dataset Copy", copy_original_datasets),
-        ("Dataset Preparation", create_minimal_dataset),
-        ("Log Creation", create_initial_logs)
     ]
-    # Add training step only if model doesn't exist
-    if not model_exists:
-        steps.insert(-1, ("🤖 Model Training", run_initial_training))
     failed_steps = []
     for step_name, step_function in steps:
         try:
-            log_step(f"▶️ Starting: {step_name}")
             if step_function():
                 log_step(f"✅ {step_name} completed")
             else:
                 log_step(f"❌ {step_name} failed")
                 failed_steps.append(step_name)
@@ -495,35 +420,33 @@ def main():
             log_step(f"❌ {step_name} failed: {str(e)}")
             failed_steps.append(step_name)
-    # Final validation
-    log_step("🔍 Running final system validation...")
-    validation_passed, validation_results = validate_installation()
     # Summary
-    log_step("=" * 60)
     if failed_steps:
-        log_step(f"⚠️ Initialization completed with {len(failed_steps)} issues")
-        log_step(f"❌ Failed steps: {', '.join(failed_steps)}")
     else:
         log_step("🎉 System initialization completed successfully!")
-    if validation_passed:
-        log_step("✅ All validation checks passed!")
-        log_step("🚀 System is ready for use!")
-        if not model_exists:
-            log_step("🤖 NEW MODEL TRAINED AND READY")
-            log_step("📊 You can now start making predictions!")
-        else:
-            log_step("🔄 EXISTING MODEL VALIDATED AND READY")
-            log_step("📊 System restored from previous installation!")
-    else:
-        log_step("❌ Some validation checks failed")
-        log_step("🔧 Manual intervention may be required")
-    log_step("=" * 60)
 if __name__ == "__main__":
-    main()

 from pathlib import Path
 from datetime import datetime
+# Import the new path manager
+try:
+    from path_config import path_manager
+except ImportError:
+    # Add current directory to path
+    sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+    from path_config import path_manager
 def log_step(message):
     """Log initialization steps"""
     print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
 def create_directories():
     """Create necessary directories"""
     log_step("Creating directory structure...")
+    # Directories are already created by path_manager initialization
     directories = [
+        path_manager.get_data_path(),
+        path_manager.get_model_path(),
+        path_manager.get_logs_path(),
+        path_manager.get_cache_path(),
+        path_manager.get_temp_path()
     ]
     for dir_path in directories:
+        if dir_path.exists():
+            log_step(f"✅ Directory exists: {dir_path}")
+        else:
+            try:
+                dir_path.mkdir(parents=True, exist_ok=True)
+                log_step(f"✅ Created directory: {dir_path}")
+            except Exception as e:
+                log_step(f"⚠️ Failed to create {dir_path}: {e}")
+                return False
+    # Create kaggle subdirectory
+    kaggle_dir = path_manager.get_data_path('kaggle')
+    kaggle_dir.mkdir(parents=True, exist_ok=True)
+    log_step(f"✅ Created kaggle directory: {kaggle_dir}")
+    return True
+def check_existing_datasets():
+    """Check for existing datasets in the project structure"""
+    log_step("Checking for existing datasets...")
+    # Check for datasets in the current project structure
+    base_dir = path_manager.base_paths['base']
+    # Possible source locations
+    source_locations = [
+        base_dir / "data" / "kaggle" / "Fake.csv",
+        base_dir / "data" / "kaggle" / "True.csv",
+        base_dir / "data" / "combined_dataset.csv"
     ]
+    found_files = []
+    for source_file in source_locations:
+        if source_file.exists():
+            found_files.append(source_file)
+            log_step(f"✅ Found existing dataset: {source_file}")
+    return found_files
+def copy_existing_datasets():
+    """Copy existing datasets if they're not in the target location"""
+    log_step("Copying existing datasets to target locations...")
+    base_dir = path_manager.base_paths['base']
+    target_data_dir = path_manager.get_data_path()
+    # Define source-target pairs
+    copy_operations = [
+        (base_dir / "data" / "kaggle" / "Fake.csv", target_data_dir / "kaggle" / "Fake.csv"),
+        (base_dir / "data" / "kaggle" / "True.csv", target_data_dir / "kaggle" / "True.csv"),
+        (base_dir / "data" / "combined_dataset.csv", target_data_dir / "combined_dataset.csv")
+    ]
     copied_count = 0
+    for source, target in copy_operations:
+        # Skip if source and target are the same (already in correct location)
+        if source == target:
+            if source.exists():
+                log_step(f"✅ Dataset already in correct location: {target}")
+                copied_count += 1
+            continue
+        if source.exists():
+            try:
+                # Ensure target directory exists
+                target.parent.mkdir(parents=True, exist_ok=True)
+                # Copy file
+                shutil.copy2(source, target)
+                log_step(f"✅ Copied {source} → {target}")
+                copied_count += 1
+            except Exception as e:
+                log_step(f"⚠️ Failed to copy {source}: {e}")
         else:
             log_step(f"⚠️ Source file not found: {source}")
 def create_minimal_dataset():
+    """Create a minimal dataset if no existing dataset is found"""
     log_step("Creating minimal dataset...")
+    combined_path = path_manager.get_combined_dataset_path()
     if combined_path.exists():
+        log_step(f"✅ Combined dataset already exists: {combined_path}")
         return True
+    try:
+        # Create minimal training data with diverse examples
+        minimal_data = pd.DataFrame({
+            'text': [
+                # Real news examples
+                'Scientists at MIT have developed a new renewable energy technology that could revolutionize solar power generation.',
+                'The Federal Reserve announced interest rate decisions following their latest economic review meeting.',
+                'Local authorities report significant improvements in air quality following new environmental regulations.',
+                'Research published in Nature journal reveals new insights about climate change adaptation strategies.',
+                'Economic indicators show steady growth in the manufacturing sector across multiple regions.',
+                'Healthcare officials recommend updated vaccination schedules based on latest medical research findings.',
+                'Transportation department announces infrastructure improvements for major highway systems nationwide.',
+                'Educational institutions implement new digital learning platforms to enhance student engagement.',
+                'Agricultural experts develop drought-resistant crop varieties to improve food security globally.',
+                'Technology companies invest heavily in cybersecurity measures to protect user data privacy.',
+                # Fake news examples
+                'SHOCKING: Government officials secretly planning to control population through mind control technology.',
+                'EXCLUSIVE: Celebrities caught in massive alien communication scandal that mainstream media won\'t report.',
+                'BREAKING: Scientists discover time travel but government hiding the truth from public knowledge.',
+                'EXPOSED: Pharmaceutical companies deliberately spreading diseases to increase their massive profits.',
+                'URGENT: Social media platforms using secret algorithms to brainwash users into political compliance.',
+                'LEAKED: Banking system about to collapse completely, insiders reveal financial catastrophe coming soon.',
+                'CONFIRMED: Weather modification technology being used to create artificial natural disasters worldwide.',
+                'REVEALED: Food companies adding dangerous chemicals that cause instant health problems and addiction.',
+                'CONSPIRACY: Educational system designed to suppress critical thinking and create obedient citizens.',
+                'TRUTH: Technology giants working with foreign powers to undermine national sovereignty completely.'
+            ],
+            'label': [
+                # Real news labels (0)
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                # Fake news labels (1)
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+            ]
+        })
+        # Save the dataset
+        minimal_data.to_csv(combined_path, index=False)
+        log_step(f"✅ Created minimal dataset with {len(minimal_data)} samples at {combined_path}")
+        # Verify the file was created correctly
+        if combined_path.exists():
+            df_check = pd.read_csv(combined_path)
+            log_step(f"✅ Verified dataset: {len(df_check)} rows loaded successfully")
+            return True
+        else:
+            log_step("❌ Failed to verify created dataset")
+            return False
+    except Exception as e:
+        log_step(f"❌ Failed to create minimal dataset: {str(e)}")
+        return False
 def run_initial_training():
+    """Run basic model training"""
+    log_step("Starting initial model training...")
     try:
+        # Check if model already exists
+        model_path = path_manager.get_model_file_path()
+        vectorizer_path = path_manager.get_vectorizer_path()
+        pipeline_path = path_manager.get_pipeline_path()
+        if pipeline_path.exists() or (model_path.exists() and vectorizer_path.exists()):
+            log_step("✅ Model files already exist")
+            return True
+        # Import required libraries
         from sklearn.feature_extraction.text import TfidfVectorizer
         from sklearn.linear_model import LogisticRegression
+        from sklearn.model_selection import train_test_split
+        from sklearn.metrics import accuracy_score, f1_score
         from sklearn.pipeline import Pipeline
         import joblib
         # Load dataset
+        dataset_path = path_manager.get_combined_dataset_path()
         if not dataset_path.exists():
             log_step("❌ No dataset available for training")
             return False
         df = pd.read_csv(dataset_path)
+        log_step(f"Loaded dataset with {len(df)} samples")
+        # Validate dataset
+        if len(df) < 10:
+            log_step("❌ Dataset too small for training")
+            return False
         # Prepare data
         X = df['text'].values
         y = df['label'].values
+        # Check class distribution
+        class_counts = pd.Series(y).value_counts()
+        log_step(f"Class distribution: {class_counts.to_dict()}")
         # Train-test split
         X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=42, stratify=y if len(class_counts) > 1 else None
         )
+        # Create pipeline with preprocessing
         pipeline = Pipeline([
+            ('vectorizer', TfidfVectorizer(
+                max_features=5000,
                 stop_words='english',
+                ngram_range=(1, 2),
+                min_df=1,
+                max_df=0.95
             )),
             ('model', LogisticRegression(
+                max_iter=1000,
+                random_state=42,
+                class_weight='balanced'
             ))
         ])
+        # Train model
+        log_step("Training model...")
         pipeline.fit(X_train, y_train)
         # Evaluate
         y_pred = pipeline.predict(X_test)
         accuracy = accuracy_score(y_test, y_pred)
         f1 = f1_score(y_test, y_pred, average='weighted')
+        # Save complete pipeline
+        joblib.dump(pipeline, pipeline_path)
+        log_step(f"✅ Saved pipeline to {pipeline_path}")
+        # Save individual components for backward compatibility
+        joblib.dump(pipeline.named_steps['model'], model_path)
+        joblib.dump(pipeline.named_steps['vectorizer'], vectorizer_path)
+        log_step(f"✅ Saved individual components")
+        # Save metadata
         metadata = {
+            "model_version": "v1.0_init",
+            "model_type": "logistic_regression_pipeline",
             "test_accuracy": float(accuracy),
             "test_f1": float(f1),
+            "train_size": len(X_train),
+            "test_size": len(X_test),
             "timestamp": datetime.now().isoformat(),
+            "training_method": "initialization",
+            "environment": path_manager.environment,
+            "data_path": str(dataset_path),
+            "class_distribution": class_counts.to_dict()
         }
+        metadata_path = path_manager.get_metadata_path()
+        with open(metadata_path, 'w') as f:
             json.dump(metadata, f, indent=2)
+        log_step(f"✅ Training completed successfully")
+        log_step(f"   Accuracy: {accuracy:.4f}")
+        log_step(f"   F1 Score: {f1:.4f}")
+        log_step(f"   Model saved to: {model_path}")
+        log_step(f"   Vectorizer saved to: {vectorizer_path}")
+        log_step(f"   Pipeline saved to: {pipeline_path}")
         return True
     except Exception as e:
         log_step(f"❌ Training failed: {str(e)}")
         import traceback
+        log_step(f"❌ Traceback: {traceback.format_exc()}")
         return False
         # Activity log
         activity_log = [{
             "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
+            "event": "System initialized successfully",
+            "level": "INFO",
+            "environment": path_manager.environment
         }]
+        activity_log_path = path_manager.get_activity_log_path()
+        with open(activity_log_path, 'w') as f:
             json.dump(activity_log, f, indent=2)
+        log_step(f"✅ Created activity log: {activity_log_path}")
         # Create empty monitoring logs
+        monitoring_log_path = path_manager.get_logs_path("monitoring_log.json")
+        with open(monitoring_log_path, 'w') as f:
             json.dump([], f)
+        log_step(f"✅ Created monitoring log: {monitoring_log_path}")
+        # Create other necessary log files
+        log_files = [
+            "drift_history.json",
+            "drift_alerts.json",
+            "scheduler_execution.json",
+            "scheduler_errors.json"
+        ]
+        for log_file in log_files:
+            log_path = path_manager.get_logs_path(log_file)
+            if not log_path.exists():
+                with open(log_path, 'w') as f:
+                    json.dump([], f)
+                log_step(f"✅ Created {log_file}")
         return True
     except Exception as e:
         return False
+def verify_system():
+    """Verify that the system is properly initialized"""
+    log_step("Verifying system initialization...")
+    # Check critical files
+    critical_files = [
+        (path_manager.get_combined_dataset_path(), "Combined dataset"),
+        (path_manager.get_model_file_path(), "Model file"),
+        (path_manager.get_vectorizer_path(), "Vectorizer file"),
+        (path_manager.get_metadata_path(), "Metadata file"),
+        (path_manager.get_activity_log_path(), "Activity log")
+    ]
+    all_good = True
+    for file_path, description in critical_files:
+        if file_path.exists():
+            log_step(f"✅ {description}: {file_path}")
+        else:
+            log_step(f"❌ Missing {description}: {file_path}")
+            all_good = False
     # Test model loading
     try:
         import joblib
+        pipeline_path = path_manager.get_pipeline_path()
+        if pipeline_path.exists():
+            pipeline = joblib.load(pipeline_path)
+            test_pred = pipeline.predict(["This is a test text"])
+            log_step(f"✅ Model test prediction successful: {test_pred}")
+        else:
+            model_path = path_manager.get_model_file_path()
+            vectorizer_path = path_manager.get_vectorizer_path()
+            model = joblib.load(model_path)
+            vectorizer = joblib.load(vectorizer_path)
+            test_text_vec = vectorizer.transform(["This is a test text"])
+            test_pred = model.predict(test_text_vec)
+            log_step(f"✅ Model component test prediction successful: {test_pred}")
     except Exception as e:
+        log_step(f"❌ Model test failed: {e}")
+        all_good = False
+    return all_good
 def main():
+    """Main initialization function"""
+    log_step("🚀 Starting system initialization...")
+    log_step(f"🌍 Environment: {path_manager.environment}")
+    log_step(f"📁 Base directory: {path_manager.base_paths['base']}")
+    log_step(f"📊 Data directory: {path_manager.base_paths['data']}")
+    log_step(f"🤖 Model directory: {path_manager.base_paths['model']}")
     steps = [
         ("Directory Creation", create_directories),
+        ("Existing Dataset Copy", copy_existing_datasets),
+        ("Minimal Dataset Creation", create_minimal_dataset),
+        ("Model Training", run_initial_training),
+        ("Log File Creation", create_initial_logs),
+        ("System Verification", verify_system)
     ]
     failed_steps = []
+    completed_steps = []
     for step_name, step_function in steps:
         try:
+            log_step(f"🔄 Starting: {step_name}")
             if step_function():
                 log_step(f"✅ {step_name} completed")
+                completed_steps.append(step_name)
             else:
                 log_step(f"❌ {step_name} failed")
                 failed_steps.append(step_name)
             log_step(f"❌ {step_name} failed: {str(e)}")
             failed_steps.append(step_name)
     # Summary
+    log_step(f"\n📊 Initialization Summary:")
+    log_step(f"   ✅ Completed: {len(completed_steps)}/{len(steps)} steps")
+    log_step(f"   ❌ Failed: {len(failed_steps)}/{len(steps)} steps")
+    if completed_steps:
+        log_step(f"   Completed steps: {', '.join(completed_steps)}")
     if failed_steps:
+        log_step(f"   Failed steps: {', '.join(failed_steps)}")
+        log_step(f"⚠️ Initialization completed with {len(failed_steps)} failed steps")
     else:
         log_step("🎉 System initialization completed successfully!")
+    # Environment info
+    log_step(f"\n🔍 Environment Information:")
+    env_info = path_manager.get_environment_info()
+    log_step(f"   Environment: {env_info['environment']}")
+    log_step(f"   Available datasets: {sum(env_info['available_datasets'].values())}")
+    log_step(f"   Available models: {sum(env_info['available_models'].values())}")
+    log_step("\n🎯 System ready for use!")
+    return len(failed_steps) == 0
 if __name__ == "__main__":
+    success = main()
+    if not success:
+        sys.exit(1)