Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files Community

Ahmedik95316 commited on Aug 19

Commit

c678ee1

1 Parent(s): 3a989cc

Update initialize_system.py

Browse files

Update to include the model training at the start so the pipeline is available

Files changed (1) hide show

initialize_system.py +336 -57

initialize_system.py CHANGED Viewed

@@ -12,14 +12,55 @@ def log_step(message):
     print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
 def create_directories():
     """Create necessary directories"""
     log_step("Creating directory structure...")
     directories = [
         "/tmp/data",
         "/tmp/model",
-        "/tmp/logs"
     ]
     for dir_path in directories:
@@ -34,7 +75,10 @@ def copy_original_datasets():
     source_files = [
         ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
         ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
-        ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv")
     ]
     copied_count = 0
@@ -60,47 +104,92 @@ def create_minimal_dataset():
         log_step("✅ Combined dataset already exists")
         return True
-    # Create minimal training data
     minimal_data = pd.DataFrame({
         'text': [
-            'Scientists discover new species in Amazon rainforest',
-            'SHOCKING: Aliens spotted in Area 51, government confirms existence',
-            'Local authorities report increase in renewable energy adoption',
-            'You won\'t believe what happens when you eat this miracle fruit',
-            'Economic indicators show steady growth in manufacturing sector',
-            'EXCLUSIVE: Celebrity caught in secret alien communication scandal',
-            'Research shows positive effects of meditation on mental health',
-            'Government hiding truth about flat earth, conspiracy theorists claim',
-            'New study reveals benefits of regular exercise for elderly',
-            'BREAKING: Time travel confirmed by underground scientists'
         ],
-        'label': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]  # 0=Real, 1=Fake
     })
     minimal_data.to_csv(combined_path, index=False)
-    log_step(f"✅ Created minimal dataset with {len(minimal_data)} samples")
     return True
 def run_initial_training():
-    """Run basic model training"""
-    log_step("Starting initial model training...")
     try:
-        # Check if model already exists
-        model_path = Path("/tmp/model.pkl")
-        vectorizer_path = Path("/tmp/vectorizer.pkl")
-        if model_path.exists() and vectorizer_path.exists():
-            log_step("✅ Model files already exist")
-            return True
-        # Import required libraries
         from sklearn.feature_extraction.text import TfidfVectorizer
         from sklearn.linear_model import LogisticRegression
-        from sklearn.model_selection import train_test_split
-        from sklearn.metrics import accuracy_score
         import joblib
         # Load dataset
         dataset_path = Path("/tmp/data/combined_dataset.csv")
@@ -109,7 +198,14 @@ def run_initial_training():
             return False
         df = pd.read_csv(dataset_path)
-        log_step(f"Loaded dataset with {len(df)} samples")
         # Prepare data
         X = df['text'].values
@@ -120,46 +216,125 @@ def run_initial_training():
             X, y, test_size=0.2, random_state=42, stratify=y
         )
-        # Vectorization
         vectorizer = TfidfVectorizer(
             max_features=5000,
             stop_words='english',
-            ngram_range=(1, 2)
         )
-        X_train_vec = vectorizer.fit_transform(X_train)
-        X_test_vec = vectorizer.transform(X_test)
-        # Train model
-        model = LogisticRegression(max_iter=1000, random_state=42)
-        model.fit(X_train_vec, y_train)
-        # Evaluate
-        y_pred = model.predict(X_test_vec)
         accuracy = accuracy_score(y_test, y_pred)
-        # Save model
-        joblib.dump(model, "/tmp/model.pkl")
-        joblib.dump(vectorizer, "/tmp/vectorizer.pkl")
-        # Save metadata
         metadata = {
-            "model_version": "v1.0_init",
-            "test_accuracy": float(accuracy),
             "train_size": len(X_train),
             "test_size": len(X_test),
             "timestamp": datetime.now().isoformat(),
-            "training_method": "initialization"
         }
         with open("/tmp/metadata.json", 'w') as f:
             json.dump(metadata, f, indent=2)
-        log_step(
-            f"✅ Training completed successfully, accuracy: {accuracy:.4f}")
         return True
     except Exception as e:
         log_step(f"❌ Training failed: {str(e)}")
         return False
@@ -171,16 +346,24 @@ def create_initial_logs():
         # Activity log
         activity_log = [{
             "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
-            "event": "System initialized successfully"
         }]
         with open("/tmp/activity_log.json", 'w') as f:
             json.dump(activity_log, f, indent=2)
         # Create empty monitoring logs
         with open("/tmp/logs/monitoring_log.json", 'w') as f:
             json.dump([], f)
         log_step("✅ Initial log files created")
         return True
@@ -189,22 +372,98 @@ def create_initial_logs():
         return False
 def main():
-    """Main initialization function"""
-    log_step("🚀 Starting system initialization...")
     steps = [
         ("Directory Creation", create_directories),
         ("Dataset Copy", copy_original_datasets),
-        ("Minimal Dataset", create_minimal_dataset),
-        ("Model Training", run_initial_training),
         ("Log Creation", create_initial_logs)
     ]
     failed_steps = []
     for step_name, step_function in steps:
         try:
             if step_function():
                 log_step(f"✅ {step_name} completed")
             else:
@@ -214,15 +473,35 @@ def main():
             log_step(f"❌ {step_name} failed: {str(e)}")
             failed_steps.append(step_name)
     if failed_steps:
-        log_step(
-            f"⚠️ Initialization completed with {len(failed_steps)} failed steps")
-        log_step(f"Failed: {', '.join(failed_steps)}")
     else:
         log_step("🎉 System initialization completed successfully!")
-    log_step("System ready for use!")
 if __name__ == "__main__":
-    main()

     print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
+def check_model_exists():
+    """Check if trained model already exists"""
+    model_files = [
+        Path("/tmp/pipeline.pkl"),
+        Path("/tmp/model.pkl"),
+        Path("/tmp/vectorizer.pkl"),
+        Path("/tmp/metadata.json")
+    ]
+    existing_files = [f for f in model_files if f.exists()]
+    if len(existing_files) >= 2:  # At least pipeline + metadata OR model + vectorizer
+        log_step(f"✅ Found {len(existing_files)} existing model files")
+        return True, existing_files
+    else:
+        log_step(f"❌ Missing model files - only found {len(existing_files)}")
+        return False, existing_files
+def check_training_data_exists():
+    """Check if training data is available"""
+    data_files = [
+        Path("/tmp/data/combined_dataset.csv"),
+        Path("/app/data/combined_dataset.csv"),
+        Path("/tmp/data/kaggle/Fake.csv"),
+        Path("/tmp/data/kaggle/True.csv")
+    ]
+    existing_data = [f for f in data_files if f.exists()]
+    if existing_data:
+        log_step(f"✅ Found training data: {[str(f) for f in existing_data]}")
+        return True, existing_data
+    else:
+        log_step("❌ No training data found")
+        return False, []
 def create_directories():
     """Create necessary directories"""
     log_step("Creating directory structure...")
     directories = [
         "/tmp/data",
+        "/tmp/data/kaggle",
         "/tmp/model",
+        "/tmp/logs",
+        "/tmp/results",
+        "/tmp/backups"
     ]
     for dir_path in directories:
     source_files = [
         ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
         ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
+        ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv"),
+        ("/app/data/liar/train.tsv", "/tmp/data/liar/train.tsv"),
+        ("/app/data/liar/test.tsv", "/tmp/data/liar/test.tsv"),
+        ("/app/data/liar/valid.tsv", "/tmp/data/liar/valid.tsv")
     ]
     copied_count = 0
         log_step("✅ Combined dataset already exists")
         return True
+    # Create minimal training data with more samples for better training
     minimal_data = pd.DataFrame({
         'text': [
+            # Real news samples
+            'Scientists at Stanford University have developed a new method for detecting cancer cells using artificial intelligence',
+            'The Federal Reserve announced today a decision to maintain current interest rates amid economic uncertainty',
+            'Local authorities report significant improvements in air quality following new environmental regulations',
+            'Research published in Nature journal shows promising results for renewable energy storage technology',
+            'The United Nations climate summit concluded with new commitments from world leaders on carbon reduction',
+            'Economic indicators suggest steady growth in the manufacturing sector according to latest government data',
+            'Healthcare workers receive additional training on new medical procedures approved by regulatory agencies',
+            'Transportation department announces infrastructure improvements to major highways across the region',
+            'Educational institutions implement new digital learning platforms to enhance student engagement',
+            'Agricultural studies reveal improved crop yields through sustainable farming practices',
+            'Technology companies invest heavily in cybersecurity measures to protect user data and privacy',
+            'Municipal government approves budget for public transportation expansion project in urban areas',
+            'Medical researchers make breakthrough in understanding genetic factors contributing to heart disease',
+            'International trade agreements show positive impact on local businesses and job creation',
+            'Environmental protection agency releases report on water quality improvements in major rivers',
+            # Fake news samples
+            'SHOCKING: Government secretly controls weather using hidden technology, whistleblower reveals truth',
+            'EXPOSED: Celebrities caught in massive conspiracy to manipulate public opinion through social media',
+            'URGENT: New study proves that drinking water causes immediate memory loss in 99% of population',
+            'BREAKING: Scientists discover that smartphones are actually mind control devices from aliens',
+            'EXCLUSIVE: Secret documents reveal that all elections have been predetermined by shadow organization',
+            'ALERT: Doctors confirm that eating vegetables makes people 500% more likely to develop rare diseases',
+            'LEAKED: Underground network of billionaires planning to replace all humans with artificial intelligence',
+            'CONSPIRACY: Major corporations hiding cure for aging to maintain population control and profits',
+            'REVEALED: Government admits that gravity is fake and Earth is actually moving upward constantly',
+            'WARNING: New technology allows complete thought reading through WiFi signals in your home',
+            'BOMBSHELL: Ancient aliens return to Earth disguised as tech executives to harvest human energy',
+            'UNCOVERED: All news media controlled by single person living in secret underground bunker',
+            'PROOF: Time travel already exists but only available to wealthy elite who control world events',
+            'SCANDAL: Pharmaceutical companies intentionally create diseases to sell more expensive treatments',
+            'EXPOSED: Education system designed to suppress human creativity and independent thinking abilities'
         ],
+        'label': [
+            # Real news labels (0)
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            # Fake news labels (1)
+            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+        ]
     })
     minimal_data.to_csv(combined_path, index=False)
+    log_step(f"✅ Created enhanced minimal dataset with {len(minimal_data)} samples")
+    log_step(f"   - Real news samples: {sum(minimal_data['label'] == 0)}")
+    log_step(f"   - Fake news samples: {sum(minimal_data['label'] == 1)}")
     return True
 def run_initial_training():
+    """Run comprehensive model training for first-time setup"""
+    log_step("🚀 Starting comprehensive model training for first-time setup...")
     try:
+        # Import training modules
         from sklearn.feature_extraction.text import TfidfVectorizer
         from sklearn.linear_model import LogisticRegression
+        from sklearn.ensemble import RandomForestClassifier
+        from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
+        from sklearn.pipeline import Pipeline
+        from sklearn.feature_selection import SelectKBest, chi2
+        from sklearn.preprocessing import FunctionTransformer
+        from sklearn.metrics import accuracy_score, f1_score, classification_report
         import joblib
+        import re
+        # Text preprocessing function (same as in train.py)
+        def preprocess_text_function(texts):
+            def clean_single_text(text):
+                text = str(text)
+                text = re.sub(r'http\S+|www\S+|https\S+', '', text)
+                text = re.sub(r'\S+@\S+', '', text)
+                text = re.sub(r'[!]{2,}', '!', text)
+                text = re.sub(r'[?]{2,}', '?', text)
+                text = re.sub(r'[.]{3,}', '...', text)
+                text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
+                text = re.sub(r'\s+', ' ', text)
+                return text.strip().lower()
+            processed = []
+            for text in texts:
+                processed.append(clean_single_text(text))
+            return processed
         # Load dataset
         dataset_path = Path("/tmp/data/combined_dataset.csv")
             return False
         df = pd.read_csv(dataset_path)
+        log_step(f"📊 Loaded dataset with {len(df)} samples")
+        # Data validation and cleaning
+        df = df.dropna(subset=['text', 'label'])
+        df = df[df['text'].astype(str).str.len() > 10]
+        log_step(f"📊 After cleaning: {len(df)} samples")
+        log_step(f"📊 Class distribution: {df['label'].value_counts().to_dict()}")
         # Prepare data
         X = df['text'].values
             X, y, test_size=0.2, random_state=42, stratify=y
         )
+        log_step(f"📊 Data split: {len(X_train)} train, {len(X_test)} test")
+        # Create comprehensive pipeline
+        text_preprocessor = FunctionTransformer(
+            func=preprocess_text_function,
+            validate=False
+        )
         vectorizer = TfidfVectorizer(
             max_features=5000,
+            min_df=1,
+            max_df=0.95,
+            ngram_range=(1, 2),
             stop_words='english',
+            sublinear_tf=True,
+            norm='l2'
+        )
+        feature_selector = SelectKBest(
+            score_func=chi2,
+            k=2000
         )
+        # Create pipeline with Logistic Regression
+        pipeline = Pipeline([
+            ('preprocess', text_preprocessor),
+            ('vectorize', vectorizer),
+            ('feature_select', feature_selector),
+            ('model', LogisticRegression(max_iter=500, class_weight='balanced', random_state=42))
+        ])
+        log_step("🔧 Training model with optimized pipeline...")
+        # Hyperparameter tuning for datasets with sufficient samples
+        if len(X_train) >= 20:
+            log_step("⚙️ Performing hyperparameter tuning...")
+            param_grid = {
+                'model__C': [0.1, 1, 10],
+                'model__penalty': ['l2']
+            }
+            cv_folds = max(2, min(3, len(X_train) // 10))
+            grid_search = GridSearchCV(
+                pipeline,
+                param_grid,
+                cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42),
+                scoring='f1_weighted',
+                n_jobs=1
+            )
+            grid_search.fit(X_train, y_train)
+            best_pipeline = grid_search.best_estimator_
+            log_step(f"✅ Best parameters: {grid_search.best_params_}")
+            log_step(f"✅ Best CV score: {grid_search.best_score_:.4f}")
+        else:
+            log_step("⚙️ Using simple training for small dataset...")
+            pipeline.fit(X_train, y_train)
+            best_pipeline = pipeline
+        # Evaluate model
+        y_pred = best_pipeline.predict(X_test)
         accuracy = accuracy_score(y_test, y_pred)
+        f1 = f1_score(y_test, y_pred, average='weighted')
+        log_step(f"📈 Model Performance:")
+        log_step(f"   - Accuracy: {accuracy:.4f}")
+        log_step(f"   - F1 Score: {f1:.4f}")
+        # Save model artifacts
+        log_step("💾 Saving model artifacts...")
+        # Save the complete pipeline
+        joblib.dump(best_pipeline, "/tmp/pipeline.pkl")
+        log_step("✅ Saved complete pipeline")
+        # Save individual components for compatibility
+        joblib.dump(best_pipeline.named_steps['model'], "/tmp/model.pkl")
+        joblib.dump(best_pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
+        log_step("✅ Saved individual model components")
+        # Generate comprehensive metadata
         metadata = {
+            "model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
+            "model_type": "logistic_regression",
+            "training_method": "initial_setup",
+            "dataset_size": len(df),
             "train_size": len(X_train),
             "test_size": len(X_test),
+            "test_accuracy": float(accuracy),
+            "test_f1": float(f1),
+            "hyperparameter_tuning": len(X_train) >= 20,
+            "cv_folds": cv_folds if len(X_train) >= 20 else "not_used",
+            "class_distribution": df['label'].value_counts().to_dict(),
+            "training_config": {
+                "max_features": 5000,
+                "ngram_range": [1, 2],
+                "feature_selection_k": 2000,
+                "test_size": 0.2
+            },
             "timestamp": datetime.now().isoformat(),
+            "initialization_notes": "Model trained during system initialization",
+            "ready_for_production": True
         }
+        # Save metadata
         with open("/tmp/metadata.json", 'w') as f:
             json.dump(metadata, f, indent=2)
+        log_step("✅ Saved comprehensive metadata")
+        log_step(f"🎉 Initial model training completed successfully!")
+        log_step(f"📊 Final Performance - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
         return True
     except Exception as e:
         log_step(f"❌ Training failed: {str(e)}")
+        import traceback
+        log_step(f"🔍 Error details: {traceback.format_exc()}")
         return False
         # Activity log
         activity_log = [{
             "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
+            "event": "System initialized successfully with trained model",
+            "level": "INFO"
         }]
         with open("/tmp/activity_log.json", 'w') as f:
             json.dump(activity_log, f, indent=2)
         # Create empty monitoring logs
+        log_dirs = ["/tmp/logs"]
+        for log_dir in log_dirs:
+            Path(log_dir).mkdir(parents=True, exist_ok=True)
         with open("/tmp/logs/monitoring_log.json", 'w') as f:
             json.dump([], f)
+        with open("/tmp/logs/scheduler_execution.json", 'w') as f:
+            json.dump([], f)
         log_step("✅ Initial log files created")
         return True
         return False
+def validate_installation():
+    """Validate that the system is properly set up"""
+    log_step("🔍 Validating system installation...")
+    validation_checks = []
+    # Check model files
+    model_exists, model_files = check_model_exists()
+    validation_checks.append(("Model Files", model_exists, f"Found: {[str(f.name) for f in model_files]}"))
+    # Check data files
+    data_exists, data_files = check_training_data_exists()
+    validation_checks.append(("Training Data", data_exists, f"Found: {len(data_files)} files"))
+    # Check directories
+    required_dirs = ["/tmp/data", "/tmp/model", "/tmp/logs"]
+    dirs_exist = all(Path(d).exists() for d in required_dirs)
+    validation_checks.append(("Directories", dirs_exist, f"Required dirs: {required_dirs}"))
+    # Check logs
+    log_exists = Path("/tmp/activity_log.json").exists()
+    validation_checks.append(("Log Files", log_exists, "Activity log created"))
+    # Test model loading
+    model_loadable = False
+    try:
+        import joblib
+        pipeline = joblib.load("/tmp/pipeline.pkl")
+        test_prediction = pipeline.predict(["This is a test news article"])
+        model_loadable = True
+        validation_checks.append(("Model Loading", True, f"Test prediction: {test_prediction[0]}"))
+    except Exception as e:
+        validation_checks.append(("Model Loading", False, f"Error: {str(e)}"))
+    # Print validation results
+    log_step("📋 Validation Results:")
+    all_passed = True
+    for check_name, passed, details in validation_checks:
+        status = "✅ PASS" if passed else "❌ FAIL"
+        log_step(f"   {status} {check_name}: {details}")
+        if not passed:
+            all_passed = False
+    return all_passed, validation_checks
 def main():
+    """Main initialization function with smart training logic"""
+    log_step("🚀 Starting intelligent system initialization...")
+    # Check if model already exists
+    model_exists, existing_model_files = check_model_exists()
+    if model_exists:
+        log_step("🎯 EXISTING INSTALLATION DETECTED")
+        log_step("📄 Found existing model files - skipping training")
+        # Load existing metadata to show info
+        try:
+            with open("/tmp/metadata.json", 'r') as f:
+                metadata = json.load(f)
+            log_step(f"📊 Existing Model Info:")
+            log_step(f"   - Version: {metadata.get('model_version', 'Unknown')}")
+            log_step(f"   - Accuracy: {metadata.get('test_accuracy', 'Unknown')}")
+            log_step(f"   - F1 Score: {metadata.get('test_f1', 'Unknown')}")
+            log_step(f"   - Created: {metadata.get('timestamp', 'Unknown')}")
+        except Exception as e:
+            log_step(f"⚠️ Could not read existing metadata: {e}")
+    else:
+        log_step("🆕 FIRST-TIME INSTALLATION DETECTED")
+        log_step("🔧 No existing model found - will train new model")
+    # Run initialization steps
     steps = [
         ("Directory Creation", create_directories),
         ("Dataset Copy", copy_original_datasets),
+        ("Dataset Preparation", create_minimal_dataset),
         ("Log Creation", create_initial_logs)
     ]
+    # Add training step only if model doesn't exist
+    if not model_exists:
+        steps.insert(-1, ("🤖 Model Training", run_initial_training))
     failed_steps = []
     for step_name, step_function in steps:
         try:
+            log_step(f"▶️ Starting: {step_name}")
             if step_function():
                 log_step(f"✅ {step_name} completed")
             else:
             log_step(f"❌ {step_name} failed: {str(e)}")
             failed_steps.append(step_name)
+    # Final validation
+    log_step("🔍 Running final system validation...")
+    validation_passed, validation_results = validate_installation()
+    # Summary
+    log_step("=" * 60)
     if failed_steps:
+        log_step(f"⚠️ Initialization completed with {len(failed_steps)} issues")
+        log_step(f"❌ Failed steps: {', '.join(failed_steps)}")
     else:
         log_step("🎉 System initialization completed successfully!")
+    if validation_passed:
+        log_step("✅ All validation checks passed!")
+        log_step("🚀 System is ready for use!")
+        if not model_exists:
+            log_step("🤖 NEW MODEL TRAINED AND READY")
+            log_step("📊 You can now start making predictions!")
+        else:
+            log_step("🔄 EXISTING MODEL VALIDATED AND READY")
+            log_step("📊 System restored from previous installation!")
+    else:
+        log_step("❌ Some validation checks failed")
+        log_step("🔧 Manual intervention may be required")
+    log_step("=" * 60)
 if __name__ == "__main__":
+    main()