import os import sys import shutil import pandas as pd import json from pathlib import Path from datetime import datetime def log_step(message): """Log initialization steps""" print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}") def create_directories(): """Create necessary directories""" log_step("Creating directory structure...") directories = [ "/tmp/data", "/tmp/model", "/tmp/logs" ] for dir_path in directories: Path(dir_path).mkdir(parents=True, exist_ok=True) log_step(f"✅ Created {dir_path}") def copy_original_datasets(): """Copy original datasets from /app to /tmp""" log_step("Copying original datasets...") source_files = [ ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"), ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"), ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv") ] copied_count = 0 for source, dest in source_files: if Path(source).exists(): Path(dest).parent.mkdir(parents=True, exist_ok=True) shutil.copy(source, dest) log_step(f"✅ Copied {source} to {dest}") copied_count += 1 else: log_step(f"⚠️ Source file not found: {source}") return copied_count > 0 def create_minimal_dataset(): """Create a minimal dataset if original doesn't exist""" log_step("Creating minimal dataset...") combined_path = Path("/tmp/data/combined_dataset.csv") if combined_path.exists(): log_step("✅ Combined dataset already exists") return True # Create minimal training data minimal_data = pd.DataFrame({ 'text': [ 'Scientists discover new species in Amazon rainforest', 'SHOCKING: Aliens spotted in Area 51, government confirms existence', 'Local authorities report increase in renewable energy adoption', 'You won\'t believe what happens when you eat this miracle fruit', 'Economic indicators show steady growth in manufacturing sector', 'EXCLUSIVE: Celebrity caught in secret alien communication scandal', 'Research shows positive effects of meditation on mental health', 'Government hiding truth about flat earth, conspiracy theorists claim', 'New study reveals benefits of regular exercise for elderly', 'BREAKING: Time travel confirmed by underground scientists' ], 'label': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] # 0=Real, 1=Fake }) minimal_data.to_csv(combined_path, index=False) log_step(f"✅ Created minimal dataset with {len(minimal_data)} samples") return True def run_initial_training(): """Run basic model training""" log_step("Starting initial model training...") try: # Check if model already exists model_path = Path("/tmp/model.pkl") vectorizer_path = Path("/tmp/vectorizer.pkl") if model_path.exists() and vectorizer_path.exists(): log_step("✅ Model files already exist") return True # Import required libraries from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import joblib # Load dataset dataset_path = Path("/tmp/data/combined_dataset.csv") if not dataset_path.exists(): log_step("❌ No dataset available for training") return False df = pd.read_csv(dataset_path) log_step(f"Loaded dataset with {len(df)} samples") # Prepare data X = df['text'].values y = df['label'].values # Train-test split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # Vectorization vectorizer = TfidfVectorizer( max_features=5000, stop_words='english', ngram_range=(1, 2) ) X_train_vec = vectorizer.fit_transform(X_train) X_test_vec = vectorizer.transform(X_test) # Train model model = LogisticRegression(max_iter=1000, random_state=42) model.fit(X_train_vec, y_train) # Evaluate y_pred = model.predict(X_test_vec) accuracy = accuracy_score(y_test, y_pred) # Save model joblib.dump(model, "/tmp/model.pkl") joblib.dump(vectorizer, "/tmp/vectorizer.pkl") # Save metadata metadata = { "model_version": "v1.0_init", "test_accuracy": float(accuracy), "train_size": len(X_train), "test_size": len(X_test), "timestamp": datetime.now().isoformat(), "training_method": "initialization" } with open("/tmp/metadata.json", 'w') as f: json.dump(metadata, f, indent=2) log_step(f"✅ Training completed successfully, accuracy: {accuracy:.4f}") return True except Exception as e: log_step(f"❌ Training failed: {str(e)}") return False def create_initial_logs(): """Create initial log files""" log_step("Creating initial log files...") try: # Activity log activity_log = [{ "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"), "event": "System initialized successfully" }] with open("/tmp/activity_log.json", 'w') as f: json.dump(activity_log, f, indent=2) # Create empty monitoring logs with open("/tmp/logs/monitoring_log.json", 'w') as f: json.dump([], f) log_step("✅ Initial log files created") return True except Exception as e: log_step(f"❌ Log creation failed: {str(e)}") return False def main(): """Main initialization function""" log_step("🚀 Starting system initialization...") steps = [ ("Directory Creation", create_directories), ("Dataset Copy", copy_original_datasets), ("Minimal Dataset", create_minimal_dataset), ("Model Training", run_initial_training), ("Log Creation", create_initial_logs) ] failed_steps = [] for step_name, step_function in steps: try: if step_function(): log_step(f"✅ {step_name} completed") else: log_step(f"❌ {step_name} failed") failed_steps.append(step_name) except Exception as e: log_step(f"❌ {step_name} failed: {str(e)}") failed_steps.append(step_name) if failed_steps: log_step(f"⚠️ Initialization completed with {len(failed_steps)} failed steps") log_step(f"Failed: {', '.join(failed_steps)}") else: log_step("🎉 System initialization completed successfully!") log_step("System ready for use!") if __name__ == "__main__": main()