Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files Community

Fake-News-Detection-with-MLOps / initialize_system.py

Ahmedik95316

Update initialize_system.py

c678ee1 2 months ago

raw

history blame

20 kB

	import os
	import sys
	import shutil
	import pandas as pd
	import json
	from pathlib import Path
	from datetime import datetime


	def log_step(message):
	"""Log initialization steps"""
	print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")


	def check_model_exists():
	"""Check if trained model already exists"""
	model_files = [
	Path("/tmp/pipeline.pkl"),
	Path("/tmp/model.pkl"),
	Path("/tmp/vectorizer.pkl"),
	Path("/tmp/metadata.json")
	]

	existing_files = [f for f in model_files if f.exists()]

	if len(existing_files) >= 2: # At least pipeline + metadata OR model + vectorizer
	log_step(f"✅ Found {len(existing_files)} existing model files")
	return True, existing_files
	else:
	log_step(f"❌ Missing model files - only found {len(existing_files)}")
	return False, existing_files


	def check_training_data_exists():
	"""Check if training data is available"""
	data_files = [
	Path("/tmp/data/combined_dataset.csv"),
	Path("/app/data/combined_dataset.csv"),
	Path("/tmp/data/kaggle/Fake.csv"),
	Path("/tmp/data/kaggle/True.csv")
	]

	existing_data = [f for f in data_files if f.exists()]

	if existing_data:
	log_step(f"✅ Found training data: {[str(f) for f in existing_data]}")
	return True, existing_data
	else:
	log_step("❌ No training data found")
	return False, []


	def create_directories():
	"""Create necessary directories"""
	log_step("Creating directory structure...")

	directories = [
	"/tmp/data",
	"/tmp/data/kaggle",
	"/tmp/model",
	"/tmp/logs",
	"/tmp/results",
	"/tmp/backups"
	]

	for dir_path in directories:
	Path(dir_path).mkdir(parents=True, exist_ok=True)
	log_step(f"✅ Created {dir_path}")


	def copy_original_datasets():
	"""Copy original datasets from /app to /tmp"""
	log_step("Copying original datasets...")

	source_files = [
	("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
	("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
	("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv"),
	("/app/data/liar/train.tsv", "/tmp/data/liar/train.tsv"),
	("/app/data/liar/test.tsv", "/tmp/data/liar/test.tsv"),
	("/app/data/liar/valid.tsv", "/tmp/data/liar/valid.tsv")
	]

	copied_count = 0
	for source, dest in source_files:
	if Path(source).exists():
	Path(dest).parent.mkdir(parents=True, exist_ok=True)
	shutil.copy(source, dest)
	log_step(f"✅ Copied {source} to {dest}")
	copied_count += 1
	else:
	log_step(f"⚠️ Source file not found: {source}")

	return copied_count > 0


	def create_minimal_dataset():
	"""Create a minimal dataset if original doesn't exist"""
	log_step("Creating minimal dataset...")

	combined_path = Path("/tmp/data/combined_dataset.csv")

	if combined_path.exists():
	log_step("✅ Combined dataset already exists")
	return True

	# Create minimal training data with more samples for better training
	minimal_data = pd.DataFrame({
	'text': [
	# Real news samples
	'Scientists at Stanford University have developed a new method for detecting cancer cells using artificial intelligence',
	'The Federal Reserve announced today a decision to maintain current interest rates amid economic uncertainty',
	'Local authorities report significant improvements in air quality following new environmental regulations',
	'Research published in Nature journal shows promising results for renewable energy storage technology',
	'The United Nations climate summit concluded with new commitments from world leaders on carbon reduction',
	'Economic indicators suggest steady growth in the manufacturing sector according to latest government data',
	'Healthcare workers receive additional training on new medical procedures approved by regulatory agencies',
	'Transportation department announces infrastructure improvements to major highways across the region',
	'Educational institutions implement new digital learning platforms to enhance student engagement',
	'Agricultural studies reveal improved crop yields through sustainable farming practices',
	'Technology companies invest heavily in cybersecurity measures to protect user data and privacy',
	'Municipal government approves budget for public transportation expansion project in urban areas',
	'Medical researchers make breakthrough in understanding genetic factors contributing to heart disease',
	'International trade agreements show positive impact on local businesses and job creation',
	'Environmental protection agency releases report on water quality improvements in major rivers',

	# Fake news samples
	'SHOCKING: Government secretly controls weather using hidden technology, whistleblower reveals truth',
	'EXPOSED: Celebrities caught in massive conspiracy to manipulate public opinion through social media',
	'URGENT: New study proves that drinking water causes immediate memory loss in 99% of population',
	'BREAKING: Scientists discover that smartphones are actually mind control devices from aliens',
	'EXCLUSIVE: Secret documents reveal that all elections have been predetermined by shadow organization',
	'ALERT: Doctors confirm that eating vegetables makes people 500% more likely to develop rare diseases',
	'LEAKED: Underground network of billionaires planning to replace all humans with artificial intelligence',
	'CONSPIRACY: Major corporations hiding cure for aging to maintain population control and profits',
	'REVEALED: Government admits that gravity is fake and Earth is actually moving upward constantly',
	'WARNING: New technology allows complete thought reading through WiFi signals in your home',
	'BOMBSHELL: Ancient aliens return to Earth disguised as tech executives to harvest human energy',
	'UNCOVERED: All news media controlled by single person living in secret underground bunker',
	'PROOF: Time travel already exists but only available to wealthy elite who control world events',
	'SCANDAL: Pharmaceutical companies intentionally create diseases to sell more expensive treatments',
	'EXPOSED: Education system designed to suppress human creativity and independent thinking abilities'
	],
	'label': [
	# Real news labels (0)
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	# Fake news labels (1)
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
	]
	})

	minimal_data.to_csv(combined_path, index=False)
	log_step(f"✅ Created enhanced minimal dataset with {len(minimal_data)} samples")
	log_step(f" - Real news samples: {sum(minimal_data['label'] == 0)}")
	log_step(f" - Fake news samples: {sum(minimal_data['label'] == 1)}")
	return True


	def run_initial_training():
	"""Run comprehensive model training for first-time setup"""
	log_step("🚀 Starting comprehensive model training for first-time setup...")

	try:
	# Import training modules
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegression
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
	from sklearn.pipeline import Pipeline
	from sklearn.feature_selection import SelectKBest, chi2
	from sklearn.preprocessing import FunctionTransformer
	from sklearn.metrics import accuracy_score, f1_score, classification_report
	import joblib
	import re

	# Text preprocessing function (same as in train.py)
	def preprocess_text_function(texts):
	def clean_single_text(text):
	text = str(text)
	text = re.sub(r'http\S+\|www\S+\|https\S+', '', text)
	text = re.sub(r'\S+@\S+', '', text)
	text = re.sub(r'[!]{2,}', '!', text)
	text = re.sub(r'[?]{2,}', '?', text)
	text = re.sub(r'[.]{3,}', '...', text)
	text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
	text = re.sub(r'\s+', ' ', text)
	return text.strip().lower()

	processed = []
	for text in texts:
	processed.append(clean_single_text(text))
	return processed

	# Load dataset
	dataset_path = Path("/tmp/data/combined_dataset.csv")
	if not dataset_path.exists():
	log_step("❌ No dataset available for training")
	return False

	df = pd.read_csv(dataset_path)
	log_step(f"📊 Loaded dataset with {len(df)} samples")

	# Data validation and cleaning
	df = df.dropna(subset=['text', 'label'])
	df = df[df['text'].astype(str).str.len() > 10]

	log_step(f"📊 After cleaning: {len(df)} samples")
	log_step(f"📊 Class distribution: {df['label'].value_counts().to_dict()}")

	# Prepare data
	X = df['text'].values
	y = df['label'].values

	# Train-test split
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42, stratify=y
	)

	log_step(f"📊 Data split: {len(X_train)} train, {len(X_test)} test")

	# Create comprehensive pipeline
	text_preprocessor = FunctionTransformer(
	func=preprocess_text_function,
	validate=False
	)

	vectorizer = TfidfVectorizer(
	max_features=5000,
	min_df=1,
	max_df=0.95,
	ngram_range=(1, 2),
	stop_words='english',
	sublinear_tf=True,
	norm='l2'
	)

	feature_selector = SelectKBest(
	score_func=chi2,
	k=2000
	)

	# Create pipeline with Logistic Regression
	pipeline = Pipeline([
	('preprocess', text_preprocessor),
	('vectorize', vectorizer),
	('feature_select', feature_selector),
	('model', LogisticRegression(max_iter=500, class_weight='balanced', random_state=42))
	])

	log_step("🔧 Training model with optimized pipeline...")

	# Hyperparameter tuning for datasets with sufficient samples
	if len(X_train) >= 20:
	log_step("⚙️ Performing hyperparameter tuning...")
	param_grid = {
	'model__C': [0.1, 1, 10],
	'model__penalty': ['l2']
	}

	cv_folds = max(2, min(3, len(X_train) // 10))
	grid_search = GridSearchCV(
	pipeline,
	param_grid,
	cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42),
	scoring='f1_weighted',
	n_jobs=1
	)

	grid_search.fit(X_train, y_train)
	best_pipeline = grid_search.best_estimator_

	log_step(f"✅ Best parameters: {grid_search.best_params_}")
	log_step(f"✅ Best CV score: {grid_search.best_score_:.4f}")
	else:
	log_step("⚙️ Using simple training for small dataset...")
	pipeline.fit(X_train, y_train)
	best_pipeline = pipeline

	# Evaluate model
	y_pred = best_pipeline.predict(X_test)
	accuracy = accuracy_score(y_test, y_pred)
	f1 = f1_score(y_test, y_pred, average='weighted')

	log_step(f"📈 Model Performance:")
	log_step(f" - Accuracy: {accuracy:.4f}")
	log_step(f" - F1 Score: {f1:.4f}")

	# Save model artifacts
	log_step("💾 Saving model artifacts...")

	# Save the complete pipeline
	joblib.dump(best_pipeline, "/tmp/pipeline.pkl")
	log_step("✅ Saved complete pipeline")

	# Save individual components for compatibility
	joblib.dump(best_pipeline.named_steps['model'], "/tmp/model.pkl")
	joblib.dump(best_pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
	log_step("✅ Saved individual model components")

	# Generate comprehensive metadata
	metadata = {
	"model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
	"model_type": "logistic_regression",
	"training_method": "initial_setup",
	"dataset_size": len(df),
	"train_size": len(X_train),
	"test_size": len(X_test),
	"test_accuracy": float(accuracy),
	"test_f1": float(f1),
	"hyperparameter_tuning": len(X_train) >= 20,
	"cv_folds": cv_folds if len(X_train) >= 20 else "not_used",
	"class_distribution": df['label'].value_counts().to_dict(),
	"training_config": {
	"max_features": 5000,
	"ngram_range": [1, 2],
	"feature_selection_k": 2000,
	"test_size": 0.2
	},
	"timestamp": datetime.now().isoformat(),
	"initialization_notes": "Model trained during system initialization",
	"ready_for_production": True
	}

	# Save metadata
	with open("/tmp/metadata.json", 'w') as f:
	json.dump(metadata, f, indent=2)

	log_step("✅ Saved comprehensive metadata")
	log_step(f"🎉 Initial model training completed successfully!")
	log_step(f"📊 Final Performance - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")

	return True

	except Exception as e:
	log_step(f"❌ Training failed: {str(e)}")
	import traceback
	log_step(f"🔍 Error details: {traceback.format_exc()}")
	return False


	def create_initial_logs():
	"""Create initial log files"""
	log_step("Creating initial log files...")

	try:
	# Activity log
	activity_log = [{
	"timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
	"event": "System initialized successfully with trained model",
	"level": "INFO"
	}]

	with open("/tmp/activity_log.json", 'w') as f:
	json.dump(activity_log, f, indent=2)

	# Create empty monitoring logs
	log_dirs = ["/tmp/logs"]
	for log_dir in log_dirs:
	Path(log_dir).mkdir(parents=True, exist_ok=True)

	with open("/tmp/logs/monitoring_log.json", 'w') as f:
	json.dump([], f)

	with open("/tmp/logs/scheduler_execution.json", 'w') as f:
	json.dump([], f)

	log_step("✅ Initial log files created")
	return True

	except Exception as e:
	log_step(f"❌ Log creation failed: {str(e)}")
	return False


	def validate_installation():
	"""Validate that the system is properly set up"""
	log_step("🔍 Validating system installation...")

	validation_checks = []

	# Check model files
	model_exists, model_files = check_model_exists()
	validation_checks.append(("Model Files", model_exists, f"Found: {[str(f.name) for f in model_files]}"))

	# Check data files
	data_exists, data_files = check_training_data_exists()
	validation_checks.append(("Training Data", data_exists, f"Found: {len(data_files)} files"))

	# Check directories
	required_dirs = ["/tmp/data", "/tmp/model", "/tmp/logs"]
	dirs_exist = all(Path(d).exists() for d in required_dirs)
	validation_checks.append(("Directories", dirs_exist, f"Required dirs: {required_dirs}"))

	# Check logs
	log_exists = Path("/tmp/activity_log.json").exists()
	validation_checks.append(("Log Files", log_exists, "Activity log created"))

	# Test model loading
	model_loadable = False
	try:
	import joblib
	pipeline = joblib.load("/tmp/pipeline.pkl")
	test_prediction = pipeline.predict(["This is a test news article"])
	model_loadable = True
	validation_checks.append(("Model Loading", True, f"Test prediction: {test_prediction[0]}"))
	except Exception as e:
	validation_checks.append(("Model Loading", False, f"Error: {str(e)}"))

	# Print validation results
	log_step("📋 Validation Results:")
	all_passed = True
	for check_name, passed, details in validation_checks:
	status = "✅ PASS" if passed else "❌ FAIL"
	log_step(f" {status} {check_name}: {details}")
	if not passed:
	all_passed = False

	return all_passed, validation_checks


	def main():
	"""Main initialization function with smart training logic"""
	log_step("🚀 Starting intelligent system initialization...")

	# Check if model already exists
	model_exists, existing_model_files = check_model_exists()

	if model_exists:
	log_step("🎯 EXISTING INSTALLATION DETECTED")
	log_step("📄 Found existing model files - skipping training")

	# Load existing metadata to show info
	try:
	with open("/tmp/metadata.json", 'r') as f:
	metadata = json.load(f)

	log_step(f"📊 Existing Model Info:")
	log_step(f" - Version: {metadata.get('model_version', 'Unknown')}")
	log_step(f" - Accuracy: {metadata.get('test_accuracy', 'Unknown')}")
	log_step(f" - F1 Score: {metadata.get('test_f1', 'Unknown')}")
	log_step(f" - Created: {metadata.get('timestamp', 'Unknown')}")

	except Exception as e:
	log_step(f"⚠️ Could not read existing metadata: {e}")

	else:
	log_step("🆕 FIRST-TIME INSTALLATION DETECTED")
	log_step("🔧 No existing model found - will train new model")

	# Run initialization steps
	steps = [
	("Directory Creation", create_directories),
	("Dataset Copy", copy_original_datasets),
	("Dataset Preparation", create_minimal_dataset),
	("Log Creation", create_initial_logs)
	]

	# Add training step only if model doesn't exist
	if not model_exists:
	steps.insert(-1, ("🤖 Model Training", run_initial_training))

	failed_steps = []

	for step_name, step_function in steps:
	try:
	log_step(f"▶️ Starting: {step_name}")
	if step_function():
	log_step(f"✅ {step_name} completed")
	else:
	log_step(f"❌ {step_name} failed")
	failed_steps.append(step_name)
	except Exception as e:
	log_step(f"❌ {step_name} failed: {str(e)}")
	failed_steps.append(step_name)

	# Final validation
	log_step("🔍 Running final system validation...")
	validation_passed, validation_results = validate_installation()

	# Summary
	log_step("=" * 60)
	if failed_steps:
	log_step(f"⚠️ Initialization completed with {len(failed_steps)} issues")
	log_step(f"❌ Failed steps: {', '.join(failed_steps)}")
	else:
	log_step("🎉 System initialization completed successfully!")

	if validation_passed:
	log_step("✅ All validation checks passed!")
	log_step("🚀 System is ready for use!")

	if not model_exists:
	log_step("🤖 NEW MODEL TRAINED AND READY")
	log_step("📊 You can now start making predictions!")
	else:
	log_step("🔄 EXISTING MODEL VALIDATED AND READY")
	log_step("📊 System restored from previous installation!")

	else:
	log_step("❌ Some validation checks failed")
	log_step("🔧 Manual intervention may be required")

	log_step("=" * 60)


	if __name__ == "__main__":
	main()