Fake-News-Detection-with-MLOps / initialize_system.py
Ahmedik95316's picture
Update initialize_system.py
c678ee1
raw
history blame
20 kB
import os
import sys
import shutil
import pandas as pd
import json
from pathlib import Path
from datetime import datetime
def log_step(message):
"""Log initialization steps"""
print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
def check_model_exists():
"""Check if trained model already exists"""
model_files = [
Path("/tmp/pipeline.pkl"),
Path("/tmp/model.pkl"),
Path("/tmp/vectorizer.pkl"),
Path("/tmp/metadata.json")
]
existing_files = [f for f in model_files if f.exists()]
if len(existing_files) >= 2: # At least pipeline + metadata OR model + vectorizer
log_step(f"βœ… Found {len(existing_files)} existing model files")
return True, existing_files
else:
log_step(f"❌ Missing model files - only found {len(existing_files)}")
return False, existing_files
def check_training_data_exists():
"""Check if training data is available"""
data_files = [
Path("/tmp/data/combined_dataset.csv"),
Path("/app/data/combined_dataset.csv"),
Path("/tmp/data/kaggle/Fake.csv"),
Path("/tmp/data/kaggle/True.csv")
]
existing_data = [f for f in data_files if f.exists()]
if existing_data:
log_step(f"βœ… Found training data: {[str(f) for f in existing_data]}")
return True, existing_data
else:
log_step("❌ No training data found")
return False, []
def create_directories():
"""Create necessary directories"""
log_step("Creating directory structure...")
directories = [
"/tmp/data",
"/tmp/data/kaggle",
"/tmp/model",
"/tmp/logs",
"/tmp/results",
"/tmp/backups"
]
for dir_path in directories:
Path(dir_path).mkdir(parents=True, exist_ok=True)
log_step(f"βœ… Created {dir_path}")
def copy_original_datasets():
"""Copy original datasets from /app to /tmp"""
log_step("Copying original datasets...")
source_files = [
("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv"),
("/app/data/liar/train.tsv", "/tmp/data/liar/train.tsv"),
("/app/data/liar/test.tsv", "/tmp/data/liar/test.tsv"),
("/app/data/liar/valid.tsv", "/tmp/data/liar/valid.tsv")
]
copied_count = 0
for source, dest in source_files:
if Path(source).exists():
Path(dest).parent.mkdir(parents=True, exist_ok=True)
shutil.copy(source, dest)
log_step(f"βœ… Copied {source} to {dest}")
copied_count += 1
else:
log_step(f"⚠️ Source file not found: {source}")
return copied_count > 0
def create_minimal_dataset():
"""Create a minimal dataset if original doesn't exist"""
log_step("Creating minimal dataset...")
combined_path = Path("/tmp/data/combined_dataset.csv")
if combined_path.exists():
log_step("βœ… Combined dataset already exists")
return True
# Create minimal training data with more samples for better training
minimal_data = pd.DataFrame({
'text': [
# Real news samples
'Scientists at Stanford University have developed a new method for detecting cancer cells using artificial intelligence',
'The Federal Reserve announced today a decision to maintain current interest rates amid economic uncertainty',
'Local authorities report significant improvements in air quality following new environmental regulations',
'Research published in Nature journal shows promising results for renewable energy storage technology',
'The United Nations climate summit concluded with new commitments from world leaders on carbon reduction',
'Economic indicators suggest steady growth in the manufacturing sector according to latest government data',
'Healthcare workers receive additional training on new medical procedures approved by regulatory agencies',
'Transportation department announces infrastructure improvements to major highways across the region',
'Educational institutions implement new digital learning platforms to enhance student engagement',
'Agricultural studies reveal improved crop yields through sustainable farming practices',
'Technology companies invest heavily in cybersecurity measures to protect user data and privacy',
'Municipal government approves budget for public transportation expansion project in urban areas',
'Medical researchers make breakthrough in understanding genetic factors contributing to heart disease',
'International trade agreements show positive impact on local businesses and job creation',
'Environmental protection agency releases report on water quality improvements in major rivers',
# Fake news samples
'SHOCKING: Government secretly controls weather using hidden technology, whistleblower reveals truth',
'EXPOSED: Celebrities caught in massive conspiracy to manipulate public opinion through social media',
'URGENT: New study proves that drinking water causes immediate memory loss in 99% of population',
'BREAKING: Scientists discover that smartphones are actually mind control devices from aliens',
'EXCLUSIVE: Secret documents reveal that all elections have been predetermined by shadow organization',
'ALERT: Doctors confirm that eating vegetables makes people 500% more likely to develop rare diseases',
'LEAKED: Underground network of billionaires planning to replace all humans with artificial intelligence',
'CONSPIRACY: Major corporations hiding cure for aging to maintain population control and profits',
'REVEALED: Government admits that gravity is fake and Earth is actually moving upward constantly',
'WARNING: New technology allows complete thought reading through WiFi signals in your home',
'BOMBSHELL: Ancient aliens return to Earth disguised as tech executives to harvest human energy',
'UNCOVERED: All news media controlled by single person living in secret underground bunker',
'PROOF: Time travel already exists but only available to wealthy elite who control world events',
'SCANDAL: Pharmaceutical companies intentionally create diseases to sell more expensive treatments',
'EXPOSED: Education system designed to suppress human creativity and independent thinking abilities'
],
'label': [
# Real news labels (0)
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
# Fake news labels (1)
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
]
})
minimal_data.to_csv(combined_path, index=False)
log_step(f"βœ… Created enhanced minimal dataset with {len(minimal_data)} samples")
log_step(f" - Real news samples: {sum(minimal_data['label'] == 0)}")
log_step(f" - Fake news samples: {sum(minimal_data['label'] == 1)}")
return True
def run_initial_training():
"""Run comprehensive model training for first-time setup"""
log_step("πŸš€ Starting comprehensive model training for first-time setup...")
try:
# Import training modules
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib
import re
# Text preprocessing function (same as in train.py)
def preprocess_text_function(texts):
def clean_single_text(text):
text = str(text)
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
text = re.sub(r'\S+@\S+', '', text)
text = re.sub(r'[!]{2,}', '!', text)
text = re.sub(r'[?]{2,}', '?', text)
text = re.sub(r'[.]{3,}', '...', text)
text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
text = re.sub(r'\s+', ' ', text)
return text.strip().lower()
processed = []
for text in texts:
processed.append(clean_single_text(text))
return processed
# Load dataset
dataset_path = Path("/tmp/data/combined_dataset.csv")
if not dataset_path.exists():
log_step("❌ No dataset available for training")
return False
df = pd.read_csv(dataset_path)
log_step(f"πŸ“Š Loaded dataset with {len(df)} samples")
# Data validation and cleaning
df = df.dropna(subset=['text', 'label'])
df = df[df['text'].astype(str).str.len() > 10]
log_step(f"πŸ“Š After cleaning: {len(df)} samples")
log_step(f"πŸ“Š Class distribution: {df['label'].value_counts().to_dict()}")
# Prepare data
X = df['text'].values
y = df['label'].values
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
log_step(f"πŸ“Š Data split: {len(X_train)} train, {len(X_test)} test")
# Create comprehensive pipeline
text_preprocessor = FunctionTransformer(
func=preprocess_text_function,
validate=False
)
vectorizer = TfidfVectorizer(
max_features=5000,
min_df=1,
max_df=0.95,
ngram_range=(1, 2),
stop_words='english',
sublinear_tf=True,
norm='l2'
)
feature_selector = SelectKBest(
score_func=chi2,
k=2000
)
# Create pipeline with Logistic Regression
pipeline = Pipeline([
('preprocess', text_preprocessor),
('vectorize', vectorizer),
('feature_select', feature_selector),
('model', LogisticRegression(max_iter=500, class_weight='balanced', random_state=42))
])
log_step("πŸ”§ Training model with optimized pipeline...")
# Hyperparameter tuning for datasets with sufficient samples
if len(X_train) >= 20:
log_step("βš™οΈ Performing hyperparameter tuning...")
param_grid = {
'model__C': [0.1, 1, 10],
'model__penalty': ['l2']
}
cv_folds = max(2, min(3, len(X_train) // 10))
grid_search = GridSearchCV(
pipeline,
param_grid,
cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42),
scoring='f1_weighted',
n_jobs=1
)
grid_search.fit(X_train, y_train)
best_pipeline = grid_search.best_estimator_
log_step(f"βœ… Best parameters: {grid_search.best_params_}")
log_step(f"βœ… Best CV score: {grid_search.best_score_:.4f}")
else:
log_step("βš™οΈ Using simple training for small dataset...")
pipeline.fit(X_train, y_train)
best_pipeline = pipeline
# Evaluate model
y_pred = best_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
log_step(f"πŸ“ˆ Model Performance:")
log_step(f" - Accuracy: {accuracy:.4f}")
log_step(f" - F1 Score: {f1:.4f}")
# Save model artifacts
log_step("πŸ’Ύ Saving model artifacts...")
# Save the complete pipeline
joblib.dump(best_pipeline, "/tmp/pipeline.pkl")
log_step("βœ… Saved complete pipeline")
# Save individual components for compatibility
joblib.dump(best_pipeline.named_steps['model'], "/tmp/model.pkl")
joblib.dump(best_pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
log_step("βœ… Saved individual model components")
# Generate comprehensive metadata
metadata = {
"model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
"model_type": "logistic_regression",
"training_method": "initial_setup",
"dataset_size": len(df),
"train_size": len(X_train),
"test_size": len(X_test),
"test_accuracy": float(accuracy),
"test_f1": float(f1),
"hyperparameter_tuning": len(X_train) >= 20,
"cv_folds": cv_folds if len(X_train) >= 20 else "not_used",
"class_distribution": df['label'].value_counts().to_dict(),
"training_config": {
"max_features": 5000,
"ngram_range": [1, 2],
"feature_selection_k": 2000,
"test_size": 0.2
},
"timestamp": datetime.now().isoformat(),
"initialization_notes": "Model trained during system initialization",
"ready_for_production": True
}
# Save metadata
with open("/tmp/metadata.json", 'w') as f:
json.dump(metadata, f, indent=2)
log_step("βœ… Saved comprehensive metadata")
log_step(f"πŸŽ‰ Initial model training completed successfully!")
log_step(f"πŸ“Š Final Performance - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
return True
except Exception as e:
log_step(f"❌ Training failed: {str(e)}")
import traceback
log_step(f"πŸ” Error details: {traceback.format_exc()}")
return False
def create_initial_logs():
"""Create initial log files"""
log_step("Creating initial log files...")
try:
# Activity log
activity_log = [{
"timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
"event": "System initialized successfully with trained model",
"level": "INFO"
}]
with open("/tmp/activity_log.json", 'w') as f:
json.dump(activity_log, f, indent=2)
# Create empty monitoring logs
log_dirs = ["/tmp/logs"]
for log_dir in log_dirs:
Path(log_dir).mkdir(parents=True, exist_ok=True)
with open("/tmp/logs/monitoring_log.json", 'w') as f:
json.dump([], f)
with open("/tmp/logs/scheduler_execution.json", 'w') as f:
json.dump([], f)
log_step("βœ… Initial log files created")
return True
except Exception as e:
log_step(f"❌ Log creation failed: {str(e)}")
return False
def validate_installation():
"""Validate that the system is properly set up"""
log_step("πŸ” Validating system installation...")
validation_checks = []
# Check model files
model_exists, model_files = check_model_exists()
validation_checks.append(("Model Files", model_exists, f"Found: {[str(f.name) for f in model_files]}"))
# Check data files
data_exists, data_files = check_training_data_exists()
validation_checks.append(("Training Data", data_exists, f"Found: {len(data_files)} files"))
# Check directories
required_dirs = ["/tmp/data", "/tmp/model", "/tmp/logs"]
dirs_exist = all(Path(d).exists() for d in required_dirs)
validation_checks.append(("Directories", dirs_exist, f"Required dirs: {required_dirs}"))
# Check logs
log_exists = Path("/tmp/activity_log.json").exists()
validation_checks.append(("Log Files", log_exists, "Activity log created"))
# Test model loading
model_loadable = False
try:
import joblib
pipeline = joblib.load("/tmp/pipeline.pkl")
test_prediction = pipeline.predict(["This is a test news article"])
model_loadable = True
validation_checks.append(("Model Loading", True, f"Test prediction: {test_prediction[0]}"))
except Exception as e:
validation_checks.append(("Model Loading", False, f"Error: {str(e)}"))
# Print validation results
log_step("πŸ“‹ Validation Results:")
all_passed = True
for check_name, passed, details in validation_checks:
status = "βœ… PASS" if passed else "❌ FAIL"
log_step(f" {status} {check_name}: {details}")
if not passed:
all_passed = False
return all_passed, validation_checks
def main():
"""Main initialization function with smart training logic"""
log_step("πŸš€ Starting intelligent system initialization...")
# Check if model already exists
model_exists, existing_model_files = check_model_exists()
if model_exists:
log_step("🎯 EXISTING INSTALLATION DETECTED")
log_step("πŸ“„ Found existing model files - skipping training")
# Load existing metadata to show info
try:
with open("/tmp/metadata.json", 'r') as f:
metadata = json.load(f)
log_step(f"πŸ“Š Existing Model Info:")
log_step(f" - Version: {metadata.get('model_version', 'Unknown')}")
log_step(f" - Accuracy: {metadata.get('test_accuracy', 'Unknown')}")
log_step(f" - F1 Score: {metadata.get('test_f1', 'Unknown')}")
log_step(f" - Created: {metadata.get('timestamp', 'Unknown')}")
except Exception as e:
log_step(f"⚠️ Could not read existing metadata: {e}")
else:
log_step("πŸ†• FIRST-TIME INSTALLATION DETECTED")
log_step("πŸ”§ No existing model found - will train new model")
# Run initialization steps
steps = [
("Directory Creation", create_directories),
("Dataset Copy", copy_original_datasets),
("Dataset Preparation", create_minimal_dataset),
("Log Creation", create_initial_logs)
]
# Add training step only if model doesn't exist
if not model_exists:
steps.insert(-1, ("πŸ€– Model Training", run_initial_training))
failed_steps = []
for step_name, step_function in steps:
try:
log_step(f"▢️ Starting: {step_name}")
if step_function():
log_step(f"βœ… {step_name} completed")
else:
log_step(f"❌ {step_name} failed")
failed_steps.append(step_name)
except Exception as e:
log_step(f"❌ {step_name} failed: {str(e)}")
failed_steps.append(step_name)
# Final validation
log_step("πŸ” Running final system validation...")
validation_passed, validation_results = validate_installation()
# Summary
log_step("=" * 60)
if failed_steps:
log_step(f"⚠️ Initialization completed with {len(failed_steps)} issues")
log_step(f"❌ Failed steps: {', '.join(failed_steps)}")
else:
log_step("πŸŽ‰ System initialization completed successfully!")
if validation_passed:
log_step("βœ… All validation checks passed!")
log_step("πŸš€ System is ready for use!")
if not model_exists:
log_step("πŸ€– NEW MODEL TRAINED AND READY")
log_step("πŸ“Š You can now start making predictions!")
else:
log_step("πŸ”„ EXISTING MODEL VALIDATED AND READY")
log_step("πŸ“Š System restored from previous installation!")
else:
log_step("❌ Some validation checks failed")
log_step("πŸ”§ Manual intervention may be required")
log_step("=" * 60)
if __name__ == "__main__":
main()