Fake-News-Detection-with-MLOps / initialize_system.py
Ahmedik95316's picture
Update initialize_system.py
c745fee
raw
history blame
7.13 kB
import os
import sys
import shutil
import pandas as pd
import json
from pathlib import Path
from datetime import datetime
def log_step(message):
"""Log initialization steps"""
print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
def create_directories():
"""Create necessary directories"""
log_step("Creating directory structure...")
directories = [
"/tmp/data",
"/tmp/model",
"/tmp/logs"
]
for dir_path in directories:
Path(dir_path).mkdir(parents=True, exist_ok=True)
log_step(f"βœ… Created {dir_path}")
def copy_original_datasets():
"""Copy original datasets from /app to /tmp"""
log_step("Copying original datasets...")
source_files = [
("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv")
]
copied_count = 0
for source, dest in source_files:
if Path(source).exists():
Path(dest).parent.mkdir(parents=True, exist_ok=True)
shutil.copy(source, dest)
log_step(f"βœ… Copied {source} to {dest}")
copied_count += 1
else:
log_step(f"⚠️ Source file not found: {source}")
return copied_count > 0
def create_minimal_dataset():
"""Create a minimal dataset if original doesn't exist"""
log_step("Creating minimal dataset...")
combined_path = Path("/tmp/data/combined_dataset.csv")
if combined_path.exists():
log_step("βœ… Combined dataset already exists")
return True
# Create minimal training data
minimal_data = pd.DataFrame({
'text': [
'Scientists discover new species in Amazon rainforest',
'SHOCKING: Aliens spotted in Area 51, government confirms existence',
'Local authorities report increase in renewable energy adoption',
'You won\'t believe what happens when you eat this miracle fruit',
'Economic indicators show steady growth in manufacturing sector',
'EXCLUSIVE: Celebrity caught in secret alien communication scandal',
'Research shows positive effects of meditation on mental health',
'Government hiding truth about flat earth, conspiracy theorists claim',
'New study reveals benefits of regular exercise for elderly',
'BREAKING: Time travel confirmed by underground scientists'
],
'label': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] # 0=Real, 1=Fake
})
minimal_data.to_csv(combined_path, index=False)
log_step(f"βœ… Created minimal dataset with {len(minimal_data)} samples")
return True
def run_initial_training():
"""Run basic model training"""
log_step("Starting initial model training...")
try:
# Check if model already exists
model_path = Path("/tmp/model.pkl")
vectorizer_path = Path("/tmp/vectorizer.pkl")
if model_path.exists() and vectorizer_path.exists():
log_step("βœ… Model files already exist")
return True
# Import required libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
# Load dataset
dataset_path = Path("/tmp/data/combined_dataset.csv")
if not dataset_path.exists():
log_step("❌ No dataset available for training")
return False
df = pd.read_csv(dataset_path)
log_step(f"Loaded dataset with {len(df)} samples")
# Prepare data
X = df['text'].values
y = df['label'].values
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Vectorization
vectorizer = TfidfVectorizer(
max_features=5000,
stop_words='english',
ngram_range=(1, 2)
)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
# Train model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_vec, y_train)
# Evaluate
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
# Save model
joblib.dump(model, "/tmp/model.pkl")
joblib.dump(vectorizer, "/tmp/vectorizer.pkl")
# Save metadata
metadata = {
"model_version": "v1.0_init",
"test_accuracy": float(accuracy),
"train_size": len(X_train),
"test_size": len(X_test),
"timestamp": datetime.now().isoformat(),
"training_method": "initialization"
}
with open("/tmp/metadata.json", 'w') as f:
json.dump(metadata, f, indent=2)
log_step(
f"βœ… Training completed successfully, accuracy: {accuracy:.4f}")
return True
except Exception as e:
log_step(f"❌ Training failed: {str(e)}")
return False
def create_initial_logs():
"""Create initial log files"""
log_step("Creating initial log files...")
try:
# Activity log
activity_log = [{
"timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
"event": "System initialized successfully"
}]
with open("/tmp/activity_log.json", 'w') as f:
json.dump(activity_log, f, indent=2)
# Create empty monitoring logs
with open("/tmp/logs/monitoring_log.json", 'w') as f:
json.dump([], f)
log_step("βœ… Initial log files created")
return True
except Exception as e:
log_step(f"❌ Log creation failed: {str(e)}")
return False
def main():
"""Main initialization function"""
log_step("πŸš€ Starting system initialization...")
steps = [
("Directory Creation", create_directories),
("Dataset Copy", copy_original_datasets),
("Minimal Dataset", create_minimal_dataset),
("Model Training", run_initial_training),
("Log Creation", create_initial_logs)
]
failed_steps = []
for step_name, step_function in steps:
try:
if step_function():
log_step(f"βœ… {step_name} completed")
else:
log_step(f"❌ {step_name} failed")
failed_steps.append(step_name)
except Exception as e:
log_step(f"❌ {step_name} failed: {str(e)}")
failed_steps.append(step_name)
if failed_steps:
log_step(
f"⚠️ Initialization completed with {len(failed_steps)} failed steps")
log_step(f"Failed: {', '.join(failed_steps)}")
else:
log_step("πŸŽ‰ System initialization completed successfully!")
log_step("System ready for use!")
if __name__ == "__main__":
main()