|
|
import os |
|
|
import sys |
|
|
import shutil |
|
|
import pandas as pd |
|
|
import json |
|
|
from pathlib import Path |
|
|
from datetime import datetime |
|
|
|
|
|
|
|
|
def log_step(message): |
|
|
"""Log initialization steps""" |
|
|
print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}") |
|
|
|
|
|
|
|
|
def create_directories(): |
|
|
"""Create necessary directories""" |
|
|
log_step("Creating directory structure...") |
|
|
|
|
|
directories = [ |
|
|
"/tmp/data", |
|
|
"/tmp/model", |
|
|
"/tmp/logs" |
|
|
] |
|
|
|
|
|
for dir_path in directories: |
|
|
Path(dir_path).mkdir(parents=True, exist_ok=True) |
|
|
log_step(f"β
Created {dir_path}") |
|
|
|
|
|
|
|
|
def copy_original_datasets(): |
|
|
"""Copy original datasets from /app to /tmp""" |
|
|
log_step("Copying original datasets...") |
|
|
|
|
|
source_files = [ |
|
|
("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"), |
|
|
("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"), |
|
|
("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv") |
|
|
] |
|
|
|
|
|
copied_count = 0 |
|
|
for source, dest in source_files: |
|
|
if Path(source).exists(): |
|
|
Path(dest).parent.mkdir(parents=True, exist_ok=True) |
|
|
shutil.copy(source, dest) |
|
|
log_step(f"β
Copied {source} to {dest}") |
|
|
copied_count += 1 |
|
|
else: |
|
|
log_step(f"β οΈ Source file not found: {source}") |
|
|
|
|
|
return copied_count > 0 |
|
|
|
|
|
|
|
|
def create_minimal_dataset(): |
|
|
"""Create a minimal dataset if original doesn't exist""" |
|
|
log_step("Creating minimal dataset...") |
|
|
|
|
|
combined_path = Path("/tmp/data/combined_dataset.csv") |
|
|
|
|
|
if combined_path.exists(): |
|
|
log_step("β
Combined dataset already exists") |
|
|
return True |
|
|
|
|
|
|
|
|
minimal_data = pd.DataFrame({ |
|
|
'text': [ |
|
|
'Scientists discover new species in Amazon rainforest', |
|
|
'SHOCKING: Aliens spotted in Area 51, government confirms existence', |
|
|
'Local authorities report increase in renewable energy adoption', |
|
|
'You won\'t believe what happens when you eat this miracle fruit', |
|
|
'Economic indicators show steady growth in manufacturing sector', |
|
|
'EXCLUSIVE: Celebrity caught in secret alien communication scandal', |
|
|
'Research shows positive effects of meditation on mental health', |
|
|
'Government hiding truth about flat earth, conspiracy theorists claim', |
|
|
'New study reveals benefits of regular exercise for elderly', |
|
|
'BREAKING: Time travel confirmed by underground scientists' |
|
|
], |
|
|
'label': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] |
|
|
}) |
|
|
|
|
|
minimal_data.to_csv(combined_path, index=False) |
|
|
log_step(f"β
Created minimal dataset with {len(minimal_data)} samples") |
|
|
return True |
|
|
|
|
|
|
|
|
def run_initial_training(): |
|
|
"""Run basic model training""" |
|
|
log_step("Starting initial model training...") |
|
|
|
|
|
try: |
|
|
|
|
|
model_path = Path("/tmp/model.pkl") |
|
|
vectorizer_path = Path("/tmp/vectorizer.pkl") |
|
|
|
|
|
if model_path.exists() and vectorizer_path.exists(): |
|
|
log_step("β
Model files already exist") |
|
|
return True |
|
|
|
|
|
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.metrics import accuracy_score |
|
|
import joblib |
|
|
|
|
|
|
|
|
dataset_path = Path("/tmp/data/combined_dataset.csv") |
|
|
if not dataset_path.exists(): |
|
|
log_step("β No dataset available for training") |
|
|
return False |
|
|
|
|
|
df = pd.read_csv(dataset_path) |
|
|
log_step(f"Loaded dataset with {len(df)} samples") |
|
|
|
|
|
|
|
|
X = df['text'].values |
|
|
y = df['label'].values |
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
X, y, test_size=0.2, random_state=42, stratify=y |
|
|
) |
|
|
|
|
|
|
|
|
vectorizer = TfidfVectorizer( |
|
|
max_features=5000, |
|
|
stop_words='english', |
|
|
ngram_range=(1, 2) |
|
|
) |
|
|
X_train_vec = vectorizer.fit_transform(X_train) |
|
|
X_test_vec = vectorizer.transform(X_test) |
|
|
|
|
|
|
|
|
model = LogisticRegression(max_iter=1000, random_state=42) |
|
|
model.fit(X_train_vec, y_train) |
|
|
|
|
|
|
|
|
y_pred = model.predict(X_test_vec) |
|
|
accuracy = accuracy_score(y_test, y_pred) |
|
|
|
|
|
|
|
|
joblib.dump(model, "/tmp/model.pkl") |
|
|
joblib.dump(vectorizer, "/tmp/vectorizer.pkl") |
|
|
|
|
|
|
|
|
metadata = { |
|
|
"model_version": "v1.0_init", |
|
|
"test_accuracy": float(accuracy), |
|
|
"train_size": len(X_train), |
|
|
"test_size": len(X_test), |
|
|
"timestamp": datetime.now().isoformat(), |
|
|
"training_method": "initialization" |
|
|
} |
|
|
|
|
|
with open("/tmp/metadata.json", 'w') as f: |
|
|
json.dump(metadata, f, indent=2) |
|
|
|
|
|
log_step( |
|
|
f"β
Training completed successfully, accuracy: {accuracy:.4f}") |
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
log_step(f"β Training failed: {str(e)}") |
|
|
return False |
|
|
|
|
|
|
|
|
def create_initial_logs(): |
|
|
"""Create initial log files""" |
|
|
log_step("Creating initial log files...") |
|
|
|
|
|
try: |
|
|
|
|
|
activity_log = [{ |
|
|
"timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"), |
|
|
"event": "System initialized successfully" |
|
|
}] |
|
|
|
|
|
with open("/tmp/activity_log.json", 'w') as f: |
|
|
json.dump(activity_log, f, indent=2) |
|
|
|
|
|
|
|
|
with open("/tmp/logs/monitoring_log.json", 'w') as f: |
|
|
json.dump([], f) |
|
|
|
|
|
log_step("β
Initial log files created") |
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
log_step(f"β Log creation failed: {str(e)}") |
|
|
return False |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main initialization function""" |
|
|
log_step("π Starting system initialization...") |
|
|
|
|
|
steps = [ |
|
|
("Directory Creation", create_directories), |
|
|
("Dataset Copy", copy_original_datasets), |
|
|
("Minimal Dataset", create_minimal_dataset), |
|
|
("Model Training", run_initial_training), |
|
|
("Log Creation", create_initial_logs) |
|
|
] |
|
|
|
|
|
failed_steps = [] |
|
|
|
|
|
for step_name, step_function in steps: |
|
|
try: |
|
|
if step_function(): |
|
|
log_step(f"β
{step_name} completed") |
|
|
else: |
|
|
log_step(f"β {step_name} failed") |
|
|
failed_steps.append(step_name) |
|
|
except Exception as e: |
|
|
log_step(f"β {step_name} failed: {str(e)}") |
|
|
failed_steps.append(step_name) |
|
|
|
|
|
if failed_steps: |
|
|
log_step( |
|
|
f"β οΈ Initialization completed with {len(failed_steps)} failed steps") |
|
|
log_step(f"Failed: {', '.join(failed_steps)}") |
|
|
else: |
|
|
log_step("π System initialization completed successfully!") |
|
|
|
|
|
log_step("System ready for use!") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|