|
|
import os |
|
|
import sys |
|
|
import shutil |
|
|
import pandas as pd |
|
|
import json |
|
|
from pathlib import Path |
|
|
from datetime import datetime |
|
|
|
|
|
|
|
|
def log_step(message): |
|
|
"""Log initialization steps""" |
|
|
print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}") |
|
|
|
|
|
|
|
|
def check_model_exists(): |
|
|
"""Check if trained model already exists""" |
|
|
model_files = [ |
|
|
Path("/tmp/pipeline.pkl"), |
|
|
Path("/tmp/model.pkl"), |
|
|
Path("/tmp/vectorizer.pkl"), |
|
|
Path("/tmp/metadata.json") |
|
|
] |
|
|
|
|
|
existing_files = [f for f in model_files if f.exists()] |
|
|
|
|
|
if len(existing_files) >= 2: |
|
|
log_step(f"β
Found {len(existing_files)} existing model files") |
|
|
return True, existing_files |
|
|
else: |
|
|
log_step(f"β Missing model files - only found {len(existing_files)}") |
|
|
return False, existing_files |
|
|
|
|
|
|
|
|
def check_training_data_exists(): |
|
|
"""Check if training data is available""" |
|
|
data_files = [ |
|
|
Path("/tmp/data/combined_dataset.csv"), |
|
|
Path("/app/data/combined_dataset.csv"), |
|
|
Path("/tmp/data/kaggle/Fake.csv"), |
|
|
Path("/tmp/data/kaggle/True.csv") |
|
|
] |
|
|
|
|
|
existing_data = [f for f in data_files if f.exists()] |
|
|
|
|
|
if existing_data: |
|
|
log_step(f"β
Found training data: {[str(f) for f in existing_data]}") |
|
|
return True, existing_data |
|
|
else: |
|
|
log_step("β No training data found") |
|
|
return False, [] |
|
|
|
|
|
|
|
|
def create_directories(): |
|
|
"""Create necessary directories""" |
|
|
log_step("Creating directory structure...") |
|
|
|
|
|
directories = [ |
|
|
"/tmp/data", |
|
|
"/tmp/data/kaggle", |
|
|
"/tmp/model", |
|
|
"/tmp/logs", |
|
|
"/tmp/results", |
|
|
"/tmp/backups" |
|
|
] |
|
|
|
|
|
for dir_path in directories: |
|
|
Path(dir_path).mkdir(parents=True, exist_ok=True) |
|
|
log_step(f"β
Created {dir_path}") |
|
|
|
|
|
|
|
|
def copy_original_datasets(): |
|
|
"""Copy original datasets from /app to /tmp""" |
|
|
log_step("Copying original datasets...") |
|
|
|
|
|
source_files = [ |
|
|
("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"), |
|
|
("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"), |
|
|
("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv"), |
|
|
("/app/data/liar/train.tsv", "/tmp/data/liar/train.tsv"), |
|
|
("/app/data/liar/test.tsv", "/tmp/data/liar/test.tsv"), |
|
|
("/app/data/liar/valid.tsv", "/tmp/data/liar/valid.tsv") |
|
|
] |
|
|
|
|
|
copied_count = 0 |
|
|
for source, dest in source_files: |
|
|
if Path(source).exists(): |
|
|
Path(dest).parent.mkdir(parents=True, exist_ok=True) |
|
|
shutil.copy(source, dest) |
|
|
log_step(f"β
Copied {source} to {dest}") |
|
|
copied_count += 1 |
|
|
else: |
|
|
log_step(f"β οΈ Source file not found: {source}") |
|
|
|
|
|
return copied_count > 0 |
|
|
|
|
|
|
|
|
def create_minimal_dataset(): |
|
|
"""Create a minimal dataset if original doesn't exist""" |
|
|
log_step("Creating minimal dataset...") |
|
|
|
|
|
combined_path = Path("/tmp/data/combined_dataset.csv") |
|
|
|
|
|
if combined_path.exists(): |
|
|
log_step("β
Combined dataset already exists") |
|
|
return True |
|
|
|
|
|
|
|
|
minimal_data = pd.DataFrame({ |
|
|
'text': [ |
|
|
|
|
|
'Scientists at Stanford University have developed a new method for detecting cancer cells using artificial intelligence', |
|
|
'The Federal Reserve announced today a decision to maintain current interest rates amid economic uncertainty', |
|
|
'Local authorities report significant improvements in air quality following new environmental regulations', |
|
|
'Research published in Nature journal shows promising results for renewable energy storage technology', |
|
|
'The United Nations climate summit concluded with new commitments from world leaders on carbon reduction', |
|
|
'Economic indicators suggest steady growth in the manufacturing sector according to latest government data', |
|
|
'Healthcare workers receive additional training on new medical procedures approved by regulatory agencies', |
|
|
'Transportation department announces infrastructure improvements to major highways across the region', |
|
|
'Educational institutions implement new digital learning platforms to enhance student engagement', |
|
|
'Agricultural studies reveal improved crop yields through sustainable farming practices', |
|
|
'Technology companies invest heavily in cybersecurity measures to protect user data and privacy', |
|
|
'Municipal government approves budget for public transportation expansion project in urban areas', |
|
|
'Medical researchers make breakthrough in understanding genetic factors contributing to heart disease', |
|
|
'International trade agreements show positive impact on local businesses and job creation', |
|
|
'Environmental protection agency releases report on water quality improvements in major rivers', |
|
|
|
|
|
|
|
|
'SHOCKING: Government secretly controls weather using hidden technology, whistleblower reveals truth', |
|
|
'EXPOSED: Celebrities caught in massive conspiracy to manipulate public opinion through social media', |
|
|
'URGENT: New study proves that drinking water causes immediate memory loss in 99% of population', |
|
|
'BREAKING: Scientists discover that smartphones are actually mind control devices from aliens', |
|
|
'EXCLUSIVE: Secret documents reveal that all elections have been predetermined by shadow organization', |
|
|
'ALERT: Doctors confirm that eating vegetables makes people 500% more likely to develop rare diseases', |
|
|
'LEAKED: Underground network of billionaires planning to replace all humans with artificial intelligence', |
|
|
'CONSPIRACY: Major corporations hiding cure for aging to maintain population control and profits', |
|
|
'REVEALED: Government admits that gravity is fake and Earth is actually moving upward constantly', |
|
|
'WARNING: New technology allows complete thought reading through WiFi signals in your home', |
|
|
'BOMBSHELL: Ancient aliens return to Earth disguised as tech executives to harvest human energy', |
|
|
'UNCOVERED: All news media controlled by single person living in secret underground bunker', |
|
|
'PROOF: Time travel already exists but only available to wealthy elite who control world events', |
|
|
'SCANDAL: Pharmaceutical companies intentionally create diseases to sell more expensive treatments', |
|
|
'EXPOSED: Education system designed to suppress human creativity and independent thinking abilities' |
|
|
], |
|
|
'label': [ |
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 |
|
|
] |
|
|
}) |
|
|
|
|
|
minimal_data.to_csv(combined_path, index=False) |
|
|
log_step(f"β
Created enhanced minimal dataset with {len(minimal_data)} samples") |
|
|
log_step(f" - Real news samples: {sum(minimal_data['label'] == 0)}") |
|
|
log_step(f" - Fake news samples: {sum(minimal_data['label'] == 1)}") |
|
|
return True |
|
|
|
|
|
|
|
|
def run_initial_training(): |
|
|
"""Run comprehensive model training for first-time setup""" |
|
|
log_step("π Starting comprehensive model training for first-time setup...") |
|
|
|
|
|
try: |
|
|
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.ensemble import RandomForestClassifier |
|
|
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold |
|
|
from sklearn.pipeline import Pipeline |
|
|
from sklearn.feature_selection import SelectKBest, chi2 |
|
|
from sklearn.preprocessing import FunctionTransformer |
|
|
from sklearn.metrics import accuracy_score, f1_score, classification_report |
|
|
import joblib |
|
|
import re |
|
|
|
|
|
|
|
|
def preprocess_text_function(texts): |
|
|
def clean_single_text(text): |
|
|
text = str(text) |
|
|
text = re.sub(r'http\S+|www\S+|https\S+', '', text) |
|
|
text = re.sub(r'\S+@\S+', '', text) |
|
|
text = re.sub(r'[!]{2,}', '!', text) |
|
|
text = re.sub(r'[?]{2,}', '?', text) |
|
|
text = re.sub(r'[.]{3,}', '...', text) |
|
|
text = re.sub(r'[^a-zA-Z\s.!?]', '', text) |
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
return text.strip().lower() |
|
|
|
|
|
processed = [] |
|
|
for text in texts: |
|
|
processed.append(clean_single_text(text)) |
|
|
return processed |
|
|
|
|
|
|
|
|
dataset_path = Path("/tmp/data/combined_dataset.csv") |
|
|
if not dataset_path.exists(): |
|
|
log_step("β No dataset available for training") |
|
|
return False |
|
|
|
|
|
df = pd.read_csv(dataset_path) |
|
|
log_step(f"π Loaded dataset with {len(df)} samples") |
|
|
|
|
|
|
|
|
df = df.dropna(subset=['text', 'label']) |
|
|
df = df[df['text'].astype(str).str.len() > 10] |
|
|
|
|
|
log_step(f"π After cleaning: {len(df)} samples") |
|
|
log_step(f"π Class distribution: {df['label'].value_counts().to_dict()}") |
|
|
|
|
|
|
|
|
X = df['text'].values |
|
|
y = df['label'].values |
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
X, y, test_size=0.2, random_state=42, stratify=y |
|
|
) |
|
|
|
|
|
log_step(f"π Data split: {len(X_train)} train, {len(X_test)} test") |
|
|
|
|
|
|
|
|
text_preprocessor = FunctionTransformer( |
|
|
func=preprocess_text_function, |
|
|
validate=False |
|
|
) |
|
|
|
|
|
vectorizer = TfidfVectorizer( |
|
|
max_features=5000, |
|
|
min_df=1, |
|
|
max_df=0.95, |
|
|
ngram_range=(1, 2), |
|
|
stop_words='english', |
|
|
sublinear_tf=True, |
|
|
norm='l2' |
|
|
) |
|
|
|
|
|
feature_selector = SelectKBest( |
|
|
score_func=chi2, |
|
|
k=2000 |
|
|
) |
|
|
|
|
|
|
|
|
pipeline = Pipeline([ |
|
|
('preprocess', text_preprocessor), |
|
|
('vectorize', vectorizer), |
|
|
('feature_select', feature_selector), |
|
|
('model', LogisticRegression(max_iter=500, class_weight='balanced', random_state=42)) |
|
|
]) |
|
|
|
|
|
log_step("π§ Training model with optimized pipeline...") |
|
|
|
|
|
|
|
|
if len(X_train) >= 20: |
|
|
log_step("βοΈ Performing hyperparameter tuning...") |
|
|
param_grid = { |
|
|
'model__C': [0.1, 1, 10], |
|
|
'model__penalty': ['l2'] |
|
|
} |
|
|
|
|
|
cv_folds = max(2, min(3, len(X_train) // 10)) |
|
|
grid_search = GridSearchCV( |
|
|
pipeline, |
|
|
param_grid, |
|
|
cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42), |
|
|
scoring='f1_weighted', |
|
|
n_jobs=1 |
|
|
) |
|
|
|
|
|
grid_search.fit(X_train, y_train) |
|
|
best_pipeline = grid_search.best_estimator_ |
|
|
|
|
|
log_step(f"β
Best parameters: {grid_search.best_params_}") |
|
|
log_step(f"β
Best CV score: {grid_search.best_score_:.4f}") |
|
|
else: |
|
|
log_step("βοΈ Using simple training for small dataset...") |
|
|
pipeline.fit(X_train, y_train) |
|
|
best_pipeline = pipeline |
|
|
|
|
|
|
|
|
y_pred = best_pipeline.predict(X_test) |
|
|
accuracy = accuracy_score(y_test, y_pred) |
|
|
f1 = f1_score(y_test, y_pred, average='weighted') |
|
|
|
|
|
log_step(f"π Model Performance:") |
|
|
log_step(f" - Accuracy: {accuracy:.4f}") |
|
|
log_step(f" - F1 Score: {f1:.4f}") |
|
|
|
|
|
|
|
|
log_step("πΎ Saving model artifacts...") |
|
|
|
|
|
|
|
|
joblib.dump(best_pipeline, "/tmp/pipeline.pkl") |
|
|
log_step("β
Saved complete pipeline") |
|
|
|
|
|
|
|
|
joblib.dump(best_pipeline.named_steps['model'], "/tmp/model.pkl") |
|
|
joblib.dump(best_pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl") |
|
|
log_step("β
Saved individual model components") |
|
|
|
|
|
|
|
|
metadata = { |
|
|
"model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}", |
|
|
"model_type": "logistic_regression", |
|
|
"training_method": "initial_setup", |
|
|
"dataset_size": len(df), |
|
|
"train_size": len(X_train), |
|
|
"test_size": len(X_test), |
|
|
"test_accuracy": float(accuracy), |
|
|
"test_f1": float(f1), |
|
|
"hyperparameter_tuning": len(X_train) >= 20, |
|
|
"cv_folds": cv_folds if len(X_train) >= 20 else "not_used", |
|
|
"class_distribution": df['label'].value_counts().to_dict(), |
|
|
"training_config": { |
|
|
"max_features": 5000, |
|
|
"ngram_range": [1, 2], |
|
|
"feature_selection_k": 2000, |
|
|
"test_size": 0.2 |
|
|
}, |
|
|
"timestamp": datetime.now().isoformat(), |
|
|
"initialization_notes": "Model trained during system initialization", |
|
|
"ready_for_production": True |
|
|
} |
|
|
|
|
|
|
|
|
with open("/tmp/metadata.json", 'w') as f: |
|
|
json.dump(metadata, f, indent=2) |
|
|
|
|
|
log_step("β
Saved comprehensive metadata") |
|
|
log_step(f"π Initial model training completed successfully!") |
|
|
log_step(f"π Final Performance - Accuracy: {accuracy:.4f}, F1: {f1:.4f}") |
|
|
|
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
log_step(f"β Training failed: {str(e)}") |
|
|
import traceback |
|
|
log_step(f"π Error details: {traceback.format_exc()}") |
|
|
return False |
|
|
|
|
|
|
|
|
def create_initial_logs(): |
|
|
"""Create initial log files""" |
|
|
log_step("Creating initial log files...") |
|
|
|
|
|
try: |
|
|
|
|
|
activity_log = [{ |
|
|
"timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"), |
|
|
"event": "System initialized successfully with trained model", |
|
|
"level": "INFO" |
|
|
}] |
|
|
|
|
|
with open("/tmp/activity_log.json", 'w') as f: |
|
|
json.dump(activity_log, f, indent=2) |
|
|
|
|
|
|
|
|
log_dirs = ["/tmp/logs"] |
|
|
for log_dir in log_dirs: |
|
|
Path(log_dir).mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
with open("/tmp/logs/monitoring_log.json", 'w') as f: |
|
|
json.dump([], f) |
|
|
|
|
|
with open("/tmp/logs/scheduler_execution.json", 'w') as f: |
|
|
json.dump([], f) |
|
|
|
|
|
log_step("β
Initial log files created") |
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
log_step(f"β Log creation failed: {str(e)}") |
|
|
return False |
|
|
|
|
|
|
|
|
def validate_installation(): |
|
|
"""Validate that the system is properly set up""" |
|
|
log_step("π Validating system installation...") |
|
|
|
|
|
validation_checks = [] |
|
|
|
|
|
|
|
|
model_exists, model_files = check_model_exists() |
|
|
validation_checks.append(("Model Files", model_exists, f"Found: {[str(f.name) for f in model_files]}")) |
|
|
|
|
|
|
|
|
data_exists, data_files = check_training_data_exists() |
|
|
validation_checks.append(("Training Data", data_exists, f"Found: {len(data_files)} files")) |
|
|
|
|
|
|
|
|
required_dirs = ["/tmp/data", "/tmp/model", "/tmp/logs"] |
|
|
dirs_exist = all(Path(d).exists() for d in required_dirs) |
|
|
validation_checks.append(("Directories", dirs_exist, f"Required dirs: {required_dirs}")) |
|
|
|
|
|
|
|
|
log_exists = Path("/tmp/activity_log.json").exists() |
|
|
validation_checks.append(("Log Files", log_exists, "Activity log created")) |
|
|
|
|
|
|
|
|
model_loadable = False |
|
|
try: |
|
|
import joblib |
|
|
pipeline = joblib.load("/tmp/pipeline.pkl") |
|
|
test_prediction = pipeline.predict(["This is a test news article"]) |
|
|
model_loadable = True |
|
|
validation_checks.append(("Model Loading", True, f"Test prediction: {test_prediction[0]}")) |
|
|
except Exception as e: |
|
|
validation_checks.append(("Model Loading", False, f"Error: {str(e)}")) |
|
|
|
|
|
|
|
|
log_step("π Validation Results:") |
|
|
all_passed = True |
|
|
for check_name, passed, details in validation_checks: |
|
|
status = "β
PASS" if passed else "β FAIL" |
|
|
log_step(f" {status} {check_name}: {details}") |
|
|
if not passed: |
|
|
all_passed = False |
|
|
|
|
|
return all_passed, validation_checks |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main initialization function with smart training logic""" |
|
|
log_step("π Starting intelligent system initialization...") |
|
|
|
|
|
|
|
|
model_exists, existing_model_files = check_model_exists() |
|
|
|
|
|
if model_exists: |
|
|
log_step("π― EXISTING INSTALLATION DETECTED") |
|
|
log_step("π Found existing model files - skipping training") |
|
|
|
|
|
|
|
|
try: |
|
|
with open("/tmp/metadata.json", 'r') as f: |
|
|
metadata = json.load(f) |
|
|
|
|
|
log_step(f"π Existing Model Info:") |
|
|
log_step(f" - Version: {metadata.get('model_version', 'Unknown')}") |
|
|
log_step(f" - Accuracy: {metadata.get('test_accuracy', 'Unknown')}") |
|
|
log_step(f" - F1 Score: {metadata.get('test_f1', 'Unknown')}") |
|
|
log_step(f" - Created: {metadata.get('timestamp', 'Unknown')}") |
|
|
|
|
|
except Exception as e: |
|
|
log_step(f"β οΈ Could not read existing metadata: {e}") |
|
|
|
|
|
else: |
|
|
log_step("π FIRST-TIME INSTALLATION DETECTED") |
|
|
log_step("π§ No existing model found - will train new model") |
|
|
|
|
|
|
|
|
steps = [ |
|
|
("Directory Creation", create_directories), |
|
|
("Dataset Copy", copy_original_datasets), |
|
|
("Dataset Preparation", create_minimal_dataset), |
|
|
("Log Creation", create_initial_logs) |
|
|
] |
|
|
|
|
|
|
|
|
if not model_exists: |
|
|
steps.insert(-1, ("π€ Model Training", run_initial_training)) |
|
|
|
|
|
failed_steps = [] |
|
|
|
|
|
for step_name, step_function in steps: |
|
|
try: |
|
|
log_step(f"βΆοΈ Starting: {step_name}") |
|
|
if step_function(): |
|
|
log_step(f"β
{step_name} completed") |
|
|
else: |
|
|
log_step(f"β {step_name} failed") |
|
|
failed_steps.append(step_name) |
|
|
except Exception as e: |
|
|
log_step(f"β {step_name} failed: {str(e)}") |
|
|
failed_steps.append(step_name) |
|
|
|
|
|
|
|
|
log_step("π Running final system validation...") |
|
|
validation_passed, validation_results = validate_installation() |
|
|
|
|
|
|
|
|
log_step("=" * 60) |
|
|
if failed_steps: |
|
|
log_step(f"β οΈ Initialization completed with {len(failed_steps)} issues") |
|
|
log_step(f"β Failed steps: {', '.join(failed_steps)}") |
|
|
else: |
|
|
log_step("π System initialization completed successfully!") |
|
|
|
|
|
if validation_passed: |
|
|
log_step("β
All validation checks passed!") |
|
|
log_step("π System is ready for use!") |
|
|
|
|
|
if not model_exists: |
|
|
log_step("π€ NEW MODEL TRAINED AND READY") |
|
|
log_step("π You can now start making predictions!") |
|
|
else: |
|
|
log_step("π EXISTING MODEL VALIDATED AND READY") |
|
|
log_step("π System restored from previous installation!") |
|
|
|
|
|
else: |
|
|
log_step("β Some validation checks failed") |
|
|
log_step("π§ Manual intervention may be required") |
|
|
|
|
|
log_step("=" * 60) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |