Commit
Β·
c678ee1
1
Parent(s):
3a989cc
Update initialize_system.py
Browse filesUpdate to include the model training at the start so the pipeline is available
- initialize_system.py +336 -57
initialize_system.py
CHANGED
@@ -12,14 +12,55 @@ def log_step(message):
|
|
12 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
|
13 |
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
def create_directories():
|
16 |
"""Create necessary directories"""
|
17 |
log_step("Creating directory structure...")
|
18 |
|
19 |
directories = [
|
20 |
"/tmp/data",
|
|
|
21 |
"/tmp/model",
|
22 |
-
"/tmp/logs"
|
|
|
|
|
23 |
]
|
24 |
|
25 |
for dir_path in directories:
|
@@ -34,7 +75,10 @@ def copy_original_datasets():
|
|
34 |
source_files = [
|
35 |
("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
|
36 |
("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
|
37 |
-
("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv")
|
|
|
|
|
|
|
38 |
]
|
39 |
|
40 |
copied_count = 0
|
@@ -60,47 +104,92 @@ def create_minimal_dataset():
|
|
60 |
log_step("β
Combined dataset already exists")
|
61 |
return True
|
62 |
|
63 |
-
# Create minimal training data
|
64 |
minimal_data = pd.DataFrame({
|
65 |
'text': [
|
66 |
-
|
67 |
-
'
|
68 |
-
'
|
69 |
-
'
|
70 |
-
'
|
71 |
-
'
|
72 |
-
'
|
73 |
-
'
|
74 |
-
'
|
75 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
],
|
77 |
-
'label': [
|
|
|
|
|
|
|
|
|
|
|
78 |
})
|
79 |
|
80 |
minimal_data.to_csv(combined_path, index=False)
|
81 |
-
log_step(f"β
Created minimal dataset with {len(minimal_data)} samples")
|
|
|
|
|
82 |
return True
|
83 |
|
84 |
|
85 |
def run_initial_training():
|
86 |
-
"""Run
|
87 |
-
log_step("Starting
|
88 |
|
89 |
try:
|
90 |
-
#
|
91 |
-
model_path = Path("/tmp/model.pkl")
|
92 |
-
vectorizer_path = Path("/tmp/vectorizer.pkl")
|
93 |
-
|
94 |
-
if model_path.exists() and vectorizer_path.exists():
|
95 |
-
log_step("β
Model files already exist")
|
96 |
-
return True
|
97 |
-
|
98 |
-
# Import required libraries
|
99 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
100 |
from sklearn.linear_model import LogisticRegression
|
101 |
-
from sklearn.
|
102 |
-
from sklearn.
|
|
|
|
|
|
|
|
|
103 |
import joblib
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
# Load dataset
|
106 |
dataset_path = Path("/tmp/data/combined_dataset.csv")
|
@@ -109,7 +198,14 @@ def run_initial_training():
|
|
109 |
return False
|
110 |
|
111 |
df = pd.read_csv(dataset_path)
|
112 |
-
log_step(f"Loaded dataset with {len(df)} samples")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
# Prepare data
|
115 |
X = df['text'].values
|
@@ -120,46 +216,125 @@ def run_initial_training():
|
|
120 |
X, y, test_size=0.2, random_state=42, stratify=y
|
121 |
)
|
122 |
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
vectorizer = TfidfVectorizer(
|
125 |
max_features=5000,
|
|
|
|
|
|
|
126 |
stop_words='english',
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
)
|
129 |
-
X_train_vec = vectorizer.fit_transform(X_train)
|
130 |
-
X_test_vec = vectorizer.transform(X_test)
|
131 |
|
132 |
-
#
|
133 |
-
|
134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
-
# Evaluate
|
137 |
-
y_pred =
|
138 |
accuracy = accuracy_score(y_test, y_pred)
|
|
|
139 |
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
|
144 |
-
# Save
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
metadata = {
|
146 |
-
"model_version": "v1.
|
147 |
-
"
|
|
|
|
|
148 |
"train_size": len(X_train),
|
149 |
"test_size": len(X_test),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
"timestamp": datetime.now().isoformat(),
|
151 |
-
"
|
|
|
152 |
}
|
153 |
|
|
|
154 |
with open("/tmp/metadata.json", 'w') as f:
|
155 |
json.dump(metadata, f, indent=2)
|
156 |
|
157 |
-
log_step(
|
158 |
-
|
|
|
|
|
159 |
return True
|
160 |
|
161 |
except Exception as e:
|
162 |
log_step(f"β Training failed: {str(e)}")
|
|
|
|
|
163 |
return False
|
164 |
|
165 |
|
@@ -171,16 +346,24 @@ def create_initial_logs():
|
|
171 |
# Activity log
|
172 |
activity_log = [{
|
173 |
"timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
|
174 |
-
"event": "System initialized successfully"
|
|
|
175 |
}]
|
176 |
|
177 |
with open("/tmp/activity_log.json", 'w') as f:
|
178 |
json.dump(activity_log, f, indent=2)
|
179 |
|
180 |
# Create empty monitoring logs
|
|
|
|
|
|
|
|
|
181 |
with open("/tmp/logs/monitoring_log.json", 'w') as f:
|
182 |
json.dump([], f)
|
183 |
|
|
|
|
|
|
|
184 |
log_step("β
Initial log files created")
|
185 |
return True
|
186 |
|
@@ -189,22 +372,98 @@ def create_initial_logs():
|
|
189 |
return False
|
190 |
|
191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
def main():
|
193 |
-
"""Main initialization function"""
|
194 |
-
log_step("π Starting system initialization...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
|
|
|
196 |
steps = [
|
197 |
("Directory Creation", create_directories),
|
198 |
("Dataset Copy", copy_original_datasets),
|
199 |
-
("
|
200 |
-
("Model Training", run_initial_training),
|
201 |
("Log Creation", create_initial_logs)
|
202 |
]
|
203 |
|
|
|
|
|
|
|
|
|
204 |
failed_steps = []
|
205 |
|
206 |
for step_name, step_function in steps:
|
207 |
try:
|
|
|
208 |
if step_function():
|
209 |
log_step(f"β
{step_name} completed")
|
210 |
else:
|
@@ -214,15 +473,35 @@ def main():
|
|
214 |
log_step(f"β {step_name} failed: {str(e)}")
|
215 |
failed_steps.append(step_name)
|
216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
if failed_steps:
|
218 |
-
log_step(
|
219 |
-
|
220 |
-
log_step(f"Failed: {', '.join(failed_steps)}")
|
221 |
else:
|
222 |
log_step("π System initialization completed successfully!")
|
223 |
|
224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
|
226 |
|
227 |
if __name__ == "__main__":
|
228 |
-
main()
|
|
|
12 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
|
13 |
|
14 |
|
15 |
+
def check_model_exists():
|
16 |
+
"""Check if trained model already exists"""
|
17 |
+
model_files = [
|
18 |
+
Path("/tmp/pipeline.pkl"),
|
19 |
+
Path("/tmp/model.pkl"),
|
20 |
+
Path("/tmp/vectorizer.pkl"),
|
21 |
+
Path("/tmp/metadata.json")
|
22 |
+
]
|
23 |
+
|
24 |
+
existing_files = [f for f in model_files if f.exists()]
|
25 |
+
|
26 |
+
if len(existing_files) >= 2: # At least pipeline + metadata OR model + vectorizer
|
27 |
+
log_step(f"β
Found {len(existing_files)} existing model files")
|
28 |
+
return True, existing_files
|
29 |
+
else:
|
30 |
+
log_step(f"β Missing model files - only found {len(existing_files)}")
|
31 |
+
return False, existing_files
|
32 |
+
|
33 |
+
|
34 |
+
def check_training_data_exists():
|
35 |
+
"""Check if training data is available"""
|
36 |
+
data_files = [
|
37 |
+
Path("/tmp/data/combined_dataset.csv"),
|
38 |
+
Path("/app/data/combined_dataset.csv"),
|
39 |
+
Path("/tmp/data/kaggle/Fake.csv"),
|
40 |
+
Path("/tmp/data/kaggle/True.csv")
|
41 |
+
]
|
42 |
+
|
43 |
+
existing_data = [f for f in data_files if f.exists()]
|
44 |
+
|
45 |
+
if existing_data:
|
46 |
+
log_step(f"β
Found training data: {[str(f) for f in existing_data]}")
|
47 |
+
return True, existing_data
|
48 |
+
else:
|
49 |
+
log_step("β No training data found")
|
50 |
+
return False, []
|
51 |
+
|
52 |
+
|
53 |
def create_directories():
|
54 |
"""Create necessary directories"""
|
55 |
log_step("Creating directory structure...")
|
56 |
|
57 |
directories = [
|
58 |
"/tmp/data",
|
59 |
+
"/tmp/data/kaggle",
|
60 |
"/tmp/model",
|
61 |
+
"/tmp/logs",
|
62 |
+
"/tmp/results",
|
63 |
+
"/tmp/backups"
|
64 |
]
|
65 |
|
66 |
for dir_path in directories:
|
|
|
75 |
source_files = [
|
76 |
("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
|
77 |
("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
|
78 |
+
("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv"),
|
79 |
+
("/app/data/liar/train.tsv", "/tmp/data/liar/train.tsv"),
|
80 |
+
("/app/data/liar/test.tsv", "/tmp/data/liar/test.tsv"),
|
81 |
+
("/app/data/liar/valid.tsv", "/tmp/data/liar/valid.tsv")
|
82 |
]
|
83 |
|
84 |
copied_count = 0
|
|
|
104 |
log_step("β
Combined dataset already exists")
|
105 |
return True
|
106 |
|
107 |
+
# Create minimal training data with more samples for better training
|
108 |
minimal_data = pd.DataFrame({
|
109 |
'text': [
|
110 |
+
# Real news samples
|
111 |
+
'Scientists at Stanford University have developed a new method for detecting cancer cells using artificial intelligence',
|
112 |
+
'The Federal Reserve announced today a decision to maintain current interest rates amid economic uncertainty',
|
113 |
+
'Local authorities report significant improvements in air quality following new environmental regulations',
|
114 |
+
'Research published in Nature journal shows promising results for renewable energy storage technology',
|
115 |
+
'The United Nations climate summit concluded with new commitments from world leaders on carbon reduction',
|
116 |
+
'Economic indicators suggest steady growth in the manufacturing sector according to latest government data',
|
117 |
+
'Healthcare workers receive additional training on new medical procedures approved by regulatory agencies',
|
118 |
+
'Transportation department announces infrastructure improvements to major highways across the region',
|
119 |
+
'Educational institutions implement new digital learning platforms to enhance student engagement',
|
120 |
+
'Agricultural studies reveal improved crop yields through sustainable farming practices',
|
121 |
+
'Technology companies invest heavily in cybersecurity measures to protect user data and privacy',
|
122 |
+
'Municipal government approves budget for public transportation expansion project in urban areas',
|
123 |
+
'Medical researchers make breakthrough in understanding genetic factors contributing to heart disease',
|
124 |
+
'International trade agreements show positive impact on local businesses and job creation',
|
125 |
+
'Environmental protection agency releases report on water quality improvements in major rivers',
|
126 |
+
|
127 |
+
# Fake news samples
|
128 |
+
'SHOCKING: Government secretly controls weather using hidden technology, whistleblower reveals truth',
|
129 |
+
'EXPOSED: Celebrities caught in massive conspiracy to manipulate public opinion through social media',
|
130 |
+
'URGENT: New study proves that drinking water causes immediate memory loss in 99% of population',
|
131 |
+
'BREAKING: Scientists discover that smartphones are actually mind control devices from aliens',
|
132 |
+
'EXCLUSIVE: Secret documents reveal that all elections have been predetermined by shadow organization',
|
133 |
+
'ALERT: Doctors confirm that eating vegetables makes people 500% more likely to develop rare diseases',
|
134 |
+
'LEAKED: Underground network of billionaires planning to replace all humans with artificial intelligence',
|
135 |
+
'CONSPIRACY: Major corporations hiding cure for aging to maintain population control and profits',
|
136 |
+
'REVEALED: Government admits that gravity is fake and Earth is actually moving upward constantly',
|
137 |
+
'WARNING: New technology allows complete thought reading through WiFi signals in your home',
|
138 |
+
'BOMBSHELL: Ancient aliens return to Earth disguised as tech executives to harvest human energy',
|
139 |
+
'UNCOVERED: All news media controlled by single person living in secret underground bunker',
|
140 |
+
'PROOF: Time travel already exists but only available to wealthy elite who control world events',
|
141 |
+
'SCANDAL: Pharmaceutical companies intentionally create diseases to sell more expensive treatments',
|
142 |
+
'EXPOSED: Education system designed to suppress human creativity and independent thinking abilities'
|
143 |
],
|
144 |
+
'label': [
|
145 |
+
# Real news labels (0)
|
146 |
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
147 |
+
# Fake news labels (1)
|
148 |
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
|
149 |
+
]
|
150 |
})
|
151 |
|
152 |
minimal_data.to_csv(combined_path, index=False)
|
153 |
+
log_step(f"β
Created enhanced minimal dataset with {len(minimal_data)} samples")
|
154 |
+
log_step(f" - Real news samples: {sum(minimal_data['label'] == 0)}")
|
155 |
+
log_step(f" - Fake news samples: {sum(minimal_data['label'] == 1)}")
|
156 |
return True
|
157 |
|
158 |
|
159 |
def run_initial_training():
|
160 |
+
"""Run comprehensive model training for first-time setup"""
|
161 |
+
log_step("π Starting comprehensive model training for first-time setup...")
|
162 |
|
163 |
try:
|
164 |
+
# Import training modules
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
166 |
from sklearn.linear_model import LogisticRegression
|
167 |
+
from sklearn.ensemble import RandomForestClassifier
|
168 |
+
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
|
169 |
+
from sklearn.pipeline import Pipeline
|
170 |
+
from sklearn.feature_selection import SelectKBest, chi2
|
171 |
+
from sklearn.preprocessing import FunctionTransformer
|
172 |
+
from sklearn.metrics import accuracy_score, f1_score, classification_report
|
173 |
import joblib
|
174 |
+
import re
|
175 |
+
|
176 |
+
# Text preprocessing function (same as in train.py)
|
177 |
+
def preprocess_text_function(texts):
|
178 |
+
def clean_single_text(text):
|
179 |
+
text = str(text)
|
180 |
+
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
|
181 |
+
text = re.sub(r'\S+@\S+', '', text)
|
182 |
+
text = re.sub(r'[!]{2,}', '!', text)
|
183 |
+
text = re.sub(r'[?]{2,}', '?', text)
|
184 |
+
text = re.sub(r'[.]{3,}', '...', text)
|
185 |
+
text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
|
186 |
+
text = re.sub(r'\s+', ' ', text)
|
187 |
+
return text.strip().lower()
|
188 |
+
|
189 |
+
processed = []
|
190 |
+
for text in texts:
|
191 |
+
processed.append(clean_single_text(text))
|
192 |
+
return processed
|
193 |
|
194 |
# Load dataset
|
195 |
dataset_path = Path("/tmp/data/combined_dataset.csv")
|
|
|
198 |
return False
|
199 |
|
200 |
df = pd.read_csv(dataset_path)
|
201 |
+
log_step(f"π Loaded dataset with {len(df)} samples")
|
202 |
+
|
203 |
+
# Data validation and cleaning
|
204 |
+
df = df.dropna(subset=['text', 'label'])
|
205 |
+
df = df[df['text'].astype(str).str.len() > 10]
|
206 |
+
|
207 |
+
log_step(f"π After cleaning: {len(df)} samples")
|
208 |
+
log_step(f"π Class distribution: {df['label'].value_counts().to_dict()}")
|
209 |
|
210 |
# Prepare data
|
211 |
X = df['text'].values
|
|
|
216 |
X, y, test_size=0.2, random_state=42, stratify=y
|
217 |
)
|
218 |
|
219 |
+
log_step(f"π Data split: {len(X_train)} train, {len(X_test)} test")
|
220 |
+
|
221 |
+
# Create comprehensive pipeline
|
222 |
+
text_preprocessor = FunctionTransformer(
|
223 |
+
func=preprocess_text_function,
|
224 |
+
validate=False
|
225 |
+
)
|
226 |
+
|
227 |
vectorizer = TfidfVectorizer(
|
228 |
max_features=5000,
|
229 |
+
min_df=1,
|
230 |
+
max_df=0.95,
|
231 |
+
ngram_range=(1, 2),
|
232 |
stop_words='english',
|
233 |
+
sublinear_tf=True,
|
234 |
+
norm='l2'
|
235 |
+
)
|
236 |
+
|
237 |
+
feature_selector = SelectKBest(
|
238 |
+
score_func=chi2,
|
239 |
+
k=2000
|
240 |
)
|
|
|
|
|
241 |
|
242 |
+
# Create pipeline with Logistic Regression
|
243 |
+
pipeline = Pipeline([
|
244 |
+
('preprocess', text_preprocessor),
|
245 |
+
('vectorize', vectorizer),
|
246 |
+
('feature_select', feature_selector),
|
247 |
+
('model', LogisticRegression(max_iter=500, class_weight='balanced', random_state=42))
|
248 |
+
])
|
249 |
+
|
250 |
+
log_step("π§ Training model with optimized pipeline...")
|
251 |
+
|
252 |
+
# Hyperparameter tuning for datasets with sufficient samples
|
253 |
+
if len(X_train) >= 20:
|
254 |
+
log_step("βοΈ Performing hyperparameter tuning...")
|
255 |
+
param_grid = {
|
256 |
+
'model__C': [0.1, 1, 10],
|
257 |
+
'model__penalty': ['l2']
|
258 |
+
}
|
259 |
+
|
260 |
+
cv_folds = max(2, min(3, len(X_train) // 10))
|
261 |
+
grid_search = GridSearchCV(
|
262 |
+
pipeline,
|
263 |
+
param_grid,
|
264 |
+
cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42),
|
265 |
+
scoring='f1_weighted',
|
266 |
+
n_jobs=1
|
267 |
+
)
|
268 |
+
|
269 |
+
grid_search.fit(X_train, y_train)
|
270 |
+
best_pipeline = grid_search.best_estimator_
|
271 |
+
|
272 |
+
log_step(f"β
Best parameters: {grid_search.best_params_}")
|
273 |
+
log_step(f"β
Best CV score: {grid_search.best_score_:.4f}")
|
274 |
+
else:
|
275 |
+
log_step("βοΈ Using simple training for small dataset...")
|
276 |
+
pipeline.fit(X_train, y_train)
|
277 |
+
best_pipeline = pipeline
|
278 |
|
279 |
+
# Evaluate model
|
280 |
+
y_pred = best_pipeline.predict(X_test)
|
281 |
accuracy = accuracy_score(y_test, y_pred)
|
282 |
+
f1 = f1_score(y_test, y_pred, average='weighted')
|
283 |
|
284 |
+
log_step(f"π Model Performance:")
|
285 |
+
log_step(f" - Accuracy: {accuracy:.4f}")
|
286 |
+
log_step(f" - F1 Score: {f1:.4f}")
|
287 |
|
288 |
+
# Save model artifacts
|
289 |
+
log_step("πΎ Saving model artifacts...")
|
290 |
+
|
291 |
+
# Save the complete pipeline
|
292 |
+
joblib.dump(best_pipeline, "/tmp/pipeline.pkl")
|
293 |
+
log_step("β
Saved complete pipeline")
|
294 |
+
|
295 |
+
# Save individual components for compatibility
|
296 |
+
joblib.dump(best_pipeline.named_steps['model'], "/tmp/model.pkl")
|
297 |
+
joblib.dump(best_pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
|
298 |
+
log_step("β
Saved individual model components")
|
299 |
+
|
300 |
+
# Generate comprehensive metadata
|
301 |
metadata = {
|
302 |
+
"model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
|
303 |
+
"model_type": "logistic_regression",
|
304 |
+
"training_method": "initial_setup",
|
305 |
+
"dataset_size": len(df),
|
306 |
"train_size": len(X_train),
|
307 |
"test_size": len(X_test),
|
308 |
+
"test_accuracy": float(accuracy),
|
309 |
+
"test_f1": float(f1),
|
310 |
+
"hyperparameter_tuning": len(X_train) >= 20,
|
311 |
+
"cv_folds": cv_folds if len(X_train) >= 20 else "not_used",
|
312 |
+
"class_distribution": df['label'].value_counts().to_dict(),
|
313 |
+
"training_config": {
|
314 |
+
"max_features": 5000,
|
315 |
+
"ngram_range": [1, 2],
|
316 |
+
"feature_selection_k": 2000,
|
317 |
+
"test_size": 0.2
|
318 |
+
},
|
319 |
"timestamp": datetime.now().isoformat(),
|
320 |
+
"initialization_notes": "Model trained during system initialization",
|
321 |
+
"ready_for_production": True
|
322 |
}
|
323 |
|
324 |
+
# Save metadata
|
325 |
with open("/tmp/metadata.json", 'w') as f:
|
326 |
json.dump(metadata, f, indent=2)
|
327 |
|
328 |
+
log_step("β
Saved comprehensive metadata")
|
329 |
+
log_step(f"π Initial model training completed successfully!")
|
330 |
+
log_step(f"π Final Performance - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
|
331 |
+
|
332 |
return True
|
333 |
|
334 |
except Exception as e:
|
335 |
log_step(f"β Training failed: {str(e)}")
|
336 |
+
import traceback
|
337 |
+
log_step(f"π Error details: {traceback.format_exc()}")
|
338 |
return False
|
339 |
|
340 |
|
|
|
346 |
# Activity log
|
347 |
activity_log = [{
|
348 |
"timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
|
349 |
+
"event": "System initialized successfully with trained model",
|
350 |
+
"level": "INFO"
|
351 |
}]
|
352 |
|
353 |
with open("/tmp/activity_log.json", 'w') as f:
|
354 |
json.dump(activity_log, f, indent=2)
|
355 |
|
356 |
# Create empty monitoring logs
|
357 |
+
log_dirs = ["/tmp/logs"]
|
358 |
+
for log_dir in log_dirs:
|
359 |
+
Path(log_dir).mkdir(parents=True, exist_ok=True)
|
360 |
+
|
361 |
with open("/tmp/logs/monitoring_log.json", 'w') as f:
|
362 |
json.dump([], f)
|
363 |
|
364 |
+
with open("/tmp/logs/scheduler_execution.json", 'w') as f:
|
365 |
+
json.dump([], f)
|
366 |
+
|
367 |
log_step("β
Initial log files created")
|
368 |
return True
|
369 |
|
|
|
372 |
return False
|
373 |
|
374 |
|
375 |
+
def validate_installation():
|
376 |
+
"""Validate that the system is properly set up"""
|
377 |
+
log_step("π Validating system installation...")
|
378 |
+
|
379 |
+
validation_checks = []
|
380 |
+
|
381 |
+
# Check model files
|
382 |
+
model_exists, model_files = check_model_exists()
|
383 |
+
validation_checks.append(("Model Files", model_exists, f"Found: {[str(f.name) for f in model_files]}"))
|
384 |
+
|
385 |
+
# Check data files
|
386 |
+
data_exists, data_files = check_training_data_exists()
|
387 |
+
validation_checks.append(("Training Data", data_exists, f"Found: {len(data_files)} files"))
|
388 |
+
|
389 |
+
# Check directories
|
390 |
+
required_dirs = ["/tmp/data", "/tmp/model", "/tmp/logs"]
|
391 |
+
dirs_exist = all(Path(d).exists() for d in required_dirs)
|
392 |
+
validation_checks.append(("Directories", dirs_exist, f"Required dirs: {required_dirs}"))
|
393 |
+
|
394 |
+
# Check logs
|
395 |
+
log_exists = Path("/tmp/activity_log.json").exists()
|
396 |
+
validation_checks.append(("Log Files", log_exists, "Activity log created"))
|
397 |
+
|
398 |
+
# Test model loading
|
399 |
+
model_loadable = False
|
400 |
+
try:
|
401 |
+
import joblib
|
402 |
+
pipeline = joblib.load("/tmp/pipeline.pkl")
|
403 |
+
test_prediction = pipeline.predict(["This is a test news article"])
|
404 |
+
model_loadable = True
|
405 |
+
validation_checks.append(("Model Loading", True, f"Test prediction: {test_prediction[0]}"))
|
406 |
+
except Exception as e:
|
407 |
+
validation_checks.append(("Model Loading", False, f"Error: {str(e)}"))
|
408 |
+
|
409 |
+
# Print validation results
|
410 |
+
log_step("π Validation Results:")
|
411 |
+
all_passed = True
|
412 |
+
for check_name, passed, details in validation_checks:
|
413 |
+
status = "β
PASS" if passed else "β FAIL"
|
414 |
+
log_step(f" {status} {check_name}: {details}")
|
415 |
+
if not passed:
|
416 |
+
all_passed = False
|
417 |
+
|
418 |
+
return all_passed, validation_checks
|
419 |
+
|
420 |
+
|
421 |
def main():
|
422 |
+
"""Main initialization function with smart training logic"""
|
423 |
+
log_step("π Starting intelligent system initialization...")
|
424 |
+
|
425 |
+
# Check if model already exists
|
426 |
+
model_exists, existing_model_files = check_model_exists()
|
427 |
+
|
428 |
+
if model_exists:
|
429 |
+
log_step("π― EXISTING INSTALLATION DETECTED")
|
430 |
+
log_step("π Found existing model files - skipping training")
|
431 |
+
|
432 |
+
# Load existing metadata to show info
|
433 |
+
try:
|
434 |
+
with open("/tmp/metadata.json", 'r') as f:
|
435 |
+
metadata = json.load(f)
|
436 |
+
|
437 |
+
log_step(f"π Existing Model Info:")
|
438 |
+
log_step(f" - Version: {metadata.get('model_version', 'Unknown')}")
|
439 |
+
log_step(f" - Accuracy: {metadata.get('test_accuracy', 'Unknown')}")
|
440 |
+
log_step(f" - F1 Score: {metadata.get('test_f1', 'Unknown')}")
|
441 |
+
log_step(f" - Created: {metadata.get('timestamp', 'Unknown')}")
|
442 |
+
|
443 |
+
except Exception as e:
|
444 |
+
log_step(f"β οΈ Could not read existing metadata: {e}")
|
445 |
+
|
446 |
+
else:
|
447 |
+
log_step("π FIRST-TIME INSTALLATION DETECTED")
|
448 |
+
log_step("π§ No existing model found - will train new model")
|
449 |
|
450 |
+
# Run initialization steps
|
451 |
steps = [
|
452 |
("Directory Creation", create_directories),
|
453 |
("Dataset Copy", copy_original_datasets),
|
454 |
+
("Dataset Preparation", create_minimal_dataset),
|
|
|
455 |
("Log Creation", create_initial_logs)
|
456 |
]
|
457 |
|
458 |
+
# Add training step only if model doesn't exist
|
459 |
+
if not model_exists:
|
460 |
+
steps.insert(-1, ("π€ Model Training", run_initial_training))
|
461 |
+
|
462 |
failed_steps = []
|
463 |
|
464 |
for step_name, step_function in steps:
|
465 |
try:
|
466 |
+
log_step(f"βΆοΈ Starting: {step_name}")
|
467 |
if step_function():
|
468 |
log_step(f"β
{step_name} completed")
|
469 |
else:
|
|
|
473 |
log_step(f"β {step_name} failed: {str(e)}")
|
474 |
failed_steps.append(step_name)
|
475 |
|
476 |
+
# Final validation
|
477 |
+
log_step("π Running final system validation...")
|
478 |
+
validation_passed, validation_results = validate_installation()
|
479 |
+
|
480 |
+
# Summary
|
481 |
+
log_step("=" * 60)
|
482 |
if failed_steps:
|
483 |
+
log_step(f"β οΈ Initialization completed with {len(failed_steps)} issues")
|
484 |
+
log_step(f"β Failed steps: {', '.join(failed_steps)}")
|
|
|
485 |
else:
|
486 |
log_step("π System initialization completed successfully!")
|
487 |
|
488 |
+
if validation_passed:
|
489 |
+
log_step("β
All validation checks passed!")
|
490 |
+
log_step("π System is ready for use!")
|
491 |
+
|
492 |
+
if not model_exists:
|
493 |
+
log_step("π€ NEW MODEL TRAINED AND READY")
|
494 |
+
log_step("π You can now start making predictions!")
|
495 |
+
else:
|
496 |
+
log_step("π EXISTING MODEL VALIDATED AND READY")
|
497 |
+
log_step("π System restored from previous installation!")
|
498 |
+
|
499 |
+
else:
|
500 |
+
log_step("β Some validation checks failed")
|
501 |
+
log_step("π§ Manual intervention may be required")
|
502 |
+
|
503 |
+
log_step("=" * 60)
|
504 |
|
505 |
|
506 |
if __name__ == "__main__":
|
507 |
+
main()
|