Ahmedik95316 commited on
Commit
83527bc
Β·
1 Parent(s): 35827c4

Update model/train.py

Browse files

Fixed the file paths to correctly point to `tmp` folder as the original paths are read-only

Files changed (1) hide show
  1. model/train.py +85 -76
model/train.py CHANGED
@@ -1,76 +1,85 @@
1
- import pandas as pd
2
- from pathlib import Path
3
- from sklearn.feature_extraction.text import TfidfVectorizer
4
- from sklearn.linear_model import LogisticRegression
5
- from sklearn.metrics import accuracy_score
6
- from sklearn.model_selection import train_test_split
7
- import joblib
8
- import json
9
- import datetime
10
- import hashlib
11
-
12
- # Paths
13
- BASE_DIR = Path(__file__).resolve().parent
14
- DATA_PATH = BASE_DIR.parent / "data" / "combined_dataset.csv"
15
- MODEL_PATH = BASE_DIR / "model.pkl"
16
- VECTORIZER_PATH = BASE_DIR / "vectorizer.pkl"
17
- METADATA_PATH = BASE_DIR / "metadata.json"
18
-
19
- def hash_file(filepath):
20
- content = Path(filepath).read_bytes()
21
- return hashlib.md5(content).hexdigest()
22
-
23
- def main():
24
- # Load dataset
25
- # print('Dataset Loaded')
26
- df = pd.read_csv(DATA_PATH)
27
- X = df['text']
28
- y = df['label']
29
-
30
- # Train-test split
31
- X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
32
-
33
- # Vectorize
34
- vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
35
- X_train_vec = vectorizer.fit_transform(X_train)
36
- X_test_vec = vectorizer.transform(X_test)
37
-
38
- # print('Train/Test Splits Created')
39
- # print('Starting Model Training')
40
-
41
- # Train model
42
- model = LogisticRegression(max_iter=1000)
43
- model.fit(X_train_vec, y_train)
44
-
45
- # print('Model Training Completed')
46
- #print('Model Evaluation Starting!')
47
-
48
- # Evaluate
49
- y_pred = model.predict(X_test_vec)
50
- acc = accuracy_score(y_test, y_pred)
51
-
52
- # Save model + vectorizer
53
- joblib.dump(model, MODEL_PATH)
54
- joblib.dump(vectorizer, VECTORIZER_PATH)
55
-
56
- # print('Model Evaluation Done')
57
- # print('Model Saved!')
58
-
59
- # Save metadata
60
- metadata = {
61
- "model_version": f"v1.0",
62
- "data_version": hash_file(DATA_PATH),
63
- "train_size": len(X_train),
64
- "test_size": len(X_test),
65
- "test_accuracy": round(acc, 4),
66
- "timestamp": datetime.datetime.now().isoformat()
67
- }
68
- with open(METADATA_PATH, 'w') as f:
69
- json.dump(metadata, f, indent=4)
70
-
71
- print(f"βœ… Model trained and saved.")
72
- print(f"πŸ“Š Test Accuracy: {acc:.4f}")
73
- print(f"πŸ“ Metadata saved to {METADATA_PATH}")
74
-
75
- if __name__ == "__main__":
76
- main()
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pathlib import Path
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.linear_model import LogisticRegression
5
+ from sklearn.metrics import accuracy_score
6
+ from sklearn.model_selection import train_test_split
7
+ import joblib
8
+ import json
9
+ import datetime
10
+ import hashlib
11
+
12
+ # # Paths
13
+ # BASE_DIR = Path(__file__).resolve().parent
14
+ # DATA_PATH = BASE_DIR.parent / "data" / "combined_dataset.csv"
15
+ # MODEL_PATH = BASE_DIR / "model.pkl"
16
+ # VECTORIZER_PATH = BASE_DIR / "vectorizer.pkl"
17
+ # METADATA_PATH = BASE_DIR / "metadata.json"
18
+
19
+ # Base dir and data location inside /tmp
20
+ BASE_DIR = Path("/tmp")
21
+ DATA_PATH = BASE_DIR / "data" / "combined_dataset.csv"
22
+
23
+ # Model artifacts also in /tmp (or you can keep these in /app/model if you want to persist them in the container)
24
+ MODEL_PATH = BASE_DIR / "model.pkl"
25
+ VECTORIZER_PATH = BASE_DIR / "vectorizer.pkl"
26
+ METADATA_PATH = BASE_DIR / "metadata.json"
27
+
28
+ def hash_file(filepath):
29
+ content = Path(filepath).read_bytes()
30
+ return hashlib.md5(content).hexdigest()
31
+
32
+ def main():
33
+ # Load dataset
34
+ # print('Dataset Loaded')
35
+ df = pd.read_csv(DATA_PATH)
36
+ X = df['text']
37
+ y = df['label']
38
+
39
+ # Train-test split
40
+ X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
41
+
42
+ # Vectorize
43
+ vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
44
+ X_train_vec = vectorizer.fit_transform(X_train)
45
+ X_test_vec = vectorizer.transform(X_test)
46
+
47
+ # print('Train/Test Splits Created')
48
+ # print('Starting Model Training')
49
+
50
+ # Train model
51
+ model = LogisticRegression(max_iter=1000)
52
+ model.fit(X_train_vec, y_train)
53
+
54
+ # print('Model Training Completed')
55
+ #print('Model Evaluation Starting!')
56
+
57
+ # Evaluate
58
+ y_pred = model.predict(X_test_vec)
59
+ acc = accuracy_score(y_test, y_pred)
60
+
61
+ # Save model + vectorizer
62
+ joblib.dump(model, MODEL_PATH)
63
+ joblib.dump(vectorizer, VECTORIZER_PATH)
64
+
65
+ # print('Model Evaluation Done')
66
+ # print('Model Saved!')
67
+
68
+ # Save metadata
69
+ metadata = {
70
+ "model_version": f"v1.0",
71
+ "data_version": hash_file(DATA_PATH),
72
+ "train_size": len(X_train),
73
+ "test_size": len(X_test),
74
+ "test_accuracy": round(acc, 4),
75
+ "timestamp": datetime.datetime.now().isoformat()
76
+ }
77
+ with open(METADATA_PATH, 'w') as f:
78
+ json.dump(metadata, f, indent=4)
79
+
80
+ print(f"βœ… Model trained and saved.")
81
+ print(f"πŸ“Š Test Accuracy: {acc:.4f}")
82
+ print(f"πŸ“ Metadata saved to {METADATA_PATH}")
83
+
84
+ if __name__ == "__main__":
85
+ main()