# src/evaluate.py import pandas as pd import torch from torch.utils.data import Dataset, DataLoader from transformers import AutoTokenizer, AutoModelForSequenceClassification from sklearn.metrics import accuracy_score, classification_report, confusion_matrix import matplotlib.pyplot as plt import seaborn as sns import numpy as np import os from tqdm.auto import tqdm from scipy.stats import mode # --- Konfigurasi --- PROCESSED_DATA_DIR = "../datasets/processed" MODELS_DIR = "../models" RESULTS_DIR = "../results" MAX_LEN = 256 BATCH_SIZE = 16 os.makedirs(RESULTS_DIR, exist_ok=True) MODEL_CONFIG = { "bert": "BERT", "roberta": "RoBERTa", "electra": "ELECTRA", "xlnet": "XLNet" } # --- Kelas Dataset Kustom --- class NewsDataset(Dataset): def __init__(self, texts, labels, tokenizer, max_len): self.texts = texts; self.labels = labels; self.tokenizer = tokenizer; self.max_len = max_len def __len__(self): return len(self.texts) def __getitem__(self, item): text = str(self.texts[item]); label = self.labels[item] encoding = self.tokenizer.encode_plus(text, add_special_tokens=True, max_length=self.max_len, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt') return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(label, dtype=torch.long)} # --- Fungsi untuk Mendapatkan Prediksi --- def get_predictions(model, data_loader, device): model = model.eval().to(device); predictions = [] with torch.no_grad(): for batch in tqdm(data_loader, desc=f"Predicting", leave=False): input_ids = batch["input_ids"].to(device); attention_mask = batch["attention_mask"].to(device) outputs = model(input_ids, attention_mask=attention_mask) _, preds = torch.max(outputs.logits, dim=1) predictions.extend(preds.cpu().numpy()) return np.array(predictions) # --- Fungsi Visualisasi Baru --- def plot_class_distribution(df, output_path): plt.figure(figsize=(8, 6)) sns.countplot(x='label', data=df, palette='pastel') plt.title('Distribusi Kelas pada Data Uji', fontsize=16) plt.xlabel('Label', fontsize=12) plt.ylabel('Jumlah', fontsize=12) plt.xticks(ticks=[0, 1], labels=['Fakta (0)', 'Hoax (1)']) plt.tight_layout() plt.savefig(output_path, dpi=300) plt.close() print(f"Diagram distribusi kelas disimpan di: {output_path}") def plot_text_length_distribution(df, output_path): plt.figure(figsize=(10, 6)) text_lengths = df['text'].str.split().str.len() sns.histplot(text_lengths, bins=50, kde=True, color='skyblue') plt.title('Distribusi Panjang Teks pada Data Uji', fontsize=16) plt.xlabel('Jumlah Kata', fontsize=12) plt.ylabel('Frekuensi', fontsize=12) plt.tight_layout() plt.savefig(output_path, dpi=300) plt.close() print(f"Diagram distribusi panjang teks disimpan di: {output_path}") def plot_full_metric_comparison(summary_df, output_path): # 'Melt' dataframe untuk plotting dengan seaborn df_melted = pd.melt(summary_df, id_vars=['Model'], value_vars=['Accuracy', 'Precision', 'Recall', 'F1-score'], var_name='Metric', value_name='Score') plt.figure(figsize=(14, 8)) barplot = sns.barplot(x='Model', y='Score', hue='Metric', data=df_melted, palette='viridis') plt.title('Perbandingan Kinerja Metrik Antar Model', fontsize=16, pad=20) plt.xlabel('Model', fontsize=12) plt.ylabel('Skor', fontsize=12) plt.xticks(rotation=15, ha="right") plt.ylim(0.9, 1.0) # Fokus pada perbedaan skor yang kecil plt.legend(title='Metrik') # Tambahkan label nilai di atas bar for container in barplot.containers: barplot.bar_label(container, fmt='%.4f', fontsize=9, padding=3) plt.tight_layout() plt.savefig(output_path, dpi=300) plt.close() print(f"Diagram perbandingan metrik lengkap disimpan di: {output_path}") # --- Fungsi Utama Evaluasi --- def main_evaluation(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Menggunakan device: {device}") # 1. Muat Data Uji dan Buat Visualisasi Data test_data_path = os.path.join(PROCESSED_DATA_DIR, "test.csv") if not os.path.exists(test_data_path): print(f"ERROR: File data uji tidak ditemukan di {test_data_path}"); return test_df = pd.read_csv(test_data_path) test_df.dropna(subset=['text', 'label'], inplace=True); true_labels = test_df['label'].astype(int).to_numpy(); test_texts = test_df['text'].tolist() if len(test_texts) == 0: print("Data uji kosong. Hentikan evaluasi."); return print("\n--- Membuat Visualisasi Data Uji ---") plot_class_distribution(test_df, os.path.join(RESULTS_DIR, "class_distribution.png")) plot_text_length_distribution(test_df, os.path.join(RESULTS_DIR, "text_length_distribution.png")) # 2. Dapatkan Prediksi dari Setiap Model Individual all_predictions = {} for model_folder, model_name in MODEL_CONFIG.items(): print(f"\n--- Memproses Model: {model_name} ---") model_path = os.path.join(MODELS_DIR, model_folder) if not os.path.exists(model_path): print(f" PERINGATAN: Direktori model {model_path} tidak ditemukan. Melewati."); continue try: tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForSequenceClassification.from_pretrained(model_path) test_dataset = NewsDataset(test_texts, true_labels, tokenizer, MAX_LEN) test_data_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=0) predictions = get_predictions(model, test_data_loader, device) all_predictions[model_name] = predictions print(f" Prediksi untuk model {model_name} selesai.") except Exception as e: print(f" ERROR saat memproses model {model_name}: {e}") if not all_predictions: print("KRITIS: Tidak ada model yang berhasil diproses. Hentikan evaluasi."); return # 3. Buat Prediksi Ensemble (Bagging) dengan Voting stacked_predictions = np.array(list(all_predictions.values())); ensemble_preds, _ = mode(stacked_predictions, axis=0, keepdims=False) all_predictions["Bagging (Ensemble)"] = ensemble_preds; print("\n--- Prediksi Ensemble (Bagging) dengan voting selesai. ---") # 4. Hitung Metrik dan Buat Output untuk Semua Model results_summary = [] for model_name, predictions in all_predictions.items(): print(f"\n--- Hasil Evaluasi untuk: {model_name} ---") report_dict = classification_report(true_labels, predictions, target_names=['Fakta (0)', 'Hoax (1)'], output_dict=True, zero_division=0) report_text = classification_report(true_labels, predictions, target_names=['Fakta (0)', 'Hoax (1)'], zero_division=0) print("\nClassification Report:"); print(report_text) with open(os.path.join(RESULTS_DIR, f"{model_name.replace(' ', '_')}_classification_report.txt"), "w") as f: f.write(report_text) cm = confusion_matrix(true_labels, predictions) plt.figure(figsize=(8, 6)); sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Fakta (0)', 'Hoax (1)'], yticklabels=['Fakta (0)', 'Hoax (1)']) plt.xlabel('Predicted Label'); plt.ylabel('True Label'); plt.title(f'Confusion Matrix - {model_name}') cm_path = os.path.join(RESULTS_DIR, f"{model_name.replace(' ', '_')}_confusion_matrix.png"); plt.savefig(cm_path, dpi=300); plt.close() print(f" Confusion matrix disimpan di {cm_path}") results_summary.append({ "Model": model_name, "Accuracy": report_dict['accuracy'], "Precision": report_dict['weighted avg']['precision'], "Recall": report_dict['weighted avg']['recall'], "F1-score": report_dict['weighted avg']['f1-score'] }) # 5. Buat dan Simpan Tabel Perbandingan & Diagram Perbandingan Akhir if results_summary: summary_df = pd.DataFrame(results_summary) summary_df = summary_df.sort_values(by="F1-score", ascending=False).reset_index(drop=True) print("\n--- Ringkasan Perbandingan Final Antar Model ---") print(summary_df.round(4).to_string()) summary_df.to_csv(os.path.join(RESULTS_DIR, "model_comparison_summary.csv"), index=False, float_format='%.4f') summary_df.to_excel(os.path.join(RESULTS_DIR, "model_comparison_summary.xlsx"), index=False, float_format='%.4f') print(f"\nRingkasan perbandingan disimpan di '{RESULTS_DIR}/'") # Panggil fungsi visualisasi perbandingan metrik yang baru plot_full_metric_comparison(summary_df, os.path.join(RESULTS_DIR, "full_metric_comparison_plot.png")) if __name__ == '__main__': main_evaluation()