import streamlit as st
import joblib
import pandas as pd
import re
import emoji
import json
import io
import unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import os
from wordcloud import WordCloud
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from transformers import pipeline
from collections import Counter
import nltk 
from nltk.corpus import stopwords
from datetime import datetime
from huggingface_hub import hf_hub_download
from huggingface_hub import HfApi, login

huggingface_token = os.getenv("HF_TOKEN")
if huggingface_token is None:
    st.error("❌ Token Hugging Face tidak ditemukan. Set token sebagai environment variable HF_TOKEN.")
    st.stop()

login(token=huggingface_token)

# Inisialisasi API
api = HfApi()

repo_id="zakyirhab0/Klasifikasi_Komentar" 

# === Preprocessing Functions === #
candidate_list = ["Lalu Muhamad Iqbal", "Indah Dhamayanti Putri", "Zulkieflimansyah", "M Suhaili", "Sitti Rohmi Djalilah", "Musyafirin"]

# Unduh stopwords jika belum terunduh
nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))

# Memastikan data "BA Lainnya" tersedia
if 'data_with_ba' in st.session_state:
    ba_lainnya_data = st.session_state['data_with_ba']
else:
    ba_lainnya_data = None

def translate_emojis(text):
    if not isinstance(text, str):
        return ''  # Mengembalikan string kosong jika input bukan string
    return ''.join(c for c in text if not emoji.is_emoji(c))  # Remove all emojis
  # Remove all emojis

def normalize_unicode(text):
    return unicodedata.normalize('NFKD', text)

def clean_text(text):
    text = str(text).casefold()  # Convert to lowercase
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # Remove non-alphabetic characters
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize spaces
    return text

def handle_negation(text):
    negation_words = {"tidak", "bukan", "jangan", "belum", "kurang", "gagal", "sulit"}
    words = text.split()
    result = []
    skip_next = False
    for i, word in enumerate(words):
        if word in negation_words and i + 1 < len(words):
            result.append(f"{word}_{words[i + 1]}")  # Combine negation with next word
            skip_next = True
        elif skip_next:
            skip_next = False
        else:
            result.append(word)
    return ' '.join(result)

def handle_replies(text):
    text = re.sub(r'=--*@\w+', '', text)  # Remove multi-level reply patterns
    text = re.sub(r'=-*@\w+', '', text)   # Remove single-level reply patterns
    text = re.sub(r'@\w+', '', text)      # Remove standalone @username mentions
    return text

def translate_text(text, dictionary):
    words = text.split()
    return ' '.join([dictionary.get(word.lower(), word) for word in words])  # Translate words using dictionary

# Fungsi untuk menetapkan sentimen berdasarkan kata kunci
def assign_sentiment_based_on_keywords(comment, keyword_dict):
    for sentiment, keywords in keyword_dict.items():
        if any(keyword in comment for keyword in keywords):
            return sentiment
    return 'unknown'

# === Load Dictionaries === #
def load_dictionary(file_path, file_type='json'):
    if file_type == 'json':
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    elif file_type == 'csv':
        df = pd.read_csv(file_path, names=['slang', 'formal'])
        return pd.Series(df['formal'].values, index=df['slang']).to_dict()

ntb_dict_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename="ntb_dict.json", repo_type="dataset")
slang_dict_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename="kamusalay.csv", repo_type="dataset")

ntb_dict = load_dictionary(ntb_dict_path, 'json')
slang_dict = load_dictionary(slang_dict_path, 'csv')

# === Utility Functions === #
# Fungsi untuk memperbarui file JSON dengan riwayat update
def update_history_json(history_path, komentar, link, model_data, field, date):
    # Konversi Timestamp menjadi string
    if isinstance(date, pd.Timestamp):
        date = date.strftime('%Y-%m-%d')

    # Baca histori dari file JSON
    try:
        with open(history_path, 'r') as file:
            history_data = json.load(file)
    except (FileNotFoundError, json.JSONDecodeError):
        history_data = {}

    # Kunci riwayat untuk komentar, link, dan model data tertentu
    key = f"{komentar}_{link}_{model_data}"
    
    # Tambahkan riwayat baru ke kunci yang sesuai
    if key in history_data:
        history_data[key].append({field: date})
    else:
        history_data[key] = [{field: date}]
    
    # Simpan JSON secara sementara
    with open("history_temp.json", "w") as file:
        json.dump(history_data, file, indent=4)

    # Upload ke Hugging Face Storage
    api = HfApi()
    api.upload_file(
        path_or_fileobj="history_temp.json",
        path_in_repo=f"history_{candidate.lower().replace(' ', '_')}.json",
        repo_id="zakyirhab0/Klasifikasi_Komentar",
        repo_type="dataset"
    )

# Fungsi untuk memperbarui dataset pelatihan dengan data baru
def update_training_dataset(output, candidate):
    dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset")
    history_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"history_{candidate.lower().replace(' ', '_')}.json", repo_type="dataset")

    try:
        required_columns = ['model_data', 'Platform', 'komentar', 'link', 'kandidat', 'sentimen', 'tanggal', 'tanggal_masuk']
        output = output[required_columns].copy()

        if 'predicted_category' in output.columns:
            output['sentimen'] = output['predicted_category']
            output.drop(columns=['predicted_category'], inplace=True)

        output['tanggal_masuk'] = pd.Timestamp.now()

        if os.path.exists(dataset_path):
            existing_data = pd.read_excel(dataset_path)
        else:
            existing_data = pd.DataFrame(columns=required_columns)

        # Menambahkan kolom 'update_ba' jika belum ada
        if 'update_ba' not in existing_data.columns:
            existing_data['update_ba'] = None

        # Menambahkan kolom 'missing_comment' jika belum ada
        if 'missing_comment' not in existing_data.columns:
            existing_data['missing_comment'] = False

        # Langkah 1: Memeriksa Komentar yang Hilang
        train_comments = existing_data.groupby('link')['komentar'].apply(list).to_dict()
        new_comments = output.groupby('link')['komentar'].apply(list).to_dict()

        for link, comments in train_comments.items():
            if link in new_comments:
                new_comment_set = set(new_comments[link])
                for comment in comments:
                    if comment not in new_comment_set:
                        existing_data.loc[(existing_data['link'] == link) & (existing_data['komentar'] == comment), 'missing_comment'] = True
                    else:
                        existing_data.loc[(existing_data['link'] == link) & (existing_data['komentar'] == comment), 'missing_comment'] = False

        # Fungsi untuk memperbarui data
        def update_data(existing_data, new_data, history_path):
            for index, row in new_data.iterrows():
                komentar = row['komentar']
                link = row['link']
                model_data = row['model_data']
                tanggal_klasifikasi = pd.Timestamp.now()

                # Komentar Sama Sudah Ada
                existing_entry = existing_data[(existing_data['link'] == link) & (existing_data['komentar'] == komentar)]
                if not existing_entry.empty:
                    existing_data.loc[existing_entry.index, 'update_ba'] = tanggal_klasifikasi
                    update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi)
                else:
                    # Link Sama, Komentar Berbeda
                    existing_link_entry = existing_data[(existing_data['link'] == link)]
                    if not existing_link_entry.empty:
                        new_row = row.copy()
                        new_row['tanggal_masuk'] = tanggal_klasifikasi
                        new_row['update_ba'] = tanggal_klasifikasi
                        existing_data = pd.concat([existing_data, new_row.to_frame().T], ignore_index=True)
                        update_history_json(history_path, komentar, link, model_data, 'tanggal_masuk', tanggal_klasifikasi)
                        update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi)
                    else:
                        # Link Baru
                        new_row = row.copy()
                        new_row['tanggal_masuk'] = tanggal_klasifikasi
                        new_row['update_ba'] = tanggal_klasifikasi
                        existing_data = pd.concat([existing_data, new_row.to_frame().T], ignore_index=True)
                        update_history_json(history_path, komentar, link, model_data, 'tanggal_masuk', tanggal_klasifikasi)
                        update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi)

            # Perbarui nilai None di update_ba dengan tanggal_masuk
            existing_data['update_ba'] = pd.to_datetime(existing_data['update_ba'], errors='coerce')
            existing_data['update_ba'].fillna(existing_data['tanggal_masuk'], inplace=True)
            return existing_data

        updated_data = update_data(existing_data, output, history_path)
        updated_data.to_excel("dataset_temp.xlsx", index=False)

        api.upload_file(
            path_or_fileobj="dataset_temp.xlsx",
            path_in_repo=f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx",
            repo_id="zakyirhab0/Klasifikasi_Komentar",
            repo_type="dataset"
        )
        st.success(f"Data successfully updated in {candidate}'s training dataset.")
        
        if 'missing_comment' in existing_data.columns and existing_data['missing_comment'].any():
            st.subheader("Missing Comments")
            st.write("Comments that were found to be missing:")
            st.dataframe(existing_data[existing_data['missing_comment']])
    except KeyError as e:
        st.error(f"Missing column in the dataset: {e}")
    except Exception as e:
        st.error(f"An error occurred: {e}")

# Fungsi untuk menyimpan keyword dictionary ke Hugging Face
def save_keywords_to_huggingface(keyword_dict):
    file_name = "keywords.json"
    
    # Simpan ke file JSON sementara
    with open(file_name, 'w', encoding='utf-8') as f:
        json.dump(keyword_dict, f, indent=4)

    # Upload ke Hugging Face
    api = HfApi()
    api.upload_file(
        path_or_fileobj=file_name,
        path_in_repo=file_name,
        repo_id="zakyirhab0/Klasifikasi_Komentar",
        repo_type="dataset"
    )

    st.success(f"✅ Keywords updated and uploaded to Hugging Face!")

# Load keywords dari Hugging Face
def load_keywords_from_huggingface():
    try:
        keywords_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename="keywords.json", repo_type="dataset")
        with open(keywords_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        st.error(f"❌ Error loading keywords: {e}")
        return {}

def clustering_based_evaluation(df, n_clusters=10):
    st.write("Starting preprocessing...")
    df['translated_emojis'] = df['komentar'].fillna('').astype(str).apply(translate_emojis)
    df['normalized_unicode'] = df['translated_emojis'].apply(normalize_unicode)
    df['reply_handled'] = df['normalized_unicode'].apply(handle_replies)
    df['clean_text'] = df['reply_handled'].apply(clean_text)
    df['translated_ntb'] = df['clean_text'].apply(lambda x: translate_text(x, ntb_dict))
    df['translated_slang'] = df['translated_ntb'].apply(lambda x: translate_text(x, slang_dict))
    df['negation_handled'] = df['translated_slang'].apply(handle_negation)

    st.write("Generating TF-IDF vectors...")
    tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['negation_handled'])

    st.write(f"Clustering into {n_clusters} clusters...")
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df['Cluster'] = kmeans.fit_predict(tfidf_matrix)

    st.write("Performing PCA for visualization...")
    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(tfidf_matrix.toarray())
    df['PCA1'] = reduced_data[:, 0]
    df['PCA2'] = reduced_data[:, 1]

    st.write("Clustering completed successfully!")
    return df

def load_and_process_data(dataset_path, history_path):
    dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset")
    df = pd.read_excel(dataset_path)
    df['tanggal_masuk'] = pd.to_datetime(df['tanggal_masuk'], errors='coerce')

    if df['tanggal_masuk'].isnull().any():
        st.warning("Some dates could not be parsed correctly. Please check the date format in the dataset.")
        df['tanggal_masuk'].fillna(pd.Timestamp.now().strftime('%Y-%m-%d'), inplace=True)

    required_columns = ['model_data', 'Platform', 'komentar', 'sentimen', 'tanggal', 'tanggal_masuk', 'evaluated_by_cluster']
    for col in required_columns:
        if col not in df.columns:
            if col == 'tanggal_masuk':
                df[col] = pd.Timestamp.now().strftime('%Y-%m-%d')
            elif col == 'evaluated_by_cluster':
                df[col] = False
            else:
                df[col] = None

    df = df[required_columns]

    try:
        with open(history_path, "r") as f:
            history = json.load(f)
    except FileNotFoundError:
        history = []

    return df, history

# Define the function to handle Special Cluster
def handle_special_cluster(data, keywords, detector):
    for index, row in data.iterrows():
        text = row['negation_handled']

        # Cek apakah teks mengandung salah satu kata kunci
        detected = False
        for sentiment, words in keywords.items():
            if any(word in text for word in words):
                data.loc[index, 'predicted_category'] = sentiment.replace('_', ' ')
                data.loc[index, 'detected_by'] = 'keyword'
                detected = True
                break
        
        if not detected:
            # Jika tidak ada kata kunci yang cocok, gunakan deteksi sarkasme
            result = detector(text)
            if result[0]['label'] == 'SARCASM':
                data.loc[index, 'predicted_category'] = 'co sarkastic'
                data.loc[index, 'detected_by'] = 'sarcasm'
            else:
                # Jika tidak ada sentimen yang cocok
                data.loc[index, 'predicted_category'] = 'Unknown'
                data.loc[index, 'detected_by'] = 'unknown'
                
    return data

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove non-alphanumeric characters
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return words

def display_word_frequencies(words, num_words):
    st.subheader(f"Top {num_words} Words")
    for word, freq in words:
        st.write(f"{word}: {freq}")

def update_sentiment(index, new_sentimen):
    df.loc[index, 'sentimen'] = new_sentimen
    st.write(f"Updated sentiment for comment at index {index} to {new_sentimen}")

file_name = "corrected_comments.xlsx"

# Fungsi untuk memuat data `Corrected Comments` dari Hugging Face atau lokal jika tidak ada
def load_corrected_comments():
    try:
        print("🔄 Mencoba memuat data dari Hugging Face...")
        corrected_comments_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=file_name, repo_type="dataset")
        return pd.read_excel(corrected_comments_path, sheet_name='Corrected Comments')
    except Exception as e:
        print(f"⚠ Gagal memuat dari Hugging Face: {e}")
        
        # Coba load dari lokal jika ada
        if os.path.exists(file_name):
            print("🔄 Memuat data dari file lokal...")
            return pd.read_excel(file_name, sheet_name='Corrected Comments')

        # Jika tidak ada, return DataFrame kosong
        print("❌ Tidak ditemukan data corrected_comments. Menggunakan DataFrame kosong.")
        return pd.DataFrame(columns=['model_data', 'Platform', 'komentar', 'tanggal', 'link', 'sentimen', 'Cluster_Name', 'corrected_by', 'tanggal_masuk'])

# Fungsi untuk menyimpan data `Corrected Comments` ke Hugging Face
def save_corrected_comments(data):
    print("💾 Menyimpan corrected_comments.xlsx ke lokal...")
    data.to_excel(file_name, sheet_name='Corrected Comments', index=False)

    print("🚀 Mengunggah ke Hugging Face...")
    api = HfApi()
    api.upload_file(
        path_or_fileobj=file_name,
        path_in_repo=file_name,
        repo_id="zakyirhab0/Klasifikasi_Komentar",
        repo_type="dataset"
    )

    print(f"✅ {file_name} berhasil diperbarui di Hugging Face.")

# Load data saat pertama kali dijalankan
corrected_comments = load_corrected_comments()

# Fungsi untuk menampilkan visualisasi distribusi komentar dalam cluster
def display_cluster_visualization(ba_lainnya_data):
    st.subheader("Cluster Visualization")

    # Menghitung jumlah komentar dalam kategori `Similar Sentiment` dan `Special Cluster`
    cluster_counts = ba_lainnya_data[ba_lainnya_data['Cluster_Name'].str.contains('Similar|Special Cluster')]['Cluster_Name'].value_counts()

    # Membuat grafik batang (bar chart)
    plt.figure(figsize=(10, 6))
    plt.bar(cluster_counts.index, cluster_counts.values, color=['blue', 'green', 'orange', 'red', 'purple'])
    plt.xlabel('Cluster Name')
    plt.ylabel('Number of Comments')
    plt.title('Distribution of Comments in Similar Sentiment and Special Cluster')
    plt.xticks(rotation=45)
    plt.show()
    st.pyplot(plt)

def run_clustering_for_ba_lainnya():
    st.title("Clustering for 'BA Lainnya'")

    if 'data_with_ba' not in st.session_state:
        st.error("No 'BA Lainnya' data found from the classification model. Please classify comments first.")
        st.stop()

    ba_lainnya_data = st.session_state['data_with_ba']

    st.write(f"**'BA Lainnya' Data:** {len(ba_lainnya_data)} rows")
    # Load keywords.json dari Hugging Face
    keywords_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename="keywords.json", repo_type="dataset")

    # Buka file yang telah diunduh
    with open(keywords_path, 'r') as f:
        keyword_dict = json.load(f)

    selected_candidate = st.session_state['candidate']
    candidate_keywords = keyword_dict.get(selected_candidate.replace(' ', '_'))

    if candidate_keywords is None:
        st.error("Keywords for the selected candidate not found.")
        st.stop()

    sarcasm_detector = pipeline('sentiment-analysis', model='unitary/toxic-bert')

    dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset")
    corrected_comments_file = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"corrected_comments_{selected_candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset")

    try:
        train_data = pd.read_excel(dataset_path)
        labeled_data = train_data[train_data['sentimen'].isin(['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])]

        st.write(f"**Labeled Data from Training Dataset:** {len(labeled_data)} rows")

        ba_lainnya_data['Cluster'] = None
        ba_lainnya_data['detected_by'] = None
        ba_lainnya_data['Cluster_Name'] = None
        ba_lainnya_data['corrected_by'] = "Not Corrected"
        ba_lainnya_data['Sentiment'] = None  # Ubah inisialisasi ke None untuk memastikan tidak ada nilai default yang salah

        ba_lainnya_data['Cluster_Name'] = ba_lainnya_data['Cluster_Name'].astype(str)
        ba_lainnya_data['corrected_by'] = ba_lainnya_data['corrected_by'].astype(str)
        ba_lainnya_data['Sentiment'] = ba_lainnya_data['Sentiment'].astype(str)

        for data in [ba_lainnya_data, labeled_data]:
            data['translated_emojis'] = data['komentar'].fillna('').astype(str).apply(translate_emojis)
            data['normalized_unicode'] = data['translated_emojis'].apply(normalize_unicode)
            data['reply_handled'] = data['normalized_unicode'].apply(handle_replies)
            data['clean_text'] = data['reply_handled'].apply(clean_text)
            data['translated_ntb'] = data['clean_text'].apply(lambda x: translate_text(x, {}))
            data['translated_slang'] = data['translated_ntb'].apply(lambda x: translate_text(x, {}))
            data['negation_handled'] = data['translated_slang'].apply(handle_negation)
            data['negation_handled'] = data['negation_handled'].fillna('')

        combined_data = ba_lainnya_data.copy()
        combined_data['Label'] = 'BA Lainnya'

        for sentimen in ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']:
            sentimen_data = labeled_data[labeled_data['sentimen'] == sentimen].copy()
            sentimen_data['Label'] = sentimen
            combined = pd.concat([combined_data, sentimen_data], ignore_index=True)

            if len(combined) < 2:
                st.warning(f"Not enough samples to cluster for {sentimen}.")
                continue

            vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=5000)
            tfidf_matrix = vectorizer.fit_transform(combined['negation_handled'])

            st.write(f"Clustering 'BA Lainnya' comments similar to {sentimen}...")
            kmeans = KMeans(n_clusters=2, random_state=42)
            combined['Cluster'] = kmeans.fit_predict(tfidf_matrix)

            valid_indices = combined.index[:len(ba_lainnya_data)]
            valid_indices = valid_indices.intersection(ba_lainnya_data.index)

            ba_lainnya_data.loc[valid_indices, 'Cluster'] = combined.loc[valid_indices, 'Cluster']
            ba_lainnya_data.loc[ba_lainnya_data['Cluster'] == 0, 'Cluster_Name'] = f"{sentimen} Similar"
            ba_lainnya_data.loc[ba_lainnya_data['Cluster'] == 1, 'Cluster_Name'] = f"{sentimen} Dissimilar"
            ba_lainnya_data.loc[valid_indices, 'Sentiment'] = sentimen

        for index, row in ba_lainnya_data.iterrows():
            if row['Cluster_Name'].endswith('Dissimilar') or row['Cluster_Name'] == 'None':
                dissimilar_comment = ba_lainnya_data.loc[[index]].copy()
                for sentimen in ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']:
                    sentimen_data = labeled_data[labeled_data['sentimen'] == sentimen].copy()
                    combined = pd.concat([dissimilar_comment, sentimen_data], ignore_index=True)

                    if len(combined) < 2:
                        continue

                    tfidf_matrix = vectorizer.fit_transform(combined['negation_handled'])
                    if tfidf_matrix.shape[0] == 0:
                        continue

                    kmeans = KMeans(n_clusters=2, random_state=42)
                    combined['Cluster'] = kmeans.fit_predict(tfidf_matrix)

                    if len(combined) > 0 and combined.loc[0, 'Cluster'] == 0:
                        ba_lainnya_data.loc[index, 'Cluster_Name'] = f"{sentimen} Similar"
                        ba_lainnya_data.loc[index, 'Sentiment'] = sentimen
                        break
                else:
                    ba_lainnya_data.loc[index, 'Cluster_Name'] = 'Special Cluster'
                    ba_lainnya_data.loc[index, 'corrected_by'] = 'Special Cluster'
                    ba_lainnya_data.loc[index, 'Sentiment'] = 'Special Sentiment'

        ba_lainnya_data['Cluster_Name'] = ba_lainnya_data['Cluster_Name'].apply(lambda x: 'Special Cluster' if x == 'nan' else x)

        special_cluster_data = ba_lainnya_data[ba_lainnya_data['Cluster_Name'] == 'Special Cluster']
        if not special_cluster_data.empty:
            special_cluster_data = handle_special_cluster(special_cluster_data, candidate_keywords, sarcasm_detector)
            ba_lainnya_data.update(special_cluster_data)
            ba_lainnya_data.loc[special_cluster_data.index, 'corrected_by'] = 'Special Cluster'
            ba_lainnya_data.loc[special_cluster_data.index, 'Sentiment'] = 'Special Sentiment'

            st.warning("Some comments were not captured by the current keywords. Please add new keywords in the 'Update Keywords' section.")

            st.subheader("Detection Distribution in Special Cluster")
            detection_counts = special_cluster_data['detected_by'].value_counts()
            plt.figure(figsize=(10, 6))
            plt.bar(detection_counts.index, detection_counts.values, color=['blue', 'orange', 'red'])
            plt.xlabel('Detection Method')
            plt.ylabel('Number of Comments')
            plt.title('Detection Distribution in Special Cluster')
            plt.show()
            st.pyplot(plt)

            st.write("Top Keywords in Special Cluster")
            for sentiment, keywords in candidate_keywords.items():
                st.write(f"{sentiment}: {', '.join(keywords)}")

            st.subheader("Special Cluster Details")
            st.dataframe(special_cluster_data[['komentar', 'Cluster_Name', 'detected_by']])

        corrected_comments = load_corrected_comments(corrected_comments_file)
        display_cluster_visualization(ba_lainnya_data)

        st.subheader("Edit Sentiment in Table")

        # Tambahkan kolom apakah komentar sudah masuk ke corrected_comments
        ba_lainnya_data['Already Corrected'] = ba_lainnya_data['komentar'].isin(corrected_comments['komentar'])
        ba_lainnya_data['Corrected Sentiment'] = ba_lainnya_data.apply(
            lambda row: corrected_comments[corrected_comments['komentar'] == row['komentar']]['sentimen'].values[0]
            if row['Already Corrected'] else "", axis=1
        )

        # Tambahkan kolom checkbox untuk memilih komentar yang mau diedit
        ba_lainnya_data['Select'] = False

        # Dropdown untuk memilih cluster (bulk selection)
        selected_cluster = st.selectbox("Select a cluster to edit:", sorted(ba_lainnya_data['Cluster_Name'].unique()))

        # Jika cluster dipilih, auto-select semua komentar di cluster tersebut
        if selected_cluster:
            ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'Select'] = True

        # Tampilkan tabel dengan checkbox & informasi sentimen
        edited_data = st.data_editor(
            ba_lainnya_data[['Select', 'komentar', 'Cluster_Name', 'Sentiment', 'Already Corrected', 'Corrected Sentiment']],  
            column_config={
                "Select": st.column_config.CheckboxColumn("✅ Select"),
                "komentar": st.column_config.TextColumn("Komentar"),
                "Cluster_Name": st.column_config.TextColumn("Cluster"),
                "Sentiment": st.column_config.TextColumn("Sentimen"),
                "Already Corrected": st.column_config.CheckboxColumn("✅ Sudah di Corrected?"),
                "Corrected Sentiment": st.column_config.TextColumn("Sentimen Corrected"),
            },
            disabled=["komentar", "Cluster_Name", "Already Corrected", "Corrected Sentiment"],  # Kolom yang gak bisa diedit
            use_container_width=True,
        )

        # Dropdown untuk memilih sentimen baru yang akan diterapkan ke komentar yang dicentang
        new_sentimen = st.selectbox("Select new sentiment for selected comments:", 
                                    ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])

        # Jika tombol ditekan, update sentimen komentar yang dicentang
        if st.button("Update Sentiments"):
            selected_rows = edited_data[edited_data['Select']]  # Ambil data yang dicentang
            
            if selected_rows.empty:
                st.warning("No comments selected for update!")
            else:
                for index, row in selected_rows.iterrows():
                    ba_lainnya_data.at[index, 'Sentiment'] = new_sentimen
                    ba_lainnya_data.at[index, 'corrected_by'] = 'Manual Edit'

                    if ba_lainnya_data.at[index, 'komentar'] in corrected_comments['komentar'].values:
                        corrected_comments.loc[corrected_comments['komentar'] == ba_lainnya_data.at[index, 'komentar'], 'sentimen'] = new_sentimen
                    else:
                        new_entry = ba_lainnya_data.loc[[index]].copy()
                        new_entry['sentimen'] = new_sentimen
                        corrected_comments = pd.concat([corrected_comments, new_entry])

                corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True)
                save_corrected_comments(corrected_comments, corrected_comments_file)
                st.success("Sentiments updated successfully!")
                st.rerun()

        st.subheader("Clustered Data")
        selected_cluster = st.selectbox("Select a cluster to view comments:", sorted(ba_lainnya_data['Cluster_Name'].unique()))
        cluster_comments = ba_lainnya_data[ba_lainnya_data['Cluster_Name'] == selected_cluster]
        st.dataframe(cluster_comments[['komentar', 'Cluster_Name']].head(100))

        new_sentimen = st.selectbox("Select new sentiment for this cluster:", ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])
        if st.button("Update Sentiment for this cluster"):
            ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'corrected_by'] = 'Batch Cluster'
            ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'sentimen'] = new_sentimen
            st.success(f"Sentiment for cluster {selected_cluster} updated to {new_sentimen}")

            # Save and refresh Corrected Comments table and Cluster Visualization
            corrected_comments = pd.concat([corrected_comments, ba_lainnya_data[ba_lainnya_data['corrected_by'] != "Not Corrected"]])
            corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True)
            save_corrected_comments(corrected_comments, corrected_comments_file)
            st.subheader("Corrected Comments")
            st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100))
            display_cluster_visualization(ba_lainnya_data)

        st.subheader("Special Rules Based on Keywords")
        keyword = st.text_input("Enter a keyword to set a rule:")
        specific_cluster = st.selectbox("Select a cluster for this keyword:", sorted(ba_lainnya_data['Cluster_Name'].unique()))

        if keyword:
            new_cluster = st.selectbox("Select sentiment for this keyword:", ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])
            if st.button("Apply Rule"):
                ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'Cluster_Name'] = new_cluster
                ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'detected_by'] = specific_cluster
                ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'corrected_by'] = 'Keyword Rule'
                ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'sentimen'] = new_cluster
                st.success(f"All comments containing '{keyword}' have been updated to '{new_cluster}' sentiment.")

                # Update keywords.json file to avoid duplicates
                if selected_candidate.replace(' ', '_') in keyword_dict:
                    if new_cluster in keyword_dict[selected_candidate.replace(' ', '_')]:
                        if keyword not in keyword_dict[selected_candidate.replace(' ', '_')][new_cluster]:
                            keyword_dict[selected_candidate.replace(' ', '_')][new_cluster].append(keyword)
                    else:
                        keyword_dict[selected_candidate.replace(' ', '_')][new_cluster] = [keyword]
                else:
                    keyword_dict[selected_candidate.replace(' ', '_')] = {new_cluster: [keyword]}

                with open('keywords.json', 'w') as f:
                    json.dump(keyword_dict, f)

                st.success(f"Keyword '{keyword}' has been added to the keyword list.")

                # Save and refresh Corrected Comments table and Cluster Visualization
                corrected_comments = pd.concat([corrected_comments, ba_lainnya_data[ba_lainnya_data['corrected_by'] != "Not Corrected"]])
                corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True)
                save_corrected_comments(corrected_comments, corrected_comments_file)
                st.subheader("Corrected Comments")
                st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100))
                display_cluster_visualization(ba_lainnya_data)

        st.subheader("Corrected Comments")
        corrected_comments = load_corrected_comments(corrected_comments_file)
        st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100))

        st.subheader("Visual Representation of Corrected Comments")
        sentiment_counts = corrected_comments['sentimen'].value_counts()
        plt.figure(figsize=(10, 6))
        plt.bar(sentiment_counts.index, sentiment_counts.values, color=['blue', 'green', 'orange', 'red', 'purple'])
        plt.xlabel('Sentimen')
        plt.ylabel('Number of Corrected Comments')
        plt.title('Number of Corrected Comments by Sentiment')
        plt.show()
        st.pyplot(plt)

        st.subheader("Download Options")
        # Simpan data hasil clustering ke buffer Excel
        excel_buffer_cluster = io.BytesIO()
        with pd.ExcelWriter(excel_buffer_cluster, engine='xlsxwriter') as writer:
            ba_lainnya_data.to_excel(writer, index=False, sheet_name='Clustered Data')
        excel_buffer_cluster.seek(0)

        # Unduh data yang sudah diklusterisasi
        st.download_button(
            label=f"Download Clustered Data for {selected_candidate}",
            data=excel_buffer_cluster,
            file_name=f"clustered_data_{selected_candidate}.xlsx",
            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        )

        # Upload data hasil clustering ke Hugging Face
        clustered_data_filename = f"clustered_data_{selected_candidate}.xlsx"
        ba_lainnya_data.to_excel(clustered_data_filename, index=False, sheet_name='Clustered Data')

        try:
            print("🚀 Mengunggah Clustered Data ke Hugging Face...")
            api = HfApi()
            api.upload_file(
                path_or_fileobj=clustered_data_filename,
                path_in_repo=clustered_data_filename,
                repo_id="zakyirhab0/Klasifikasi_Komentar",
                repo_type="dataset"
            )
            st.success(f"✅ Clustered Data berhasil diunggah ke Hugging Face: {clustered_data_filename}")
        except Exception as e:
            st.error(f"⚠ Gagal mengunggah Clustered Data ke Hugging Face: {e}")

        # Simpan hasil corrected comments ke training dataset
        st.subheader("Save Corrected Comments to Training Dataset")
        if st.button("Save Corrected Comments"):
            try:
                # Tambahkan timestamp untuk data yang baru disimpan
                corrected_comments['tanggal_masuk'] = pd.Timestamp.now().strftime('%Y-%m-%d')

                # Simpan ke training dataset yang ada di Hugging Face
                update_training_dataset(corrected_comments, st.session_state['candidate'])

                # Simpan juga di Hugging Face sebagai file
                corrected_comments_file = f"corrected_comments_{selected_candidate}.xlsx"
                corrected_comments.to_excel(corrected_comments_file, index=False, sheet_name='Corrected Comments')

                print("🚀 Mengunggah Corrected Comments ke Hugging Face...")
                api.upload_file(
                    path_or_fileobj=corrected_comments_file,
                    path_in_repo=corrected_comments_file,
                    repo_id="zakyirhab0/Klasifikasi_Komentar",
                    repo_type="dataset"
                )
                st.success(f"✅ Corrected Comments berhasil disimpan & diunggah ke Hugging Face: {corrected_comments_file}")

                # Bersihkan state untuk menghindari duplikasi penyimpanan
                st.session_state['data_with_ba'] = pd.DataFrame(columns=corrected_comments.columns)

                # Refresh otomatis setelah penyimpanan
                st.rerun()
            except Exception as e:
                st.error(f"⚠ Terjadi kesalahan saat menyimpan Corrected Comments: {e}")

    except FileNotFoundError:
        st.error(f"❌ Tidak ditemukan dataset untuk {selected_candidate}. Silakan tambahkan dataset terlebih dahulu.")
    except Exception as e:
        st.error(f"⚠ Terjadi kesalahan tidak terduga: {e}")


# === Sidebar Navigation === #
menu = st.sidebar.radio("Select a Feature", ["Model-Based Classification","Clustering for 'BA Lainnya'", "Update Keywords","View Training Dataset","Evaluate Data Train","Maximize Preprocessing","Retraining Model"])
if menu == "Model-Based Classification":
    st.title("Model-Based Classification")
    candidate = st.selectbox("Choose a candidate:", candidate_list)
    model_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"best_rf_model_{candidate.replace(' ', '_').lower()}.joblib", repo_type="dataset")
    vectorizer_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"tfidf_vectorizer_{candidate.replace(' ', '_').lower()}.joblib", repo_type="dataset")

    # Save the selected candidate to session state
    st.session_state['candidate'] = candidate

    uploaded_file = st.file_uploader("Upload an Excel file for classification", type=['xlsx'])

    @st.cache_data
    def load_model_and_vectorizer(model_path, vectorizer_path):
        """Load model and vectorizer, cache them for efficiency."""
        try:
            model = joblib.load(model_path)
            vectorizer = joblib.load(vectorizer_path)
            return model, vectorizer
        except FileNotFoundError:
            return None, None

    model, vectorizer = load_model_and_vectorizer(model_path, vectorizer_path)

    if not model or not vectorizer:
        st.error("Model or vectorizer not found for the selected candidate.")
        st.stop()

    # Save the vectorizer and model to session state
    st.session_state['vectorizer'] = vectorizer
    st.session_state['model'] = model

    @st.cache_data
    def preprocess_data(data):
        """Preprocess comments with batching for large datasets."""
        from joblib import Parallel, delayed
        
        def preprocess_batch(batch):
            batch['translated_emojis'] = batch['komentar'].apply(translate_emojis)
            batch['normalized_unicode'] = batch['translated_emojis'].apply(normalize_unicode)
            batch['reply_handled'] = batch['normalized_unicode'].apply(handle_replies)
            batch['clean_text'] = batch['reply_handled'].apply(clean_text)
            batch['translated_ntb'] = batch['clean_text'].apply(lambda x: translate_text(x, ntb_dict))
            batch['translated_slang'] = batch['translated_ntb'].apply(lambda x: translate_text(x, slang_dict))
            batch['negation_handled'] = batch['translated_slang'].apply(handle_negation)
            return batch
        
        batch_size = 10000  # Process 10,000 rows at a time
        batches = [data.iloc[i:i+batch_size] for i in range(0, len(data), batch_size)]
        processed_batches = Parallel(n_jobs=-1)(delayed(preprocess_batch)(batch) for batch in batches)
        return pd.concat(processed_batches, ignore_index=True)

    # Process uploaded file
    if uploaded_file:
        try:
            data = pd.read_excel(uploaded_file)
            if 'komentar' not in data.columns:
                st.error("The uploaded file must include a 'komentar' column.")
                st.stop()
            data = preprocess_data(data)
        except Exception as e:
            st.error(f"An error occurred while processing the file: {e}")
            st.stop()
    elif 'model_classified_data' in st.session_state:
        data = st.session_state['model_classified_data']
    else:
        st.info("Please upload a file for classification.")
        st.stop()

    # Transform comments into TF-IDF vectors
    try:
        tfidf_data = vectorizer.transform(data['negation_handled'].fillna(''))
        data['predicted_category'] = model.predict(tfidf_data)
        data['probabilities'] = model.predict_proba(tfidf_data).tolist()
        data['max_probability'] = data['probabilities'].apply(lambda x: max(x))
    except Exception as e:
        st.error(f"An error occurred during model prediction: {e}")
        st.stop()

    # Cache classified data
    st.session_state['model_classified_data'] = data

    # Interactive threshold adjustment
    st.subheader("Set Threshold for 'BA Lainnya'")
    threshold = st.slider("Threshold for tagging 'BA Lainnya'", min_value=0.0, max_value=1.0, value=0.80, step=0.01)

    # Apply threshold to tag "BA Lainnya"
    data['tag'] = data['max_probability'].apply(lambda x: 'BA Lainnya' if x < threshold else '')

    # Separate data for visualization
    data_without_ba = data[data['tag'] != 'BA Lainnya']
    data_with_ba = data[data['tag'] == 'BA Lainnya']

    # Save updated results to session state for dynamic updates
    st.session_state['data_without_ba'] = data_without_ba
    st.session_state['data_with_ba'] = data_with_ba

    # Preview Results
    st.subheader("Preview Results")

    st.write("### 1. Hasil Klasifikasi Tanpa Tag 'BA Lainnya'")
    if not data_without_ba.empty:
        st.dataframe(data_without_ba[['komentar', 'predicted_category', 'max_probability']])
    else:
        st.info("No high-probability classifications available.")

    st.write("### 2. Hasil Klasifikasi Dengan Tag 'BA Lainnya'")
    if not data_with_ba.empty:
        st.dataframe(data_with_ba[['komentar', 'predicted_category', 'max_probability']])
    else:
        st.info("No low-probability classifications available.")

    # Visualization: Sentiment Distribution
    st.subheader("Sentiment Distribution Visualization")

    def plot_distribution(data, title):
        sentiment_counts = data['predicted_category'].value_counts()
        fig, ax = plt.subplots()
        ax.bar(sentiment_counts.index, sentiment_counts.values)
        ax.set_title(title)
        ax.set_xlabel("Sentiments")
        ax.set_ylabel("Count")
        st.pyplot(fig)

    if not data_without_ba.empty:
        plot_distribution(data_without_ba, "Sentiment Distribution (Without 'BA Lainnya')")
    if not data_with_ba.empty:
        plot_distribution(data_with_ba, "Sentiment Distribution (With 'BA Lainnya')")

    # ====== DOWNLOAD HASIL KLASIFIKASI ======
    st.subheader("Download Results")

    # Simpan hasil klasifikasi ke buffer Excel
    excel_buffer = io.BytesIO()
    with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
        data.to_excel(writer, index=False, sheet_name='Classification Results')
    excel_buffer.seek(0)

    # Download hasil klasifikasi
    st.download_button(
        label="Download All Classification Results",
        data=excel_buffer,
        file_name=f"classification_results_{candidate}.xlsx",
        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
    )

    # ====== UNGGAH HASIL KE HUGGING FACE ======
    classification_results_file = f"classification_results_{candidate}.xlsx"
    data.to_excel(classification_results_file, index=False, sheet_name='Classification Results')

    api = HfApi()
    api.upload_file(
        path_or_fileobj=classification_results_file,
        path_in_repo=classification_results_file,
        repo_id="zakyirhab0/Klasifikasi_Komentar",
        repo_type="dataset"
    )

    st.success(f"✅ {classification_results_file} berhasil diunggah ke Hugging Face!")

    # ====== SIMPAN NON-'BA LAINNYA' KE TRAINING DATASET ======
    st.subheader("Save Classified Data")

    if not data_without_ba.empty:
        st.info("Only Non-'BA Lainnya' comments will be saved to the training dataset.")
        
        if st.button("Save Non-'BA Lainnya' Data to Training Dataset"):
            try:
                # Persiapkan data untuk disimpan
                data_to_save = data_without_ba[['model_data', 'Platform', 'komentar', 'link', 'kandidat', 'tanggal']].copy()
                data_to_save['sentimen'] = data_without_ba['predicted_category']
                data_to_save['tanggal_masuk'] = pd.Timestamp.now().strftime('%Y-%m-%d')

                # Simpan dataset ke Hugging Face
                training_dataset_file = f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx"
                data_to_save.to_excel(training_dataset_file, index=False)

                api.upload_file(
                    path_or_fileobj=training_dataset_file,
                    path_in_repo=training_dataset_file,
                    repo_id="zakyirhab0/Klasifikasi_Komentar",
                    repo_type="dataset"
                )

                st.success(f"✅ Data berhasil disimpan ke training dataset di Hugging Face: {training_dataset_file}")

            except Exception as e:
                st.error(f"❌ Terjadi kesalahan saat menyimpan data: {e}")

    else:
        st.info("No Non-'BA Lainnya' data available to save.")

# Integrasi fungsi dalam halaman "Clustering for 'BA Lainnya'"
if menu == "Clustering for 'BA Lainnya'":
    selected_candidate = st.session_state['selected_candidate'] if 'selected_candidate' in st.session_state else None
    run_clustering_for_ba_lainnya()
    pass  # Placeholder

# Memastikan kode ini hanya dijalankan jika menu yang dipilih adalah "View Training Dataset"
if menu == "View Training Dataset":
    st.title("View Training Dataset")

    # Header untuk memilih kandidat
    st.header("Options")
    selected_candidate = st.selectbox("Choose a candidate:", list(candidate_list), key='candidate_select_view')

    # Path dataset
    dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset")
    history_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"history_{selected_candidate.lower().replace(' ', '_')}.json", repo_type="dataset")

    # Memuat dataset
    try:
        dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset")
        df = pd.read_excel(dataset_path)

        # Memastikan kolom yang diperlukan ada
        required_columns = ['model_data', 'Platform', 'komentar', 'sentimen', 'tanggal', 'tanggal_masuk', 'link', 'evaluated_by_data_train']
        for col in required_columns:
            if col not in df.columns:
                if col == 'evaluated_by_data_train':
                    df[col] = False

        # Menambahkan kolom 'update_ba' jika belum ada
        if 'update_ba' not in df.columns:
            df['update_ba'] = None

        # Menambahkan kolom 'missing_comment' jika belum ada
        if 'missing_comment' not in df.columns:
            df['missing_comment'] = False

        # Pastikan 'tanggal_masuk' dan 'tanggal' dalam format datetime yang benar
        df['tanggal_masuk'] = pd.to_datetime(df['tanggal_masuk'], errors='coerce')
        df['tanggal'] = pd.to_datetime(df['tanggal'], errors='coerce')

        # Konversi kolom tanggal agar hanya menyimpan bagian tanggal tanpa waktu
        df['tanggal'] = df['tanggal'].dt.date
        df['tanggal_masuk'] = df['tanggal_masuk'].dt.date
        df['update_ba'] = pd.to_datetime(df['update_ba'], errors='coerce').dt.date

        # Menangani nilai NaT (Not a Time) jika ada
        if df['tanggal_masuk'].isnull().any():
            st.warning("Some dates 'tanggal_masuk' could not be parsed correctly. Please check the date format in the dataset.")
            df['tanggal_masuk'].fillna(pd.Timestamp.now().date(), inplace=True)

        if df['tanggal'].isnull().any():
            st.warning("Some dates 'tanggal' could not be parsed correctly. Please check the date format in the dataset.")
            df['tanggal'].fillna(pd.Timestamp.now().date(), inplace=True)

        # Menambahkan kolom 'kandidat' jika belum ada dan mengisinya
        if 'kandidat' not in df.columns:
            df['kandidat'] = selected_candidate

        # Mengambil subset kolom yang diperlukan
        df = df[required_columns + ['update_ba', 'kandidat', 'missing_comment']]

        # Perbarui nilai None di update_ba dengan tanggal_masuk
        df['update_ba'].fillna(df['tanggal_masuk'], inplace=True)

        # Menampilkan statistik dasar
        st.subheader(f"Training Dataset for {selected_candidate}")
        st.write(f"**Total rows in dataset:** {len(df)}")

        if not df.empty:
            # Visualisasi sebaran update BA
            st.subheader("Visualisasi Postingan Berdasarkan Update BA")
            ba_update_counts = df['update_ba'].value_counts().sort_index()
            fig, ax = plt.subplots(figsize=(10, 6))
            ba_update_counts.plot(kind='bar', ax=ax, color='blue')
            ax.set_title('Sebaran Postingan Berdasarkan Update BA')
            ax.set_xlabel('Tanggal Update BA')
            ax.set_ylabel('Jumlah Postingan')
            plt.xticks(rotation=45)
            plt.tight_layout()
            st.pyplot(fig)

            # Visualisasi tambahan sebaran platform
            st.subheader("Sebaran Platform Berdasarkan Update BA")
            platform_counts = df['Platform'].value_counts()
            fig, ax = plt.subplots(figsize=(10, 6))
            platform_counts.plot(kind='bar', ax=ax, color='green')
            ax.set_title('Sebaran Platform Berdasarkan Update BA')
            ax.set_xlabel('Platform')
            ax.set_ylabel('Jumlah Postingan')
            plt.xticks(rotation=45)
            plt.tight_layout()
            st.pyplot(fig)

            # Visualisasi jumlah komentar hilang berdasarkan platform
            st.subheader("Jumlah Komentar Hilang Berdasarkan Platform")
            missing_comments_by_platform = df.groupby('Platform')['missing_comment'].sum().sort_index()
            fig, ax = plt.subplots(figsize=(10, 6))
            missing_comments_by_platform.plot(kind='bar', ax=ax, color='red')
            ax.set_title('Jumlah Komentar Hilang Berdasarkan Platform')
            ax.set_xlabel('Platform')
            ax.set_ylabel('Jumlah Komentar Hilang')
            plt.xticks(rotation=45)
            plt.tight_layout()
            st.pyplot(fig)

            # Filter berdasarkan status validasi
            st.subheader("Filter Data")
            validation_filter = st.radio(
                "Choose data type to view:",
                ["All Data", "Validated Data", "Non-Validated Data"],
                key='validation_filter'
            )

            if validation_filter == "Validated Data":
                filtered_data = df[df['evaluated_by_data_train'] == True]
            elif validation_filter == "Non-Validated Data":
                filtered_data = df[df['evaluated_by_data_train'] == False]
            else:
                filtered_data = df

            if not filtered_data.empty:
                st.subheader(f"Filtered Data: {validation_filter}")
                st.dataframe(filtered_data)  # Menampilkan semua data yang sesuai dengan filter
            else:
                st.warning("Tidak ada data yang sesuai dengan filter yang dipilih.")

            # Menampilkan riwayat penambahan data
            st.subheader("History of Data Additions")

            try:
                history_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"history_{selected_candidate.lower().replace(' ', '_')}.json", repo_type="dataset")
                
                with open(history_path, "r") as f:
                    history = json.load(f)
                
                history_list = []
                for key, value in history.items():
                    for entry in value:
                        for k, v in entry.items():
                            history_list.append({
                                'key': key,
                                'field': k,
                                'date': v
                            })

                history_df = pd.DataFrame(history_list)
                st.dataframe(history_df)

            except Exception as e:
                st.warning("No addition history available.")
                st.error(f"Error: {e}")

            # Opsi untuk mengunduh dataset yang telah difilter
            st.subheader("Download Options")
            if not filtered_data.empty:
                filtered_dataset_file = f"filtered_training_dataset_{selected_candidate}.xlsx"
                filtered_data.to_excel(filtered_dataset_file, index=False, sheet_name='Filtered Dataset')

                # Streamlit Download
                st.download_button(
                    label=f"Download Filtered Dataset for {selected_candidate}",
                    data=open(filtered_dataset_file, "rb"),
                    file_name=filtered_dataset_file,
                    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                )

                # Upload ke Hugging Face
                api = HfApi()
                api.upload_file(
                    path_or_fileobj=filtered_dataset_file,
                    path_in_repo=filtered_dataset_file,
                    repo_id="zakyirhab0/Klasifikasi_Komentar",
                    repo_type="dataset"
                )

                st.success(f"✅ Filtered dataset berhasil diunggah ke Hugging Face: {filtered_dataset_file}")

            # Opsi untuk mengunduh seluruh dataset
            if not df.empty:
                full_dataset_file = f"training_dataset_{selected_candidate}.xlsx"
                df.to_excel(full_dataset_file, index=False, sheet_name='Training Dataset')

                # Streamlit Download
                st.download_button(
                    label=f"Download Full Training Dataset for {selected_candidate}",
                    data=open(full_dataset_file, "rb"),
                    file_name=full_dataset_file,
                    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                )

                # Upload ke Hugging Face
                api.upload_file(
                    path_or_fileobj=full_dataset_file,
                    path_in_repo=full_dataset_file,
                    repo_id="zakyirhab0/Klasifikasi_Komentar",
                    repo_type="dataset"
                )

                st.success(f"✅ Full training dataset berhasil diunggah ke Hugging Face: {full_dataset_file}")

            # Fitur untuk mengunduh dataset berdasarkan filter tanggal
            st.subheader("Download Aggregated Data by Date")

            # Mendapatkan nilai maksimum dan minimum tanggal untuk default date_input
            if not df['tanggal'].empty:
                min_date = df['tanggal'].min()
                max_date = df['tanggal'].max()

                selected_start_date = st.date_input("Select start date for aggregation:", value=min_date)
                selected_end_date = st.date_input("Select end date for aggregation:", value=max_date)

                agg_filtered_data = df[(df['tanggal'] >= selected_start_date) & (df['tanggal'] <= selected_end_date)]

                if not agg_filtered_data.empty:
                    aggregated_data = agg_filtered_data.groupby(['kandidat', 'link', 'tanggal', 'Platform', 'model_data', 'update_ba']).agg(
                        co_likes=('sentimen', lambda x: (x == 'Co Likes').sum()),
                        co_support=('sentimen', lambda x: (x == 'Co Support').sum()),
                        co_optimism=('sentimen', lambda x: (x == 'Co Optimism').sum()),
                        co_sarkastic=('sentimen', lambda x: (x == 'Co Sarkastic').sum()),
                        co_negative=('sentimen', lambda x: (x == 'Co Negative').sum()),
                        missing_comments=('missing_comment', 'sum')
                    ).reset_index()

                    aggregated_data['jumlah_komentar_positif'] = aggregated_data['co_likes'] + aggregated_data['co_support'] + aggregated_data['co_optimism']
                    aggregated_data['jumlah_komentar_negatif'] = aggregated_data['co_sarkastic'] + aggregated_data['co_negative']
                    aggregated_data['jumlah_komentar'] = aggregated_data[['co_likes', 'co_support', 'co_optimism', 'co_sarkastic', 'co_negative']].sum(axis=1)

                    st.dataframe(aggregated_data)

                    # Simpan dataset yang telah diagregasi
                    aggregated_dataset_file = f"aggregated_data_{selected_candidate}.xlsx"
                    aggregated_data.to_excel(aggregated_dataset_file, index=False, sheet_name='Aggregated Data')

                    # Streamlit Download
                    st.download_button(
                        label=f"Download Aggregated Data by Date for {selected_candidate}",
                        data=open(aggregated_dataset_file, "rb"),
                        file_name=aggregated_dataset_file,
                        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                    )

                    # Upload ke Hugging Face
                    api.upload_file(
                        path_or_fileobj=aggregated_dataset_file,
                        path_in_repo=aggregated_dataset_file,
                        repo_id="zakyirhab0/Klasifikasi_Komentar",
                        repo_type="dataset"
                    )

                    st.success(f"✅ Aggregated dataset berhasil diunggah ke Hugging Face: {aggregated_dataset_file}")
                else:
                    st.warning("Tidak ada data yang sesuai dengan rentang tanggal yang dipilih untuk diagregasi.")
                    st.dataframe(aggregated_data)

                    # Visualisasi jumlah postingan yang diupdate BA dengan sebaran platform berdasarkan rentang tanggal
                    st.subheader("Visualisasi Postingan yang Diupdate BA Berdasarkan Rentang Tanggal")
                    ba_update_range = aggregated_data[aggregated_data['update_ba'] != 'Belum diupdate']
                    if not ba_update_range.empty:
                        plt.figure(figsize=(10, 6))
                        ba_update_range['Platform'].value_counts().plot(kind='bar', title='Sebaran Platform - Diupdate BA (Rentang Tanggal)')
                        plt.xlabel('Platform')
                        plt.ylabel('Jumlah Postingan')
                        st.pyplot(plt)

                        # Tambahan visualisasi sebaran postingan berdasarkan tanggal
                        st.subheader("Sebaran Postingan Berdasarkan Tanggal")
                        plt.figure(figsize=(10, 6))
                        ba_update_range['tanggal'].value_counts().sort_index().plot(kind='bar', title='Sebaran Postingan Berdasarkan Tanggal')
                        plt.xlabel('Tanggal')
                        plt.ylabel('Jumlah Postingan')
                        plt.xticks(rotation=45)
                        plt.tight_layout()
                        st.pyplot(plt)

                        # Tambahan visualisasi sebaran update BA
                        st.subheader("Sebaran Update BA")
                        plt.figure(figsize=(10, 6))
                        ba_update_range['update_ba'].value_counts().sort_index().plot(kind='bar', title='Sebaran Update BA')
                        plt.xlabel('Tanggal Update BA')
                        plt.ylabel('Jumlah Postingan')
                        plt.xticks(rotation=45)
                        plt.tight_layout()
                        st.pyplot(plt)

                        # Visualisasi jumlah komentar berdasarkan tanggal_masuk
                        st.subheader("Jumlah Komentar Berdasarkan Tanggal Masuk")
                        plt.figure(figsize=(10, 6))
                        agg_filtered_data.groupby('tanggal_masuk')['komentar'].count().sort_index().plot(kind='bar', title='Jumlah Komentar Berdasarkan Tanggal Masuk')
                        plt.xlabel('Tanggal Masuk')
                        plt.ylabel('Jumlah Komentar')
                        plt.xticks(rotation=45)
                        plt.tight_layout()
                        st.pyplot(plt)

                        # Visualisasi sebaran komentar di tiap platform
                        st.subheader("Sebaran Komentar di Tiap Platform")
                        plt.figure(figsize=(10, 6))
                        agg_filtered_data['Platform'].value_counts().plot(kind='bar', title='Sebaran Komentar di Tiap Platform')
                        plt.xlabel('Platform')
                        plt.ylabel('Jumlah Komentar')
                        plt.xticks(rotation=45)
                        plt.tight_layout()
                        st.pyplot(plt)

                        # Visualisasi jumlah missing comments berdasarkan postingan
                        st.subheader("Jumlah Komentar Hilang Berdasarkan Postingan")
                        plt.figure(figsize=(10, 6))
                        aggregated_data.groupby('link')['missing_comments'].sum().sort_index().plot(kind='bar', title='Jumlah Komentar Hilang Berdasarkan Postingan')
                        plt.xlabel('Link')
                        plt.ylabel('Jumlah Komentar Hilang')
                        plt.xticks(rotation=45)
                        plt.tight_layout()
                        st.pyplot(plt)
                    else:
                        st.warning("Tidak ada data yang diupdate BA untuk rentang tanggal yang dipilih.")

                    # Opsi untuk mengunduh dataset yang telah diagregasi
                    excel_buffer_aggregated = io.BytesIO()
                    with pd.ExcelWriter(excel_buffer_aggregated, engine='xlsxwriter') as writer:
                        aggregated_data.to_excel(writer, index=False, sheet_name='Aggregated Data')
                    excel_buffer_aggregated.seek(0)

                    st.download_button(
                        label=f"Download Aggregated Data by Date for {selected_candidate}",
                        data=excel_buffer_aggregated,
                        file_name=f"aggregated_data_{selected_candidate}.xlsx",
                        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                    )
            else:
                st.warning("Tidak ada data yang sesuai dengan rentang tanggal yang dipilih untuk diagregasi.")
        else:
            st.warning("Tidak ada data dalam dataset untuk divisualisasikan.")
    
    except FileNotFoundError:
        st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.")
    except Exception as e:
        st.error(f"An error occurred: {e}")
pass

if menu == "Evaluate Data Train":
    st.title("Evaluate Data Train")

    selected_candidate = st.selectbox("Choose a candidate:", list(candidate_list), key='candidate_select_evaluate')

    try:
        # **📥 Load Dataset dari Hugging Face**
        dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset")
        df = pd.read_excel(dataset_path)

        # **✅ Load existing keyword dictionary dari Hugging Face**
        try:
            keywords_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename="keywords.json", repo_type="dataset")
            with open(keywords_path, 'r', encoding="utf-8") as f:
                keyword_dict = json.load(f)

            st.success("✅ Keywords.json loaded successfully.")

        except FileNotFoundError:
            st.error("❌ Keywords.json file not found in Hugging Face. Please upload it first.")
            st.stop()

        except json.JSONDecodeError:
            st.error("❌ Keywords.json file is not a valid JSON format. Please check the file.")
            st.stop()

        # **🔹 Pastikan dataset memiliki data**
        if df.empty:
            st.warning(f"⚠ Dataset for {selected_candidate} is empty.")
        else:
            st.success(f"✅ Dataset for {selected_candidate} loaded successfully.")
            st.dataframe(df.head())  # Tampilkan 5 data pertama

    except FileNotFoundError:
        st.error(f"❌ No training dataset found for {selected_candidate}. Please upload the dataset to Hugging Face.")
    except Exception as e:
        st.error(f"⚠ An error occurred: {e}")

        # Select candidate-specific keywords
        candidate_key = selected_candidate.replace(' ', '_')
        candidate_keywords = keyword_dict.get(candidate_key)

        if not candidate_keywords:
            st.error(f"No keywords found for the selected candidate '{selected_candidate}'. Please update the 'keywords.json' file with appropriate keywords.")
            st.stop()

        keywords = [kw for sentiment_keywords in candidate_keywords.values() for kw in sentiment_keywords]

        # Validasi Konsistensi Data
        st.subheader("Data Consistency Validation")
        missing_values = df.isnull().sum()
        st.write("Missing values in each column:")
        st.write(missing_values)

        # Menghapus baris dengan nilai yang hilang
        st.write("Removing rows with missing values...")
        df.dropna(inplace=True)

        # Distribusi Sentimen
        st.subheader("Sentiment Distribution")
        sentiment_counts = df['sentimen'].value_counts()
        st.write("Number of comments for each sentiment:")
        st.write(sentiment_counts)

        # Kualitas Data
        st.subheader("Data Quality Check")
        invalid_entries = df[df['komentar'].str.len() == 0]
        st.write(f"Number of invalid comments (empty): {len(invalid_entries)}")
        if len(invalid_entries) > 0:
            st.write("Invalid comments (empty):")
            st.dataframe(invalid_entries.head(100))  # Preview 100 baris pertama

        # Deteksi Inkonistensi Sentimen
        st.subheader("Inconsistent Sentiment Labels")
        duplicate_comments = df[df.duplicated(subset=['komentar'], keep=False)]
        inconsistent_labels = duplicate_comments.groupby('komentar')['sentimen'].nunique()
        inconsistent_labels = inconsistent_labels[inconsistent_labels > 1]
        if not inconsistent_labels.empty:
            inconsistent_labels_df = duplicate_comments[duplicate_comments['komentar'].isin(inconsistent_labels.index)]
            st.write(f"Number of comments with inconsistent sentiment labels: {len(inconsistent_labels_df)}")
            st.dataframe(inconsistent_labels_df.head(100))  # Preview 100 baris pertama
        else:
            st.write("No comments with inconsistent sentiment labels found.")

        # Penanganan Masalah
        st.subheader("Problem Handling")

        # Menghapus komentar yang kosong
        st.write("Removing invalid (empty) comments...")
        df = df[df['komentar'].str.len() > 0]

        # Interaktif: Menangani komentar dengan label sentimen yang tidak konsisten
        st.write("Resolving inconsistent sentiment labels...")
        if not inconsistent_labels.empty:
            for index, row in inconsistent_labels_df.iterrows():
                st.write(f"Comment: {row['komentar']}")
                sentimen_options = df[df['komentar'] == row['komentar']]['sentimen'].unique().tolist()
                new_sentimen = st.selectbox("Select correct sentiment", sentimen_options, key=f'sentimen_{index}')
                if st.button("Update Sentiment", key=f'update_{index}'):
                    update_sentiment(index, new_sentimen)

        # Clustering menggunakan Keyword dan Model Sarkas
        st.write("Clustering comments using keywords and sarcasm model...")
        keyword_vectorizer = TfidfVectorizer(vocabulary=keywords)
        X_keywords = keyword_vectorizer.fit_transform(df['komentar'])
        kmeans = KMeans(n_clusters=10, random_state=0).fit(X_keywords)
        df['cluster'] = kmeans.labels_

        # Identifikasi kluster yang perlu ditinjau
        review_clusters = df[df['cluster'].isin(df['cluster'].value_counts()[df['cluster'].value_counts() > 10].index)]
        st.write("Clusters identified for review:")
        st.dataframe(review_clusters.head(100))  # Preview 100 baris pertama

        # Notifikasi Tambahkan Kata Kunci
        st.warning("Some comments were not captured by the current keywords. Please add new keywords in the 'Update Keywords' section.")

        # Visualisasi Klaster
        cluster_counts = df['cluster'].value_counts()
        st.write("Number of comments in each cluster:")
        st.write(cluster_counts)

        # Menampilkan hasil clustering
        st.write("Comments clustered by patterns:")
        st.dataframe(df.head(100))  # Preview 100 baris pertama

        # Export hasil analisis dan penanganan
        st.subheader("Export Final Data")

        # **1️⃣ Menyimpan data ke JSON dalam buffer**
        json_buffer = io.BytesIO()
        df.to_json(json_buffer, orient='records', lines=True)
        json_buffer.seek(0)

        # **2️⃣ Opsi untuk Download File Secara Lokal**
        st.download_button(
            label=f"⬇ Download Final Data for {selected_candidate}",
            data=json_buffer,
            file_name=f"final_data_{selected_candidate}.json",
            mime="application/json"
        )

        # **3️⃣ Opsi untuk Menyimpan ke Hugging Face**
        file_name = f"final_data_{selected_candidate}.json"

        if st.button("📤 Upload Final Data to Hugging Face"):
            try:
                # Simpan file sementara sebelum diunggah
                with open(file_name, "w", encoding="utf-8") as f:
                    df.to_json(f, orient='records', lines=True)

                # Upload ke Hugging Face
                api = HfApi()
                api.upload_file(
                    path_or_fileobj=file_name,
                    path_in_repo=file_name,
                    repo_id="zakyirhab0/Klasifikasi_Komentar",
                    repo_type="dataset"
                )

                st.success(f"✅ {file_name} berhasil diunggah ke Hugging Face.")

            except Exception as e:
                st.error(f"❌ Gagal mengunggah ke Hugging Face: {e}")

        # **4️⃣ Menangani FileNotFoundError & Error Lainnya**
        try:
            # Load dataset
            dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=file_name, repo_type="dataset")
            df = pd.read_json(dataset_path, lines=True)
            st.dataframe(df)
        except FileNotFoundError:
            st.error(f"❌ No training dataset found for {selected_candidate}. Please add data to create the dataset.")
        except Exception as e:
            st.error(f"⚠ An error occurred: {e}")
    pass

if menu == "Retraining Model":
    st.title("Retrain Model")
    selected_candidate = st.selectbox("Select a candidate to retrain the model:", list(candidate_list))

    # Load model & vectorizer dari Hugging Face Hub
    model_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"best_rf_model_{selected_candidate.lower().replace(' ', '_')}.joblib", repo_type="dataset")
    vectorizer_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"tfidf_vectorizer_{selected_candidate.lower().replace(' ', '_')}.joblib", repo_type="dataset")

    # Load dataset dari Hugging Face
    dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset")
    retrain_history_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"retrain_history_{selected_candidate.lower().replace(' ', '_')}.json", repo_type="dataset")  # Jika ingin menyimpan history retrain secara lokal

    try:
        # Load dataset
        data = pd.read_excel(dataset_path)

        # Ensure 'komentar' is string
        if 'komentar' not in data.columns:
            st.error("Dataset must include a 'komentar' column.")
            st.stop()
        else:
            # Convert all comments to string
            data['komentar'] = data['komentar'].fillna('').astype(str)

        # Separate validated and unvalidated data
        if 'evaluated_by_cluster' in data.columns:
            validated_data = data[data['evaluated_by_cluster'] == True]
            unvalidated_data = data[data['evaluated_by_cluster'] == False]
        else:
            validated_data = pd.DataFrame(columns=data.columns)
            unvalidated_data = data

        st.write(f"**Validated Data:** {len(validated_data)} rows")
        st.write(f"**Unvalidated Data:** {len(unvalidated_data)} rows")

        # Check if all data is validated
        if len(unvalidated_data) > 0:
            st.warning("Model retraining is only allowed if all data has been validated through 'Evaluate Clustering'. Please ensure all data is validated before retraining the model.")
            st.stop()

        # Combine all data for preprocessing
        combined_data = validated_data  # Only use validated data

        # Preprocessing Function
        @st.cache_data(show_spinner=True)
        def preprocess_data(data):
            from joblib import Parallel, delayed

            def preprocess_comment(comment):
                comment = translate_emojis(comment)
                comment = normalize_unicode(comment)
                comment = handle_replies(comment)
                comment = clean_text(comment)
                comment = translate_text(comment, ntb_dict)
                comment = translate_text(comment, slang_dict)
                comment = handle_negation(comment)
                return comment

            data['processed_comments'] = Parallel(n_jobs=-1)(
                delayed(preprocess_comment)(c) for c in data['komentar']
            )
            return data

        # Preprocessing
        st.write("Starting preprocessing...")
        combined_data = preprocess_data(combined_data)

        if st.button("Retrain Model"):
            # Vectorization
            st.write("Vectorizing data...")
            vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=5000)
            X = vectorizer.fit_transform(combined_data['processed_comments'])
            y = combined_data['sentimen']

            # Split Data
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            # Handle Class Imbalance with SMOTE
            st.write("Balancing data with SMOTE...")
            smote = SMOTE(random_state=42, n_jobs=-1)
            X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

            # Train Random Forest Model
            st.write("Training Random Forest model...")
            rf_model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
            rf_model.fit(X_train_res, y_train_res)

            # Evaluate on Training Data
            st.write("Evaluating model...")
            y_pred_train = rf_model.predict(X_train)
            accuracy_train = accuracy_score(y_train, y_pred_train)
            report_train = classification_report(y_train, y_pred_train, output_dict=True)

            # Evaluate on Test Data
            y_pred_test = rf_model.predict(X_test)
            accuracy_test = accuracy_score(y_test, y_pred_test)
            report_test = classification_report(y_test, y_pred_test, output_dict=True)

            # Save Model and Vectorizer
            st.write("Saving model and vectorizer...")
            joblib.dump(rf_model, model_path)
            joblib.dump(vectorizer, vectorizer_path)

            # Log Retraining History
            st.write("Logging retraining history...")
            try:
                with open(retrain_history_path, "r") as f:
                    retrain_history = json.load(f)
            except FileNotFoundError:
                retrain_history = []

            retrain_history.append({
                "date_retrained": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
                "accuracy_on_train": accuracy_train,
                "accuracy_on_test": accuracy_test,
                "f1_score_on_train": report_train['weighted avg']['f1-score'],
                "f1_score_on_test": report_test['weighted avg']['f1-score'],
            })

            with open(retrain_history_path, "w") as f:
                json.dump(retrain_history, f, indent=4)

            # Display Results
            st.success(f"Model retrained successfully! Accuracy on training data: {accuracy_train:.4f}, Accuracy on test data: {accuracy_test:.4f}")
            st.subheader("Model Metrics on Training Data")
            st.table(pd.DataFrame(report_train).T)
            st.subheader("Model Metrics on Test Data")
            st.table(pd.DataFrame(report_test).T)

            # Show Retrain History
            st.subheader("Retrain History")
            st.json(retrain_history)

    except FileNotFoundError:
        st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.")
    except Exception as e:
        st.error(f"An unexpected error occurred: {e}")

    pass  # Placeholder

if menu == "Maximize Preprocessing":
    st.title("Maximize Preprocessing")
    
    # Load Dataset Train
    candidate = st.selectbox("Choose a candidate:", list(candidate_list))
    dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset")
    try:
        # Load dataset
        data = pd.read_excel(dataset_path)

        # Ensure 'komentar' is string
        if 'komentar' not in data.columns:
            st.error("Dataset must include a 'komentar' column.")
            st.stop()
        data['komentar'] = data['komentar'].fillna('').astype(str)

        # Preprocessing Steps
        @st.cache_data(show_spinner=True)
        def preprocess_data(data):
            st.write("Starting preprocessing...")
            data['translated_emojis'] = data['komentar'].apply(translate_emojis)
            data['normalized_unicode'] = data['translated_emojis'].apply(normalize_unicode)
            data['reply_handled'] = data['normalized_unicode'].apply(handle_replies)
            data['clean_text'] = data['reply_handled'].apply(clean_text)
            data['translated_ntb'] = data['clean_text'].apply(lambda x: translate_text(x, ntb_dict))
            data['translated_slang'] = data['translated_ntb'].apply(lambda x: translate_text(x, slang_dict))
            data['negation_handled'] = data['translated_slang'].apply(handle_negation)
            return data

        data = preprocess_data(data)

        # Check Unmapped Words
        st.subheader("Check for Unmapped Words")
        all_words = (word.lower() for comment in data['negation_handled'] for word in comment.split())
        unique_words = set(all_words)
        ntb_dict_keys = set(ntb_dict.keys())
        slang_dict_keys = set(slang_dict.keys())
        mapped_words = ntb_dict_keys.union(slang_dict_keys)
        unmapped_words = sorted(unique_words - mapped_words)

        if unmapped_words:
            st.write(f"Found **{len(unmapped_words)} unmapped words.**")

            # Pilihan jumlah kata yang ingin ditampilkan
            max_words = st.slider(
                "Select number of words to display:",
                min_value=10,
                max_value=len(unmapped_words),
                value=min(50, len(unmapped_words)),
                step=10,
            )

            # Buat DataFrame untuk tampilan tabel
            unmapped_df = pd.DataFrame(unmapped_words, columns=["Unmapped Words"])
            st.dataframe(unmapped_df.head(max_words))

            # Tampilkan jumlah total kata jika dibutuhkan
            st.caption(f"Showing {min(max_words, len(unmapped_words))} out of {len(unmapped_words)} unmapped words.")
        else:
            st.success("No unmapped words found!")

        # Add Words to Dictionary
        st.subheader("Add New Words to Dictionary")
        new_word = st.text_input("Enter new word:")
        normalized_word = st.text_input("Enter normalized form:")
        dictionary_choice = st.radio("Select dictionary to update:", ["Kamus Alay", "Kamus ntb"])

        if st.button("Add to Dictionary"):
            if new_word and normalized_word:
                if dictionary_choice == "Kamus Alay":
                    slang_dict[new_word.lower()] = normalized_word
                    st.success(f"Added '{new_word}' -> '{normalized_word}' to Kamus Alay.")
                elif dictionary_choice == "Kamus ntb":
                    ntb_dict[new_word.lower()] = normalized_word
                    st.success(f"Added '{new_word}' -> '{normalized_word}' to Kamus ntb.")
            else:
                st.warning("Please enter both the new word and its normalized form.")

        # Save Updates to Hugging Face
        st.subheader("Save Updated Dictionaries")

        if st.button("Save Kamus Alay"):
            kamus_alay_filename = "kamusalay.csv"

            # Simpan DataFrame ke file CSV sementara
            slang_df = pd.DataFrame(list(slang_dict.items()), columns=["slang", "formal"])
            slang_df.to_csv(kamus_alay_filename, index=False)

            # **📤 Upload ke Hugging Face**
            try:
                api = HfApi()
                api.upload_file(
                    path_or_fileobj=kamus_alay_filename,
                    path_in_repo=kamus_alay_filename,
                    repo_id="zakyirhab0/Klasifikasi_Komentar",
                    repo_type="dataset"
                )
                st.success(f"✅ Kamus Alay successfully saved and uploaded to Hugging Face!")
            except Exception as e:
                st.error(f"❌ Failed to upload Kamus Alay: {e}")

        if st.button("Save Kamus NTB"):
            kamus_ntb_filename = "ntb_dict.json"

            # Simpan dictionary ke JSON sementara
            try:
                with open(kamus_ntb_filename, 'w', encoding='utf-8') as f:
                    json.dump(ntb_dict, f, indent=4)

                # **📤 Upload ke Hugging Face**
                api.upload_file(
                    path_or_fileobj=kamus_ntb_filename,
                    path_in_repo=kamus_ntb_filename,
                    repo_id="zakyirhab0/Klasifikasi_Komentar",
                    repo_type="dataset"
                )
                st.success(f"✅ Kamus NTB successfully saved and uploaded to Hugging Face!")
            except Exception as e:
                st.error(f"❌ Failed to upload Kamus NTB: {e}")

    except FileNotFoundError:
        st.error(f"❌ No training dataset found for {candidate}. Please ensure the dataset is available.")
    except Exception as e:
        st.error(f"⚠ An unexpected error occurred: {e}")
    pass  # Placeholder

if menu == "Update Keywords":
    st.title("Update Keywords")

    # Load existing keyword dictionary from Hugging Face
    try:
        keywords_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename="keywords.json", repo_type="dataset")

        with open(keywords_path, 'r', encoding='utf-8') as f:
            keyword_dict = json.load(f)

        st.success("✅ keywords.json loaded successfully from Hugging Face!")

    except Exception as e:
        st.error(f"❌ Failed to load keywords.json from Hugging Face: {e}")
        keyword_dict = {}  # Set dictionary kosong jika gagal

    # Show current keywords
    st.subheader("Current Keywords")
    candidate = st.selectbox("Select candidate", list(keyword_dict.keys()))
    for sentiment, keywords in keyword_dict[candidate].items():
        st.write(f"{sentiment}: {', '.join(keywords)}")

    # Add new keyword
    st.subheader("Add New Keyword")
    new_keyword = st.text_input("Enter new keyword")
    selected_sentiment = st.selectbox("Select sentiment for new keyword", list(keyword_dict[candidate].keys()))

    # Tambah keyword dan unggah ke Hugging Face
    if st.button("Add Keyword"):
        if new_keyword and selected_sentiment:
            keyword_dict[candidate][selected_sentiment].append(new_keyword)
            save_keywords_to_huggingface(keyword_dict)  # Simpan ke Hugging Face
            st.success(f"✅ Keyword '{new_keyword}' added to {selected_sentiment} for {candidate}")
        else:
            st.error("❌ Please enter a keyword and select a sentiment")

    # Analyze Special Cluster
    st.subheader("Analyze Special Cluster")
    if 'ba_lainnya_data' in st.session_state:
        try:
            # Langsung Memuat Data `Special Cluster`
            special_cluster_data = st.session_state['ba_lainnya_data'][st.session_state['ba_lainnya_data']['Cluster_Name'] == 'Special Cluster']
            if special_cluster_data.empty:
                st.warning("No data found in Special Cluster.")
            else:
                st.write(f"Total comments in Special Cluster: {len(special_cluster_data)}")

                all_words_special = []
                for comment in special_cluster_data['negation_handled']:
                    comment = translate_emojis(comment)
                    comment = normalize_unicode(comment)
                    comment = handle_replies(comment)
                    comment = clean_text(comment)
                    comment = translate_text(comment, {})  # Adjust based on your dictionary
                    comment = handle_negation(comment)
                    words = preprocess_text(comment)
                    all_words_special.extend(words)

                # Calculate word frequencies
                word_freq_special = Counter(all_words_special)

                # Add slider to select number of words to display
                num_words_special = st.slider("Number of words to display (Special Cluster)", min_value=5, max_value=50, value=20)
                most_common_words_special = word_freq_special.most_common(num_words_special)

                # Display word frequencies as a table
                st.subheader(f"Top {num_words_special} Word Frequencies in Special Cluster")
                word_freq_df_special = pd.DataFrame(most_common_words_special, columns=['Word', 'Frequency'])
                st.dataframe(word_freq_df_special)

        except Exception as e:
            st.error(f"An error occurred: {e}")
    else:
        st.warning("No 'BA Lainnya' data found. Please classify comments first.")

    # Analyze Training Data
    st.subheader("Analyze Training Data")
    dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset")
    try:
        train_data = pd.read_excel(dataset_path)
        if train_data.empty:
            st.warning("Training dataset is empty.")
        else:
            all_words_train = []
            for comment in train_data['komentar'].astype(str):
                comment = translate_emojis(comment)
                comment = normalize_unicode(comment)
                comment = handle_replies(comment)
                comment = clean_text(comment)
                comment = translate_text(comment, {})  # Adjust based on your dictionary
                comment = handle_negation(comment)
                words = preprocess_text(comment)
                all_words_train.extend(words)

            # Calculate word frequencies
            word_freq_train = Counter(all_words_train)

            # Add slider to select number of words to display
            num_words_train = st.slider("Number of words to display (Training Data)", min_value=5, max_value=50, value=20)
            most_common_words_train = word_freq_train.most_common(num_words_train)

            # Display word frequencies as a table
            st.subheader(f"Top {num_words_train} Word Frequencies in Training Data")
            word_freq_df_train = pd.DataFrame(most_common_words_train, columns=['Word', 'Frequency'])
            st.dataframe(word_freq_df_train)

    except FileNotFoundError:
        st.error(f"Training dataset for {candidate} not found.")
    except Exception as e:
        st.error(f"An error occurred: {e}")

    # Ekspor Keywords (Download)
    st.subheader("Export Keywords")
    json_buffer = io.BytesIO()
    json_buffer.write(json.dumps(keyword_dict, indent=4).encode('utf-8'))
    json_buffer.seek(0)
    st.download_button(
        label="Download Keywords",
        data=json_buffer,
        file_name="keywords.json",
        mime="application/json"
    )

    # Impor Keywords (Upload & Sync ke Hugging Face)
    st.subheader("Import Keywords")
    uploaded_file = st.file_uploader("Choose a JSON file", type="json")
    if uploaded_file is not None:
        try:
            imported_keywords = json.load(uploaded_file)
            keyword_dict.update(imported_keywords)  # Update dictionary
            
            # Simpan kembali ke Hugging Face
            save_keywords_to_huggingface(keyword_dict)
            st.success("✅ Keywords imported and updated successfully!")
        except Exception as e:
            st.error(f"❌ Error importing keywords: {e}")
    pass