import streamlit as st import joblib import pandas as pd import re import emoji import json import io import unicodedata from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, f1_score from imblearn.over_sampling import SMOTE import matplotlib.pyplot as plt import os from wordcloud import WordCloud from sklearn.metrics import classification_report, accuracy_score, f1_score from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.preprocessing import FunctionTransformer from sklearn.pipeline import Pipeline from transformers import pipeline from collections import Counter import nltk from nltk.corpus import stopwords from datetime import datetime from huggingface_hub import hf_hub_download from huggingface_hub import HfApi, login huggingface_token = os.getenv("HF_TOKEN") if huggingface_token is None: st.error("❌ Token Hugging Face tidak ditemukan. Set token sebagai environment variable HF_TOKEN.") st.stop() login(token=huggingface_token) # Inisialisasi API api = HfApi() repo_id="zakyirhab0/Klasifikasi_Komentar" # === Preprocessing Functions === # candidate_list = ["Lalu Muhamad Iqbal", "Indah Dhamayanti Putri", "Zulkieflimansyah", "M Suhaili", "Sitti Rohmi Djalilah", "Musyafirin"] # Unduh stopwords jika belum terunduh nltk.download('stopwords') stop_words = set(stopwords.words('indonesian')) # Memastikan data "BA Lainnya" tersedia if 'data_with_ba' in st.session_state: ba_lainnya_data = st.session_state['data_with_ba'] else: ba_lainnya_data = None def translate_emojis(text): if not isinstance(text, str): return '' # Mengembalikan string kosong jika input bukan string return ''.join(c for c in text if not emoji.is_emoji(c)) # Remove all emojis # Remove all emojis def normalize_unicode(text): return unicodedata.normalize('NFKD', text) def clean_text(text): text = str(text).casefold() # Convert to lowercase text = re.sub(r'http\S+|www\S+', '', text) # Remove URLs text = re.sub(r'[^a-z\s]', '', text) # Remove non-alphabetic characters text = re.sub(r'\s+', ' ', text).strip() # Normalize spaces return text def handle_negation(text): negation_words = {"tidak", "bukan", "jangan", "belum", "kurang", "gagal", "sulit"} words = text.split() result = [] skip_next = False for i, word in enumerate(words): if word in negation_words and i + 1 < len(words): result.append(f"{word}_{words[i + 1]}") # Combine negation with next word skip_next = True elif skip_next: skip_next = False else: result.append(word) return ' '.join(result) def handle_replies(text): text = re.sub(r'=--*@\w+', '', text) # Remove multi-level reply patterns text = re.sub(r'=-*@\w+', '', text) # Remove single-level reply patterns text = re.sub(r'@\w+', '', text) # Remove standalone @username mentions return text def translate_text(text, dictionary): words = text.split() return ' '.join([dictionary.get(word.lower(), word) for word in words]) # Translate words using dictionary # Fungsi untuk menetapkan sentimen berdasarkan kata kunci def assign_sentiment_based_on_keywords(comment, keyword_dict): for sentiment, keywords in keyword_dict.items(): if any(keyword in comment for keyword in keywords): return sentiment return 'unknown' # === Load Dictionaries === # def load_dictionary(file_path, file_type='json'): if file_type == 'json': with open(file_path, 'r', encoding='utf-8') as file: return json.load(file) elif file_type == 'csv': df = pd.read_csv(file_path, names=['slang', 'formal']) return pd.Series(df['formal'].values, index=df['slang']).to_dict() ntb_dict_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename="ntb_dict.json", repo_type="dataset") slang_dict_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename="kamusalay.csv", repo_type="dataset") ntb_dict = load_dictionary(ntb_dict_path, 'json') slang_dict = load_dictionary(slang_dict_path, 'csv') # === Utility Functions === # # Fungsi untuk memperbarui file JSON dengan riwayat update def update_history_json(history_path, komentar, link, model_data, field, date): # Konversi Timestamp menjadi string if isinstance(date, pd.Timestamp): date = date.strftime('%Y-%m-%d') # Baca histori dari file JSON try: with open(history_path, 'r') as file: history_data = json.load(file) except (FileNotFoundError, json.JSONDecodeError): history_data = {} # Kunci riwayat untuk komentar, link, dan model data tertentu key = f"{komentar}_{link}_{model_data}" # Tambahkan riwayat baru ke kunci yang sesuai if key in history_data: history_data[key].append({field: date}) else: history_data[key] = [{field: date}] # Simpan JSON secara sementara with open("history_temp.json", "w") as file: json.dump(history_data, file, indent=4) # Upload ke Hugging Face Storage api = HfApi() api.upload_file( path_or_fileobj="history_temp.json", path_in_repo=f"history_{candidate.lower().replace(' ', '_')}.json", repo_id="zakyirhab0/Klasifikasi_Komentar", repo_type="dataset" ) # Fungsi untuk memperbarui dataset pelatihan dengan data baru def update_training_dataset(output, candidate): dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset") history_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"history_{candidate.lower().replace(' ', '_')}.json", repo_type="dataset") try: required_columns = ['model_data', 'Platform', 'komentar', 'link', 'kandidat', 'sentimen', 'tanggal', 'tanggal_masuk'] output = output[required_columns].copy() if 'predicted_category' in output.columns: output['sentimen'] = output['predicted_category'] output.drop(columns=['predicted_category'], inplace=True) output['tanggal_masuk'] = pd.Timestamp.now() if os.path.exists(dataset_path): existing_data = pd.read_excel(dataset_path) else: existing_data = pd.DataFrame(columns=required_columns) # Menambahkan kolom 'update_ba' jika belum ada if 'update_ba' not in existing_data.columns: existing_data['update_ba'] = None # Menambahkan kolom 'missing_comment' jika belum ada if 'missing_comment' not in existing_data.columns: existing_data['missing_comment'] = False # Langkah 1: Memeriksa Komentar yang Hilang train_comments = existing_data.groupby('link')['komentar'].apply(list).to_dict() new_comments = output.groupby('link')['komentar'].apply(list).to_dict() for link, comments in train_comments.items(): if link in new_comments: new_comment_set = set(new_comments[link]) for comment in comments: if comment not in new_comment_set: existing_data.loc[(existing_data['link'] == link) & (existing_data['komentar'] == comment), 'missing_comment'] = True else: existing_data.loc[(existing_data['link'] == link) & (existing_data['komentar'] == comment), 'missing_comment'] = False # Fungsi untuk memperbarui data def update_data(existing_data, new_data, history_path): for index, row in new_data.iterrows(): komentar = row['komentar'] link = row['link'] model_data = row['model_data'] tanggal_klasifikasi = pd.Timestamp.now() # Komentar Sama Sudah Ada existing_entry = existing_data[(existing_data['link'] == link) & (existing_data['komentar'] == komentar)] if not existing_entry.empty: existing_data.loc[existing_entry.index, 'update_ba'] = tanggal_klasifikasi update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi) else: # Link Sama, Komentar Berbeda existing_link_entry = existing_data[(existing_data['link'] == link)] if not existing_link_entry.empty: new_row = row.copy() new_row['tanggal_masuk'] = tanggal_klasifikasi new_row['update_ba'] = tanggal_klasifikasi existing_data = pd.concat([existing_data, new_row.to_frame().T], ignore_index=True) update_history_json(history_path, komentar, link, model_data, 'tanggal_masuk', tanggal_klasifikasi) update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi) else: # Link Baru new_row = row.copy() new_row['tanggal_masuk'] = tanggal_klasifikasi new_row['update_ba'] = tanggal_klasifikasi existing_data = pd.concat([existing_data, new_row.to_frame().T], ignore_index=True) update_history_json(history_path, komentar, link, model_data, 'tanggal_masuk', tanggal_klasifikasi) update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi) # Perbarui nilai None di update_ba dengan tanggal_masuk existing_data['update_ba'] = pd.to_datetime(existing_data['update_ba'], errors='coerce') existing_data['update_ba'].fillna(existing_data['tanggal_masuk'], inplace=True) return existing_data updated_data = update_data(existing_data, output, history_path) updated_data.to_excel("dataset_temp.xlsx", index=False) api.upload_file( path_or_fileobj="dataset_temp.xlsx", path_in_repo=f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx", repo_id="zakyirhab0/Klasifikasi_Komentar", repo_type="dataset" ) st.success(f"Data successfully updated in {candidate}'s training dataset.") if 'missing_comment' in existing_data.columns and existing_data['missing_comment'].any(): st.subheader("Missing Comments") st.write("Comments that were found to be missing:") st.dataframe(existing_data[existing_data['missing_comment']]) except KeyError as e: st.error(f"Missing column in the dataset: {e}") except Exception as e: st.error(f"An error occurred: {e}") # Fungsi untuk menyimpan keyword dictionary ke Hugging Face def save_keywords_to_huggingface(keyword_dict): file_name = "keywords.json" # Simpan ke file JSON sementara with open(file_name, 'w', encoding='utf-8') as f: json.dump(keyword_dict, f, indent=4) # Upload ke Hugging Face api = HfApi() api.upload_file( path_or_fileobj=file_name, path_in_repo=file_name, repo_id="zakyirhab0/Klasifikasi_Komentar", repo_type="dataset" ) st.success(f"✅ Keywords updated and uploaded to Hugging Face!") # Load keywords dari Hugging Face def load_keywords_from_huggingface(): try: keywords_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename="keywords.json", repo_type="dataset") with open(keywords_path, 'r', encoding='utf-8') as f: return json.load(f) except Exception as e: st.error(f"❌ Error loading keywords: {e}") return {} def clustering_based_evaluation(df, n_clusters=10): st.write("Starting preprocessing...") df['translated_emojis'] = df['komentar'].fillna('').astype(str).apply(translate_emojis) df['normalized_unicode'] = df['translated_emojis'].apply(normalize_unicode) df['reply_handled'] = df['normalized_unicode'].apply(handle_replies) df['clean_text'] = df['reply_handled'].apply(clean_text) df['translated_ntb'] = df['clean_text'].apply(lambda x: translate_text(x, ntb_dict)) df['translated_slang'] = df['translated_ntb'].apply(lambda x: translate_text(x, slang_dict)) df['negation_handled'] = df['translated_slang'].apply(handle_negation) st.write("Generating TF-IDF vectors...") tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') tfidf_matrix = tfidf_vectorizer.fit_transform(df['negation_handled']) st.write(f"Clustering into {n_clusters} clusters...") kmeans = KMeans(n_clusters=n_clusters, random_state=42) df['Cluster'] = kmeans.fit_predict(tfidf_matrix) st.write("Performing PCA for visualization...") pca = PCA(n_components=2) reduced_data = pca.fit_transform(tfidf_matrix.toarray()) df['PCA1'] = reduced_data[:, 0] df['PCA2'] = reduced_data[:, 1] st.write("Clustering completed successfully!") return df def load_and_process_data(dataset_path, history_path): dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset") df = pd.read_excel(dataset_path) df['tanggal_masuk'] = pd.to_datetime(df['tanggal_masuk'], errors='coerce') if df['tanggal_masuk'].isnull().any(): st.warning("Some dates could not be parsed correctly. Please check the date format in the dataset.") df['tanggal_masuk'].fillna(pd.Timestamp.now().strftime('%Y-%m-%d'), inplace=True) required_columns = ['model_data', 'Platform', 'komentar', 'sentimen', 'tanggal', 'tanggal_masuk', 'evaluated_by_cluster'] for col in required_columns: if col not in df.columns: if col == 'tanggal_masuk': df[col] = pd.Timestamp.now().strftime('%Y-%m-%d') elif col == 'evaluated_by_cluster': df[col] = False else: df[col] = None df = df[required_columns] try: with open(history_path, "r") as f: history = json.load(f) except FileNotFoundError: history = [] return df, history # Define the function to handle Special Cluster def handle_special_cluster(data, keywords, detector): for index, row in data.iterrows(): text = row['negation_handled'] # Cek apakah teks mengandung salah satu kata kunci detected = False for sentiment, words in keywords.items(): if any(word in text for word in words): data.loc[index, 'predicted_category'] = sentiment.replace('_', ' ') data.loc[index, 'detected_by'] = 'keyword' detected = True break if not detected: # Jika tidak ada kata kunci yang cocok, gunakan deteksi sarkasme result = detector(text) if result[0]['label'] == 'SARCASM': data.loc[index, 'predicted_category'] = 'co sarkastic' data.loc[index, 'detected_by'] = 'sarcasm' else: # Jika tidak ada sentimen yang cocok data.loc[index, 'predicted_category'] = 'Unknown' data.loc[index, 'detected_by'] = 'unknown' return data def preprocess_text(text): text = text.lower() text = re.sub(r'\W+', ' ', text) # Remove non-alphanumeric characters words = text.split() words = [word for word in words if word not in stop_words] return words def display_word_frequencies(words, num_words): st.subheader(f"Top {num_words} Words") for word, freq in words: st.write(f"{word}: {freq}") def update_sentiment(index, new_sentimen): df.loc[index, 'sentimen'] = new_sentimen st.write(f"Updated sentiment for comment at index {index} to {new_sentimen}") file_name = "corrected_comments.xlsx" # Fungsi untuk memuat data `Corrected Comments` dari Hugging Face atau lokal jika tidak ada def load_corrected_comments(): try: print("🔄 Mencoba memuat data dari Hugging Face...") corrected_comments_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=file_name, repo_type="dataset") return pd.read_excel(corrected_comments_path, sheet_name='Corrected Comments') except Exception as e: print(f"⚠ Gagal memuat dari Hugging Face: {e}") # Coba load dari lokal jika ada if os.path.exists(file_name): print("🔄 Memuat data dari file lokal...") return pd.read_excel(file_name, sheet_name='Corrected Comments') # Jika tidak ada, return DataFrame kosong print("❌ Tidak ditemukan data corrected_comments. Menggunakan DataFrame kosong.") return pd.DataFrame(columns=['model_data', 'Platform', 'komentar', 'tanggal', 'link', 'sentimen', 'Cluster_Name', 'corrected_by', 'tanggal_masuk']) # Fungsi untuk menyimpan data `Corrected Comments` ke Hugging Face def save_corrected_comments(data): print("💾 Menyimpan corrected_comments.xlsx ke lokal...") data.to_excel(file_name, sheet_name='Corrected Comments', index=False) print("🚀 Mengunggah ke Hugging Face...") api = HfApi() api.upload_file( path_or_fileobj=file_name, path_in_repo=file_name, repo_id="zakyirhab0/Klasifikasi_Komentar", repo_type="dataset" ) print(f"✅ {file_name} berhasil diperbarui di Hugging Face.") # Load data saat pertama kali dijalankan corrected_comments = load_corrected_comments() # Fungsi untuk menampilkan visualisasi distribusi komentar dalam cluster def display_cluster_visualization(ba_lainnya_data): st.subheader("Cluster Visualization") # Menghitung jumlah komentar dalam kategori `Similar Sentiment` dan `Special Cluster` cluster_counts = ba_lainnya_data[ba_lainnya_data['Cluster_Name'].str.contains('Similar|Special Cluster')]['Cluster_Name'].value_counts() # Membuat grafik batang (bar chart) plt.figure(figsize=(10, 6)) plt.bar(cluster_counts.index, cluster_counts.values, color=['blue', 'green', 'orange', 'red', 'purple']) plt.xlabel('Cluster Name') plt.ylabel('Number of Comments') plt.title('Distribution of Comments in Similar Sentiment and Special Cluster') plt.xticks(rotation=45) plt.show() st.pyplot(plt) def run_clustering_for_ba_lainnya(): st.title("Clustering for 'BA Lainnya'") if 'data_with_ba' not in st.session_state: st.error("No 'BA Lainnya' data found from the classification model. Please classify comments first.") st.stop() ba_lainnya_data = st.session_state['data_with_ba'] st.write(f"**'BA Lainnya' Data:** {len(ba_lainnya_data)} rows") # Load keywords.json dari Hugging Face keywords_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename="keywords.json", repo_type="dataset") # Buka file yang telah diunduh with open(keywords_path, 'r') as f: keyword_dict = json.load(f) selected_candidate = st.session_state['candidate'] candidate_keywords = keyword_dict.get(selected_candidate.replace(' ', '_')) if candidate_keywords is None: st.error("Keywords for the selected candidate not found.") st.stop() sarcasm_detector = pipeline('sentiment-analysis', model='unitary/toxic-bert') dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset") corrected_comments_file = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"corrected_comments_{selected_candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset") try: train_data = pd.read_excel(dataset_path) labeled_data = train_data[train_data['sentimen'].isin(['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])] st.write(f"**Labeled Data from Training Dataset:** {len(labeled_data)} rows") ba_lainnya_data['Cluster'] = None ba_lainnya_data['detected_by'] = None ba_lainnya_data['Cluster_Name'] = None ba_lainnya_data['corrected_by'] = "Not Corrected" ba_lainnya_data['Sentiment'] = None # Ubah inisialisasi ke None untuk memastikan tidak ada nilai default yang salah ba_lainnya_data['Cluster_Name'] = ba_lainnya_data['Cluster_Name'].astype(str) ba_lainnya_data['corrected_by'] = ba_lainnya_data['corrected_by'].astype(str) ba_lainnya_data['Sentiment'] = ba_lainnya_data['Sentiment'].astype(str) for data in [ba_lainnya_data, labeled_data]: data['translated_emojis'] = data['komentar'].fillna('').astype(str).apply(translate_emojis) data['normalized_unicode'] = data['translated_emojis'].apply(normalize_unicode) data['reply_handled'] = data['normalized_unicode'].apply(handle_replies) data['clean_text'] = data['reply_handled'].apply(clean_text) data['translated_ntb'] = data['clean_text'].apply(lambda x: translate_text(x, {})) data['translated_slang'] = data['translated_ntb'].apply(lambda x: translate_text(x, {})) data['negation_handled'] = data['translated_slang'].apply(handle_negation) data['negation_handled'] = data['negation_handled'].fillna('') combined_data = ba_lainnya_data.copy() combined_data['Label'] = 'BA Lainnya' for sentimen in ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']: sentimen_data = labeled_data[labeled_data['sentimen'] == sentimen].copy() sentimen_data['Label'] = sentimen combined = pd.concat([combined_data, sentimen_data], ignore_index=True) if len(combined) < 2: st.warning(f"Not enough samples to cluster for {sentimen}.") continue vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=5000) tfidf_matrix = vectorizer.fit_transform(combined['negation_handled']) st.write(f"Clustering 'BA Lainnya' comments similar to {sentimen}...") kmeans = KMeans(n_clusters=2, random_state=42) combined['Cluster'] = kmeans.fit_predict(tfidf_matrix) valid_indices = combined.index[:len(ba_lainnya_data)] valid_indices = valid_indices.intersection(ba_lainnya_data.index) ba_lainnya_data.loc[valid_indices, 'Cluster'] = combined.loc[valid_indices, 'Cluster'] ba_lainnya_data.loc[ba_lainnya_data['Cluster'] == 0, 'Cluster_Name'] = f"{sentimen} Similar" ba_lainnya_data.loc[ba_lainnya_data['Cluster'] == 1, 'Cluster_Name'] = f"{sentimen} Dissimilar" ba_lainnya_data.loc[valid_indices, 'Sentiment'] = sentimen for index, row in ba_lainnya_data.iterrows(): if row['Cluster_Name'].endswith('Dissimilar') or row['Cluster_Name'] == 'None': dissimilar_comment = ba_lainnya_data.loc[[index]].copy() for sentimen in ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']: sentimen_data = labeled_data[labeled_data['sentimen'] == sentimen].copy() combined = pd.concat([dissimilar_comment, sentimen_data], ignore_index=True) if len(combined) < 2: continue tfidf_matrix = vectorizer.fit_transform(combined['negation_handled']) if tfidf_matrix.shape[0] == 0: continue kmeans = KMeans(n_clusters=2, random_state=42) combined['Cluster'] = kmeans.fit_predict(tfidf_matrix) if len(combined) > 0 and combined.loc[0, 'Cluster'] == 0: ba_lainnya_data.loc[index, 'Cluster_Name'] = f"{sentimen} Similar" ba_lainnya_data.loc[index, 'Sentiment'] = sentimen break else: ba_lainnya_data.loc[index, 'Cluster_Name'] = 'Special Cluster' ba_lainnya_data.loc[index, 'corrected_by'] = 'Special Cluster' ba_lainnya_data.loc[index, 'Sentiment'] = 'Special Sentiment' ba_lainnya_data['Cluster_Name'] = ba_lainnya_data['Cluster_Name'].apply(lambda x: 'Special Cluster' if x == 'nan' else x) special_cluster_data = ba_lainnya_data[ba_lainnya_data['Cluster_Name'] == 'Special Cluster'] if not special_cluster_data.empty: special_cluster_data = handle_special_cluster(special_cluster_data, candidate_keywords, sarcasm_detector) ba_lainnya_data.update(special_cluster_data) ba_lainnya_data.loc[special_cluster_data.index, 'corrected_by'] = 'Special Cluster' ba_lainnya_data.loc[special_cluster_data.index, 'Sentiment'] = 'Special Sentiment' st.warning("Some comments were not captured by the current keywords. Please add new keywords in the 'Update Keywords' section.") st.subheader("Detection Distribution in Special Cluster") detection_counts = special_cluster_data['detected_by'].value_counts() plt.figure(figsize=(10, 6)) plt.bar(detection_counts.index, detection_counts.values, color=['blue', 'orange', 'red']) plt.xlabel('Detection Method') plt.ylabel('Number of Comments') plt.title('Detection Distribution in Special Cluster') plt.show() st.pyplot(plt) st.write("Top Keywords in Special Cluster") for sentiment, keywords in candidate_keywords.items(): st.write(f"{sentiment}: {', '.join(keywords)}") st.subheader("Special Cluster Details") st.dataframe(special_cluster_data[['komentar', 'Cluster_Name', 'detected_by']]) corrected_comments = load_corrected_comments(corrected_comments_file) display_cluster_visualization(ba_lainnya_data) st.subheader("Edit Sentiment in Table") # Tambahkan kolom apakah komentar sudah masuk ke corrected_comments ba_lainnya_data['Already Corrected'] = ba_lainnya_data['komentar'].isin(corrected_comments['komentar']) ba_lainnya_data['Corrected Sentiment'] = ba_lainnya_data.apply( lambda row: corrected_comments[corrected_comments['komentar'] == row['komentar']]['sentimen'].values[0] if row['Already Corrected'] else "", axis=1 ) # Tambahkan kolom checkbox untuk memilih komentar yang mau diedit ba_lainnya_data['Select'] = False # Dropdown untuk memilih cluster (bulk selection) selected_cluster = st.selectbox("Select a cluster to edit:", sorted(ba_lainnya_data['Cluster_Name'].unique())) # Jika cluster dipilih, auto-select semua komentar di cluster tersebut if selected_cluster: ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'Select'] = True # Tampilkan tabel dengan checkbox & informasi sentimen edited_data = st.data_editor( ba_lainnya_data[['Select', 'komentar', 'Cluster_Name', 'Sentiment', 'Already Corrected', 'Corrected Sentiment']], column_config={ "Select": st.column_config.CheckboxColumn("✅ Select"), "komentar": st.column_config.TextColumn("Komentar"), "Cluster_Name": st.column_config.TextColumn("Cluster"), "Sentiment": st.column_config.TextColumn("Sentimen"), "Already Corrected": st.column_config.CheckboxColumn("✅ Sudah di Corrected?"), "Corrected Sentiment": st.column_config.TextColumn("Sentimen Corrected"), }, disabled=["komentar", "Cluster_Name", "Already Corrected", "Corrected Sentiment"], # Kolom yang gak bisa diedit use_container_width=True, ) # Dropdown untuk memilih sentimen baru yang akan diterapkan ke komentar yang dicentang new_sentimen = st.selectbox("Select new sentiment for selected comments:", ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']) # Jika tombol ditekan, update sentimen komentar yang dicentang if st.button("Update Sentiments"): selected_rows = edited_data[edited_data['Select']] # Ambil data yang dicentang if selected_rows.empty: st.warning("No comments selected for update!") else: for index, row in selected_rows.iterrows(): ba_lainnya_data.at[index, 'Sentiment'] = new_sentimen ba_lainnya_data.at[index, 'corrected_by'] = 'Manual Edit' if ba_lainnya_data.at[index, 'komentar'] in corrected_comments['komentar'].values: corrected_comments.loc[corrected_comments['komentar'] == ba_lainnya_data.at[index, 'komentar'], 'sentimen'] = new_sentimen else: new_entry = ba_lainnya_data.loc[[index]].copy() new_entry['sentimen'] = new_sentimen corrected_comments = pd.concat([corrected_comments, new_entry]) corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True) save_corrected_comments(corrected_comments, corrected_comments_file) st.success("Sentiments updated successfully!") st.rerun() st.subheader("Clustered Data") selected_cluster = st.selectbox("Select a cluster to view comments:", sorted(ba_lainnya_data['Cluster_Name'].unique())) cluster_comments = ba_lainnya_data[ba_lainnya_data['Cluster_Name'] == selected_cluster] st.dataframe(cluster_comments[['komentar', 'Cluster_Name']].head(100)) new_sentimen = st.selectbox("Select new sentiment for this cluster:", ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']) if st.button("Update Sentiment for this cluster"): ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'corrected_by'] = 'Batch Cluster' ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'sentimen'] = new_sentimen st.success(f"Sentiment for cluster {selected_cluster} updated to {new_sentimen}") # Save and refresh Corrected Comments table and Cluster Visualization corrected_comments = pd.concat([corrected_comments, ba_lainnya_data[ba_lainnya_data['corrected_by'] != "Not Corrected"]]) corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True) save_corrected_comments(corrected_comments, corrected_comments_file) st.subheader("Corrected Comments") st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100)) display_cluster_visualization(ba_lainnya_data) st.subheader("Special Rules Based on Keywords") keyword = st.text_input("Enter a keyword to set a rule:") specific_cluster = st.selectbox("Select a cluster for this keyword:", sorted(ba_lainnya_data['Cluster_Name'].unique())) if keyword: new_cluster = st.selectbox("Select sentiment for this keyword:", ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']) if st.button("Apply Rule"): ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'Cluster_Name'] = new_cluster ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'detected_by'] = specific_cluster ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'corrected_by'] = 'Keyword Rule' ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'sentimen'] = new_cluster st.success(f"All comments containing '{keyword}' have been updated to '{new_cluster}' sentiment.") # Update keywords.json file to avoid duplicates if selected_candidate.replace(' ', '_') in keyword_dict: if new_cluster in keyword_dict[selected_candidate.replace(' ', '_')]: if keyword not in keyword_dict[selected_candidate.replace(' ', '_')][new_cluster]: keyword_dict[selected_candidate.replace(' ', '_')][new_cluster].append(keyword) else: keyword_dict[selected_candidate.replace(' ', '_')][new_cluster] = [keyword] else: keyword_dict[selected_candidate.replace(' ', '_')] = {new_cluster: [keyword]} with open('keywords.json', 'w') as f: json.dump(keyword_dict, f) st.success(f"Keyword '{keyword}' has been added to the keyword list.") # Save and refresh Corrected Comments table and Cluster Visualization corrected_comments = pd.concat([corrected_comments, ba_lainnya_data[ba_lainnya_data['corrected_by'] != "Not Corrected"]]) corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True) save_corrected_comments(corrected_comments, corrected_comments_file) st.subheader("Corrected Comments") st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100)) display_cluster_visualization(ba_lainnya_data) st.subheader("Corrected Comments") corrected_comments = load_corrected_comments(corrected_comments_file) st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100)) st.subheader("Visual Representation of Corrected Comments") sentiment_counts = corrected_comments['sentimen'].value_counts() plt.figure(figsize=(10, 6)) plt.bar(sentiment_counts.index, sentiment_counts.values, color=['blue', 'green', 'orange', 'red', 'purple']) plt.xlabel('Sentimen') plt.ylabel('Number of Corrected Comments') plt.title('Number of Corrected Comments by Sentiment') plt.show() st.pyplot(plt) st.subheader("Download Options") # Simpan data hasil clustering ke buffer Excel excel_buffer_cluster = io.BytesIO() with pd.ExcelWriter(excel_buffer_cluster, engine='xlsxwriter') as writer: ba_lainnya_data.to_excel(writer, index=False, sheet_name='Clustered Data') excel_buffer_cluster.seek(0) # Unduh data yang sudah diklusterisasi st.download_button( label=f"Download Clustered Data for {selected_candidate}", data=excel_buffer_cluster, file_name=f"clustered_data_{selected_candidate}.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) # Upload data hasil clustering ke Hugging Face clustered_data_filename = f"clustered_data_{selected_candidate}.xlsx" ba_lainnya_data.to_excel(clustered_data_filename, index=False, sheet_name='Clustered Data') try: print("🚀 Mengunggah Clustered Data ke Hugging Face...") api = HfApi() api.upload_file( path_or_fileobj=clustered_data_filename, path_in_repo=clustered_data_filename, repo_id="zakyirhab0/Klasifikasi_Komentar", repo_type="dataset" ) st.success(f"✅ Clustered Data berhasil diunggah ke Hugging Face: {clustered_data_filename}") except Exception as e: st.error(f"⚠ Gagal mengunggah Clustered Data ke Hugging Face: {e}") # Simpan hasil corrected comments ke training dataset st.subheader("Save Corrected Comments to Training Dataset") if st.button("Save Corrected Comments"): try: # Tambahkan timestamp untuk data yang baru disimpan corrected_comments['tanggal_masuk'] = pd.Timestamp.now().strftime('%Y-%m-%d') # Simpan ke training dataset yang ada di Hugging Face update_training_dataset(corrected_comments, st.session_state['candidate']) # Simpan juga di Hugging Face sebagai file corrected_comments_file = f"corrected_comments_{selected_candidate}.xlsx" corrected_comments.to_excel(corrected_comments_file, index=False, sheet_name='Corrected Comments') print("🚀 Mengunggah Corrected Comments ke Hugging Face...") api.upload_file( path_or_fileobj=corrected_comments_file, path_in_repo=corrected_comments_file, repo_id="zakyirhab0/Klasifikasi_Komentar", repo_type="dataset" ) st.success(f"✅ Corrected Comments berhasil disimpan & diunggah ke Hugging Face: {corrected_comments_file}") # Bersihkan state untuk menghindari duplikasi penyimpanan st.session_state['data_with_ba'] = pd.DataFrame(columns=corrected_comments.columns) # Refresh otomatis setelah penyimpanan st.rerun() except Exception as e: st.error(f"⚠ Terjadi kesalahan saat menyimpan Corrected Comments: {e}") except FileNotFoundError: st.error(f"❌ Tidak ditemukan dataset untuk {selected_candidate}. Silakan tambahkan dataset terlebih dahulu.") except Exception as e: st.error(f"⚠ Terjadi kesalahan tidak terduga: {e}") # === Sidebar Navigation === # menu = st.sidebar.radio("Select a Feature", ["Model-Based Classification","Clustering for 'BA Lainnya'", "Update Keywords","View Training Dataset","Evaluate Data Train","Maximize Preprocessing","Retraining Model"]) if menu == "Model-Based Classification": st.title("Model-Based Classification") candidate = st.selectbox("Choose a candidate:", candidate_list) model_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"best_rf_model_{candidate.replace(' ', '_').lower()}.joblib", repo_type="dataset") vectorizer_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"tfidf_vectorizer_{candidate.replace(' ', '_').lower()}.joblib", repo_type="dataset") # Save the selected candidate to session state st.session_state['candidate'] = candidate uploaded_file = st.file_uploader("Upload an Excel file for classification", type=['xlsx']) @st.cache_data def load_model_and_vectorizer(model_path, vectorizer_path): """Load model and vectorizer, cache them for efficiency.""" try: model = joblib.load(model_path) vectorizer = joblib.load(vectorizer_path) return model, vectorizer except FileNotFoundError: return None, None model, vectorizer = load_model_and_vectorizer(model_path, vectorizer_path) if not model or not vectorizer: st.error("Model or vectorizer not found for the selected candidate.") st.stop() # Save the vectorizer and model to session state st.session_state['vectorizer'] = vectorizer st.session_state['model'] = model @st.cache_data def preprocess_data(data): """Preprocess comments with batching for large datasets.""" from joblib import Parallel, delayed def preprocess_batch(batch): batch['translated_emojis'] = batch['komentar'].apply(translate_emojis) batch['normalized_unicode'] = batch['translated_emojis'].apply(normalize_unicode) batch['reply_handled'] = batch['normalized_unicode'].apply(handle_replies) batch['clean_text'] = batch['reply_handled'].apply(clean_text) batch['translated_ntb'] = batch['clean_text'].apply(lambda x: translate_text(x, ntb_dict)) batch['translated_slang'] = batch['translated_ntb'].apply(lambda x: translate_text(x, slang_dict)) batch['negation_handled'] = batch['translated_slang'].apply(handle_negation) return batch batch_size = 10000 # Process 10,000 rows at a time batches = [data.iloc[i:i+batch_size] for i in range(0, len(data), batch_size)] processed_batches = Parallel(n_jobs=-1)(delayed(preprocess_batch)(batch) for batch in batches) return pd.concat(processed_batches, ignore_index=True) # Process uploaded file if uploaded_file: try: data = pd.read_excel(uploaded_file) if 'komentar' not in data.columns: st.error("The uploaded file must include a 'komentar' column.") st.stop() data = preprocess_data(data) except Exception as e: st.error(f"An error occurred while processing the file: {e}") st.stop() elif 'model_classified_data' in st.session_state: data = st.session_state['model_classified_data'] else: st.info("Please upload a file for classification.") st.stop() # Transform comments into TF-IDF vectors try: tfidf_data = vectorizer.transform(data['negation_handled'].fillna('')) data['predicted_category'] = model.predict(tfidf_data) data['probabilities'] = model.predict_proba(tfidf_data).tolist() data['max_probability'] = data['probabilities'].apply(lambda x: max(x)) except Exception as e: st.error(f"An error occurred during model prediction: {e}") st.stop() # Cache classified data st.session_state['model_classified_data'] = data # Interactive threshold adjustment st.subheader("Set Threshold for 'BA Lainnya'") threshold = st.slider("Threshold for tagging 'BA Lainnya'", min_value=0.0, max_value=1.0, value=0.80, step=0.01) # Apply threshold to tag "BA Lainnya" data['tag'] = data['max_probability'].apply(lambda x: 'BA Lainnya' if x < threshold else '') # Separate data for visualization data_without_ba = data[data['tag'] != 'BA Lainnya'] data_with_ba = data[data['tag'] == 'BA Lainnya'] # Save updated results to session state for dynamic updates st.session_state['data_without_ba'] = data_without_ba st.session_state['data_with_ba'] = data_with_ba # Preview Results st.subheader("Preview Results") st.write("### 1. Hasil Klasifikasi Tanpa Tag 'BA Lainnya'") if not data_without_ba.empty: st.dataframe(data_without_ba[['komentar', 'predicted_category', 'max_probability']]) else: st.info("No high-probability classifications available.") st.write("### 2. Hasil Klasifikasi Dengan Tag 'BA Lainnya'") if not data_with_ba.empty: st.dataframe(data_with_ba[['komentar', 'predicted_category', 'max_probability']]) else: st.info("No low-probability classifications available.") # Visualization: Sentiment Distribution st.subheader("Sentiment Distribution Visualization") def plot_distribution(data, title): sentiment_counts = data['predicted_category'].value_counts() fig, ax = plt.subplots() ax.bar(sentiment_counts.index, sentiment_counts.values) ax.set_title(title) ax.set_xlabel("Sentiments") ax.set_ylabel("Count") st.pyplot(fig) if not data_without_ba.empty: plot_distribution(data_without_ba, "Sentiment Distribution (Without 'BA Lainnya')") if not data_with_ba.empty: plot_distribution(data_with_ba, "Sentiment Distribution (With 'BA Lainnya')") # ====== DOWNLOAD HASIL KLASIFIKASI ====== st.subheader("Download Results") # Simpan hasil klasifikasi ke buffer Excel excel_buffer = io.BytesIO() with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer: data.to_excel(writer, index=False, sheet_name='Classification Results') excel_buffer.seek(0) # Download hasil klasifikasi st.download_button( label="Download All Classification Results", data=excel_buffer, file_name=f"classification_results_{candidate}.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) # ====== UNGGAH HASIL KE HUGGING FACE ====== classification_results_file = f"classification_results_{candidate}.xlsx" data.to_excel(classification_results_file, index=False, sheet_name='Classification Results') api = HfApi() api.upload_file( path_or_fileobj=classification_results_file, path_in_repo=classification_results_file, repo_id="zakyirhab0/Klasifikasi_Komentar", repo_type="dataset" ) st.success(f"✅ {classification_results_file} berhasil diunggah ke Hugging Face!") # ====== SIMPAN NON-'BA LAINNYA' KE TRAINING DATASET ====== st.subheader("Save Classified Data") if not data_without_ba.empty: st.info("Only Non-'BA Lainnya' comments will be saved to the training dataset.") if st.button("Save Non-'BA Lainnya' Data to Training Dataset"): try: # Persiapkan data untuk disimpan data_to_save = data_without_ba[['model_data', 'Platform', 'komentar', 'link', 'kandidat', 'tanggal']].copy() data_to_save['sentimen'] = data_without_ba['predicted_category'] data_to_save['tanggal_masuk'] = pd.Timestamp.now().strftime('%Y-%m-%d') # Simpan dataset ke Hugging Face training_dataset_file = f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx" data_to_save.to_excel(training_dataset_file, index=False) api.upload_file( path_or_fileobj=training_dataset_file, path_in_repo=training_dataset_file, repo_id="zakyirhab0/Klasifikasi_Komentar", repo_type="dataset" ) st.success(f"✅ Data berhasil disimpan ke training dataset di Hugging Face: {training_dataset_file}") except Exception as e: st.error(f"❌ Terjadi kesalahan saat menyimpan data: {e}") else: st.info("No Non-'BA Lainnya' data available to save.") # Integrasi fungsi dalam halaman "Clustering for 'BA Lainnya'" if menu == "Clustering for 'BA Lainnya'": selected_candidate = st.session_state['selected_candidate'] if 'selected_candidate' in st.session_state else None run_clustering_for_ba_lainnya() pass # Placeholder # Memastikan kode ini hanya dijalankan jika menu yang dipilih adalah "View Training Dataset" if menu == "View Training Dataset": st.title("View Training Dataset") # Header untuk memilih kandidat st.header("Options") selected_candidate = st.selectbox("Choose a candidate:", list(candidate_list), key='candidate_select_view') # Path dataset dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset") history_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"history_{selected_candidate.lower().replace(' ', '_')}.json", repo_type="dataset") # Memuat dataset try: dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset") df = pd.read_excel(dataset_path) # Memastikan kolom yang diperlukan ada required_columns = ['model_data', 'Platform', 'komentar', 'sentimen', 'tanggal', 'tanggal_masuk', 'link', 'evaluated_by_data_train'] for col in required_columns: if col not in df.columns: if col == 'evaluated_by_data_train': df[col] = False # Menambahkan kolom 'update_ba' jika belum ada if 'update_ba' not in df.columns: df['update_ba'] = None # Menambahkan kolom 'missing_comment' jika belum ada if 'missing_comment' not in df.columns: df['missing_comment'] = False # Pastikan 'tanggal_masuk' dan 'tanggal' dalam format datetime yang benar df['tanggal_masuk'] = pd.to_datetime(df['tanggal_masuk'], errors='coerce') df['tanggal'] = pd.to_datetime(df['tanggal'], errors='coerce') # Konversi kolom tanggal agar hanya menyimpan bagian tanggal tanpa waktu df['tanggal'] = df['tanggal'].dt.date df['tanggal_masuk'] = df['tanggal_masuk'].dt.date df['update_ba'] = pd.to_datetime(df['update_ba'], errors='coerce').dt.date # Menangani nilai NaT (Not a Time) jika ada if df['tanggal_masuk'].isnull().any(): st.warning("Some dates 'tanggal_masuk' could not be parsed correctly. Please check the date format in the dataset.") df['tanggal_masuk'].fillna(pd.Timestamp.now().date(), inplace=True) if df['tanggal'].isnull().any(): st.warning("Some dates 'tanggal' could not be parsed correctly. Please check the date format in the dataset.") df['tanggal'].fillna(pd.Timestamp.now().date(), inplace=True) # Menambahkan kolom 'kandidat' jika belum ada dan mengisinya if 'kandidat' not in df.columns: df['kandidat'] = selected_candidate # Mengambil subset kolom yang diperlukan df = df[required_columns + ['update_ba', 'kandidat', 'missing_comment']] # Perbarui nilai None di update_ba dengan tanggal_masuk df['update_ba'].fillna(df['tanggal_masuk'], inplace=True) # Menampilkan statistik dasar st.subheader(f"Training Dataset for {selected_candidate}") st.write(f"**Total rows in dataset:** {len(df)}") if not df.empty: # Visualisasi sebaran update BA st.subheader("Visualisasi Postingan Berdasarkan Update BA") ba_update_counts = df['update_ba'].value_counts().sort_index() fig, ax = plt.subplots(figsize=(10, 6)) ba_update_counts.plot(kind='bar', ax=ax, color='blue') ax.set_title('Sebaran Postingan Berdasarkan Update BA') ax.set_xlabel('Tanggal Update BA') ax.set_ylabel('Jumlah Postingan') plt.xticks(rotation=45) plt.tight_layout() st.pyplot(fig) # Visualisasi tambahan sebaran platform st.subheader("Sebaran Platform Berdasarkan Update BA") platform_counts = df['Platform'].value_counts() fig, ax = plt.subplots(figsize=(10, 6)) platform_counts.plot(kind='bar', ax=ax, color='green') ax.set_title('Sebaran Platform Berdasarkan Update BA') ax.set_xlabel('Platform') ax.set_ylabel('Jumlah Postingan') plt.xticks(rotation=45) plt.tight_layout() st.pyplot(fig) # Visualisasi jumlah komentar hilang berdasarkan platform st.subheader("Jumlah Komentar Hilang Berdasarkan Platform") missing_comments_by_platform = df.groupby('Platform')['missing_comment'].sum().sort_index() fig, ax = plt.subplots(figsize=(10, 6)) missing_comments_by_platform.plot(kind='bar', ax=ax, color='red') ax.set_title('Jumlah Komentar Hilang Berdasarkan Platform') ax.set_xlabel('Platform') ax.set_ylabel('Jumlah Komentar Hilang') plt.xticks(rotation=45) plt.tight_layout() st.pyplot(fig) # Filter berdasarkan status validasi st.subheader("Filter Data") validation_filter = st.radio( "Choose data type to view:", ["All Data", "Validated Data", "Non-Validated Data"], key='validation_filter' ) if validation_filter == "Validated Data": filtered_data = df[df['evaluated_by_data_train'] == True] elif validation_filter == "Non-Validated Data": filtered_data = df[df['evaluated_by_data_train'] == False] else: filtered_data = df if not filtered_data.empty: st.subheader(f"Filtered Data: {validation_filter}") st.dataframe(filtered_data) # Menampilkan semua data yang sesuai dengan filter else: st.warning("Tidak ada data yang sesuai dengan filter yang dipilih.") # Menampilkan riwayat penambahan data st.subheader("History of Data Additions") try: history_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"history_{selected_candidate.lower().replace(' ', '_')}.json", repo_type="dataset") with open(history_path, "r") as f: history = json.load(f) history_list = [] for key, value in history.items(): for entry in value: for k, v in entry.items(): history_list.append({ 'key': key, 'field': k, 'date': v }) history_df = pd.DataFrame(history_list) st.dataframe(history_df) except Exception as e: st.warning("No addition history available.") st.error(f"Error: {e}") # Opsi untuk mengunduh dataset yang telah difilter st.subheader("Download Options") if not filtered_data.empty: filtered_dataset_file = f"filtered_training_dataset_{selected_candidate}.xlsx" filtered_data.to_excel(filtered_dataset_file, index=False, sheet_name='Filtered Dataset') # Streamlit Download st.download_button( label=f"Download Filtered Dataset for {selected_candidate}", data=open(filtered_dataset_file, "rb"), file_name=filtered_dataset_file, mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) # Upload ke Hugging Face api = HfApi() api.upload_file( path_or_fileobj=filtered_dataset_file, path_in_repo=filtered_dataset_file, repo_id="zakyirhab0/Klasifikasi_Komentar", repo_type="dataset" ) st.success(f"✅ Filtered dataset berhasil diunggah ke Hugging Face: {filtered_dataset_file}") # Opsi untuk mengunduh seluruh dataset if not df.empty: full_dataset_file = f"training_dataset_{selected_candidate}.xlsx" df.to_excel(full_dataset_file, index=False, sheet_name='Training Dataset') # Streamlit Download st.download_button( label=f"Download Full Training Dataset for {selected_candidate}", data=open(full_dataset_file, "rb"), file_name=full_dataset_file, mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) # Upload ke Hugging Face api.upload_file( path_or_fileobj=full_dataset_file, path_in_repo=full_dataset_file, repo_id="zakyirhab0/Klasifikasi_Komentar", repo_type="dataset" ) st.success(f"✅ Full training dataset berhasil diunggah ke Hugging Face: {full_dataset_file}") # Fitur untuk mengunduh dataset berdasarkan filter tanggal st.subheader("Download Aggregated Data by Date") # Mendapatkan nilai maksimum dan minimum tanggal untuk default date_input if not df['tanggal'].empty: min_date = df['tanggal'].min() max_date = df['tanggal'].max() selected_start_date = st.date_input("Select start date for aggregation:", value=min_date) selected_end_date = st.date_input("Select end date for aggregation:", value=max_date) agg_filtered_data = df[(df['tanggal'] >= selected_start_date) & (df['tanggal'] <= selected_end_date)] if not agg_filtered_data.empty: aggregated_data = agg_filtered_data.groupby(['kandidat', 'link', 'tanggal', 'Platform', 'model_data', 'update_ba']).agg( co_likes=('sentimen', lambda x: (x == 'Co Likes').sum()), co_support=('sentimen', lambda x: (x == 'Co Support').sum()), co_optimism=('sentimen', lambda x: (x == 'Co Optimism').sum()), co_sarkastic=('sentimen', lambda x: (x == 'Co Sarkastic').sum()), co_negative=('sentimen', lambda x: (x == 'Co Negative').sum()), missing_comments=('missing_comment', 'sum') ).reset_index() aggregated_data['jumlah_komentar_positif'] = aggregated_data['co_likes'] + aggregated_data['co_support'] + aggregated_data['co_optimism'] aggregated_data['jumlah_komentar_negatif'] = aggregated_data['co_sarkastic'] + aggregated_data['co_negative'] aggregated_data['jumlah_komentar'] = aggregated_data[['co_likes', 'co_support', 'co_optimism', 'co_sarkastic', 'co_negative']].sum(axis=1) st.dataframe(aggregated_data) # Simpan dataset yang telah diagregasi aggregated_dataset_file = f"aggregated_data_{selected_candidate}.xlsx" aggregated_data.to_excel(aggregated_dataset_file, index=False, sheet_name='Aggregated Data') # Streamlit Download st.download_button( label=f"Download Aggregated Data by Date for {selected_candidate}", data=open(aggregated_dataset_file, "rb"), file_name=aggregated_dataset_file, mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) # Upload ke Hugging Face api.upload_file( path_or_fileobj=aggregated_dataset_file, path_in_repo=aggregated_dataset_file, repo_id="zakyirhab0/Klasifikasi_Komentar", repo_type="dataset" ) st.success(f"✅ Aggregated dataset berhasil diunggah ke Hugging Face: {aggregated_dataset_file}") else: st.warning("Tidak ada data yang sesuai dengan rentang tanggal yang dipilih untuk diagregasi.") st.dataframe(aggregated_data) # Visualisasi jumlah postingan yang diupdate BA dengan sebaran platform berdasarkan rentang tanggal st.subheader("Visualisasi Postingan yang Diupdate BA Berdasarkan Rentang Tanggal") ba_update_range = aggregated_data[aggregated_data['update_ba'] != 'Belum diupdate'] if not ba_update_range.empty: plt.figure(figsize=(10, 6)) ba_update_range['Platform'].value_counts().plot(kind='bar', title='Sebaran Platform - Diupdate BA (Rentang Tanggal)') plt.xlabel('Platform') plt.ylabel('Jumlah Postingan') st.pyplot(plt) # Tambahan visualisasi sebaran postingan berdasarkan tanggal st.subheader("Sebaran Postingan Berdasarkan Tanggal") plt.figure(figsize=(10, 6)) ba_update_range['tanggal'].value_counts().sort_index().plot(kind='bar', title='Sebaran Postingan Berdasarkan Tanggal') plt.xlabel('Tanggal') plt.ylabel('Jumlah Postingan') plt.xticks(rotation=45) plt.tight_layout() st.pyplot(plt) # Tambahan visualisasi sebaran update BA st.subheader("Sebaran Update BA") plt.figure(figsize=(10, 6)) ba_update_range['update_ba'].value_counts().sort_index().plot(kind='bar', title='Sebaran Update BA') plt.xlabel('Tanggal Update BA') plt.ylabel('Jumlah Postingan') plt.xticks(rotation=45) plt.tight_layout() st.pyplot(plt) # Visualisasi jumlah komentar berdasarkan tanggal_masuk st.subheader("Jumlah Komentar Berdasarkan Tanggal Masuk") plt.figure(figsize=(10, 6)) agg_filtered_data.groupby('tanggal_masuk')['komentar'].count().sort_index().plot(kind='bar', title='Jumlah Komentar Berdasarkan Tanggal Masuk') plt.xlabel('Tanggal Masuk') plt.ylabel('Jumlah Komentar') plt.xticks(rotation=45) plt.tight_layout() st.pyplot(plt) # Visualisasi sebaran komentar di tiap platform st.subheader("Sebaran Komentar di Tiap Platform") plt.figure(figsize=(10, 6)) agg_filtered_data['Platform'].value_counts().plot(kind='bar', title='Sebaran Komentar di Tiap Platform') plt.xlabel('Platform') plt.ylabel('Jumlah Komentar') plt.xticks(rotation=45) plt.tight_layout() st.pyplot(plt) # Visualisasi jumlah missing comments berdasarkan postingan st.subheader("Jumlah Komentar Hilang Berdasarkan Postingan") plt.figure(figsize=(10, 6)) aggregated_data.groupby('link')['missing_comments'].sum().sort_index().plot(kind='bar', title='Jumlah Komentar Hilang Berdasarkan Postingan') plt.xlabel('Link') plt.ylabel('Jumlah Komentar Hilang') plt.xticks(rotation=45) plt.tight_layout() st.pyplot(plt) else: st.warning("Tidak ada data yang diupdate BA untuk rentang tanggal yang dipilih.") # Opsi untuk mengunduh dataset yang telah diagregasi excel_buffer_aggregated = io.BytesIO() with pd.ExcelWriter(excel_buffer_aggregated, engine='xlsxwriter') as writer: aggregated_data.to_excel(writer, index=False, sheet_name='Aggregated Data') excel_buffer_aggregated.seek(0) st.download_button( label=f"Download Aggregated Data by Date for {selected_candidate}", data=excel_buffer_aggregated, file_name=f"aggregated_data_{selected_candidate}.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) else: st.warning("Tidak ada data yang sesuai dengan rentang tanggal yang dipilih untuk diagregasi.") else: st.warning("Tidak ada data dalam dataset untuk divisualisasikan.") except FileNotFoundError: st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.") except Exception as e: st.error(f"An error occurred: {e}") pass if menu == "Evaluate Data Train": st.title("Evaluate Data Train") selected_candidate = st.selectbox("Choose a candidate:", list(candidate_list), key='candidate_select_evaluate') try: # **📥 Load Dataset dari Hugging Face** dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset") df = pd.read_excel(dataset_path) # **✅ Load existing keyword dictionary dari Hugging Face** try: keywords_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename="keywords.json", repo_type="dataset") with open(keywords_path, 'r', encoding="utf-8") as f: keyword_dict = json.load(f) st.success("✅ Keywords.json loaded successfully.") except FileNotFoundError: st.error("❌ Keywords.json file not found in Hugging Face. Please upload it first.") st.stop() except json.JSONDecodeError: st.error("❌ Keywords.json file is not a valid JSON format. Please check the file.") st.stop() # **🔹 Pastikan dataset memiliki data** if df.empty: st.warning(f"⚠ Dataset for {selected_candidate} is empty.") else: st.success(f"✅ Dataset for {selected_candidate} loaded successfully.") st.dataframe(df.head()) # Tampilkan 5 data pertama except FileNotFoundError: st.error(f"❌ No training dataset found for {selected_candidate}. Please upload the dataset to Hugging Face.") except Exception as e: st.error(f"⚠ An error occurred: {e}") # Select candidate-specific keywords candidate_key = selected_candidate.replace(' ', '_') candidate_keywords = keyword_dict.get(candidate_key) if not candidate_keywords: st.error(f"No keywords found for the selected candidate '{selected_candidate}'. Please update the 'keywords.json' file with appropriate keywords.") st.stop() keywords = [kw for sentiment_keywords in candidate_keywords.values() for kw in sentiment_keywords] # Validasi Konsistensi Data st.subheader("Data Consistency Validation") missing_values = df.isnull().sum() st.write("Missing values in each column:") st.write(missing_values) # Menghapus baris dengan nilai yang hilang st.write("Removing rows with missing values...") df.dropna(inplace=True) # Distribusi Sentimen st.subheader("Sentiment Distribution") sentiment_counts = df['sentimen'].value_counts() st.write("Number of comments for each sentiment:") st.write(sentiment_counts) # Kualitas Data st.subheader("Data Quality Check") invalid_entries = df[df['komentar'].str.len() == 0] st.write(f"Number of invalid comments (empty): {len(invalid_entries)}") if len(invalid_entries) > 0: st.write("Invalid comments (empty):") st.dataframe(invalid_entries.head(100)) # Preview 100 baris pertama # Deteksi Inkonistensi Sentimen st.subheader("Inconsistent Sentiment Labels") duplicate_comments = df[df.duplicated(subset=['komentar'], keep=False)] inconsistent_labels = duplicate_comments.groupby('komentar')['sentimen'].nunique() inconsistent_labels = inconsistent_labels[inconsistent_labels > 1] if not inconsistent_labels.empty: inconsistent_labels_df = duplicate_comments[duplicate_comments['komentar'].isin(inconsistent_labels.index)] st.write(f"Number of comments with inconsistent sentiment labels: {len(inconsistent_labels_df)}") st.dataframe(inconsistent_labels_df.head(100)) # Preview 100 baris pertama else: st.write("No comments with inconsistent sentiment labels found.") # Penanganan Masalah st.subheader("Problem Handling") # Menghapus komentar yang kosong st.write("Removing invalid (empty) comments...") df = df[df['komentar'].str.len() > 0] # Interaktif: Menangani komentar dengan label sentimen yang tidak konsisten st.write("Resolving inconsistent sentiment labels...") if not inconsistent_labels.empty: for index, row in inconsistent_labels_df.iterrows(): st.write(f"Comment: {row['komentar']}") sentimen_options = df[df['komentar'] == row['komentar']]['sentimen'].unique().tolist() new_sentimen = st.selectbox("Select correct sentiment", sentimen_options, key=f'sentimen_{index}') if st.button("Update Sentiment", key=f'update_{index}'): update_sentiment(index, new_sentimen) # Clustering menggunakan Keyword dan Model Sarkas st.write("Clustering comments using keywords and sarcasm model...") keyword_vectorizer = TfidfVectorizer(vocabulary=keywords) X_keywords = keyword_vectorizer.fit_transform(df['komentar']) kmeans = KMeans(n_clusters=10, random_state=0).fit(X_keywords) df['cluster'] = kmeans.labels_ # Identifikasi kluster yang perlu ditinjau review_clusters = df[df['cluster'].isin(df['cluster'].value_counts()[df['cluster'].value_counts() > 10].index)] st.write("Clusters identified for review:") st.dataframe(review_clusters.head(100)) # Preview 100 baris pertama # Notifikasi Tambahkan Kata Kunci st.warning("Some comments were not captured by the current keywords. Please add new keywords in the 'Update Keywords' section.") # Visualisasi Klaster cluster_counts = df['cluster'].value_counts() st.write("Number of comments in each cluster:") st.write(cluster_counts) # Menampilkan hasil clustering st.write("Comments clustered by patterns:") st.dataframe(df.head(100)) # Preview 100 baris pertama # Export hasil analisis dan penanganan st.subheader("Export Final Data") # **1️⃣ Menyimpan data ke JSON dalam buffer** json_buffer = io.BytesIO() df.to_json(json_buffer, orient='records', lines=True) json_buffer.seek(0) # **2️⃣ Opsi untuk Download File Secara Lokal** st.download_button( label=f"⬇ Download Final Data for {selected_candidate}", data=json_buffer, file_name=f"final_data_{selected_candidate}.json", mime="application/json" ) # **3️⃣ Opsi untuk Menyimpan ke Hugging Face** file_name = f"final_data_{selected_candidate}.json" if st.button("📤 Upload Final Data to Hugging Face"): try: # Simpan file sementara sebelum diunggah with open(file_name, "w", encoding="utf-8") as f: df.to_json(f, orient='records', lines=True) # Upload ke Hugging Face api = HfApi() api.upload_file( path_or_fileobj=file_name, path_in_repo=file_name, repo_id="zakyirhab0/Klasifikasi_Komentar", repo_type="dataset" ) st.success(f"✅ {file_name} berhasil diunggah ke Hugging Face.") except Exception as e: st.error(f"❌ Gagal mengunggah ke Hugging Face: {e}") # **4️⃣ Menangani FileNotFoundError & Error Lainnya** try: # Load dataset dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=file_name, repo_type="dataset") df = pd.read_json(dataset_path, lines=True) st.dataframe(df) except FileNotFoundError: st.error(f"❌ No training dataset found for {selected_candidate}. Please add data to create the dataset.") except Exception as e: st.error(f"⚠ An error occurred: {e}") pass if menu == "Retraining Model": st.title("Retrain Model") selected_candidate = st.selectbox("Select a candidate to retrain the model:", list(candidate_list)) # Load model & vectorizer dari Hugging Face Hub model_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"best_rf_model_{selected_candidate.lower().replace(' ', '_')}.joblib", repo_type="dataset") vectorizer_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"tfidf_vectorizer_{selected_candidate.lower().replace(' ', '_')}.joblib", repo_type="dataset") # Load dataset dari Hugging Face dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset") retrain_history_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"retrain_history_{selected_candidate.lower().replace(' ', '_')}.json", repo_type="dataset") # Jika ingin menyimpan history retrain secara lokal try: # Load dataset data = pd.read_excel(dataset_path) # Ensure 'komentar' is string if 'komentar' not in data.columns: st.error("Dataset must include a 'komentar' column.") st.stop() else: # Convert all comments to string data['komentar'] = data['komentar'].fillna('').astype(str) # Separate validated and unvalidated data if 'evaluated_by_cluster' in data.columns: validated_data = data[data['evaluated_by_cluster'] == True] unvalidated_data = data[data['evaluated_by_cluster'] == False] else: validated_data = pd.DataFrame(columns=data.columns) unvalidated_data = data st.write(f"**Validated Data:** {len(validated_data)} rows") st.write(f"**Unvalidated Data:** {len(unvalidated_data)} rows") # Check if all data is validated if len(unvalidated_data) > 0: st.warning("Model retraining is only allowed if all data has been validated through 'Evaluate Clustering'. Please ensure all data is validated before retraining the model.") st.stop() # Combine all data for preprocessing combined_data = validated_data # Only use validated data # Preprocessing Function @st.cache_data(show_spinner=True) def preprocess_data(data): from joblib import Parallel, delayed def preprocess_comment(comment): comment = translate_emojis(comment) comment = normalize_unicode(comment) comment = handle_replies(comment) comment = clean_text(comment) comment = translate_text(comment, ntb_dict) comment = translate_text(comment, slang_dict) comment = handle_negation(comment) return comment data['processed_comments'] = Parallel(n_jobs=-1)( delayed(preprocess_comment)(c) for c in data['komentar'] ) return data # Preprocessing st.write("Starting preprocessing...") combined_data = preprocess_data(combined_data) if st.button("Retrain Model"): # Vectorization st.write("Vectorizing data...") vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=5000) X = vectorizer.fit_transform(combined_data['processed_comments']) y = combined_data['sentimen'] # Split Data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Handle Class Imbalance with SMOTE st.write("Balancing data with SMOTE...") smote = SMOTE(random_state=42, n_jobs=-1) X_train_res, y_train_res = smote.fit_resample(X_train, y_train) # Train Random Forest Model st.write("Training Random Forest model...") rf_model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42) rf_model.fit(X_train_res, y_train_res) # Evaluate on Training Data st.write("Evaluating model...") y_pred_train = rf_model.predict(X_train) accuracy_train = accuracy_score(y_train, y_pred_train) report_train = classification_report(y_train, y_pred_train, output_dict=True) # Evaluate on Test Data y_pred_test = rf_model.predict(X_test) accuracy_test = accuracy_score(y_test, y_pred_test) report_test = classification_report(y_test, y_pred_test, output_dict=True) # Save Model and Vectorizer st.write("Saving model and vectorizer...") joblib.dump(rf_model, model_path) joblib.dump(vectorizer, vectorizer_path) # Log Retraining History st.write("Logging retraining history...") try: with open(retrain_history_path, "r") as f: retrain_history = json.load(f) except FileNotFoundError: retrain_history = [] retrain_history.append({ "date_retrained": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'), "accuracy_on_train": accuracy_train, "accuracy_on_test": accuracy_test, "f1_score_on_train": report_train['weighted avg']['f1-score'], "f1_score_on_test": report_test['weighted avg']['f1-score'], }) with open(retrain_history_path, "w") as f: json.dump(retrain_history, f, indent=4) # Display Results st.success(f"Model retrained successfully! Accuracy on training data: {accuracy_train:.4f}, Accuracy on test data: {accuracy_test:.4f}") st.subheader("Model Metrics on Training Data") st.table(pd.DataFrame(report_train).T) st.subheader("Model Metrics on Test Data") st.table(pd.DataFrame(report_test).T) # Show Retrain History st.subheader("Retrain History") st.json(retrain_history) except FileNotFoundError: st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.") except Exception as e: st.error(f"An unexpected error occurred: {e}") pass # Placeholder if menu == "Maximize Preprocessing": st.title("Maximize Preprocessing") # Load Dataset Train candidate = st.selectbox("Choose a candidate:", list(candidate_list)) dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset") try: # Load dataset data = pd.read_excel(dataset_path) # Ensure 'komentar' is string if 'komentar' not in data.columns: st.error("Dataset must include a 'komentar' column.") st.stop() data['komentar'] = data['komentar'].fillna('').astype(str) # Preprocessing Steps @st.cache_data(show_spinner=True) def preprocess_data(data): st.write("Starting preprocessing...") data['translated_emojis'] = data['komentar'].apply(translate_emojis) data['normalized_unicode'] = data['translated_emojis'].apply(normalize_unicode) data['reply_handled'] = data['normalized_unicode'].apply(handle_replies) data['clean_text'] = data['reply_handled'].apply(clean_text) data['translated_ntb'] = data['clean_text'].apply(lambda x: translate_text(x, ntb_dict)) data['translated_slang'] = data['translated_ntb'].apply(lambda x: translate_text(x, slang_dict)) data['negation_handled'] = data['translated_slang'].apply(handle_negation) return data data = preprocess_data(data) # Check Unmapped Words st.subheader("Check for Unmapped Words") all_words = (word.lower() for comment in data['negation_handled'] for word in comment.split()) unique_words = set(all_words) ntb_dict_keys = set(ntb_dict.keys()) slang_dict_keys = set(slang_dict.keys()) mapped_words = ntb_dict_keys.union(slang_dict_keys) unmapped_words = sorted(unique_words - mapped_words) if unmapped_words: st.write(f"Found **{len(unmapped_words)} unmapped words.**") # Pilihan jumlah kata yang ingin ditampilkan max_words = st.slider( "Select number of words to display:", min_value=10, max_value=len(unmapped_words), value=min(50, len(unmapped_words)), step=10, ) # Buat DataFrame untuk tampilan tabel unmapped_df = pd.DataFrame(unmapped_words, columns=["Unmapped Words"]) st.dataframe(unmapped_df.head(max_words)) # Tampilkan jumlah total kata jika dibutuhkan st.caption(f"Showing {min(max_words, len(unmapped_words))} out of {len(unmapped_words)} unmapped words.") else: st.success("No unmapped words found!") # Add Words to Dictionary st.subheader("Add New Words to Dictionary") new_word = st.text_input("Enter new word:") normalized_word = st.text_input("Enter normalized form:") dictionary_choice = st.radio("Select dictionary to update:", ["Kamus Alay", "Kamus ntb"]) if st.button("Add to Dictionary"): if new_word and normalized_word: if dictionary_choice == "Kamus Alay": slang_dict[new_word.lower()] = normalized_word st.success(f"Added '{new_word}' -> '{normalized_word}' to Kamus Alay.") elif dictionary_choice == "Kamus ntb": ntb_dict[new_word.lower()] = normalized_word st.success(f"Added '{new_word}' -> '{normalized_word}' to Kamus ntb.") else: st.warning("Please enter both the new word and its normalized form.") # Save Updates to Hugging Face st.subheader("Save Updated Dictionaries") if st.button("Save Kamus Alay"): kamus_alay_filename = "kamusalay.csv" # Simpan DataFrame ke file CSV sementara slang_df = pd.DataFrame(list(slang_dict.items()), columns=["slang", "formal"]) slang_df.to_csv(kamus_alay_filename, index=False) # **📤 Upload ke Hugging Face** try: api = HfApi() api.upload_file( path_or_fileobj=kamus_alay_filename, path_in_repo=kamus_alay_filename, repo_id="zakyirhab0/Klasifikasi_Komentar", repo_type="dataset" ) st.success(f"✅ Kamus Alay successfully saved and uploaded to Hugging Face!") except Exception as e: st.error(f"❌ Failed to upload Kamus Alay: {e}") if st.button("Save Kamus NTB"): kamus_ntb_filename = "ntb_dict.json" # Simpan dictionary ke JSON sementara try: with open(kamus_ntb_filename, 'w', encoding='utf-8') as f: json.dump(ntb_dict, f, indent=4) # **📤 Upload ke Hugging Face** api.upload_file( path_or_fileobj=kamus_ntb_filename, path_in_repo=kamus_ntb_filename, repo_id="zakyirhab0/Klasifikasi_Komentar", repo_type="dataset" ) st.success(f"✅ Kamus NTB successfully saved and uploaded to Hugging Face!") except Exception as e: st.error(f"❌ Failed to upload Kamus NTB: {e}") except FileNotFoundError: st.error(f"❌ No training dataset found for {candidate}. Please ensure the dataset is available.") except Exception as e: st.error(f"⚠ An unexpected error occurred: {e}") pass # Placeholder if menu == "Update Keywords": st.title("Update Keywords") # Load existing keyword dictionary from Hugging Face try: keywords_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename="keywords.json", repo_type="dataset") with open(keywords_path, 'r', encoding='utf-8') as f: keyword_dict = json.load(f) st.success("✅ keywords.json loaded successfully from Hugging Face!") except Exception as e: st.error(f"❌ Failed to load keywords.json from Hugging Face: {e}") keyword_dict = {} # Set dictionary kosong jika gagal # Show current keywords st.subheader("Current Keywords") candidate = st.selectbox("Select candidate", list(keyword_dict.keys())) for sentiment, keywords in keyword_dict[candidate].items(): st.write(f"{sentiment}: {', '.join(keywords)}") # Add new keyword st.subheader("Add New Keyword") new_keyword = st.text_input("Enter new keyword") selected_sentiment = st.selectbox("Select sentiment for new keyword", list(keyword_dict[candidate].keys())) # Tambah keyword dan unggah ke Hugging Face if st.button("Add Keyword"): if new_keyword and selected_sentiment: keyword_dict[candidate][selected_sentiment].append(new_keyword) save_keywords_to_huggingface(keyword_dict) # Simpan ke Hugging Face st.success(f"✅ Keyword '{new_keyword}' added to {selected_sentiment} for {candidate}") else: st.error("❌ Please enter a keyword and select a sentiment") # Analyze Special Cluster st.subheader("Analyze Special Cluster") if 'ba_lainnya_data' in st.session_state: try: # Langsung Memuat Data `Special Cluster` special_cluster_data = st.session_state['ba_lainnya_data'][st.session_state['ba_lainnya_data']['Cluster_Name'] == 'Special Cluster'] if special_cluster_data.empty: st.warning("No data found in Special Cluster.") else: st.write(f"Total comments in Special Cluster: {len(special_cluster_data)}") all_words_special = [] for comment in special_cluster_data['negation_handled']: comment = translate_emojis(comment) comment = normalize_unicode(comment) comment = handle_replies(comment) comment = clean_text(comment) comment = translate_text(comment, {}) # Adjust based on your dictionary comment = handle_negation(comment) words = preprocess_text(comment) all_words_special.extend(words) # Calculate word frequencies word_freq_special = Counter(all_words_special) # Add slider to select number of words to display num_words_special = st.slider("Number of words to display (Special Cluster)", min_value=5, max_value=50, value=20) most_common_words_special = word_freq_special.most_common(num_words_special) # Display word frequencies as a table st.subheader(f"Top {num_words_special} Word Frequencies in Special Cluster") word_freq_df_special = pd.DataFrame(most_common_words_special, columns=['Word', 'Frequency']) st.dataframe(word_freq_df_special) except Exception as e: st.error(f"An error occurred: {e}") else: st.warning("No 'BA Lainnya' data found. Please classify comments first.") # Analyze Training Data st.subheader("Analyze Training Data") dataset_path = hf_hub_download(repo_id="zakyirhab0/Klasifikasi_Komentar", filename=f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx", repo_type="dataset") try: train_data = pd.read_excel(dataset_path) if train_data.empty: st.warning("Training dataset is empty.") else: all_words_train = [] for comment in train_data['komentar'].astype(str): comment = translate_emojis(comment) comment = normalize_unicode(comment) comment = handle_replies(comment) comment = clean_text(comment) comment = translate_text(comment, {}) # Adjust based on your dictionary comment = handle_negation(comment) words = preprocess_text(comment) all_words_train.extend(words) # Calculate word frequencies word_freq_train = Counter(all_words_train) # Add slider to select number of words to display num_words_train = st.slider("Number of words to display (Training Data)", min_value=5, max_value=50, value=20) most_common_words_train = word_freq_train.most_common(num_words_train) # Display word frequencies as a table st.subheader(f"Top {num_words_train} Word Frequencies in Training Data") word_freq_df_train = pd.DataFrame(most_common_words_train, columns=['Word', 'Frequency']) st.dataframe(word_freq_df_train) except FileNotFoundError: st.error(f"Training dataset for {candidate} not found.") except Exception as e: st.error(f"An error occurred: {e}") # Ekspor Keywords (Download) st.subheader("Export Keywords") json_buffer = io.BytesIO() json_buffer.write(json.dumps(keyword_dict, indent=4).encode('utf-8')) json_buffer.seek(0) st.download_button( label="Download Keywords", data=json_buffer, file_name="keywords.json", mime="application/json" ) # Impor Keywords (Upload & Sync ke Hugging Face) st.subheader("Import Keywords") uploaded_file = st.file_uploader("Choose a JSON file", type="json") if uploaded_file is not None: try: imported_keywords = json.load(uploaded_file) keyword_dict.update(imported_keywords) # Update dictionary # Simpan kembali ke Hugging Face save_keywords_to_huggingface(keyword_dict) st.success("✅ Keywords imported and updated successfully!") except Exception as e: st.error(f"❌ Error importing keywords: {e}") pass