Spaces:

zakyirhab0
/

Klasifikasi_Komentar

Sleeping

App Files Files Community

Klasifikasi_Komentar / app.py

zakyirhab0

Update app.py

257f4a3 verified about 2 months ago

raw

history blame

77.6 kB

	import streamlit as st
	import joblib
	import pandas as pd
	import re
	import emoji
	import json
	import io
	import unicodedata
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.model_selection import train_test_split, GridSearchCV
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.metrics import accuracy_score, f1_score
	from imblearn.over_sampling import SMOTE
	import matplotlib.pyplot as plt
	import os
	from wordcloud import WordCloud
	from sklearn.metrics import classification_report, accuracy_score, f1_score
	from sklearn.cluster import KMeans
	from sklearn.decomposition import PCA
	from sklearn.preprocessing import FunctionTransformer
	from sklearn.pipeline import Pipeline
	from transformers import pipeline
	from collections import Counter
	import nltk
	from nltk.corpus import stopwords
	from datetime import datetime

	# === Preprocessing Functions === #
	candidate_list = ["Lalu Muhamad Iqbal", "Indah Dhamayanti Putri", "Zulkieflimansyah", "M Suhaili", "Sitti Rohmi Djalilah", "Musyafirin"]

	# Unduh stopwords jika belum terunduh
	nltk.download('stopwords')
	stop_words = set(stopwords.words('indonesian'))

	# Memastikan data "BA Lainnya" tersedia
	if 'data_with_ba' in st.session_state:
	ba_lainnya_data = st.session_state['data_with_ba']
	else:
	ba_lainnya_data = None

	def translate_emojis(text):
	if not isinstance(text, str):
	return '' # Mengembalikan string kosong jika input bukan string
	return ''.join(c for c in text if not emoji.is_emoji(c)) # Remove all emojis
	# Remove all emojis

	def normalize_unicode(text):
	return unicodedata.normalize('NFKD', text)

	def clean_text(text):
	text = str(text).casefold() # Convert to lowercase
	text = re.sub(r'http\S+\|www\S+', '', text) # Remove URLs
	text = re.sub(r'[^a-z\s]', '', text) # Remove non-alphabetic characters
	text = re.sub(r'\s+', ' ', text).strip() # Normalize spaces
	return text

	def handle_negation(text):
	negation_words = {"tidak", "bukan", "jangan", "belum", "kurang", "gagal", "sulit"}
	words = text.split()
	result = []
	skip_next = False
	for i, word in enumerate(words):
	if word in negation_words and i + 1 < len(words):
	result.append(f"{word}_{words[i + 1]}") # Combine negation with next word
	skip_next = True
	elif skip_next:
	skip_next = False
	else:
	result.append(word)
	return ' '.join(result)

	def handle_replies(text):
	text = re.sub(r'=--*@\w+', '', text) # Remove multi-level reply patterns
	text = re.sub(r'=-*@\w+', '', text) # Remove single-level reply patterns
	text = re.sub(r'@\w+', '', text) # Remove standalone @username mentions
	return text

	def translate_text(text, dictionary):
	words = text.split()
	return ' '.join([dictionary.get(word.lower(), word) for word in words]) # Translate words using dictionary

	# Fungsi untuk menetapkan sentimen berdasarkan kata kunci
	def assign_sentiment_based_on_keywords(comment, keyword_dict):
	for sentiment, keywords in keyword_dict.items():
	if any(keyword in comment for keyword in keywords):
	return sentiment
	return 'unknown'

	# === Load Dictionaries === #
	def load_dictionary(file_path, file_type='json'):
	if file_type == 'json':
	with open(file_path, 'r', encoding='utf-8') as file:
	return json.load(file)
	elif file_type == 'csv':
	df = pd.read_csv(file_path, names=['slang', 'formal'])
	return pd.Series(df['formal'].values, index=df['slang']).to_dict()

	ntb_dict = load_dictionary('ntb_dict.json', 'json')
	slang_dict = load_dictionary('kamusalay.csv', 'csv')

	# === Utility Functions === #
	# Fungsi untuk memperbarui file JSON dengan riwayat update
	def update_history_json(history_path, komentar, link, model_data, field, date):
	# Konversi Timestamp menjadi string
	if isinstance(date, pd.Timestamp):
	date = date.strftime('%Y-%m-%d')

	# Baca histori dari file JSON
	try:
	with open(history_path, 'r') as file:
	history_data = json.load(file)
	except (FileNotFoundError, json.JSONDecodeError):
	history_data = {}

	# Kunci riwayat untuk komentar, link, dan model data tertentu
	key = f"{komentar}_{link}_{model_data}"

	# Tambahkan riwayat baru ke kunci yang sesuai
	if key in history_data:
	history_data[key].append({field: date})
	else:
	history_data[key] = [{field: date}]

	# Simpan kembali ke file JSON
	with open(history_path, 'w') as file:
	json.dump(history_data, file, indent=4)

	# Fungsi untuk memperbarui dataset pelatihan dengan data baru
	def update_training_dataset(output, candidate):
	dataset_path = f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx"
	history_path = f"history_{candidate.lower().replace(' ', '_')}.json"

	try:
	required_columns = ['model_data', 'Platform', 'komentar', 'link', 'kandidat', 'sentimen', 'tanggal', 'tanggal_masuk']
	output = output[required_columns].copy()

	if 'predicted_category' in output.columns:
	output['sentimen'] = output['predicted_category']
	output.drop(columns=['predicted_category'], inplace=True)

	output['tanggal_masuk'] = pd.Timestamp.now()

	if os.path.exists(dataset_path):
	existing_data = pd.read_excel(dataset_path)
	else:
	existing_data = pd.DataFrame(columns=required_columns)

	# Menambahkan kolom 'update_ba' jika belum ada
	if 'update_ba' not in existing_data.columns:
	existing_data['update_ba'] = None

	# Menambahkan kolom 'missing_comment' jika belum ada
	if 'missing_comment' not in existing_data.columns:
	existing_data['missing_comment'] = False

	# Langkah 1: Memeriksa Komentar yang Hilang
	train_comments = existing_data.groupby('link')['komentar'].apply(list).to_dict()
	new_comments = output.groupby('link')['komentar'].apply(list).to_dict()

	for link, comments in train_comments.items():
	if link in new_comments:
	new_comment_set = set(new_comments[link])
	for comment in comments:
	if comment not in new_comment_set:
	existing_data.loc[(existing_data['link'] == link) & (existing_data['komentar'] == comment), 'missing_comment'] = True
	else:
	existing_data.loc[(existing_data['link'] == link) & (existing_data['komentar'] == comment), 'missing_comment'] = False

	# Fungsi untuk memperbarui data
	def update_data(existing_data, new_data, history_path):
	for index, row in new_data.iterrows():
	komentar = row['komentar']
	link = row['link']
	model_data = row['model_data']
	tanggal_klasifikasi = pd.Timestamp.now()

	# Komentar Sama Sudah Ada
	existing_entry = existing_data[(existing_data['link'] == link) & (existing_data['komentar'] == komentar)]
	if not existing_entry.empty:
	existing_data.loc[existing_entry.index, 'update_ba'] = tanggal_klasifikasi
	update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi)
	else:
	# Link Sama, Komentar Berbeda
	existing_link_entry = existing_data[(existing_data['link'] == link)]
	if not existing_link_entry.empty:
	new_row = row.copy()
	new_row['tanggal_masuk'] = tanggal_klasifikasi
	new_row['update_ba'] = tanggal_klasifikasi
	existing_data = pd.concat([existing_data, new_row.to_frame().T], ignore_index=True)
	update_history_json(history_path, komentar, link, model_data, 'tanggal_masuk', tanggal_klasifikasi)
	update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi)
	else:
	# Link Baru
	new_row = row.copy()
	new_row['tanggal_masuk'] = tanggal_klasifikasi
	new_row['update_ba'] = tanggal_klasifikasi
	existing_data = pd.concat([existing_data, new_row.to_frame().T], ignore_index=True)
	update_history_json(history_path, komentar, link, model_data, 'tanggal_masuk', tanggal_klasifikasi)
	update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi)

	# Perbarui nilai None di update_ba dengan tanggal_masuk
	existing_data['update_ba'] = pd.to_datetime(existing_data['update_ba'], errors='coerce')
	existing_data['update_ba'].fillna(existing_data['tanggal_masuk'], inplace=True)
	return existing_data

	updated_data = update_data(existing_data, output, history_path)
	updated_data.to_excel(dataset_path, index=False)

	st.success(f"Data successfully updated in {candidate}'s training dataset.")

	if 'missing_comment' in existing_data.columns and existing_data['missing_comment'].any():
	st.subheader("Missing Comments")
	st.write("Comments that were found to be missing:")
	st.dataframe(existing_data[existing_data['missing_comment']])
	except KeyError as e:
	st.error(f"Missing column in the dataset: {e}")
	except Exception as e:
	st.error(f"An error occurred: {e}")


	def clustering_based_evaluation(df, n_clusters=10):
	st.write("Starting preprocessing...")
	df['translated_emojis'] = df['komentar'].fillna('').astype(str).apply(translate_emojis)
	df['normalized_unicode'] = df['translated_emojis'].apply(normalize_unicode)
	df['reply_handled'] = df['normalized_unicode'].apply(handle_replies)
	df['clean_text'] = df['reply_handled'].apply(clean_text)
	df['translated_ntb'] = df['clean_text'].apply(lambda x: translate_text(x, ntb_dict))
	df['translated_slang'] = df['translated_ntb'].apply(lambda x: translate_text(x, slang_dict))
	df['negation_handled'] = df['translated_slang'].apply(handle_negation)

	st.write("Generating TF-IDF vectors...")
	tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
	tfidf_matrix = tfidf_vectorizer.fit_transform(df['negation_handled'])

	st.write(f"Clustering into {n_clusters} clusters...")
	kmeans = KMeans(n_clusters=n_clusters, random_state=42)
	df['Cluster'] = kmeans.fit_predict(tfidf_matrix)

	st.write("Performing PCA for visualization...")
	pca = PCA(n_components=2)
	reduced_data = pca.fit_transform(tfidf_matrix.toarray())
	df['PCA1'] = reduced_data[:, 0]
	df['PCA2'] = reduced_data[:, 1]

	st.write("Clustering completed successfully!")
	return df

	def load_and_process_data(dataset_path, history_path):
	df = pd.read_excel(dataset_path)
	df['tanggal_masuk'] = pd.to_datetime(df['tanggal_masuk'], errors='coerce')

	if df['tanggal_masuk'].isnull().any():
	st.warning("Some dates could not be parsed correctly. Please check the date format in the dataset.")
	df['tanggal_masuk'].fillna(pd.Timestamp.now().strftime('%Y-%m-%d'), inplace=True)

	required_columns = ['model_data', 'Platform', 'komentar', 'sentimen', 'tanggal', 'tanggal_masuk', 'evaluated_by_cluster']
	for col in required_columns:
	if col not in df.columns:
	if col == 'tanggal_masuk':
	df[col] = pd.Timestamp.now().strftime('%Y-%m-%d')
	elif col == 'evaluated_by_cluster':
	df[col] = False
	else:
	df[col] = None

	df = df[required_columns]

	try:
	with open(history_path, "r") as f:
	history = json.load(f)
	except FileNotFoundError:
	history = []

	return df, history

	# Define the function to handle Special Cluster
	def handle_special_cluster(data, keywords, detector):
	for index, row in data.iterrows():
	text = row['negation_handled']

	# Cek apakah teks mengandung salah satu kata kunci
	detected = False
	for sentiment, words in keywords.items():
	if any(word in text for word in words):
	data.loc[index, 'predicted_category'] = sentiment.replace('_', ' ')
	data.loc[index, 'detected_by'] = 'keyword'
	detected = True
	break

	if not detected:
	# Jika tidak ada kata kunci yang cocok, gunakan deteksi sarkasme
	result = detector(text)
	if result[0]['label'] == 'SARCASM':
	data.loc[index, 'predicted_category'] = 'co sarkastic'
	data.loc[index, 'detected_by'] = 'sarcasm'
	else:
	# Jika tidak ada sentimen yang cocok
	data.loc[index, 'predicted_category'] = 'Unknown'
	data.loc[index, 'detected_by'] = 'unknown'

	return data

	def preprocess_text(text):
	text = text.lower()
	text = re.sub(r'\W+', ' ', text) # Remove non-alphanumeric characters
	words = text.split()
	words = [word for word in words if word not in stop_words]
	return words

	def display_word_frequencies(words, num_words):
	st.subheader(f"Top {num_words} Words")
	for word, freq in words:
	st.write(f"{word}: {freq}")

	def update_sentiment(index, new_sentimen):
	df.loc[index, 'sentimen'] = new_sentimen
	st.write(f"Updated sentiment for comment at index {index} to {new_sentimen}")

	# Fungsi untuk memuat data `Corrected Comments` dari file Excel
	def load_corrected_comments_from_excel(file_path):
	try:
	return pd.read_excel(file_path, sheet_name='Corrected Comments')
	except FileNotFoundError:
	return pd.DataFrame(columns=['model_data', 'Platform', 'komentar', 'tanggal', 'link', 'sentimen', 'Cluster_Name', 'corrected_by', 'tanggal_masuk'])
	except Exception as e:
	print(f"Error loading corrected comments: {e}")
	return pd.DataFrame(columns=['model_data', 'Platform', 'komentar', 'tanggal', 'link', 'sentimen', 'Cluster_Name', 'corrected_by', 'tanggal_masuk'])

	# Fungsi untuk menyimpan data `Corrected Comments` ke file Excel
	def save_corrected_comments_to_excel(data, file_path):
	with pd.ExcelWriter(file_path, engine='xlsxwriter') as writer:
	data.to_excel(writer, sheet_name='Corrected Comments', index=False)

	# Lokasi file Excel untuk menyimpan data `Corrected Comments`
	corrected_comments_file = 'corrected_comments.xlsx'

	# Fungsi untuk menampilkan visualisasi distribusi komentar dalam cluster
	def display_cluster_visualization(ba_lainnya_data):
	st.subheader("Cluster Visualization")

	# Menghitung jumlah komentar dalam kategori `Similar Sentiment` dan `Special Cluster`
	cluster_counts = ba_lainnya_data[ba_lainnya_data['Cluster_Name'].str.contains('Similar\|Special Cluster')]['Cluster_Name'].value_counts()

	# Membuat grafik batang (bar chart)
	plt.figure(figsize=(10, 6))
	plt.bar(cluster_counts.index, cluster_counts.values, color=['blue', 'green', 'orange', 'red', 'purple'])
	plt.xlabel('Cluster Name')
	plt.ylabel('Number of Comments')
	plt.title('Distribution of Comments in Similar Sentiment and Special Cluster')
	plt.xticks(rotation=45)
	plt.show()
	st.pyplot(plt)

	def run_clustering_for_ba_lainnya():
	st.title("Clustering for 'BA Lainnya'")

	if 'data_with_ba' not in st.session_state:
	st.error("No 'BA Lainnya' data found from the classification model. Please classify comments first.")
	st.stop()

	ba_lainnya_data = st.session_state['data_with_ba']

	st.write(f"'BA Lainnya' Data: {len(ba_lainnya_data)} rows")

	with open('keywords.json', 'r') as f:
	keyword_dict = json.load(f)

	selected_candidate = st.session_state['candidate']
	candidate_keywords = keyword_dict.get(selected_candidate.replace(' ', '_'))

	if candidate_keywords is None:
	st.error("Keywords for the selected candidate not found.")
	st.stop()

	sarcasm_detector = pipeline('sentiment-analysis', model='unitary/toxic-bert')

	dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx"
	corrected_comments_file = f"corrected_comments_{selected_candidate.lower().replace(' ', '_')}.xlsx"

	try:
	train_data = pd.read_excel(dataset_path)
	labeled_data = train_data[train_data['sentimen'].isin(['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])]

	st.write(f"Labeled Data from Training Dataset: {len(labeled_data)} rows")

	ba_lainnya_data['Cluster'] = None
	ba_lainnya_data['detected_by'] = None
	ba_lainnya_data['Cluster_Name'] = None
	ba_lainnya_data['corrected_by'] = "Not Corrected"
	ba_lainnya_data['Sentiment'] = None # Ubah inisialisasi ke None untuk memastikan tidak ada nilai default yang salah

	ba_lainnya_data['Cluster_Name'] = ba_lainnya_data['Cluster_Name'].astype(str)
	ba_lainnya_data['corrected_by'] = ba_lainnya_data['corrected_by'].astype(str)
	ba_lainnya_data['Sentiment'] = ba_lainnya_data['Sentiment'].astype(str)

	for data in [ba_lainnya_data, labeled_data]:
	data['translated_emojis'] = data['komentar'].fillna('').astype(str).apply(translate_emojis)
	data['normalized_unicode'] = data['translated_emojis'].apply(normalize_unicode)
	data['reply_handled'] = data['normalized_unicode'].apply(handle_replies)
	data['clean_text'] = data['reply_handled'].apply(clean_text)
	data['translated_ntb'] = data['clean_text'].apply(lambda x: translate_text(x, {}))
	data['translated_slang'] = data['translated_ntb'].apply(lambda x: translate_text(x, {}))
	data['negation_handled'] = data['translated_slang'].apply(handle_negation)
	data['negation_handled'] = data['negation_handled'].fillna('')

	combined_data = ba_lainnya_data.copy()
	combined_data['Label'] = 'BA Lainnya'

	for sentimen in ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']:
	sentimen_data = labeled_data[labeled_data['sentimen'] == sentimen].copy()
	sentimen_data['Label'] = sentimen
	combined = pd.concat([combined_data, sentimen_data], ignore_index=True)

	if len(combined) < 2:
	st.warning(f"Not enough samples to cluster for {sentimen}.")
	continue

	vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=5000)
	tfidf_matrix = vectorizer.fit_transform(combined['negation_handled'])

	st.write(f"Clustering 'BA Lainnya' comments similar to {sentimen}...")
	kmeans = KMeans(n_clusters=2, random_state=42)
	combined['Cluster'] = kmeans.fit_predict(tfidf_matrix)

	valid_indices = combined.index[:len(ba_lainnya_data)]
	valid_indices = valid_indices.intersection(ba_lainnya_data.index)

	ba_lainnya_data.loc[valid_indices, 'Cluster'] = combined.loc[valid_indices, 'Cluster']
	ba_lainnya_data.loc[ba_lainnya_data['Cluster'] == 0, 'Cluster_Name'] = f"{sentimen} Similar"
	ba_lainnya_data.loc[ba_lainnya_data['Cluster'] == 1, 'Cluster_Name'] = f"{sentimen} Dissimilar"
	ba_lainnya_data.loc[valid_indices, 'Sentiment'] = sentimen

	for index, row in ba_lainnya_data.iterrows():
	if row['Cluster_Name'].endswith('Dissimilar') or row['Cluster_Name'] == 'None':
	dissimilar_comment = ba_lainnya_data.loc[[index]].copy()
	for sentimen in ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']:
	sentimen_data = labeled_data[labeled_data['sentimen'] == sentimen].copy()
	combined = pd.concat([dissimilar_comment, sentimen_data], ignore_index=True)

	if len(combined) < 2:
	continue

	tfidf_matrix = vectorizer.fit_transform(combined['negation_handled'])
	if tfidf_matrix.shape[0] == 0:
	continue

	kmeans = KMeans(n_clusters=2, random_state=42)
	combined['Cluster'] = kmeans.fit_predict(tfidf_matrix)

	if len(combined) > 0 and combined.loc[0, 'Cluster'] == 0:
	ba_lainnya_data.loc[index, 'Cluster_Name'] = f"{sentimen} Similar"
	ba_lainnya_data.loc[index, 'Sentiment'] = sentimen
	break
	else:
	ba_lainnya_data.loc[index, 'Cluster_Name'] = 'Special Cluster'
	ba_lainnya_data.loc[index, 'corrected_by'] = 'Special Cluster'
	ba_lainnya_data.loc[index, 'Sentiment'] = 'Special Sentiment'

	ba_lainnya_data['Cluster_Name'] = ba_lainnya_data['Cluster_Name'].apply(lambda x: 'Special Cluster' if x == 'nan' else x)

	special_cluster_data = ba_lainnya_data[ba_lainnya_data['Cluster_Name'] == 'Special Cluster']
	if not special_cluster_data.empty:
	special_cluster_data = handle_special_cluster(special_cluster_data, candidate_keywords, sarcasm_detector)
	ba_lainnya_data.update(special_cluster_data)
	ba_lainnya_data.loc[special_cluster_data.index, 'corrected_by'] = 'Special Cluster'
	ba_lainnya_data.loc[special_cluster_data.index, 'Sentiment'] = 'Special Sentiment'

	st.warning("Some comments were not captured by the current keywords. Please add new keywords in the 'Update Keywords' section.")

	st.subheader("Detection Distribution in Special Cluster")
	detection_counts = special_cluster_data['detected_by'].value_counts()
	plt.figure(figsize=(10, 6))
	plt.bar(detection_counts.index, detection_counts.values, color=['blue', 'orange', 'red'])
	plt.xlabel('Detection Method')
	plt.ylabel('Number of Comments')
	plt.title('Detection Distribution in Special Cluster')
	plt.show()
	st.pyplot(plt)

	st.write("Top Keywords in Special Cluster")
	for sentiment, keywords in candidate_keywords.items():
	st.write(f"{sentiment}: {', '.join(keywords)}")

	st.subheader("Special Cluster Details")
	st.dataframe(special_cluster_data[['komentar', 'Cluster_Name', 'detected_by']])

	corrected_comments = load_corrected_comments_from_excel(corrected_comments_file)
	display_cluster_visualization(ba_lainnya_data)

	st.subheader("Edit Sentiment in Table")

	# Tambahkan kolom apakah komentar sudah masuk ke corrected_comments
	ba_lainnya_data['Already Corrected'] = ba_lainnya_data['komentar'].isin(corrected_comments['komentar'])
	ba_lainnya_data['Corrected Sentiment'] = ba_lainnya_data.apply(
	lambda row: corrected_comments[corrected_comments['komentar'] == row['komentar']]['sentimen'].values[0]
	if row['Already Corrected'] else "", axis=1
	)

	# Tambahkan kolom checkbox untuk memilih komentar yang mau diedit
	ba_lainnya_data['Select'] = False

	# Dropdown untuk memilih cluster (bulk selection)
	selected_cluster = st.selectbox("Select a cluster to edit:", sorted(ba_lainnya_data['Cluster_Name'].unique()))

	# Jika cluster dipilih, auto-select semua komentar di cluster tersebut
	if selected_cluster:
	ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'Select'] = True

	# Tampilkan tabel dengan checkbox & informasi sentimen
	edited_data = st.data_editor(
	ba_lainnya_data[['Select', 'komentar', 'Cluster_Name', 'Sentiment', 'Already Corrected', 'Corrected Sentiment']],
	column_config={
	"Select": st.column_config.CheckboxColumn("✅ Select"),
	"komentar": st.column_config.TextColumn("Komentar"),
	"Cluster_Name": st.column_config.TextColumn("Cluster"),
	"Sentiment": st.column_config.TextColumn("Sentimen"),
	"Already Corrected": st.column_config.CheckboxColumn("✅ Sudah di Corrected?"),
	"Corrected Sentiment": st.column_config.TextColumn("Sentimen Corrected"),
	},
	disabled=["komentar", "Cluster_Name", "Already Corrected", "Corrected Sentiment"], # Kolom yang gak bisa diedit
	use_container_width=True,
	)

	# Dropdown untuk memilih sentimen baru yang akan diterapkan ke komentar yang dicentang
	new_sentimen = st.selectbox("Select new sentiment for selected comments:",
	['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])

	# Jika tombol ditekan, update sentimen komentar yang dicentang
	if st.button("Update Sentiments"):
	selected_rows = edited_data[edited_data['Select']] # Ambil data yang dicentang

	if selected_rows.empty:
	st.warning("No comments selected for update!")
	else:
	for index, row in selected_rows.iterrows():
	ba_lainnya_data.at[index, 'Sentiment'] = new_sentimen
	ba_lainnya_data.at[index, 'corrected_by'] = 'Manual Edit'

	if ba_lainnya_data.at[index, 'komentar'] in corrected_comments['komentar'].values:
	corrected_comments.loc[corrected_comments['komentar'] == ba_lainnya_data.at[index, 'komentar'], 'sentimen'] = new_sentimen
	else:
	new_entry = ba_lainnya_data.loc[[index]].copy()
	new_entry['sentimen'] = new_sentimen
	corrected_comments = pd.concat([corrected_comments, new_entry])

	corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True)
	save_corrected_comments_to_excel(corrected_comments, corrected_comments_file)
	st.success("Sentiments updated successfully!")
	st.rerun()

	st.subheader("Clustered Data")
	selected_cluster = st.selectbox("Select a cluster to view comments:", sorted(ba_lainnya_data['Cluster_Name'].unique()))
	cluster_comments = ba_lainnya_data[ba_lainnya_data['Cluster_Name'] == selected_cluster]
	st.dataframe(cluster_comments[['komentar', 'Cluster_Name']].head(100))

	new_sentimen = st.selectbox("Select new sentiment for this cluster:", ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])
	if st.button("Update Sentiment for this cluster"):
	ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'corrected_by'] = 'Batch Cluster'
	ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'sentimen'] = new_sentimen
	st.success(f"Sentiment for cluster {selected_cluster} updated to {new_sentimen}")

	# Save and refresh Corrected Comments table and Cluster Visualization
	corrected_comments = pd.concat([corrected_comments, ba_lainnya_data[ba_lainnya_data['corrected_by'] != "Not Corrected"]])
	corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True)
	save_corrected_comments_to_excel(corrected_comments, corrected_comments_file)
	st.subheader("Corrected Comments")
	st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100))
	display_cluster_visualization(ba_lainnya_data)

	st.subheader("Special Rules Based on Keywords")
	keyword = st.text_input("Enter a keyword to set a rule:")
	specific_cluster = st.selectbox("Select a cluster for this keyword:", sorted(ba_lainnya_data['Cluster_Name'].unique()))

	if keyword:
	new_cluster = st.selectbox("Select sentiment for this keyword:", ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])
	if st.button("Apply Rule"):
	ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'Cluster_Name'] = new_cluster
	ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'detected_by'] = specific_cluster
	ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'corrected_by'] = 'Keyword Rule'
	ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'sentimen'] = new_cluster
	st.success(f"All comments containing '{keyword}' have been updated to '{new_cluster}' sentiment.")

	# Update keywords.json file to avoid duplicates
	if selected_candidate.replace(' ', '_') in keyword_dict:
	if new_cluster in keyword_dict[selected_candidate.replace(' ', '_')]:
	if keyword not in keyword_dict[selected_candidate.replace(' ', '_')][new_cluster]:
	keyword_dict[selected_candidate.replace(' ', '_')][new_cluster].append(keyword)
	else:
	keyword_dict[selected_candidate.replace(' ', '_')][new_cluster] = [keyword]
	else:
	keyword_dict[selected_candidate.replace(' ', '_')] = {new_cluster: [keyword]}

	with open('keywords.json', 'w') as f:
	json.dump(keyword_dict, f)

	st.success(f"Keyword '{keyword}' has been added to the keyword list.")

	# Save and refresh Corrected Comments table and Cluster Visualization
	corrected_comments = pd.concat([corrected_comments, ba_lainnya_data[ba_lainnya_data['corrected_by'] != "Not Corrected"]])
	corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True)
	save_corrected_comments_to_excel(corrected_comments, corrected_comments_file)
	st.subheader("Corrected Comments")
	st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100))
	display_cluster_visualization(ba_lainnya_data)

	st.subheader("Corrected Comments")
	corrected_comments = load_corrected_comments_from_excel(corrected_comments_file)
	st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100))

	st.subheader("Visual Representation of Corrected Comments")
	sentiment_counts = corrected_comments['sentimen'].value_counts()
	plt.figure(figsize=(10, 6))
	plt.bar(sentiment_counts.index, sentiment_counts.values, color=['blue', 'green', 'orange', 'red', 'purple'])
	plt.xlabel('Sentimen')
	plt.ylabel('Number of Corrected Comments')
	plt.title('Number of Corrected Comments by Sentiment')
	plt.show()
	st.pyplot(plt)

	st.subheader("Download Options")
	excel_buffer_cluster = io.BytesIO()
	with pd.ExcelWriter(excel_buffer_cluster, engine='xlsxwriter') as writer:
	ba_lainnya_data.to_excel(writer, index=False, sheet_name='Clustered Data')
	excel_buffer_cluster.seek(0)

	st.download_button(
	label=f"Download Clustered Data for {selected_candidate}",
	data=excel_buffer_cluster,
	file_name=f"clustered_data_{selected_candidate}.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)

	st.subheader("Save Corrected Comments to Training Dataset")
	if st.button("Save Corrected Comments"):
	try:
	# Simpan hanya hasil koreksi
	corrected_comments['tanggal_masuk'] = pd.Timestamp.now().strftime('%Y-%m-%d')
	update_training_dataset(corrected_comments, st.session_state['candidate'])
	st.success("Corrected comments have been successfully saved to the training dataset.")

	# Clear corrected comments from session state to avoid duplicate saves
	st.session_state['data_with_ba'] = pd.DataFrame(columns=corrected_comments.columns)

	# Rerun automatically after saving to refresh the state
	st.rerun()
	except Exception as e:
	st.error(f"An error occurred while saving the corrected comments: {e}")

	except FileNotFoundError:
	st.error(f"No dataset found for {selected_candidate}. Please add data to create the dataset.")
	except Exception as e:
	st.error(f"An unexpected error occurred: {e}")

	# === Sidebar Navigation === #
	menu = st.sidebar.radio("Select a Feature", ["Model-Based Classification","Clustering for 'BA Lainnya'", "Update Keywords","View Training Dataset","Evaluate Data Train","Maximize Preprocessing","Retraining Model"])
	if menu == "Model-Based Classification":
	st.title("Model-Based Classification")
	candidate = st.selectbox("Choose a candidate:", candidate_list)
	model_path = f"best_rf_model_{candidate.replace(' ', '_').lower()}.joblib"
	vectorizer_path = f"tfidf_vectorizer_{candidate.replace(' ', '_').lower()}.joblib"

	# Save the selected candidate to session state
	st.session_state['candidate'] = candidate

	uploaded_file = st.file_uploader("Upload an Excel file for classification", type=['xlsx'])

	@st.cache_data
	def load_model_and_vectorizer(model_path, vectorizer_path):
	"""Load model and vectorizer, cache them for efficiency."""
	try:
	model = joblib.load(model_path)
	vectorizer = joblib.load(vectorizer_path)
	return model, vectorizer
	except FileNotFoundError:
	return None, None

	model, vectorizer = load_model_and_vectorizer(model_path, vectorizer_path)

	if not model or not vectorizer:
	st.error("Model or vectorizer not found for the selected candidate.")
	st.stop()

	# Save the vectorizer and model to session state
	st.session_state['vectorizer'] = vectorizer
	st.session_state['model'] = model

	@st.cache_data
	def preprocess_data(data):
	"""Preprocess comments with batching for large datasets."""
	from joblib import Parallel, delayed

	def preprocess_batch(batch):
	batch['translated_emojis'] = batch['komentar'].apply(translate_emojis)
	batch['normalized_unicode'] = batch['translated_emojis'].apply(normalize_unicode)
	batch['reply_handled'] = batch['normalized_unicode'].apply(handle_replies)
	batch['clean_text'] = batch['reply_handled'].apply(clean_text)
	batch['translated_ntb'] = batch['clean_text'].apply(lambda x: translate_text(x, ntb_dict))
	batch['translated_slang'] = batch['translated_ntb'].apply(lambda x: translate_text(x, slang_dict))
	batch['negation_handled'] = batch['translated_slang'].apply(handle_negation)
	return batch

	batch_size = 10000 # Process 10,000 rows at a time
	batches = [data.iloc[i:i+batch_size] for i in range(0, len(data), batch_size)]
	processed_batches = Parallel(n_jobs=-1)(delayed(preprocess_batch)(batch) for batch in batches)
	return pd.concat(processed_batches, ignore_index=True)

	# Process uploaded file
	if uploaded_file:
	try:
	data = pd.read_excel(uploaded_file)
	if 'komentar' not in data.columns:
	st.error("The uploaded file must include a 'komentar' column.")
	st.stop()
	data = preprocess_data(data)
	except Exception as e:
	st.error(f"An error occurred while processing the file: {e}")
	st.stop()
	elif 'model_classified_data' in st.session_state:
	data = st.session_state['model_classified_data']
	else:
	st.info("Please upload a file for classification.")
	st.stop()

	# Transform comments into TF-IDF vectors
	try:
	tfidf_data = vectorizer.transform(data['negation_handled'].fillna(''))
	data['predicted_category'] = model.predict(tfidf_data)
	data['probabilities'] = model.predict_proba(tfidf_data).tolist()
	data['max_probability'] = data['probabilities'].apply(lambda x: max(x))
	except Exception as e:
	st.error(f"An error occurred during model prediction: {e}")
	st.stop()

	# Cache classified data
	st.session_state['model_classified_data'] = data

	# Interactive threshold adjustment
	st.subheader("Set Threshold for 'BA Lainnya'")
	threshold = st.slider("Threshold for tagging 'BA Lainnya'", min_value=0.0, max_value=1.0, value=0.80, step=0.01)

	# Apply threshold to tag "BA Lainnya"
	data['tag'] = data['max_probability'].apply(lambda x: 'BA Lainnya' if x < threshold else '')

	# Separate data for visualization
	data_without_ba = data[data['tag'] != 'BA Lainnya']
	data_with_ba = data[data['tag'] == 'BA Lainnya']

	# Save updated results to session state for dynamic updates
	st.session_state['data_without_ba'] = data_without_ba
	st.session_state['data_with_ba'] = data_with_ba

	# Preview Results
	st.subheader("Preview Results")

	st.write("### 1. Hasil Klasifikasi Tanpa Tag 'BA Lainnya'")
	if not data_without_ba.empty:
	st.dataframe(data_without_ba[['komentar', 'predicted_category', 'max_probability']])
	else:
	st.info("No high-probability classifications available.")

	st.write("### 2. Hasil Klasifikasi Dengan Tag 'BA Lainnya'")
	if not data_with_ba.empty:
	st.dataframe(data_with_ba[['komentar', 'predicted_category', 'max_probability']])
	else:
	st.info("No low-probability classifications available.")

	# Visualization: Sentiment Distribution
	st.subheader("Sentiment Distribution Visualization")

	def plot_distribution(data, title):
	sentiment_counts = data['predicted_category'].value_counts()
	fig, ax = plt.subplots()
	ax.bar(sentiment_counts.index, sentiment_counts.values)
	ax.set_title(title)
	ax.set_xlabel("Sentiments")
	ax.set_ylabel("Count")
	st.pyplot(fig)

	if not data_without_ba.empty:
	plot_distribution(data_without_ba, "Sentiment Distribution (Without 'BA Lainnya')")
	if not data_with_ba.empty:
	plot_distribution(data_with_ba, "Sentiment Distribution (With 'BA Lainnya')")

	# Download Results
	st.subheader("Download Results")
	excel_buffer = io.BytesIO()
	with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
	data.to_excel(writer, index=False, sheet_name='Classification Results')
	excel_buffer.seek(0)

	st.download_button(
	label="Download All Classification Results",
	data=excel_buffer,
	file_name=f"classification_results_{candidate}.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)

	# Save Non-'BA Lainnya' Data to Training Dataset
	st.subheader("Save Classified Data")
	if not data_without_ba.empty:
	st.info("Only Non-'BA Lainnya' comments will be saved to the training dataset.")
	if st.button("Save Non-'BA Lainnya' Data to Training Dataset"):
	try:
	data_to_save = data_without_ba[['model_data', 'Platform', 'komentar', 'link', 'kandidat', 'tanggal']].copy()
	data_to_save['sentimen'] = data_without_ba['predicted_category']
	data_to_save['tanggal_masuk'] = pd.Timestamp.now().strftime('%Y-%m-%d')
	update_training_dataset(data_to_save, candidate)
	st.success("Data successfully saved to the training dataset.")
	except Exception as e:
	st.error(f"An error occurred while saving the data: {e}")
	else:
	st.info("No Non-'BA Lainnya' data available to save.")
	pass # Placeholder

	# Integrasi fungsi dalam halaman "Clustering for 'BA Lainnya'"
	if menu == "Clustering for 'BA Lainnya'":
	selected_candidate = st.session_state['selected_candidate'] if 'selected_candidate' in st.session_state else None
	run_clustering_for_ba_lainnya()
	pass # Placeholder

	# Memastikan kode ini hanya dijalankan jika menu yang dipilih adalah "View Training Dataset"
	if menu == "View Training Dataset":
	st.title("View Training Dataset")

	# Header untuk memilih kandidat
	st.header("Options")
	selected_candidate = st.selectbox("Choose a candidate:", list(candidate_list), key='candidate_select_view')

	# Path dataset
	dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx"
	history_path = f"history_{selected_candidate.lower().replace(' ', '_')}.json"

	# Memuat dataset
	try:
	df = pd.read_excel(dataset_path)

	# Memastikan kolom yang diperlukan ada
	required_columns = ['model_data', 'Platform', 'komentar', 'sentimen', 'tanggal', 'tanggal_masuk', 'link', 'evaluated_by_data_train']
	for col in required_columns:
	if col not in df.columns:
	if col == 'evaluated_by_data_train':
	df[col] = False

	# Menambahkan kolom 'update_ba' jika belum ada
	if 'update_ba' not in df.columns:
	df['update_ba'] = None

	# Menambahkan kolom 'missing_comment' jika belum ada
	if 'missing_comment' not in df.columns:
	df['missing_comment'] = False

	# Pastikan 'tanggal_masuk' dan 'tanggal' dalam format datetime yang benar
	df['tanggal_masuk'] = pd.to_datetime(df['tanggal_masuk'], errors='coerce')
	df['tanggal'] = pd.to_datetime(df['tanggal'], errors='coerce')

	# Konversi kolom tanggal agar hanya menyimpan bagian tanggal tanpa waktu
	df['tanggal'] = df['tanggal'].dt.date
	df['tanggal_masuk'] = df['tanggal_masuk'].dt.date
	df['update_ba'] = pd.to_datetime(df['update_ba'], errors='coerce').dt.date

	# Menangani nilai NaT (Not a Time) jika ada
	if df['tanggal_masuk'].isnull().any():
	st.warning("Some dates 'tanggal_masuk' could not be parsed correctly. Please check the date format in the dataset.")
	df['tanggal_masuk'].fillna(pd.Timestamp.now().date(), inplace=True)

	if df['tanggal'].isnull().any():
	st.warning("Some dates 'tanggal' could not be parsed correctly. Please check the date format in the dataset.")
	df['tanggal'].fillna(pd.Timestamp.now().date(), inplace=True)

	# Menambahkan kolom 'kandidat' jika belum ada dan mengisinya
	if 'kandidat' not in df.columns:
	df['kandidat'] = selected_candidate

	# Mengambil subset kolom yang diperlukan
	df = df[required_columns + ['update_ba', 'kandidat', 'missing_comment']]

	# Perbarui nilai None di update_ba dengan tanggal_masuk
	df['update_ba'].fillna(df['tanggal_masuk'], inplace=True)

	# Menampilkan statistik dasar
	st.subheader(f"Training Dataset for {selected_candidate}")
	st.write(f"Total rows in dataset: {len(df)}")

	if not df.empty:
	# Visualisasi sebaran update BA
	st.subheader("Visualisasi Postingan Berdasarkan Update BA")
	ba_update_counts = df['update_ba'].value_counts().sort_index()
	fig, ax = plt.subplots(figsize=(10, 6))
	ba_update_counts.plot(kind='bar', ax=ax, color='blue')
	ax.set_title('Sebaran Postingan Berdasarkan Update BA')
	ax.set_xlabel('Tanggal Update BA')
	ax.set_ylabel('Jumlah Postingan')
	plt.xticks(rotation=45)
	plt.tight_layout()
	st.pyplot(fig)

	# Visualisasi tambahan sebaran platform
	st.subheader("Sebaran Platform Berdasarkan Update BA")
	platform_counts = df['Platform'].value_counts()
	fig, ax = plt.subplots(figsize=(10, 6))
	platform_counts.plot(kind='bar', ax=ax, color='green')
	ax.set_title('Sebaran Platform Berdasarkan Update BA')
	ax.set_xlabel('Platform')
	ax.set_ylabel('Jumlah Postingan')
	plt.xticks(rotation=45)
	plt.tight_layout()
	st.pyplot(fig)

	# Visualisasi jumlah komentar hilang berdasarkan platform
	st.subheader("Jumlah Komentar Hilang Berdasarkan Platform")
	missing_comments_by_platform = df.groupby('Platform')['missing_comment'].sum().sort_index()
	fig, ax = plt.subplots(figsize=(10, 6))
	missing_comments_by_platform.plot(kind='bar', ax=ax, color='red')
	ax.set_title('Jumlah Komentar Hilang Berdasarkan Platform')
	ax.set_xlabel('Platform')
	ax.set_ylabel('Jumlah Komentar Hilang')
	plt.xticks(rotation=45)
	plt.tight_layout()
	st.pyplot(fig)

	# Filter berdasarkan status validasi
	st.subheader("Filter Data")
	validation_filter = st.radio(
	"Choose data type to view:",
	["All Data", "Validated Data", "Non-Validated Data"],
	key='validation_filter'
	)

	if validation_filter == "Validated Data":
	filtered_data = df[df['evaluated_by_data_train'] == True]
	elif validation_filter == "Non-Validated Data":
	filtered_data = df[df['evaluated_by_data_train'] == False]
	else:
	filtered_data = df

	if not filtered_data.empty:
	st.subheader(f"Filtered Data: {validation_filter}")
	st.dataframe(filtered_data) # Menampilkan semua data yang sesuai dengan filter
	else:
	st.warning("Tidak ada data yang sesuai dengan filter yang dipilih.")

	# Menampilkan riwayat penambahan data
	st.subheader("History of Data Additions")
	try:
	with open(history_path, "r") as f:
	history = json.load(f)

	history_list = []
	for key, value in history.items():
	for entry in value:
	for k, v in entry.items():
	history_list.append({
	'key': key,
	'field': k,
	'date': v
	})

	history_df = pd.DataFrame(history_list)
	st.dataframe(history_df)
	except FileNotFoundError:
	st.write("No addition history available.")
	except ValueError as e:
	st.error(f"An error occurred while loading history data: {e}")

	# Opsi untuk mengunduh dataset yang telah difilter
	st.subheader("Download Options")
	if not filtered_data.empty:
	excel_buffer = io.BytesIO()
	with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
	filtered_data.to_excel(writer, index=False, sheet_name='Filtered Dataset')
	excel_buffer.seek(0)

	st.download_button(
	label=f"Download Filtered Dataset for {selected_candidate}",
	data=excel_buffer,
	file_name=f"filtered_training_dataset_{selected_candidate}.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)

	# Opsi untuk mengunduh seluruh dataset
	if not df.empty:
	excel_buffer_full = io.BytesIO()
	with pd.ExcelWriter(excel_buffer_full, engine='xlsxwriter') as writer:
	df.to_excel(writer, index=False, sheet_name='Training Dataset')
	excel_buffer_full.seek(0)

	st.download_button(
	label=f"Download Full Training Dataset for {selected_candidate}",
	data=excel_buffer_full,
	file_name=f"training_dataset_{selected_candidate}.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)

	# Fitur untuk mengunduh dataset berdasarkan filter tanggal
	st.subheader("Download Aggregated Data by Date")

	# Mendapatkan nilai maksimum dan minimum tanggal untuk default date_input
	if not df['tanggal'].empty:
	min_date = df['tanggal'].min()
	max_date = df['tanggal'].max()

	selected_start_date = st.date_input("Select start date for aggregation:", value=min_date)
	selected_end_date = st.date_input("Select end date for aggregation:", value=max_date)

	agg_filtered_data = df[(df['tanggal'] >= selected_start_date) & (df['tanggal'] <= selected_end_date)]

	if not agg_filtered_data.empty:
	# Pastikan kolom 'kandidat' ada dan diisi
	agg_filtered_data['kandidat'] = selected_candidate

	aggregated_data = agg_filtered_data.groupby(['kandidat', 'link', 'tanggal', 'Platform', 'model_data', 'update_ba']).agg(
	co_likes=('sentimen', lambda x: (x == 'Co Likes').sum()),
	co_support=('sentimen', lambda x: (x == 'Co Support').sum()),
	co_optimism=('sentimen', lambda x: (x == 'Co Optimism').sum()),
	co_sarkastic=('sentimen', lambda x: (x == 'Co Sarkastic').sum()),
	co_negative=('sentimen', lambda x: (x == 'Co Negative').sum()),
	missing_comments=('missing_comment', 'sum') # Menambahkan agregasi untuk missing_comment
	).reset_index()

	# Menambahkan kolom-kolom baru untuk jumlah komentar
	aggregated_data['jumlah_komentar_positif'] = aggregated_data['co_likes'] + aggregated_data['co_support'] + aggregated_data['co_optimism']
	aggregated_data['jumlah_komentar_negatif'] = aggregated_data['co_sarkastic'] + aggregated_data['co_negative']
	aggregated_data['jumlah_komentar'] = aggregated_data[['co_likes', 'co_support', 'co_optimism', 'co_sarkastic', 'co_negative']].sum(axis=1)

	st.dataframe(aggregated_data)

	# Visualisasi jumlah postingan yang diupdate BA dengan sebaran platform berdasarkan rentang tanggal
	st.subheader("Visualisasi Postingan yang Diupdate BA Berdasarkan Rentang Tanggal")
	ba_update_range = aggregated_data[aggregated_data['update_ba'] != 'Belum diupdate']
	if not ba_update_range.empty:
	plt.figure(figsize=(10, 6))
	ba_update_range['Platform'].value_counts().plot(kind='bar', title='Sebaran Platform - Diupdate BA (Rentang Tanggal)')
	plt.xlabel('Platform')
	plt.ylabel('Jumlah Postingan')
	st.pyplot(plt)

	# Tambahan visualisasi sebaran postingan berdasarkan tanggal
	st.subheader("Sebaran Postingan Berdasarkan Tanggal")
	plt.figure(figsize=(10, 6))
	ba_update_range['tanggal'].value_counts().sort_index().plot(kind='bar', title='Sebaran Postingan Berdasarkan Tanggal')
	plt.xlabel('Tanggal')
	plt.ylabel('Jumlah Postingan')
	plt.xticks(rotation=45)
	plt.tight_layout()
	st.pyplot(plt)

	# Tambahan visualisasi sebaran update BA
	st.subheader("Sebaran Update BA")
	plt.figure(figsize=(10, 6))
	ba_update_range['update_ba'].value_counts().sort_index().plot(kind='bar', title='Sebaran Update BA')
	plt.xlabel('Tanggal Update BA')
	plt.ylabel('Jumlah Postingan')
	plt.xticks(rotation=45)
	plt.tight_layout()
	st.pyplot(plt)

	# Visualisasi jumlah komentar berdasarkan tanggal_masuk
	st.subheader("Jumlah Komentar Berdasarkan Tanggal Masuk")
	plt.figure(figsize=(10, 6))
	agg_filtered_data.groupby('tanggal_masuk')['komentar'].count().sort_index().plot(kind='bar', title='Jumlah Komentar Berdasarkan Tanggal Masuk')
	plt.xlabel('Tanggal Masuk')
	plt.ylabel('Jumlah Komentar')
	plt.xticks(rotation=45)
	plt.tight_layout()
	st.pyplot(plt)

	# Visualisasi sebaran komentar di tiap platform
	st.subheader("Sebaran Komentar di Tiap Platform")
	plt.figure(figsize=(10, 6))
	agg_filtered_data['Platform'].value_counts().plot(kind='bar', title='Sebaran Komentar di Tiap Platform')
	plt.xlabel('Platform')
	plt.ylabel('Jumlah Komentar')
	plt.xticks(rotation=45)
	plt.tight_layout()
	st.pyplot(plt)

	# Visualisasi jumlah missing comments berdasarkan postingan
	st.subheader("Jumlah Komentar Hilang Berdasarkan Postingan")
	plt.figure(figsize=(10, 6))
	aggregated_data.groupby('link')['missing_comments'].sum().sort_index().plot(kind='bar', title='Jumlah Komentar Hilang Berdasarkan Postingan')
	plt.xlabel('Link')
	plt.ylabel('Jumlah Komentar Hilang')
	plt.xticks(rotation=45)
	plt.tight_layout()
	st.pyplot(plt)
	else:
	st.warning("Tidak ada data yang diupdate BA untuk rentang tanggal yang dipilih.")

	# Opsi untuk mengunduh dataset yang telah diagregasi
	excel_buffer_aggregated = io.BytesIO()
	with pd.ExcelWriter(excel_buffer_aggregated, engine='xlsxwriter') as writer:
	aggregated_data.to_excel(writer, index=False, sheet_name='Aggregated Data')
	excel_buffer_aggregated.seek(0)

	st.download_button(
	label=f"Download Aggregated Data by Date for {selected_candidate}",
	data=excel_buffer_aggregated,
	file_name=f"aggregated_data_{selected_candidate}.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)
	else:
	st.warning("Tidak ada data yang sesuai dengan rentang tanggal yang dipilih untuk diagregasi.")
	else:
	st.warning("Tidak ada data dalam dataset untuk divisualisasikan.")
	else:
	st.warning("Tidak ada data dalam dataset untuk divisualisasikan.")
	except FileNotFoundError:
	st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.")
	except Exception as e:
	st.error(f"An error occurred: {e}")
	pass

	if menu == "Evaluate Data Train":
	st.title("Evaluate Data Train")

	selected_candidate = st.selectbox("Choose a candidate:", list(candidate_list), key='candidate_select_evaluate')
	dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx"

	try:
	df = pd.read_excel(dataset_path)

	# Load existing keyword dictionary
	try:
	with open('keywords.json', 'r') as f:
	keyword_dict = json.load(f)
	st.success("keywords.json loaded successfully.")
	except FileNotFoundError:
	st.error("keywords.json file not found. Please ensure the file is in the correct directory.")
	st.stop()
	except json.JSONDecodeError:
	st.error("keywords.json file is not a valid JSON. Please check the file format.")
	st.stop()

	# Select candidate-specific keywords
	candidate_key = selected_candidate.replace(' ', '_')
	candidate_keywords = keyword_dict.get(candidate_key)

	if not candidate_keywords:
	st.error(f"No keywords found for the selected candidate '{selected_candidate}'. Please update the 'keywords.json' file with appropriate keywords.")
	st.stop()

	keywords = [kw for sentiment_keywords in candidate_keywords.values() for kw in sentiment_keywords]

	# Validasi Konsistensi Data
	st.subheader("Data Consistency Validation")
	missing_values = df.isnull().sum()
	st.write("Missing values in each column:")
	st.write(missing_values)

	# Menghapus baris dengan nilai yang hilang
	st.write("Removing rows with missing values...")
	df.dropna(inplace=True)

	# Distribusi Sentimen
	st.subheader("Sentiment Distribution")
	sentiment_counts = df['sentimen'].value_counts()
	st.write("Number of comments for each sentiment:")
	st.write(sentiment_counts)

	# Kualitas Data
	st.subheader("Data Quality Check")
	invalid_entries = df[df['komentar'].str.len() == 0]
	st.write(f"Number of invalid comments (empty): {len(invalid_entries)}")
	if len(invalid_entries) > 0:
	st.write("Invalid comments (empty):")
	st.dataframe(invalid_entries.head(100)) # Preview 100 baris pertama

	# Deteksi Inkonistensi Sentimen
	st.subheader("Inconsistent Sentiment Labels")
	duplicate_comments = df[df.duplicated(subset=['komentar'], keep=False)]
	inconsistent_labels = duplicate_comments.groupby('komentar')['sentimen'].nunique()
	inconsistent_labels = inconsistent_labels[inconsistent_labels > 1]
	if not inconsistent_labels.empty:
	inconsistent_labels_df = duplicate_comments[duplicate_comments['komentar'].isin(inconsistent_labels.index)]
	st.write(f"Number of comments with inconsistent sentiment labels: {len(inconsistent_labels_df)}")
	st.dataframe(inconsistent_labels_df.head(100)) # Preview 100 baris pertama
	else:
	st.write("No comments with inconsistent sentiment labels found.")

	# Penanganan Masalah
	st.subheader("Problem Handling")

	# Menghapus komentar yang kosong
	st.write("Removing invalid (empty) comments...")
	df = df[df['komentar'].str.len() > 0]

	# Interaktif: Menangani komentar dengan label sentimen yang tidak konsisten
	st.write("Resolving inconsistent sentiment labels...")
	if not inconsistent_labels.empty:
	for index, row in inconsistent_labels_df.iterrows():
	st.write(f"Comment: {row['komentar']}")
	sentimen_options = df[df['komentar'] == row['komentar']]['sentimen'].unique().tolist()
	new_sentimen = st.selectbox("Select correct sentiment", sentimen_options, key=f'sentimen_{index}')
	if st.button("Update Sentiment", key=f'update_{index}'):
	update_sentiment(index, new_sentimen)

	# Clustering menggunakan Keyword dan Model Sarkas
	st.write("Clustering comments using keywords and sarcasm model...")
	keyword_vectorizer = TfidfVectorizer(vocabulary=keywords)
	X_keywords = keyword_vectorizer.fit_transform(df['komentar'])
	kmeans = KMeans(n_clusters=10, random_state=0).fit(X_keywords)
	df['cluster'] = kmeans.labels_

	# Identifikasi kluster yang perlu ditinjau
	review_clusters = df[df['cluster'].isin(df['cluster'].value_counts()[df['cluster'].value_counts() > 10].index)]
	st.write("Clusters identified for review:")
	st.dataframe(review_clusters.head(100)) # Preview 100 baris pertama

	# Notifikasi Tambahkan Kata Kunci
	st.warning("Some comments were not captured by the current keywords. Please add new keywords in the 'Update Keywords' section.")

	# Visualisasi Klaster
	cluster_counts = df['cluster'].value_counts()
	st.write("Number of comments in each cluster:")
	st.write(cluster_counts)

	# Menampilkan hasil clustering
	st.write("Comments clustered by patterns:")
	st.dataframe(df.head(100)) # Preview 100 baris pertama

	# Export hasil analisis dan penanganan
	st.subheader("Export Final Data")
	json_buffer = io.BytesIO()
	df.to_json(json_buffer, orient='records', lines=True)
	json_buffer.seek(0)
	st.download_button(
	label=f"Download Final Data for {selected_candidate}",
	data=json_buffer,
	file_name=f"final_data_{selected_candidate}.json",
	mime="application/json"
	)

	except FileNotFoundError:
	st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.")
	except Exception as e:
	st.error(f"An error occurred: {e}")

	pass # Placeholder

	if menu == "Retraining Model":
	st.title("Retrain Model")
	selected_candidate = st.selectbox("Select a candidate to retrain the model:", list(candidate_list))

	dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx"
	model_path = f"best_rf_model_{selected_candidate.lower().replace(' ', '_')}.joblib"
	vectorizer_path = f"tfidf_vectorizer_{selected_candidate.lower().replace(' ', '_')}.joblib"
	retrain_history_path = f"retrain_history_{selected_candidate.lower().replace(' ', '_')}.json"

	try:
	# Load dataset
	data = pd.read_excel(dataset_path)

	# Ensure 'komentar' is string
	if 'komentar' not in data.columns:
	st.error("Dataset must include a 'komentar' column.")
	st.stop()
	else:
	# Convert all comments to string
	data['komentar'] = data['komentar'].fillna('').astype(str)

	# Separate validated and unvalidated data
	if 'evaluated_by_cluster' in data.columns:
	validated_data = data[data['evaluated_by_cluster'] == True]
	unvalidated_data = data[data['evaluated_by_cluster'] == False]
	else:
	validated_data = pd.DataFrame(columns=data.columns)
	unvalidated_data = data

	st.write(f"Validated Data: {len(validated_data)} rows")
	st.write(f"Unvalidated Data: {len(unvalidated_data)} rows")

	# Check if all data is validated
	if len(unvalidated_data) > 0:
	st.warning("Model retraining is only allowed if all data has been validated through 'Evaluate Clustering'. Please ensure all data is validated before retraining the model.")
	st.stop()

	# Combine all data for preprocessing
	combined_data = validated_data # Only use validated data

	# Preprocessing Function
	@st.cache_data(show_spinner=True)
	def preprocess_data(data):
	from joblib import Parallel, delayed

	def preprocess_comment(comment):
	comment = translate_emojis(comment)
	comment = normalize_unicode(comment)
	comment = handle_replies(comment)
	comment = clean_text(comment)
	comment = translate_text(comment, ntb_dict)
	comment = translate_text(comment, slang_dict)
	comment = handle_negation(comment)
	return comment

	data['processed_comments'] = Parallel(n_jobs=-1)(
	delayed(preprocess_comment)(c) for c in data['komentar']
	)
	return data

	# Preprocessing
	st.write("Starting preprocessing...")
	combined_data = preprocess_data(combined_data)

	if st.button("Retrain Model"):
	# Vectorization
	st.write("Vectorizing data...")
	vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=5000)
	X = vectorizer.fit_transform(combined_data['processed_comments'])
	y = combined_data['sentimen']

	# Split Data
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Handle Class Imbalance with SMOTE
	st.write("Balancing data with SMOTE...")
	smote = SMOTE(random_state=42, n_jobs=-1)
	X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

	# Train Random Forest Model
	st.write("Training Random Forest model...")
	rf_model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
	rf_model.fit(X_train_res, y_train_res)

	# Evaluate on Training Data
	st.write("Evaluating model...")
	y_pred_train = rf_model.predict(X_train)
	accuracy_train = accuracy_score(y_train, y_pred_train)
	report_train = classification_report(y_train, y_pred_train, output_dict=True)

	# Evaluate on Test Data
	y_pred_test = rf_model.predict(X_test)
	accuracy_test = accuracy_score(y_test, y_pred_test)
	report_test = classification_report(y_test, y_pred_test, output_dict=True)

	# Save Model and Vectorizer
	st.write("Saving model and vectorizer...")
	joblib.dump(rf_model, model_path)
	joblib.dump(vectorizer, vectorizer_path)

	# Log Retraining History
	st.write("Logging retraining history...")
	try:
	with open(retrain_history_path, "r") as f:
	retrain_history = json.load(f)
	except FileNotFoundError:
	retrain_history = []

	retrain_history.append({
	"date_retrained": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
	"accuracy_on_train": accuracy_train,
	"accuracy_on_test": accuracy_test,
	"f1_score_on_train": report_train['weighted avg']['f1-score'],
	"f1_score_on_test": report_test['weighted avg']['f1-score'],
	})

	with open(retrain_history_path, "w") as f:
	json.dump(retrain_history, f, indent=4)

	# Display Results
	st.success(f"Model retrained successfully! Accuracy on training data: {accuracy_train:.4f}, Accuracy on test data: {accuracy_test:.4f}")
	st.subheader("Model Metrics on Training Data")
	st.table(pd.DataFrame(report_train).T)
	st.subheader("Model Metrics on Test Data")
	st.table(pd.DataFrame(report_test).T)

	# Show Retrain History
	st.subheader("Retrain History")
	st.json(retrain_history)

	except FileNotFoundError:
	st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.")
	except Exception as e:
	st.error(f"An unexpected error occurred: {e}")

	pass # Placeholder

	if menu == "Maximize Preprocessing":
	st.title("Maximize Preprocessing")

	# Load Dataset Train
	candidate = st.selectbox("Choose a candidate:", list(candidate_list))
	dataset_path = f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx"
	try:
	# Load dataset
	data = pd.read_excel(dataset_path)

	# Ensure 'komentar' is string
	if 'komentar' not in data.columns:
	st.error("Dataset must include a 'komentar' column.")
	st.stop()
	data['komentar'] = data['komentar'].fillna('').astype(str)

	# Preprocessing Steps
	@st.cache_data(show_spinner=True)
	def preprocess_data(data):
	st.write("Starting preprocessing...")
	data['translated_emojis'] = data['komentar'].apply(translate_emojis)
	data['normalized_unicode'] = data['translated_emojis'].apply(normalize_unicode)
	data['reply_handled'] = data['normalized_unicode'].apply(handle_replies)
	data['clean_text'] = data['reply_handled'].apply(clean_text)
	data['translated_ntb'] = data['clean_text'].apply(lambda x: translate_text(x, ntb_dict))
	data['translated_slang'] = data['translated_ntb'].apply(lambda x: translate_text(x, slang_dict))
	data['negation_handled'] = data['translated_slang'].apply(handle_negation)
	return data

	data = preprocess_data(data)

	# Check Unmapped Words
	st.subheader("Check for Unmapped Words")
	all_words = (word.lower() for comment in data['negation_handled'] for word in comment.split())
	unique_words = set(all_words)
	ntb_dict_keys = set(ntb_dict.keys())
	slang_dict_keys = set(slang_dict.keys())
	mapped_words = ntb_dict_keys.union(slang_dict_keys)
	unmapped_words = sorted(unique_words - mapped_words)

	if unmapped_words:
	st.write(f"Found {len(unmapped_words)} unmapped words.")

	# Pilihan jumlah kata yang ingin ditampilkan
	max_words = st.slider(
	"Select number of words to display:",
	min_value=10,
	max_value=len(unmapped_words),
	value=min(50, len(unmapped_words)),
	step=10,
	)

	# Buat DataFrame untuk tampilan tabel
	unmapped_df = pd.DataFrame(unmapped_words, columns=["Unmapped Words"])
	st.dataframe(unmapped_df.head(max_words))

	# Tampilkan jumlah total kata jika dibutuhkan
	st.caption(f"Showing {min(max_words, len(unmapped_words))} out of {len(unmapped_words)} unmapped words.")
	else:
	st.success("No unmapped words found!")

	# Add Words to Dictionary
	st.subheader("Add New Words to Dictionary")
	new_word = st.text_input("Enter new word:")
	normalized_word = st.text_input("Enter normalized form:")
	dictionary_choice = st.radio("Select dictionary to update:", ["Kamus Alay", "Kamus ntb"])

	if st.button("Add to Dictionary"):
	if new_word and normalized_word:
	if dictionary_choice == "Kamus Alay":
	slang_dict[new_word.lower()] = normalized_word
	st.success(f"Added '{new_word}' -> '{normalized_word}' to Kamus Alay.")
	elif dictionary_choice == "Kamus ntb":
	ntb_dict[new_word.lower()] = normalized_word
	st.success(f"Added '{new_word}' -> '{normalized_word}' to Kamus ntb.")
	else:
	st.warning("Please enter both the new word and its normalized form.")

	# Save Updates to File
	st.subheader("Save Updated Dictionaries")
	if st.button("Save Kamus Alay"):
	kamus_alay_path = 'kamusalay.csv' # Adjust the path as needed
	pd.DataFrame(list(slang_dict.items()), columns=["slang", "formal"]).to_csv(kamus_alay_path, index=False)
	st.success(f"Kamus Alay saved successfully to {kamus_alay_path}.")

	if st.button("Save Kamus ntb"):
	kamus_ntb_path = 'ntb_dict.json' # Adjust the path as needed
	with open(kamus_ntb_path, 'w', encoding='utf-8') as f:
	json.dump(ntb_dict, f, indent=4)
	st.success(f"Kamus ntb saved successfully to {kamus_ntb_path}.")
	except FileNotFoundError:
	st.error(f"No training dataset found for {candidate}. Please ensure the dataset is available.")
	except Exception as e:
	st.error(f"An unexpected error occurred: {e}")

	pass # Placeholder

	if menu == "Update Keywords":
	st.title("Update Keywords")

	# Load existing keyword dictionary
	with open('keywords.json', 'r') as f:
	keyword_dict = json.load(f)

	# Show current keywords
	st.subheader("Current Keywords")
	candidate = st.selectbox("Select candidate", list(keyword_dict.keys()))
	for sentiment, keywords in keyword_dict[candidate].items():
	st.write(f"{sentiment}: {', '.join(keywords)}")

	# Add new keyword
	st.subheader("Add New Keyword")
	new_keyword = st.text_input("Enter new keyword")
	selected_sentiment = st.selectbox("Select sentiment for new keyword", list(keyword_dict[candidate].keys()))

	if st.button("Add Keyword"):
	if new_keyword and selected_sentiment:
	keyword_dict[candidate][selected_sentiment].append(new_keyword)
	with open('keywords.json', 'w') as f:
	json.dump(keyword_dict, f, indent=4)
	st.success(f"Keyword '{new_keyword}' added to {selected_sentiment} for {candidate}")
	else:
	st.error("Please enter a keyword and select a sentiment")

	# Analyze Special Cluster
	st.subheader("Analyze Special Cluster")
	if 'ba_lainnya_data' in st.session_state:
	try:
	# Langsung Memuat Data `Special Cluster`
	special_cluster_data = st.session_state['ba_lainnya_data'][st.session_state['ba_lainnya_data']['Cluster_Name'] == 'Special Cluster']
	if special_cluster_data.empty:
	st.warning("No data found in Special Cluster.")
	else:
	st.write(f"Total comments in Special Cluster: {len(special_cluster_data)}")

	all_words_special = []
	for comment in special_cluster_data['negation_handled']:
	comment = translate_emojis(comment)
	comment = normalize_unicode(comment)
	comment = handle_replies(comment)
	comment = clean_text(comment)
	comment = translate_text(comment, {}) # Adjust based on your dictionary
	comment = handle_negation(comment)
	words = preprocess_text(comment)
	all_words_special.extend(words)

	# Calculate word frequencies
	word_freq_special = Counter(all_words_special)

	# Add slider to select number of words to display
	num_words_special = st.slider("Number of words to display (Special Cluster)", min_value=5, max_value=50, value=20)
	most_common_words_special = word_freq_special.most_common(num_words_special)

	# Display word frequencies as a table
	st.subheader(f"Top {num_words_special} Word Frequencies in Special Cluster")
	word_freq_df_special = pd.DataFrame(most_common_words_special, columns=['Word', 'Frequency'])
	st.dataframe(word_freq_df_special)

	except Exception as e:
	st.error(f"An error occurred: {e}")
	else:
	st.warning("No 'BA Lainnya' data found. Please classify comments first.")

	# Analyze Training Data
	st.subheader("Analyze Training Data")
	dataset_path = f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx"
	try:
	train_data = pd.read_excel(dataset_path)
	if train_data.empty:
	st.warning("Training dataset is empty.")
	else:
	all_words_train = []
	for comment in train_data['komentar'].astype(str):
	comment = translate_emojis(comment)
	comment = normalize_unicode(comment)
	comment = handle_replies(comment)
	comment = clean_text(comment)
	comment = translate_text(comment, {}) # Adjust based on your dictionary
	comment = handle_negation(comment)
	words = preprocess_text(comment)
	all_words_train.extend(words)

	# Calculate word frequencies
	word_freq_train = Counter(all_words_train)

	# Add slider to select number of words to display
	num_words_train = st.slider("Number of words to display (Training Data)", min_value=5, max_value=50, value=20)
	most_common_words_train = word_freq_train.most_common(num_words_train)

	# Display word frequencies as a table
	st.subheader(f"Top {num_words_train} Word Frequencies in Training Data")
	word_freq_df_train = pd.DataFrame(most_common_words_train, columns=['Word', 'Frequency'])
	st.dataframe(word_freq_df_train)

	except FileNotFoundError:
	st.error(f"Training dataset for {candidate} not found.")
	except Exception as e:
	st.error(f"An error occurred: {e}")

	# Option to export keywords
	st.subheader("Export Keywords")
	json_buffer = io.BytesIO()
	json_buffer.write(json.dumps(keyword_dict).encode('utf-8'))
	json_buffer.seek(0)
	st.download_button(
	label="Export Keywords",
	data=json_buffer,
	file_name="keywords.json",
	mime="application/json"
	)

	# Option to import keywords
	st.subheader("Import Keywords")
	uploaded_file = st.file_uploader("Choose a JSON file", type="json")
	if uploaded_file is not None:
	imported_keywords = json.load(uploaded_file)
	keyword_dict.update(imported_keywords)
	with open('keywords.json', 'w') as f:
	json.dump(keyword_dict, f, indent=4)
	st.success("Keywords imported successfully")
	pass