zakyirhab0's picture
Update app.py
257f4a3 verified
raw
history blame
77.6 kB
import streamlit as st
import joblib
import pandas as pd
import re
import emoji
import json
import io
import unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import os
from wordcloud import WordCloud
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from transformers import pipeline
from collections import Counter
import nltk
from nltk.corpus import stopwords
from datetime import datetime
# === Preprocessing Functions === #
candidate_list = ["Lalu Muhamad Iqbal", "Indah Dhamayanti Putri", "Zulkieflimansyah", "M Suhaili", "Sitti Rohmi Djalilah", "Musyafirin"]
# Unduh stopwords jika belum terunduh
nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))
# Memastikan data "BA Lainnya" tersedia
if 'data_with_ba' in st.session_state:
ba_lainnya_data = st.session_state['data_with_ba']
else:
ba_lainnya_data = None
def translate_emojis(text):
if not isinstance(text, str):
return '' # Mengembalikan string kosong jika input bukan string
return ''.join(c for c in text if not emoji.is_emoji(c)) # Remove all emojis
# Remove all emojis
def normalize_unicode(text):
return unicodedata.normalize('NFKD', text)
def clean_text(text):
text = str(text).casefold() # Convert to lowercase
text = re.sub(r'http\S+|www\S+', '', text) # Remove URLs
text = re.sub(r'[^a-z\s]', '', text) # Remove non-alphabetic characters
text = re.sub(r'\s+', ' ', text).strip() # Normalize spaces
return text
def handle_negation(text):
negation_words = {"tidak", "bukan", "jangan", "belum", "kurang", "gagal", "sulit"}
words = text.split()
result = []
skip_next = False
for i, word in enumerate(words):
if word in negation_words and i + 1 < len(words):
result.append(f"{word}_{words[i + 1]}") # Combine negation with next word
skip_next = True
elif skip_next:
skip_next = False
else:
result.append(word)
return ' '.join(result)
def handle_replies(text):
text = re.sub(r'=--*@\w+', '', text) # Remove multi-level reply patterns
text = re.sub(r'=-*@\w+', '', text) # Remove single-level reply patterns
text = re.sub(r'@\w+', '', text) # Remove standalone @username mentions
return text
def translate_text(text, dictionary):
words = text.split()
return ' '.join([dictionary.get(word.lower(), word) for word in words]) # Translate words using dictionary
# Fungsi untuk menetapkan sentimen berdasarkan kata kunci
def assign_sentiment_based_on_keywords(comment, keyword_dict):
for sentiment, keywords in keyword_dict.items():
if any(keyword in comment for keyword in keywords):
return sentiment
return 'unknown'
# === Load Dictionaries === #
def load_dictionary(file_path, file_type='json'):
if file_type == 'json':
with open(file_path, 'r', encoding='utf-8') as file:
return json.load(file)
elif file_type == 'csv':
df = pd.read_csv(file_path, names=['slang', 'formal'])
return pd.Series(df['formal'].values, index=df['slang']).to_dict()
ntb_dict = load_dictionary('ntb_dict.json', 'json')
slang_dict = load_dictionary('kamusalay.csv', 'csv')
# === Utility Functions === #
# Fungsi untuk memperbarui file JSON dengan riwayat update
def update_history_json(history_path, komentar, link, model_data, field, date):
# Konversi Timestamp menjadi string
if isinstance(date, pd.Timestamp):
date = date.strftime('%Y-%m-%d')
# Baca histori dari file JSON
try:
with open(history_path, 'r') as file:
history_data = json.load(file)
except (FileNotFoundError, json.JSONDecodeError):
history_data = {}
# Kunci riwayat untuk komentar, link, dan model data tertentu
key = f"{komentar}_{link}_{model_data}"
# Tambahkan riwayat baru ke kunci yang sesuai
if key in history_data:
history_data[key].append({field: date})
else:
history_data[key] = [{field: date}]
# Simpan kembali ke file JSON
with open(history_path, 'w') as file:
json.dump(history_data, file, indent=4)
# Fungsi untuk memperbarui dataset pelatihan dengan data baru
def update_training_dataset(output, candidate):
dataset_path = f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx"
history_path = f"history_{candidate.lower().replace(' ', '_')}.json"
try:
required_columns = ['model_data', 'Platform', 'komentar', 'link', 'kandidat', 'sentimen', 'tanggal', 'tanggal_masuk']
output = output[required_columns].copy()
if 'predicted_category' in output.columns:
output['sentimen'] = output['predicted_category']
output.drop(columns=['predicted_category'], inplace=True)
output['tanggal_masuk'] = pd.Timestamp.now()
if os.path.exists(dataset_path):
existing_data = pd.read_excel(dataset_path)
else:
existing_data = pd.DataFrame(columns=required_columns)
# Menambahkan kolom 'update_ba' jika belum ada
if 'update_ba' not in existing_data.columns:
existing_data['update_ba'] = None
# Menambahkan kolom 'missing_comment' jika belum ada
if 'missing_comment' not in existing_data.columns:
existing_data['missing_comment'] = False
# Langkah 1: Memeriksa Komentar yang Hilang
train_comments = existing_data.groupby('link')['komentar'].apply(list).to_dict()
new_comments = output.groupby('link')['komentar'].apply(list).to_dict()
for link, comments in train_comments.items():
if link in new_comments:
new_comment_set = set(new_comments[link])
for comment in comments:
if comment not in new_comment_set:
existing_data.loc[(existing_data['link'] == link) & (existing_data['komentar'] == comment), 'missing_comment'] = True
else:
existing_data.loc[(existing_data['link'] == link) & (existing_data['komentar'] == comment), 'missing_comment'] = False
# Fungsi untuk memperbarui data
def update_data(existing_data, new_data, history_path):
for index, row in new_data.iterrows():
komentar = row['komentar']
link = row['link']
model_data = row['model_data']
tanggal_klasifikasi = pd.Timestamp.now()
# Komentar Sama Sudah Ada
existing_entry = existing_data[(existing_data['link'] == link) & (existing_data['komentar'] == komentar)]
if not existing_entry.empty:
existing_data.loc[existing_entry.index, 'update_ba'] = tanggal_klasifikasi
update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi)
else:
# Link Sama, Komentar Berbeda
existing_link_entry = existing_data[(existing_data['link'] == link)]
if not existing_link_entry.empty:
new_row = row.copy()
new_row['tanggal_masuk'] = tanggal_klasifikasi
new_row['update_ba'] = tanggal_klasifikasi
existing_data = pd.concat([existing_data, new_row.to_frame().T], ignore_index=True)
update_history_json(history_path, komentar, link, model_data, 'tanggal_masuk', tanggal_klasifikasi)
update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi)
else:
# Link Baru
new_row = row.copy()
new_row['tanggal_masuk'] = tanggal_klasifikasi
new_row['update_ba'] = tanggal_klasifikasi
existing_data = pd.concat([existing_data, new_row.to_frame().T], ignore_index=True)
update_history_json(history_path, komentar, link, model_data, 'tanggal_masuk', tanggal_klasifikasi)
update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi)
# Perbarui nilai None di update_ba dengan tanggal_masuk
existing_data['update_ba'] = pd.to_datetime(existing_data['update_ba'], errors='coerce')
existing_data['update_ba'].fillna(existing_data['tanggal_masuk'], inplace=True)
return existing_data
updated_data = update_data(existing_data, output, history_path)
updated_data.to_excel(dataset_path, index=False)
st.success(f"Data successfully updated in {candidate}'s training dataset.")
if 'missing_comment' in existing_data.columns and existing_data['missing_comment'].any():
st.subheader("Missing Comments")
st.write("Comments that were found to be missing:")
st.dataframe(existing_data[existing_data['missing_comment']])
except KeyError as e:
st.error(f"Missing column in the dataset: {e}")
except Exception as e:
st.error(f"An error occurred: {e}")
def clustering_based_evaluation(df, n_clusters=10):
st.write("Starting preprocessing...")
df['translated_emojis'] = df['komentar'].fillna('').astype(str).apply(translate_emojis)
df['normalized_unicode'] = df['translated_emojis'].apply(normalize_unicode)
df['reply_handled'] = df['normalized_unicode'].apply(handle_replies)
df['clean_text'] = df['reply_handled'].apply(clean_text)
df['translated_ntb'] = df['clean_text'].apply(lambda x: translate_text(x, ntb_dict))
df['translated_slang'] = df['translated_ntb'].apply(lambda x: translate_text(x, slang_dict))
df['negation_handled'] = df['translated_slang'].apply(handle_negation)
st.write("Generating TF-IDF vectors...")
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['negation_handled'])
st.write(f"Clustering into {n_clusters} clusters...")
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(tfidf_matrix)
st.write("Performing PCA for visualization...")
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())
df['PCA1'] = reduced_data[:, 0]
df['PCA2'] = reduced_data[:, 1]
st.write("Clustering completed successfully!")
return df
def load_and_process_data(dataset_path, history_path):
df = pd.read_excel(dataset_path)
df['tanggal_masuk'] = pd.to_datetime(df['tanggal_masuk'], errors='coerce')
if df['tanggal_masuk'].isnull().any():
st.warning("Some dates could not be parsed correctly. Please check the date format in the dataset.")
df['tanggal_masuk'].fillna(pd.Timestamp.now().strftime('%Y-%m-%d'), inplace=True)
required_columns = ['model_data', 'Platform', 'komentar', 'sentimen', 'tanggal', 'tanggal_masuk', 'evaluated_by_cluster']
for col in required_columns:
if col not in df.columns:
if col == 'tanggal_masuk':
df[col] = pd.Timestamp.now().strftime('%Y-%m-%d')
elif col == 'evaluated_by_cluster':
df[col] = False
else:
df[col] = None
df = df[required_columns]
try:
with open(history_path, "r") as f:
history = json.load(f)
except FileNotFoundError:
history = []
return df, history
# Define the function to handle Special Cluster
def handle_special_cluster(data, keywords, detector):
for index, row in data.iterrows():
text = row['negation_handled']
# Cek apakah teks mengandung salah satu kata kunci
detected = False
for sentiment, words in keywords.items():
if any(word in text for word in words):
data.loc[index, 'predicted_category'] = sentiment.replace('_', ' ')
data.loc[index, 'detected_by'] = 'keyword'
detected = True
break
if not detected:
# Jika tidak ada kata kunci yang cocok, gunakan deteksi sarkasme
result = detector(text)
if result[0]['label'] == 'SARCASM':
data.loc[index, 'predicted_category'] = 'co sarkastic'
data.loc[index, 'detected_by'] = 'sarcasm'
else:
# Jika tidak ada sentimen yang cocok
data.loc[index, 'predicted_category'] = 'Unknown'
data.loc[index, 'detected_by'] = 'unknown'
return data
def preprocess_text(text):
text = text.lower()
text = re.sub(r'\W+', ' ', text) # Remove non-alphanumeric characters
words = text.split()
words = [word for word in words if word not in stop_words]
return words
def display_word_frequencies(words, num_words):
st.subheader(f"Top {num_words} Words")
for word, freq in words:
st.write(f"{word}: {freq}")
def update_sentiment(index, new_sentimen):
df.loc[index, 'sentimen'] = new_sentimen
st.write(f"Updated sentiment for comment at index {index} to {new_sentimen}")
# Fungsi untuk memuat data `Corrected Comments` dari file Excel
def load_corrected_comments_from_excel(file_path):
try:
return pd.read_excel(file_path, sheet_name='Corrected Comments')
except FileNotFoundError:
return pd.DataFrame(columns=['model_data', 'Platform', 'komentar', 'tanggal', 'link', 'sentimen', 'Cluster_Name', 'corrected_by', 'tanggal_masuk'])
except Exception as e:
print(f"Error loading corrected comments: {e}")
return pd.DataFrame(columns=['model_data', 'Platform', 'komentar', 'tanggal', 'link', 'sentimen', 'Cluster_Name', 'corrected_by', 'tanggal_masuk'])
# Fungsi untuk menyimpan data `Corrected Comments` ke file Excel
def save_corrected_comments_to_excel(data, file_path):
with pd.ExcelWriter(file_path, engine='xlsxwriter') as writer:
data.to_excel(writer, sheet_name='Corrected Comments', index=False)
# Lokasi file Excel untuk menyimpan data `Corrected Comments`
corrected_comments_file = 'corrected_comments.xlsx'
# Fungsi untuk menampilkan visualisasi distribusi komentar dalam cluster
def display_cluster_visualization(ba_lainnya_data):
st.subheader("Cluster Visualization")
# Menghitung jumlah komentar dalam kategori `Similar Sentiment` dan `Special Cluster`
cluster_counts = ba_lainnya_data[ba_lainnya_data['Cluster_Name'].str.contains('Similar|Special Cluster')]['Cluster_Name'].value_counts()
# Membuat grafik batang (bar chart)
plt.figure(figsize=(10, 6))
plt.bar(cluster_counts.index, cluster_counts.values, color=['blue', 'green', 'orange', 'red', 'purple'])
plt.xlabel('Cluster Name')
plt.ylabel('Number of Comments')
plt.title('Distribution of Comments in Similar Sentiment and Special Cluster')
plt.xticks(rotation=45)
plt.show()
st.pyplot(plt)
def run_clustering_for_ba_lainnya():
st.title("Clustering for 'BA Lainnya'")
if 'data_with_ba' not in st.session_state:
st.error("No 'BA Lainnya' data found from the classification model. Please classify comments first.")
st.stop()
ba_lainnya_data = st.session_state['data_with_ba']
st.write(f"**'BA Lainnya' Data:** {len(ba_lainnya_data)} rows")
with open('keywords.json', 'r') as f:
keyword_dict = json.load(f)
selected_candidate = st.session_state['candidate']
candidate_keywords = keyword_dict.get(selected_candidate.replace(' ', '_'))
if candidate_keywords is None:
st.error("Keywords for the selected candidate not found.")
st.stop()
sarcasm_detector = pipeline('sentiment-analysis', model='unitary/toxic-bert')
dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx"
corrected_comments_file = f"corrected_comments_{selected_candidate.lower().replace(' ', '_')}.xlsx"
try:
train_data = pd.read_excel(dataset_path)
labeled_data = train_data[train_data['sentimen'].isin(['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])]
st.write(f"**Labeled Data from Training Dataset:** {len(labeled_data)} rows")
ba_lainnya_data['Cluster'] = None
ba_lainnya_data['detected_by'] = None
ba_lainnya_data['Cluster_Name'] = None
ba_lainnya_data['corrected_by'] = "Not Corrected"
ba_lainnya_data['Sentiment'] = None # Ubah inisialisasi ke None untuk memastikan tidak ada nilai default yang salah
ba_lainnya_data['Cluster_Name'] = ba_lainnya_data['Cluster_Name'].astype(str)
ba_lainnya_data['corrected_by'] = ba_lainnya_data['corrected_by'].astype(str)
ba_lainnya_data['Sentiment'] = ba_lainnya_data['Sentiment'].astype(str)
for data in [ba_lainnya_data, labeled_data]:
data['translated_emojis'] = data['komentar'].fillna('').astype(str).apply(translate_emojis)
data['normalized_unicode'] = data['translated_emojis'].apply(normalize_unicode)
data['reply_handled'] = data['normalized_unicode'].apply(handle_replies)
data['clean_text'] = data['reply_handled'].apply(clean_text)
data['translated_ntb'] = data['clean_text'].apply(lambda x: translate_text(x, {}))
data['translated_slang'] = data['translated_ntb'].apply(lambda x: translate_text(x, {}))
data['negation_handled'] = data['translated_slang'].apply(handle_negation)
data['negation_handled'] = data['negation_handled'].fillna('')
combined_data = ba_lainnya_data.copy()
combined_data['Label'] = 'BA Lainnya'
for sentimen in ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']:
sentimen_data = labeled_data[labeled_data['sentimen'] == sentimen].copy()
sentimen_data['Label'] = sentimen
combined = pd.concat([combined_data, sentimen_data], ignore_index=True)
if len(combined) < 2:
st.warning(f"Not enough samples to cluster for {sentimen}.")
continue
vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=5000)
tfidf_matrix = vectorizer.fit_transform(combined['negation_handled'])
st.write(f"Clustering 'BA Lainnya' comments similar to {sentimen}...")
kmeans = KMeans(n_clusters=2, random_state=42)
combined['Cluster'] = kmeans.fit_predict(tfidf_matrix)
valid_indices = combined.index[:len(ba_lainnya_data)]
valid_indices = valid_indices.intersection(ba_lainnya_data.index)
ba_lainnya_data.loc[valid_indices, 'Cluster'] = combined.loc[valid_indices, 'Cluster']
ba_lainnya_data.loc[ba_lainnya_data['Cluster'] == 0, 'Cluster_Name'] = f"{sentimen} Similar"
ba_lainnya_data.loc[ba_lainnya_data['Cluster'] == 1, 'Cluster_Name'] = f"{sentimen} Dissimilar"
ba_lainnya_data.loc[valid_indices, 'Sentiment'] = sentimen
for index, row in ba_lainnya_data.iterrows():
if row['Cluster_Name'].endswith('Dissimilar') or row['Cluster_Name'] == 'None':
dissimilar_comment = ba_lainnya_data.loc[[index]].copy()
for sentimen in ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']:
sentimen_data = labeled_data[labeled_data['sentimen'] == sentimen].copy()
combined = pd.concat([dissimilar_comment, sentimen_data], ignore_index=True)
if len(combined) < 2:
continue
tfidf_matrix = vectorizer.fit_transform(combined['negation_handled'])
if tfidf_matrix.shape[0] == 0:
continue
kmeans = KMeans(n_clusters=2, random_state=42)
combined['Cluster'] = kmeans.fit_predict(tfidf_matrix)
if len(combined) > 0 and combined.loc[0, 'Cluster'] == 0:
ba_lainnya_data.loc[index, 'Cluster_Name'] = f"{sentimen} Similar"
ba_lainnya_data.loc[index, 'Sentiment'] = sentimen
break
else:
ba_lainnya_data.loc[index, 'Cluster_Name'] = 'Special Cluster'
ba_lainnya_data.loc[index, 'corrected_by'] = 'Special Cluster'
ba_lainnya_data.loc[index, 'Sentiment'] = 'Special Sentiment'
ba_lainnya_data['Cluster_Name'] = ba_lainnya_data['Cluster_Name'].apply(lambda x: 'Special Cluster' if x == 'nan' else x)
special_cluster_data = ba_lainnya_data[ba_lainnya_data['Cluster_Name'] == 'Special Cluster']
if not special_cluster_data.empty:
special_cluster_data = handle_special_cluster(special_cluster_data, candidate_keywords, sarcasm_detector)
ba_lainnya_data.update(special_cluster_data)
ba_lainnya_data.loc[special_cluster_data.index, 'corrected_by'] = 'Special Cluster'
ba_lainnya_data.loc[special_cluster_data.index, 'Sentiment'] = 'Special Sentiment'
st.warning("Some comments were not captured by the current keywords. Please add new keywords in the 'Update Keywords' section.")
st.subheader("Detection Distribution in Special Cluster")
detection_counts = special_cluster_data['detected_by'].value_counts()
plt.figure(figsize=(10, 6))
plt.bar(detection_counts.index, detection_counts.values, color=['blue', 'orange', 'red'])
plt.xlabel('Detection Method')
plt.ylabel('Number of Comments')
plt.title('Detection Distribution in Special Cluster')
plt.show()
st.pyplot(plt)
st.write("Top Keywords in Special Cluster")
for sentiment, keywords in candidate_keywords.items():
st.write(f"{sentiment}: {', '.join(keywords)}")
st.subheader("Special Cluster Details")
st.dataframe(special_cluster_data[['komentar', 'Cluster_Name', 'detected_by']])
corrected_comments = load_corrected_comments_from_excel(corrected_comments_file)
display_cluster_visualization(ba_lainnya_data)
st.subheader("Edit Sentiment in Table")
# Tambahkan kolom apakah komentar sudah masuk ke corrected_comments
ba_lainnya_data['Already Corrected'] = ba_lainnya_data['komentar'].isin(corrected_comments['komentar'])
ba_lainnya_data['Corrected Sentiment'] = ba_lainnya_data.apply(
lambda row: corrected_comments[corrected_comments['komentar'] == row['komentar']]['sentimen'].values[0]
if row['Already Corrected'] else "", axis=1
)
# Tambahkan kolom checkbox untuk memilih komentar yang mau diedit
ba_lainnya_data['Select'] = False
# Dropdown untuk memilih cluster (bulk selection)
selected_cluster = st.selectbox("Select a cluster to edit:", sorted(ba_lainnya_data['Cluster_Name'].unique()))
# Jika cluster dipilih, auto-select semua komentar di cluster tersebut
if selected_cluster:
ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'Select'] = True
# Tampilkan tabel dengan checkbox & informasi sentimen
edited_data = st.data_editor(
ba_lainnya_data[['Select', 'komentar', 'Cluster_Name', 'Sentiment', 'Already Corrected', 'Corrected Sentiment']],
column_config={
"Select": st.column_config.CheckboxColumn("✅ Select"),
"komentar": st.column_config.TextColumn("Komentar"),
"Cluster_Name": st.column_config.TextColumn("Cluster"),
"Sentiment": st.column_config.TextColumn("Sentimen"),
"Already Corrected": st.column_config.CheckboxColumn("✅ Sudah di Corrected?"),
"Corrected Sentiment": st.column_config.TextColumn("Sentimen Corrected"),
},
disabled=["komentar", "Cluster_Name", "Already Corrected", "Corrected Sentiment"], # Kolom yang gak bisa diedit
use_container_width=True,
)
# Dropdown untuk memilih sentimen baru yang akan diterapkan ke komentar yang dicentang
new_sentimen = st.selectbox("Select new sentiment for selected comments:",
['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])
# Jika tombol ditekan, update sentimen komentar yang dicentang
if st.button("Update Sentiments"):
selected_rows = edited_data[edited_data['Select']] # Ambil data yang dicentang
if selected_rows.empty:
st.warning("No comments selected for update!")
else:
for index, row in selected_rows.iterrows():
ba_lainnya_data.at[index, 'Sentiment'] = new_sentimen
ba_lainnya_data.at[index, 'corrected_by'] = 'Manual Edit'
if ba_lainnya_data.at[index, 'komentar'] in corrected_comments['komentar'].values:
corrected_comments.loc[corrected_comments['komentar'] == ba_lainnya_data.at[index, 'komentar'], 'sentimen'] = new_sentimen
else:
new_entry = ba_lainnya_data.loc[[index]].copy()
new_entry['sentimen'] = new_sentimen
corrected_comments = pd.concat([corrected_comments, new_entry])
corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True)
save_corrected_comments_to_excel(corrected_comments, corrected_comments_file)
st.success("Sentiments updated successfully!")
st.rerun()
st.subheader("Clustered Data")
selected_cluster = st.selectbox("Select a cluster to view comments:", sorted(ba_lainnya_data['Cluster_Name'].unique()))
cluster_comments = ba_lainnya_data[ba_lainnya_data['Cluster_Name'] == selected_cluster]
st.dataframe(cluster_comments[['komentar', 'Cluster_Name']].head(100))
new_sentimen = st.selectbox("Select new sentiment for this cluster:", ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])
if st.button("Update Sentiment for this cluster"):
ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'corrected_by'] = 'Batch Cluster'
ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'sentimen'] = new_sentimen
st.success(f"Sentiment for cluster {selected_cluster} updated to {new_sentimen}")
# Save and refresh Corrected Comments table and Cluster Visualization
corrected_comments = pd.concat([corrected_comments, ba_lainnya_data[ba_lainnya_data['corrected_by'] != "Not Corrected"]])
corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True)
save_corrected_comments_to_excel(corrected_comments, corrected_comments_file)
st.subheader("Corrected Comments")
st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100))
display_cluster_visualization(ba_lainnya_data)
st.subheader("Special Rules Based on Keywords")
keyword = st.text_input("Enter a keyword to set a rule:")
specific_cluster = st.selectbox("Select a cluster for this keyword:", sorted(ba_lainnya_data['Cluster_Name'].unique()))
if keyword:
new_cluster = st.selectbox("Select sentiment for this keyword:", ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])
if st.button("Apply Rule"):
ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'Cluster_Name'] = new_cluster
ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'detected_by'] = specific_cluster
ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'corrected_by'] = 'Keyword Rule'
ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'sentimen'] = new_cluster
st.success(f"All comments containing '{keyword}' have been updated to '{new_cluster}' sentiment.")
# Update keywords.json file to avoid duplicates
if selected_candidate.replace(' ', '_') in keyword_dict:
if new_cluster in keyword_dict[selected_candidate.replace(' ', '_')]:
if keyword not in keyword_dict[selected_candidate.replace(' ', '_')][new_cluster]:
keyword_dict[selected_candidate.replace(' ', '_')][new_cluster].append(keyword)
else:
keyword_dict[selected_candidate.replace(' ', '_')][new_cluster] = [keyword]
else:
keyword_dict[selected_candidate.replace(' ', '_')] = {new_cluster: [keyword]}
with open('keywords.json', 'w') as f:
json.dump(keyword_dict, f)
st.success(f"Keyword '{keyword}' has been added to the keyword list.")
# Save and refresh Corrected Comments table and Cluster Visualization
corrected_comments = pd.concat([corrected_comments, ba_lainnya_data[ba_lainnya_data['corrected_by'] != "Not Corrected"]])
corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True)
save_corrected_comments_to_excel(corrected_comments, corrected_comments_file)
st.subheader("Corrected Comments")
st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100))
display_cluster_visualization(ba_lainnya_data)
st.subheader("Corrected Comments")
corrected_comments = load_corrected_comments_from_excel(corrected_comments_file)
st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100))
st.subheader("Visual Representation of Corrected Comments")
sentiment_counts = corrected_comments['sentimen'].value_counts()
plt.figure(figsize=(10, 6))
plt.bar(sentiment_counts.index, sentiment_counts.values, color=['blue', 'green', 'orange', 'red', 'purple'])
plt.xlabel('Sentimen')
plt.ylabel('Number of Corrected Comments')
plt.title('Number of Corrected Comments by Sentiment')
plt.show()
st.pyplot(plt)
st.subheader("Download Options")
excel_buffer_cluster = io.BytesIO()
with pd.ExcelWriter(excel_buffer_cluster, engine='xlsxwriter') as writer:
ba_lainnya_data.to_excel(writer, index=False, sheet_name='Clustered Data')
excel_buffer_cluster.seek(0)
st.download_button(
label=f"Download Clustered Data for {selected_candidate}",
data=excel_buffer_cluster,
file_name=f"clustered_data_{selected_candidate}.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
st.subheader("Save Corrected Comments to Training Dataset")
if st.button("Save Corrected Comments"):
try:
# Simpan hanya hasil koreksi
corrected_comments['tanggal_masuk'] = pd.Timestamp.now().strftime('%Y-%m-%d')
update_training_dataset(corrected_comments, st.session_state['candidate'])
st.success("Corrected comments have been successfully saved to the training dataset.")
# Clear corrected comments from session state to avoid duplicate saves
st.session_state['data_with_ba'] = pd.DataFrame(columns=corrected_comments.columns)
# Rerun automatically after saving to refresh the state
st.rerun()
except Exception as e:
st.error(f"An error occurred while saving the corrected comments: {e}")
except FileNotFoundError:
st.error(f"No dataset found for {selected_candidate}. Please add data to create the dataset.")
except Exception as e:
st.error(f"An unexpected error occurred: {e}")
# === Sidebar Navigation === #
menu = st.sidebar.radio("Select a Feature", ["Model-Based Classification","Clustering for 'BA Lainnya'", "Update Keywords","View Training Dataset","Evaluate Data Train","Maximize Preprocessing","Retraining Model"])
if menu == "Model-Based Classification":
st.title("Model-Based Classification")
candidate = st.selectbox("Choose a candidate:", candidate_list)
model_path = f"best_rf_model_{candidate.replace(' ', '_').lower()}.joblib"
vectorizer_path = f"tfidf_vectorizer_{candidate.replace(' ', '_').lower()}.joblib"
# Save the selected candidate to session state
st.session_state['candidate'] = candidate
uploaded_file = st.file_uploader("Upload an Excel file for classification", type=['xlsx'])
@st.cache_data
def load_model_and_vectorizer(model_path, vectorizer_path):
"""Load model and vectorizer, cache them for efficiency."""
try:
model = joblib.load(model_path)
vectorizer = joblib.load(vectorizer_path)
return model, vectorizer
except FileNotFoundError:
return None, None
model, vectorizer = load_model_and_vectorizer(model_path, vectorizer_path)
if not model or not vectorizer:
st.error("Model or vectorizer not found for the selected candidate.")
st.stop()
# Save the vectorizer and model to session state
st.session_state['vectorizer'] = vectorizer
st.session_state['model'] = model
@st.cache_data
def preprocess_data(data):
"""Preprocess comments with batching for large datasets."""
from joblib import Parallel, delayed
def preprocess_batch(batch):
batch['translated_emojis'] = batch['komentar'].apply(translate_emojis)
batch['normalized_unicode'] = batch['translated_emojis'].apply(normalize_unicode)
batch['reply_handled'] = batch['normalized_unicode'].apply(handle_replies)
batch['clean_text'] = batch['reply_handled'].apply(clean_text)
batch['translated_ntb'] = batch['clean_text'].apply(lambda x: translate_text(x, ntb_dict))
batch['translated_slang'] = batch['translated_ntb'].apply(lambda x: translate_text(x, slang_dict))
batch['negation_handled'] = batch['translated_slang'].apply(handle_negation)
return batch
batch_size = 10000 # Process 10,000 rows at a time
batches = [data.iloc[i:i+batch_size] for i in range(0, len(data), batch_size)]
processed_batches = Parallel(n_jobs=-1)(delayed(preprocess_batch)(batch) for batch in batches)
return pd.concat(processed_batches, ignore_index=True)
# Process uploaded file
if uploaded_file:
try:
data = pd.read_excel(uploaded_file)
if 'komentar' not in data.columns:
st.error("The uploaded file must include a 'komentar' column.")
st.stop()
data = preprocess_data(data)
except Exception as e:
st.error(f"An error occurred while processing the file: {e}")
st.stop()
elif 'model_classified_data' in st.session_state:
data = st.session_state['model_classified_data']
else:
st.info("Please upload a file for classification.")
st.stop()
# Transform comments into TF-IDF vectors
try:
tfidf_data = vectorizer.transform(data['negation_handled'].fillna(''))
data['predicted_category'] = model.predict(tfidf_data)
data['probabilities'] = model.predict_proba(tfidf_data).tolist()
data['max_probability'] = data['probabilities'].apply(lambda x: max(x))
except Exception as e:
st.error(f"An error occurred during model prediction: {e}")
st.stop()
# Cache classified data
st.session_state['model_classified_data'] = data
# Interactive threshold adjustment
st.subheader("Set Threshold for 'BA Lainnya'")
threshold = st.slider("Threshold for tagging 'BA Lainnya'", min_value=0.0, max_value=1.0, value=0.80, step=0.01)
# Apply threshold to tag "BA Lainnya"
data['tag'] = data['max_probability'].apply(lambda x: 'BA Lainnya' if x < threshold else '')
# Separate data for visualization
data_without_ba = data[data['tag'] != 'BA Lainnya']
data_with_ba = data[data['tag'] == 'BA Lainnya']
# Save updated results to session state for dynamic updates
st.session_state['data_without_ba'] = data_without_ba
st.session_state['data_with_ba'] = data_with_ba
# Preview Results
st.subheader("Preview Results")
st.write("### 1. Hasil Klasifikasi Tanpa Tag 'BA Lainnya'")
if not data_without_ba.empty:
st.dataframe(data_without_ba[['komentar', 'predicted_category', 'max_probability']])
else:
st.info("No high-probability classifications available.")
st.write("### 2. Hasil Klasifikasi Dengan Tag 'BA Lainnya'")
if not data_with_ba.empty:
st.dataframe(data_with_ba[['komentar', 'predicted_category', 'max_probability']])
else:
st.info("No low-probability classifications available.")
# Visualization: Sentiment Distribution
st.subheader("Sentiment Distribution Visualization")
def plot_distribution(data, title):
sentiment_counts = data['predicted_category'].value_counts()
fig, ax = plt.subplots()
ax.bar(sentiment_counts.index, sentiment_counts.values)
ax.set_title(title)
ax.set_xlabel("Sentiments")
ax.set_ylabel("Count")
st.pyplot(fig)
if not data_without_ba.empty:
plot_distribution(data_without_ba, "Sentiment Distribution (Without 'BA Lainnya')")
if not data_with_ba.empty:
plot_distribution(data_with_ba, "Sentiment Distribution (With 'BA Lainnya')")
# Download Results
st.subheader("Download Results")
excel_buffer = io.BytesIO()
with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
data.to_excel(writer, index=False, sheet_name='Classification Results')
excel_buffer.seek(0)
st.download_button(
label="Download All Classification Results",
data=excel_buffer,
file_name=f"classification_results_{candidate}.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
# Save Non-'BA Lainnya' Data to Training Dataset
st.subheader("Save Classified Data")
if not data_without_ba.empty:
st.info("Only Non-'BA Lainnya' comments will be saved to the training dataset.")
if st.button("Save Non-'BA Lainnya' Data to Training Dataset"):
try:
data_to_save = data_without_ba[['model_data', 'Platform', 'komentar', 'link', 'kandidat', 'tanggal']].copy()
data_to_save['sentimen'] = data_without_ba['predicted_category']
data_to_save['tanggal_masuk'] = pd.Timestamp.now().strftime('%Y-%m-%d')
update_training_dataset(data_to_save, candidate)
st.success("Data successfully saved to the training dataset.")
except Exception as e:
st.error(f"An error occurred while saving the data: {e}")
else:
st.info("No Non-'BA Lainnya' data available to save.")
pass # Placeholder
# Integrasi fungsi dalam halaman "Clustering for 'BA Lainnya'"
if menu == "Clustering for 'BA Lainnya'":
selected_candidate = st.session_state['selected_candidate'] if 'selected_candidate' in st.session_state else None
run_clustering_for_ba_lainnya()
pass # Placeholder
# Memastikan kode ini hanya dijalankan jika menu yang dipilih adalah "View Training Dataset"
if menu == "View Training Dataset":
st.title("View Training Dataset")
# Header untuk memilih kandidat
st.header("Options")
selected_candidate = st.selectbox("Choose a candidate:", list(candidate_list), key='candidate_select_view')
# Path dataset
dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx"
history_path = f"history_{selected_candidate.lower().replace(' ', '_')}.json"
# Memuat dataset
try:
df = pd.read_excel(dataset_path)
# Memastikan kolom yang diperlukan ada
required_columns = ['model_data', 'Platform', 'komentar', 'sentimen', 'tanggal', 'tanggal_masuk', 'link', 'evaluated_by_data_train']
for col in required_columns:
if col not in df.columns:
if col == 'evaluated_by_data_train':
df[col] = False
# Menambahkan kolom 'update_ba' jika belum ada
if 'update_ba' not in df.columns:
df['update_ba'] = None
# Menambahkan kolom 'missing_comment' jika belum ada
if 'missing_comment' not in df.columns:
df['missing_comment'] = False
# Pastikan 'tanggal_masuk' dan 'tanggal' dalam format datetime yang benar
df['tanggal_masuk'] = pd.to_datetime(df['tanggal_masuk'], errors='coerce')
df['tanggal'] = pd.to_datetime(df['tanggal'], errors='coerce')
# Konversi kolom tanggal agar hanya menyimpan bagian tanggal tanpa waktu
df['tanggal'] = df['tanggal'].dt.date
df['tanggal_masuk'] = df['tanggal_masuk'].dt.date
df['update_ba'] = pd.to_datetime(df['update_ba'], errors='coerce').dt.date
# Menangani nilai NaT (Not a Time) jika ada
if df['tanggal_masuk'].isnull().any():
st.warning("Some dates 'tanggal_masuk' could not be parsed correctly. Please check the date format in the dataset.")
df['tanggal_masuk'].fillna(pd.Timestamp.now().date(), inplace=True)
if df['tanggal'].isnull().any():
st.warning("Some dates 'tanggal' could not be parsed correctly. Please check the date format in the dataset.")
df['tanggal'].fillna(pd.Timestamp.now().date(), inplace=True)
# Menambahkan kolom 'kandidat' jika belum ada dan mengisinya
if 'kandidat' not in df.columns:
df['kandidat'] = selected_candidate
# Mengambil subset kolom yang diperlukan
df = df[required_columns + ['update_ba', 'kandidat', 'missing_comment']]
# Perbarui nilai None di update_ba dengan tanggal_masuk
df['update_ba'].fillna(df['tanggal_masuk'], inplace=True)
# Menampilkan statistik dasar
st.subheader(f"Training Dataset for {selected_candidate}")
st.write(f"**Total rows in dataset:** {len(df)}")
if not df.empty:
# Visualisasi sebaran update BA
st.subheader("Visualisasi Postingan Berdasarkan Update BA")
ba_update_counts = df['update_ba'].value_counts().sort_index()
fig, ax = plt.subplots(figsize=(10, 6))
ba_update_counts.plot(kind='bar', ax=ax, color='blue')
ax.set_title('Sebaran Postingan Berdasarkan Update BA')
ax.set_xlabel('Tanggal Update BA')
ax.set_ylabel('Jumlah Postingan')
plt.xticks(rotation=45)
plt.tight_layout()
st.pyplot(fig)
# Visualisasi tambahan sebaran platform
st.subheader("Sebaran Platform Berdasarkan Update BA")
platform_counts = df['Platform'].value_counts()
fig, ax = plt.subplots(figsize=(10, 6))
platform_counts.plot(kind='bar', ax=ax, color='green')
ax.set_title('Sebaran Platform Berdasarkan Update BA')
ax.set_xlabel('Platform')
ax.set_ylabel('Jumlah Postingan')
plt.xticks(rotation=45)
plt.tight_layout()
st.pyplot(fig)
# Visualisasi jumlah komentar hilang berdasarkan platform
st.subheader("Jumlah Komentar Hilang Berdasarkan Platform")
missing_comments_by_platform = df.groupby('Platform')['missing_comment'].sum().sort_index()
fig, ax = plt.subplots(figsize=(10, 6))
missing_comments_by_platform.plot(kind='bar', ax=ax, color='red')
ax.set_title('Jumlah Komentar Hilang Berdasarkan Platform')
ax.set_xlabel('Platform')
ax.set_ylabel('Jumlah Komentar Hilang')
plt.xticks(rotation=45)
plt.tight_layout()
st.pyplot(fig)
# Filter berdasarkan status validasi
st.subheader("Filter Data")
validation_filter = st.radio(
"Choose data type to view:",
["All Data", "Validated Data", "Non-Validated Data"],
key='validation_filter'
)
if validation_filter == "Validated Data":
filtered_data = df[df['evaluated_by_data_train'] == True]
elif validation_filter == "Non-Validated Data":
filtered_data = df[df['evaluated_by_data_train'] == False]
else:
filtered_data = df
if not filtered_data.empty:
st.subheader(f"Filtered Data: {validation_filter}")
st.dataframe(filtered_data) # Menampilkan semua data yang sesuai dengan filter
else:
st.warning("Tidak ada data yang sesuai dengan filter yang dipilih.")
# Menampilkan riwayat penambahan data
st.subheader("History of Data Additions")
try:
with open(history_path, "r") as f:
history = json.load(f)
history_list = []
for key, value in history.items():
for entry in value:
for k, v in entry.items():
history_list.append({
'key': key,
'field': k,
'date': v
})
history_df = pd.DataFrame(history_list)
st.dataframe(history_df)
except FileNotFoundError:
st.write("No addition history available.")
except ValueError as e:
st.error(f"An error occurred while loading history data: {e}")
# Opsi untuk mengunduh dataset yang telah difilter
st.subheader("Download Options")
if not filtered_data.empty:
excel_buffer = io.BytesIO()
with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
filtered_data.to_excel(writer, index=False, sheet_name='Filtered Dataset')
excel_buffer.seek(0)
st.download_button(
label=f"Download Filtered Dataset for {selected_candidate}",
data=excel_buffer,
file_name=f"filtered_training_dataset_{selected_candidate}.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
# Opsi untuk mengunduh seluruh dataset
if not df.empty:
excel_buffer_full = io.BytesIO()
with pd.ExcelWriter(excel_buffer_full, engine='xlsxwriter') as writer:
df.to_excel(writer, index=False, sheet_name='Training Dataset')
excel_buffer_full.seek(0)
st.download_button(
label=f"Download Full Training Dataset for {selected_candidate}",
data=excel_buffer_full,
file_name=f"training_dataset_{selected_candidate}.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
# Fitur untuk mengunduh dataset berdasarkan filter tanggal
st.subheader("Download Aggregated Data by Date")
# Mendapatkan nilai maksimum dan minimum tanggal untuk default date_input
if not df['tanggal'].empty:
min_date = df['tanggal'].min()
max_date = df['tanggal'].max()
selected_start_date = st.date_input("Select start date for aggregation:", value=min_date)
selected_end_date = st.date_input("Select end date for aggregation:", value=max_date)
agg_filtered_data = df[(df['tanggal'] >= selected_start_date) & (df['tanggal'] <= selected_end_date)]
if not agg_filtered_data.empty:
# Pastikan kolom 'kandidat' ada dan diisi
agg_filtered_data['kandidat'] = selected_candidate
aggregated_data = agg_filtered_data.groupby(['kandidat', 'link', 'tanggal', 'Platform', 'model_data', 'update_ba']).agg(
co_likes=('sentimen', lambda x: (x == 'Co Likes').sum()),
co_support=('sentimen', lambda x: (x == 'Co Support').sum()),
co_optimism=('sentimen', lambda x: (x == 'Co Optimism').sum()),
co_sarkastic=('sentimen', lambda x: (x == 'Co Sarkastic').sum()),
co_negative=('sentimen', lambda x: (x == 'Co Negative').sum()),
missing_comments=('missing_comment', 'sum') # Menambahkan agregasi untuk missing_comment
).reset_index()
# Menambahkan kolom-kolom baru untuk jumlah komentar
aggregated_data['jumlah_komentar_positif'] = aggregated_data['co_likes'] + aggregated_data['co_support'] + aggregated_data['co_optimism']
aggregated_data['jumlah_komentar_negatif'] = aggregated_data['co_sarkastic'] + aggregated_data['co_negative']
aggregated_data['jumlah_komentar'] = aggregated_data[['co_likes', 'co_support', 'co_optimism', 'co_sarkastic', 'co_negative']].sum(axis=1)
st.dataframe(aggregated_data)
# Visualisasi jumlah postingan yang diupdate BA dengan sebaran platform berdasarkan rentang tanggal
st.subheader("Visualisasi Postingan yang Diupdate BA Berdasarkan Rentang Tanggal")
ba_update_range = aggregated_data[aggregated_data['update_ba'] != 'Belum diupdate']
if not ba_update_range.empty:
plt.figure(figsize=(10, 6))
ba_update_range['Platform'].value_counts().plot(kind='bar', title='Sebaran Platform - Diupdate BA (Rentang Tanggal)')
plt.xlabel('Platform')
plt.ylabel('Jumlah Postingan')
st.pyplot(plt)
# Tambahan visualisasi sebaran postingan berdasarkan tanggal
st.subheader("Sebaran Postingan Berdasarkan Tanggal")
plt.figure(figsize=(10, 6))
ba_update_range['tanggal'].value_counts().sort_index().plot(kind='bar', title='Sebaran Postingan Berdasarkan Tanggal')
plt.xlabel('Tanggal')
plt.ylabel('Jumlah Postingan')
plt.xticks(rotation=45)
plt.tight_layout()
st.pyplot(plt)
# Tambahan visualisasi sebaran update BA
st.subheader("Sebaran Update BA")
plt.figure(figsize=(10, 6))
ba_update_range['update_ba'].value_counts().sort_index().plot(kind='bar', title='Sebaran Update BA')
plt.xlabel('Tanggal Update BA')
plt.ylabel('Jumlah Postingan')
plt.xticks(rotation=45)
plt.tight_layout()
st.pyplot(plt)
# Visualisasi jumlah komentar berdasarkan tanggal_masuk
st.subheader("Jumlah Komentar Berdasarkan Tanggal Masuk")
plt.figure(figsize=(10, 6))
agg_filtered_data.groupby('tanggal_masuk')['komentar'].count().sort_index().plot(kind='bar', title='Jumlah Komentar Berdasarkan Tanggal Masuk')
plt.xlabel('Tanggal Masuk')
plt.ylabel('Jumlah Komentar')
plt.xticks(rotation=45)
plt.tight_layout()
st.pyplot(plt)
# Visualisasi sebaran komentar di tiap platform
st.subheader("Sebaran Komentar di Tiap Platform")
plt.figure(figsize=(10, 6))
agg_filtered_data['Platform'].value_counts().plot(kind='bar', title='Sebaran Komentar di Tiap Platform')
plt.xlabel('Platform')
plt.ylabel('Jumlah Komentar')
plt.xticks(rotation=45)
plt.tight_layout()
st.pyplot(plt)
# Visualisasi jumlah missing comments berdasarkan postingan
st.subheader("Jumlah Komentar Hilang Berdasarkan Postingan")
plt.figure(figsize=(10, 6))
aggregated_data.groupby('link')['missing_comments'].sum().sort_index().plot(kind='bar', title='Jumlah Komentar Hilang Berdasarkan Postingan')
plt.xlabel('Link')
plt.ylabel('Jumlah Komentar Hilang')
plt.xticks(rotation=45)
plt.tight_layout()
st.pyplot(plt)
else:
st.warning("Tidak ada data yang diupdate BA untuk rentang tanggal yang dipilih.")
# Opsi untuk mengunduh dataset yang telah diagregasi
excel_buffer_aggregated = io.BytesIO()
with pd.ExcelWriter(excel_buffer_aggregated, engine='xlsxwriter') as writer:
aggregated_data.to_excel(writer, index=False, sheet_name='Aggregated Data')
excel_buffer_aggregated.seek(0)
st.download_button(
label=f"Download Aggregated Data by Date for {selected_candidate}",
data=excel_buffer_aggregated,
file_name=f"aggregated_data_{selected_candidate}.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
else:
st.warning("Tidak ada data yang sesuai dengan rentang tanggal yang dipilih untuk diagregasi.")
else:
st.warning("Tidak ada data dalam dataset untuk divisualisasikan.")
else:
st.warning("Tidak ada data dalam dataset untuk divisualisasikan.")
except FileNotFoundError:
st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.")
except Exception as e:
st.error(f"An error occurred: {e}")
pass
if menu == "Evaluate Data Train":
st.title("Evaluate Data Train")
selected_candidate = st.selectbox("Choose a candidate:", list(candidate_list), key='candidate_select_evaluate')
dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx"
try:
df = pd.read_excel(dataset_path)
# Load existing keyword dictionary
try:
with open('keywords.json', 'r') as f:
keyword_dict = json.load(f)
st.success("keywords.json loaded successfully.")
except FileNotFoundError:
st.error("keywords.json file not found. Please ensure the file is in the correct directory.")
st.stop()
except json.JSONDecodeError:
st.error("keywords.json file is not a valid JSON. Please check the file format.")
st.stop()
# Select candidate-specific keywords
candidate_key = selected_candidate.replace(' ', '_')
candidate_keywords = keyword_dict.get(candidate_key)
if not candidate_keywords:
st.error(f"No keywords found for the selected candidate '{selected_candidate}'. Please update the 'keywords.json' file with appropriate keywords.")
st.stop()
keywords = [kw for sentiment_keywords in candidate_keywords.values() for kw in sentiment_keywords]
# Validasi Konsistensi Data
st.subheader("Data Consistency Validation")
missing_values = df.isnull().sum()
st.write("Missing values in each column:")
st.write(missing_values)
# Menghapus baris dengan nilai yang hilang
st.write("Removing rows with missing values...")
df.dropna(inplace=True)
# Distribusi Sentimen
st.subheader("Sentiment Distribution")
sentiment_counts = df['sentimen'].value_counts()
st.write("Number of comments for each sentiment:")
st.write(sentiment_counts)
# Kualitas Data
st.subheader("Data Quality Check")
invalid_entries = df[df['komentar'].str.len() == 0]
st.write(f"Number of invalid comments (empty): {len(invalid_entries)}")
if len(invalid_entries) > 0:
st.write("Invalid comments (empty):")
st.dataframe(invalid_entries.head(100)) # Preview 100 baris pertama
# Deteksi Inkonistensi Sentimen
st.subheader("Inconsistent Sentiment Labels")
duplicate_comments = df[df.duplicated(subset=['komentar'], keep=False)]
inconsistent_labels = duplicate_comments.groupby('komentar')['sentimen'].nunique()
inconsistent_labels = inconsistent_labels[inconsistent_labels > 1]
if not inconsistent_labels.empty:
inconsistent_labels_df = duplicate_comments[duplicate_comments['komentar'].isin(inconsistent_labels.index)]
st.write(f"Number of comments with inconsistent sentiment labels: {len(inconsistent_labels_df)}")
st.dataframe(inconsistent_labels_df.head(100)) # Preview 100 baris pertama
else:
st.write("No comments with inconsistent sentiment labels found.")
# Penanganan Masalah
st.subheader("Problem Handling")
# Menghapus komentar yang kosong
st.write("Removing invalid (empty) comments...")
df = df[df['komentar'].str.len() > 0]
# Interaktif: Menangani komentar dengan label sentimen yang tidak konsisten
st.write("Resolving inconsistent sentiment labels...")
if not inconsistent_labels.empty:
for index, row in inconsistent_labels_df.iterrows():
st.write(f"Comment: {row['komentar']}")
sentimen_options = df[df['komentar'] == row['komentar']]['sentimen'].unique().tolist()
new_sentimen = st.selectbox("Select correct sentiment", sentimen_options, key=f'sentimen_{index}')
if st.button("Update Sentiment", key=f'update_{index}'):
update_sentiment(index, new_sentimen)
# Clustering menggunakan Keyword dan Model Sarkas
st.write("Clustering comments using keywords and sarcasm model...")
keyword_vectorizer = TfidfVectorizer(vocabulary=keywords)
X_keywords = keyword_vectorizer.fit_transform(df['komentar'])
kmeans = KMeans(n_clusters=10, random_state=0).fit(X_keywords)
df['cluster'] = kmeans.labels_
# Identifikasi kluster yang perlu ditinjau
review_clusters = df[df['cluster'].isin(df['cluster'].value_counts()[df['cluster'].value_counts() > 10].index)]
st.write("Clusters identified for review:")
st.dataframe(review_clusters.head(100)) # Preview 100 baris pertama
# Notifikasi Tambahkan Kata Kunci
st.warning("Some comments were not captured by the current keywords. Please add new keywords in the 'Update Keywords' section.")
# Visualisasi Klaster
cluster_counts = df['cluster'].value_counts()
st.write("Number of comments in each cluster:")
st.write(cluster_counts)
# Menampilkan hasil clustering
st.write("Comments clustered by patterns:")
st.dataframe(df.head(100)) # Preview 100 baris pertama
# Export hasil analisis dan penanganan
st.subheader("Export Final Data")
json_buffer = io.BytesIO()
df.to_json(json_buffer, orient='records', lines=True)
json_buffer.seek(0)
st.download_button(
label=f"Download Final Data for {selected_candidate}",
data=json_buffer,
file_name=f"final_data_{selected_candidate}.json",
mime="application/json"
)
except FileNotFoundError:
st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.")
except Exception as e:
st.error(f"An error occurred: {e}")
pass # Placeholder
if menu == "Retraining Model":
st.title("Retrain Model")
selected_candidate = st.selectbox("Select a candidate to retrain the model:", list(candidate_list))
dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx"
model_path = f"best_rf_model_{selected_candidate.lower().replace(' ', '_')}.joblib"
vectorizer_path = f"tfidf_vectorizer_{selected_candidate.lower().replace(' ', '_')}.joblib"
retrain_history_path = f"retrain_history_{selected_candidate.lower().replace(' ', '_')}.json"
try:
# Load dataset
data = pd.read_excel(dataset_path)
# Ensure 'komentar' is string
if 'komentar' not in data.columns:
st.error("Dataset must include a 'komentar' column.")
st.stop()
else:
# Convert all comments to string
data['komentar'] = data['komentar'].fillna('').astype(str)
# Separate validated and unvalidated data
if 'evaluated_by_cluster' in data.columns:
validated_data = data[data['evaluated_by_cluster'] == True]
unvalidated_data = data[data['evaluated_by_cluster'] == False]
else:
validated_data = pd.DataFrame(columns=data.columns)
unvalidated_data = data
st.write(f"**Validated Data:** {len(validated_data)} rows")
st.write(f"**Unvalidated Data:** {len(unvalidated_data)} rows")
# Check if all data is validated
if len(unvalidated_data) > 0:
st.warning("Model retraining is only allowed if all data has been validated through 'Evaluate Clustering'. Please ensure all data is validated before retraining the model.")
st.stop()
# Combine all data for preprocessing
combined_data = validated_data # Only use validated data
# Preprocessing Function
@st.cache_data(show_spinner=True)
def preprocess_data(data):
from joblib import Parallel, delayed
def preprocess_comment(comment):
comment = translate_emojis(comment)
comment = normalize_unicode(comment)
comment = handle_replies(comment)
comment = clean_text(comment)
comment = translate_text(comment, ntb_dict)
comment = translate_text(comment, slang_dict)
comment = handle_negation(comment)
return comment
data['processed_comments'] = Parallel(n_jobs=-1)(
delayed(preprocess_comment)(c) for c in data['komentar']
)
return data
# Preprocessing
st.write("Starting preprocessing...")
combined_data = preprocess_data(combined_data)
if st.button("Retrain Model"):
# Vectorization
st.write("Vectorizing data...")
vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=5000)
X = vectorizer.fit_transform(combined_data['processed_comments'])
y = combined_data['sentimen']
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Handle Class Imbalance with SMOTE
st.write("Balancing data with SMOTE...")
smote = SMOTE(random_state=42, n_jobs=-1)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
# Train Random Forest Model
st.write("Training Random Forest model...")
rf_model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
rf_model.fit(X_train_res, y_train_res)
# Evaluate on Training Data
st.write("Evaluating model...")
y_pred_train = rf_model.predict(X_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
report_train = classification_report(y_train, y_pred_train, output_dict=True)
# Evaluate on Test Data
y_pred_test = rf_model.predict(X_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
report_test = classification_report(y_test, y_pred_test, output_dict=True)
# Save Model and Vectorizer
st.write("Saving model and vectorizer...")
joblib.dump(rf_model, model_path)
joblib.dump(vectorizer, vectorizer_path)
# Log Retraining History
st.write("Logging retraining history...")
try:
with open(retrain_history_path, "r") as f:
retrain_history = json.load(f)
except FileNotFoundError:
retrain_history = []
retrain_history.append({
"date_retrained": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
"accuracy_on_train": accuracy_train,
"accuracy_on_test": accuracy_test,
"f1_score_on_train": report_train['weighted avg']['f1-score'],
"f1_score_on_test": report_test['weighted avg']['f1-score'],
})
with open(retrain_history_path, "w") as f:
json.dump(retrain_history, f, indent=4)
# Display Results
st.success(f"Model retrained successfully! Accuracy on training data: {accuracy_train:.4f}, Accuracy on test data: {accuracy_test:.4f}")
st.subheader("Model Metrics on Training Data")
st.table(pd.DataFrame(report_train).T)
st.subheader("Model Metrics on Test Data")
st.table(pd.DataFrame(report_test).T)
# Show Retrain History
st.subheader("Retrain History")
st.json(retrain_history)
except FileNotFoundError:
st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.")
except Exception as e:
st.error(f"An unexpected error occurred: {e}")
pass # Placeholder
if menu == "Maximize Preprocessing":
st.title("Maximize Preprocessing")
# Load Dataset Train
candidate = st.selectbox("Choose a candidate:", list(candidate_list))
dataset_path = f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx"
try:
# Load dataset
data = pd.read_excel(dataset_path)
# Ensure 'komentar' is string
if 'komentar' not in data.columns:
st.error("Dataset must include a 'komentar' column.")
st.stop()
data['komentar'] = data['komentar'].fillna('').astype(str)
# Preprocessing Steps
@st.cache_data(show_spinner=True)
def preprocess_data(data):
st.write("Starting preprocessing...")
data['translated_emojis'] = data['komentar'].apply(translate_emojis)
data['normalized_unicode'] = data['translated_emojis'].apply(normalize_unicode)
data['reply_handled'] = data['normalized_unicode'].apply(handle_replies)
data['clean_text'] = data['reply_handled'].apply(clean_text)
data['translated_ntb'] = data['clean_text'].apply(lambda x: translate_text(x, ntb_dict))
data['translated_slang'] = data['translated_ntb'].apply(lambda x: translate_text(x, slang_dict))
data['negation_handled'] = data['translated_slang'].apply(handle_negation)
return data
data = preprocess_data(data)
# Check Unmapped Words
st.subheader("Check for Unmapped Words")
all_words = (word.lower() for comment in data['negation_handled'] for word in comment.split())
unique_words = set(all_words)
ntb_dict_keys = set(ntb_dict.keys())
slang_dict_keys = set(slang_dict.keys())
mapped_words = ntb_dict_keys.union(slang_dict_keys)
unmapped_words = sorted(unique_words - mapped_words)
if unmapped_words:
st.write(f"Found **{len(unmapped_words)} unmapped words.**")
# Pilihan jumlah kata yang ingin ditampilkan
max_words = st.slider(
"Select number of words to display:",
min_value=10,
max_value=len(unmapped_words),
value=min(50, len(unmapped_words)),
step=10,
)
# Buat DataFrame untuk tampilan tabel
unmapped_df = pd.DataFrame(unmapped_words, columns=["Unmapped Words"])
st.dataframe(unmapped_df.head(max_words))
# Tampilkan jumlah total kata jika dibutuhkan
st.caption(f"Showing {min(max_words, len(unmapped_words))} out of {len(unmapped_words)} unmapped words.")
else:
st.success("No unmapped words found!")
# Add Words to Dictionary
st.subheader("Add New Words to Dictionary")
new_word = st.text_input("Enter new word:")
normalized_word = st.text_input("Enter normalized form:")
dictionary_choice = st.radio("Select dictionary to update:", ["Kamus Alay", "Kamus ntb"])
if st.button("Add to Dictionary"):
if new_word and normalized_word:
if dictionary_choice == "Kamus Alay":
slang_dict[new_word.lower()] = normalized_word
st.success(f"Added '{new_word}' -> '{normalized_word}' to Kamus Alay.")
elif dictionary_choice == "Kamus ntb":
ntb_dict[new_word.lower()] = normalized_word
st.success(f"Added '{new_word}' -> '{normalized_word}' to Kamus ntb.")
else:
st.warning("Please enter both the new word and its normalized form.")
# Save Updates to File
st.subheader("Save Updated Dictionaries")
if st.button("Save Kamus Alay"):
kamus_alay_path = 'kamusalay.csv' # Adjust the path as needed
pd.DataFrame(list(slang_dict.items()), columns=["slang", "formal"]).to_csv(kamus_alay_path, index=False)
st.success(f"Kamus Alay saved successfully to {kamus_alay_path}.")
if st.button("Save Kamus ntb"):
kamus_ntb_path = 'ntb_dict.json' # Adjust the path as needed
with open(kamus_ntb_path, 'w', encoding='utf-8') as f:
json.dump(ntb_dict, f, indent=4)
st.success(f"Kamus ntb saved successfully to {kamus_ntb_path}.")
except FileNotFoundError:
st.error(f"No training dataset found for {candidate}. Please ensure the dataset is available.")
except Exception as e:
st.error(f"An unexpected error occurred: {e}")
pass # Placeholder
if menu == "Update Keywords":
st.title("Update Keywords")
# Load existing keyword dictionary
with open('keywords.json', 'r') as f:
keyword_dict = json.load(f)
# Show current keywords
st.subheader("Current Keywords")
candidate = st.selectbox("Select candidate", list(keyword_dict.keys()))
for sentiment, keywords in keyword_dict[candidate].items():
st.write(f"{sentiment}: {', '.join(keywords)}")
# Add new keyword
st.subheader("Add New Keyword")
new_keyword = st.text_input("Enter new keyword")
selected_sentiment = st.selectbox("Select sentiment for new keyword", list(keyword_dict[candidate].keys()))
if st.button("Add Keyword"):
if new_keyword and selected_sentiment:
keyword_dict[candidate][selected_sentiment].append(new_keyword)
with open('keywords.json', 'w') as f:
json.dump(keyword_dict, f, indent=4)
st.success(f"Keyword '{new_keyword}' added to {selected_sentiment} for {candidate}")
else:
st.error("Please enter a keyword and select a sentiment")
# Analyze Special Cluster
st.subheader("Analyze Special Cluster")
if 'ba_lainnya_data' in st.session_state:
try:
# Langsung Memuat Data `Special Cluster`
special_cluster_data = st.session_state['ba_lainnya_data'][st.session_state['ba_lainnya_data']['Cluster_Name'] == 'Special Cluster']
if special_cluster_data.empty:
st.warning("No data found in Special Cluster.")
else:
st.write(f"Total comments in Special Cluster: {len(special_cluster_data)}")
all_words_special = []
for comment in special_cluster_data['negation_handled']:
comment = translate_emojis(comment)
comment = normalize_unicode(comment)
comment = handle_replies(comment)
comment = clean_text(comment)
comment = translate_text(comment, {}) # Adjust based on your dictionary
comment = handle_negation(comment)
words = preprocess_text(comment)
all_words_special.extend(words)
# Calculate word frequencies
word_freq_special = Counter(all_words_special)
# Add slider to select number of words to display
num_words_special = st.slider("Number of words to display (Special Cluster)", min_value=5, max_value=50, value=20)
most_common_words_special = word_freq_special.most_common(num_words_special)
# Display word frequencies as a table
st.subheader(f"Top {num_words_special} Word Frequencies in Special Cluster")
word_freq_df_special = pd.DataFrame(most_common_words_special, columns=['Word', 'Frequency'])
st.dataframe(word_freq_df_special)
except Exception as e:
st.error(f"An error occurred: {e}")
else:
st.warning("No 'BA Lainnya' data found. Please classify comments first.")
# Analyze Training Data
st.subheader("Analyze Training Data")
dataset_path = f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx"
try:
train_data = pd.read_excel(dataset_path)
if train_data.empty:
st.warning("Training dataset is empty.")
else:
all_words_train = []
for comment in train_data['komentar'].astype(str):
comment = translate_emojis(comment)
comment = normalize_unicode(comment)
comment = handle_replies(comment)
comment = clean_text(comment)
comment = translate_text(comment, {}) # Adjust based on your dictionary
comment = handle_negation(comment)
words = preprocess_text(comment)
all_words_train.extend(words)
# Calculate word frequencies
word_freq_train = Counter(all_words_train)
# Add slider to select number of words to display
num_words_train = st.slider("Number of words to display (Training Data)", min_value=5, max_value=50, value=20)
most_common_words_train = word_freq_train.most_common(num_words_train)
# Display word frequencies as a table
st.subheader(f"Top {num_words_train} Word Frequencies in Training Data")
word_freq_df_train = pd.DataFrame(most_common_words_train, columns=['Word', 'Frequency'])
st.dataframe(word_freq_df_train)
except FileNotFoundError:
st.error(f"Training dataset for {candidate} not found.")
except Exception as e:
st.error(f"An error occurred: {e}")
# Option to export keywords
st.subheader("Export Keywords")
json_buffer = io.BytesIO()
json_buffer.write(json.dumps(keyword_dict).encode('utf-8'))
json_buffer.seek(0)
st.download_button(
label="Export Keywords",
data=json_buffer,
file_name="keywords.json",
mime="application/json"
)
# Option to import keywords
st.subheader("Import Keywords")
uploaded_file = st.file_uploader("Choose a JSON file", type="json")
if uploaded_file is not None:
imported_keywords = json.load(uploaded_file)
keyword_dict.update(imported_keywords)
with open('keywords.json', 'w') as f:
json.dump(keyword_dict, f, indent=4)
st.success("Keywords imported successfully")
pass