Spaces:
Sleeping
Sleeping
import streamlit as st | |
import joblib | |
import pandas as pd | |
import re | |
import emoji | |
import json | |
import io | |
import unicodedata | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sklearn.model_selection import train_test_split, GridSearchCV | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.metrics import accuracy_score, f1_score | |
from imblearn.over_sampling import SMOTE | |
import matplotlib.pyplot as plt | |
import os | |
from wordcloud import WordCloud | |
from sklearn.metrics import classification_report, accuracy_score, f1_score | |
from sklearn.cluster import KMeans | |
from sklearn.decomposition import PCA | |
from sklearn.preprocessing import FunctionTransformer | |
from sklearn.pipeline import Pipeline | |
from transformers import pipeline | |
from collections import Counter | |
import nltk | |
from nltk.corpus import stopwords | |
from datetime import datetime | |
# === Preprocessing Functions === # | |
candidate_list = ["Lalu Muhamad Iqbal", "Indah Dhamayanti Putri", "Zulkieflimansyah", "M Suhaili", "Sitti Rohmi Djalilah", "Musyafirin"] | |
# Unduh stopwords jika belum terunduh | |
nltk.download('stopwords') | |
stop_words = set(stopwords.words('indonesian')) | |
# Memastikan data "BA Lainnya" tersedia | |
if 'data_with_ba' in st.session_state: | |
ba_lainnya_data = st.session_state['data_with_ba'] | |
else: | |
ba_lainnya_data = None | |
def translate_emojis(text): | |
if not isinstance(text, str): | |
return '' # Mengembalikan string kosong jika input bukan string | |
return ''.join(c for c in text if not emoji.is_emoji(c)) # Remove all emojis | |
# Remove all emojis | |
def normalize_unicode(text): | |
return unicodedata.normalize('NFKD', text) | |
def clean_text(text): | |
text = str(text).casefold() # Convert to lowercase | |
text = re.sub(r'http\S+|www\S+', '', text) # Remove URLs | |
text = re.sub(r'[^a-z\s]', '', text) # Remove non-alphabetic characters | |
text = re.sub(r'\s+', ' ', text).strip() # Normalize spaces | |
return text | |
def handle_negation(text): | |
negation_words = {"tidak", "bukan", "jangan", "belum", "kurang", "gagal", "sulit"} | |
words = text.split() | |
result = [] | |
skip_next = False | |
for i, word in enumerate(words): | |
if word in negation_words and i + 1 < len(words): | |
result.append(f"{word}_{words[i + 1]}") # Combine negation with next word | |
skip_next = True | |
elif skip_next: | |
skip_next = False | |
else: | |
result.append(word) | |
return ' '.join(result) | |
def handle_replies(text): | |
text = re.sub(r'=--*@\w+', '', text) # Remove multi-level reply patterns | |
text = re.sub(r'=-*@\w+', '', text) # Remove single-level reply patterns | |
text = re.sub(r'@\w+', '', text) # Remove standalone @username mentions | |
return text | |
def translate_text(text, dictionary): | |
words = text.split() | |
return ' '.join([dictionary.get(word.lower(), word) for word in words]) # Translate words using dictionary | |
# Fungsi untuk menetapkan sentimen berdasarkan kata kunci | |
def assign_sentiment_based_on_keywords(comment, keyword_dict): | |
for sentiment, keywords in keyword_dict.items(): | |
if any(keyword in comment for keyword in keywords): | |
return sentiment | |
return 'unknown' | |
# === Load Dictionaries === # | |
def load_dictionary(file_path, file_type='json'): | |
if file_type == 'json': | |
with open(file_path, 'r', encoding='utf-8') as file: | |
return json.load(file) | |
elif file_type == 'csv': | |
df = pd.read_csv(file_path, names=['slang', 'formal']) | |
return pd.Series(df['formal'].values, index=df['slang']).to_dict() | |
ntb_dict = load_dictionary('ntb_dict.json', 'json') | |
slang_dict = load_dictionary('kamusalay.csv', 'csv') | |
# === Utility Functions === # | |
# Fungsi untuk memperbarui file JSON dengan riwayat update | |
def update_history_json(history_path, komentar, link, model_data, field, date): | |
# Konversi Timestamp menjadi string | |
if isinstance(date, pd.Timestamp): | |
date = date.strftime('%Y-%m-%d') | |
# Baca histori dari file JSON | |
try: | |
with open(history_path, 'r') as file: | |
history_data = json.load(file) | |
except (FileNotFoundError, json.JSONDecodeError): | |
history_data = {} | |
# Kunci riwayat untuk komentar, link, dan model data tertentu | |
key = f"{komentar}_{link}_{model_data}" | |
# Tambahkan riwayat baru ke kunci yang sesuai | |
if key in history_data: | |
history_data[key].append({field: date}) | |
else: | |
history_data[key] = [{field: date}] | |
# Simpan kembali ke file JSON | |
with open(history_path, 'w') as file: | |
json.dump(history_data, file, indent=4) | |
# Fungsi untuk memperbarui dataset pelatihan dengan data baru | |
def update_training_dataset(output, candidate): | |
dataset_path = f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx" | |
history_path = f"history_{candidate.lower().replace(' ', '_')}.json" | |
try: | |
required_columns = ['model_data', 'Platform', 'komentar', 'link', 'kandidat', 'sentimen', 'tanggal', 'tanggal_masuk'] | |
output = output[required_columns].copy() | |
if 'predicted_category' in output.columns: | |
output['sentimen'] = output['predicted_category'] | |
output.drop(columns=['predicted_category'], inplace=True) | |
output['tanggal_masuk'] = pd.Timestamp.now() | |
if os.path.exists(dataset_path): | |
existing_data = pd.read_excel(dataset_path) | |
else: | |
existing_data = pd.DataFrame(columns=required_columns) | |
# Menambahkan kolom 'update_ba' jika belum ada | |
if 'update_ba' not in existing_data.columns: | |
existing_data['update_ba'] = None | |
# Menambahkan kolom 'missing_comment' jika belum ada | |
if 'missing_comment' not in existing_data.columns: | |
existing_data['missing_comment'] = False | |
# Langkah 1: Memeriksa Komentar yang Hilang | |
train_comments = existing_data.groupby('link')['komentar'].apply(list).to_dict() | |
new_comments = output.groupby('link')['komentar'].apply(list).to_dict() | |
for link, comments in train_comments.items(): | |
if link in new_comments: | |
new_comment_set = set(new_comments[link]) | |
for comment in comments: | |
if comment not in new_comment_set: | |
existing_data.loc[(existing_data['link'] == link) & (existing_data['komentar'] == comment), 'missing_comment'] = True | |
else: | |
existing_data.loc[(existing_data['link'] == link) & (existing_data['komentar'] == comment), 'missing_comment'] = False | |
# Fungsi untuk memperbarui data | |
def update_data(existing_data, new_data, history_path): | |
for index, row in new_data.iterrows(): | |
komentar = row['komentar'] | |
link = row['link'] | |
model_data = row['model_data'] | |
tanggal_klasifikasi = pd.Timestamp.now() | |
# Komentar Sama Sudah Ada | |
existing_entry = existing_data[(existing_data['link'] == link) & (existing_data['komentar'] == komentar)] | |
if not existing_entry.empty: | |
existing_data.loc[existing_entry.index, 'update_ba'] = tanggal_klasifikasi | |
update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi) | |
else: | |
# Link Sama, Komentar Berbeda | |
existing_link_entry = existing_data[(existing_data['link'] == link)] | |
if not existing_link_entry.empty: | |
new_row = row.copy() | |
new_row['tanggal_masuk'] = tanggal_klasifikasi | |
new_row['update_ba'] = tanggal_klasifikasi | |
existing_data = pd.concat([existing_data, new_row.to_frame().T], ignore_index=True) | |
update_history_json(history_path, komentar, link, model_data, 'tanggal_masuk', tanggal_klasifikasi) | |
update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi) | |
else: | |
# Link Baru | |
new_row = row.copy() | |
new_row['tanggal_masuk'] = tanggal_klasifikasi | |
new_row['update_ba'] = tanggal_klasifikasi | |
existing_data = pd.concat([existing_data, new_row.to_frame().T], ignore_index=True) | |
update_history_json(history_path, komentar, link, model_data, 'tanggal_masuk', tanggal_klasifikasi) | |
update_history_json(history_path, komentar, link, model_data, 'update_ba', tanggal_klasifikasi) | |
# Perbarui nilai None di update_ba dengan tanggal_masuk | |
existing_data['update_ba'] = pd.to_datetime(existing_data['update_ba'], errors='coerce') | |
existing_data['update_ba'].fillna(existing_data['tanggal_masuk'], inplace=True) | |
return existing_data | |
updated_data = update_data(existing_data, output, history_path) | |
updated_data.to_excel(dataset_path, index=False) | |
st.success(f"Data successfully updated in {candidate}'s training dataset.") | |
if 'missing_comment' in existing_data.columns and existing_data['missing_comment'].any(): | |
st.subheader("Missing Comments") | |
st.write("Comments that were found to be missing:") | |
st.dataframe(existing_data[existing_data['missing_comment']]) | |
except KeyError as e: | |
st.error(f"Missing column in the dataset: {e}") | |
except Exception as e: | |
st.error(f"An error occurred: {e}") | |
def clustering_based_evaluation(df, n_clusters=10): | |
st.write("Starting preprocessing...") | |
df['translated_emojis'] = df['komentar'].fillna('').astype(str).apply(translate_emojis) | |
df['normalized_unicode'] = df['translated_emojis'].apply(normalize_unicode) | |
df['reply_handled'] = df['normalized_unicode'].apply(handle_replies) | |
df['clean_text'] = df['reply_handled'].apply(clean_text) | |
df['translated_ntb'] = df['clean_text'].apply(lambda x: translate_text(x, ntb_dict)) | |
df['translated_slang'] = df['translated_ntb'].apply(lambda x: translate_text(x, slang_dict)) | |
df['negation_handled'] = df['translated_slang'].apply(handle_negation) | |
st.write("Generating TF-IDF vectors...") | |
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') | |
tfidf_matrix = tfidf_vectorizer.fit_transform(df['negation_handled']) | |
st.write(f"Clustering into {n_clusters} clusters...") | |
kmeans = KMeans(n_clusters=n_clusters, random_state=42) | |
df['Cluster'] = kmeans.fit_predict(tfidf_matrix) | |
st.write("Performing PCA for visualization...") | |
pca = PCA(n_components=2) | |
reduced_data = pca.fit_transform(tfidf_matrix.toarray()) | |
df['PCA1'] = reduced_data[:, 0] | |
df['PCA2'] = reduced_data[:, 1] | |
st.write("Clustering completed successfully!") | |
return df | |
def load_and_process_data(dataset_path, history_path): | |
df = pd.read_excel(dataset_path) | |
df['tanggal_masuk'] = pd.to_datetime(df['tanggal_masuk'], errors='coerce') | |
if df['tanggal_masuk'].isnull().any(): | |
st.warning("Some dates could not be parsed correctly. Please check the date format in the dataset.") | |
df['tanggal_masuk'].fillna(pd.Timestamp.now().strftime('%Y-%m-%d'), inplace=True) | |
required_columns = ['model_data', 'Platform', 'komentar', 'sentimen', 'tanggal', 'tanggal_masuk', 'evaluated_by_cluster'] | |
for col in required_columns: | |
if col not in df.columns: | |
if col == 'tanggal_masuk': | |
df[col] = pd.Timestamp.now().strftime('%Y-%m-%d') | |
elif col == 'evaluated_by_cluster': | |
df[col] = False | |
else: | |
df[col] = None | |
df = df[required_columns] | |
try: | |
with open(history_path, "r") as f: | |
history = json.load(f) | |
except FileNotFoundError: | |
history = [] | |
return df, history | |
# Define the function to handle Special Cluster | |
def handle_special_cluster(data, keywords, detector): | |
for index, row in data.iterrows(): | |
text = row['negation_handled'] | |
# Cek apakah teks mengandung salah satu kata kunci | |
detected = False | |
for sentiment, words in keywords.items(): | |
if any(word in text for word in words): | |
data.loc[index, 'predicted_category'] = sentiment.replace('_', ' ') | |
data.loc[index, 'detected_by'] = 'keyword' | |
detected = True | |
break | |
if not detected: | |
# Jika tidak ada kata kunci yang cocok, gunakan deteksi sarkasme | |
result = detector(text) | |
if result[0]['label'] == 'SARCASM': | |
data.loc[index, 'predicted_category'] = 'co sarkastic' | |
data.loc[index, 'detected_by'] = 'sarcasm' | |
else: | |
# Jika tidak ada sentimen yang cocok | |
data.loc[index, 'predicted_category'] = 'Unknown' | |
data.loc[index, 'detected_by'] = 'unknown' | |
return data | |
def preprocess_text(text): | |
text = text.lower() | |
text = re.sub(r'\W+', ' ', text) # Remove non-alphanumeric characters | |
words = text.split() | |
words = [word for word in words if word not in stop_words] | |
return words | |
def display_word_frequencies(words, num_words): | |
st.subheader(f"Top {num_words} Words") | |
for word, freq in words: | |
st.write(f"{word}: {freq}") | |
def update_sentiment(index, new_sentimen): | |
df.loc[index, 'sentimen'] = new_sentimen | |
st.write(f"Updated sentiment for comment at index {index} to {new_sentimen}") | |
# Fungsi untuk memuat data `Corrected Comments` dari file Excel | |
def load_corrected_comments_from_excel(file_path): | |
try: | |
return pd.read_excel(file_path, sheet_name='Corrected Comments') | |
except FileNotFoundError: | |
return pd.DataFrame(columns=['model_data', 'Platform', 'komentar', 'tanggal', 'link', 'sentimen', 'Cluster_Name', 'corrected_by', 'tanggal_masuk']) | |
except Exception as e: | |
print(f"Error loading corrected comments: {e}") | |
return pd.DataFrame(columns=['model_data', 'Platform', 'komentar', 'tanggal', 'link', 'sentimen', 'Cluster_Name', 'corrected_by', 'tanggal_masuk']) | |
# Fungsi untuk menyimpan data `Corrected Comments` ke file Excel | |
def save_corrected_comments_to_excel(data, file_path): | |
with pd.ExcelWriter(file_path, engine='xlsxwriter') as writer: | |
data.to_excel(writer, sheet_name='Corrected Comments', index=False) | |
# Lokasi file Excel untuk menyimpan data `Corrected Comments` | |
corrected_comments_file = 'corrected_comments.xlsx' | |
# Fungsi untuk menampilkan visualisasi distribusi komentar dalam cluster | |
def display_cluster_visualization(ba_lainnya_data): | |
st.subheader("Cluster Visualization") | |
# Menghitung jumlah komentar dalam kategori `Similar Sentiment` dan `Special Cluster` | |
cluster_counts = ba_lainnya_data[ba_lainnya_data['Cluster_Name'].str.contains('Similar|Special Cluster')]['Cluster_Name'].value_counts() | |
# Membuat grafik batang (bar chart) | |
plt.figure(figsize=(10, 6)) | |
plt.bar(cluster_counts.index, cluster_counts.values, color=['blue', 'green', 'orange', 'red', 'purple']) | |
plt.xlabel('Cluster Name') | |
plt.ylabel('Number of Comments') | |
plt.title('Distribution of Comments in Similar Sentiment and Special Cluster') | |
plt.xticks(rotation=45) | |
plt.show() | |
st.pyplot(plt) | |
def run_clustering_for_ba_lainnya(): | |
st.title("Clustering for 'BA Lainnya'") | |
if 'data_with_ba' not in st.session_state: | |
st.error("No 'BA Lainnya' data found from the classification model. Please classify comments first.") | |
st.stop() | |
ba_lainnya_data = st.session_state['data_with_ba'] | |
st.write(f"**'BA Lainnya' Data:** {len(ba_lainnya_data)} rows") | |
with open('keywords.json', 'r') as f: | |
keyword_dict = json.load(f) | |
selected_candidate = st.session_state['candidate'] | |
candidate_keywords = keyword_dict.get(selected_candidate.replace(' ', '_')) | |
if candidate_keywords is None: | |
st.error("Keywords for the selected candidate not found.") | |
st.stop() | |
sarcasm_detector = pipeline('sentiment-analysis', model='unitary/toxic-bert') | |
dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx" | |
corrected_comments_file = f"corrected_comments_{selected_candidate.lower().replace(' ', '_')}.xlsx" | |
try: | |
train_data = pd.read_excel(dataset_path) | |
labeled_data = train_data[train_data['sentimen'].isin(['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic'])] | |
st.write(f"**Labeled Data from Training Dataset:** {len(labeled_data)} rows") | |
ba_lainnya_data['Cluster'] = None | |
ba_lainnya_data['detected_by'] = None | |
ba_lainnya_data['Cluster_Name'] = None | |
ba_lainnya_data['corrected_by'] = "Not Corrected" | |
ba_lainnya_data['Sentiment'] = None # Ubah inisialisasi ke None untuk memastikan tidak ada nilai default yang salah | |
ba_lainnya_data['Cluster_Name'] = ba_lainnya_data['Cluster_Name'].astype(str) | |
ba_lainnya_data['corrected_by'] = ba_lainnya_data['corrected_by'].astype(str) | |
ba_lainnya_data['Sentiment'] = ba_lainnya_data['Sentiment'].astype(str) | |
for data in [ba_lainnya_data, labeled_data]: | |
data['translated_emojis'] = data['komentar'].fillna('').astype(str).apply(translate_emojis) | |
data['normalized_unicode'] = data['translated_emojis'].apply(normalize_unicode) | |
data['reply_handled'] = data['normalized_unicode'].apply(handle_replies) | |
data['clean_text'] = data['reply_handled'].apply(clean_text) | |
data['translated_ntb'] = data['clean_text'].apply(lambda x: translate_text(x, {})) | |
data['translated_slang'] = data['translated_ntb'].apply(lambda x: translate_text(x, {})) | |
data['negation_handled'] = data['translated_slang'].apply(handle_negation) | |
data['negation_handled'] = data['negation_handled'].fillna('') | |
combined_data = ba_lainnya_data.copy() | |
combined_data['Label'] = 'BA Lainnya' | |
for sentimen in ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']: | |
sentimen_data = labeled_data[labeled_data['sentimen'] == sentimen].copy() | |
sentimen_data['Label'] = sentimen | |
combined = pd.concat([combined_data, sentimen_data], ignore_index=True) | |
if len(combined) < 2: | |
st.warning(f"Not enough samples to cluster for {sentimen}.") | |
continue | |
vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=5000) | |
tfidf_matrix = vectorizer.fit_transform(combined['negation_handled']) | |
st.write(f"Clustering 'BA Lainnya' comments similar to {sentimen}...") | |
kmeans = KMeans(n_clusters=2, random_state=42) | |
combined['Cluster'] = kmeans.fit_predict(tfidf_matrix) | |
valid_indices = combined.index[:len(ba_lainnya_data)] | |
valid_indices = valid_indices.intersection(ba_lainnya_data.index) | |
ba_lainnya_data.loc[valid_indices, 'Cluster'] = combined.loc[valid_indices, 'Cluster'] | |
ba_lainnya_data.loc[ba_lainnya_data['Cluster'] == 0, 'Cluster_Name'] = f"{sentimen} Similar" | |
ba_lainnya_data.loc[ba_lainnya_data['Cluster'] == 1, 'Cluster_Name'] = f"{sentimen} Dissimilar" | |
ba_lainnya_data.loc[valid_indices, 'Sentiment'] = sentimen | |
for index, row in ba_lainnya_data.iterrows(): | |
if row['Cluster_Name'].endswith('Dissimilar') or row['Cluster_Name'] == 'None': | |
dissimilar_comment = ba_lainnya_data.loc[[index]].copy() | |
for sentimen in ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']: | |
sentimen_data = labeled_data[labeled_data['sentimen'] == sentimen].copy() | |
combined = pd.concat([dissimilar_comment, sentimen_data], ignore_index=True) | |
if len(combined) < 2: | |
continue | |
tfidf_matrix = vectorizer.fit_transform(combined['negation_handled']) | |
if tfidf_matrix.shape[0] == 0: | |
continue | |
kmeans = KMeans(n_clusters=2, random_state=42) | |
combined['Cluster'] = kmeans.fit_predict(tfidf_matrix) | |
if len(combined) > 0 and combined.loc[0, 'Cluster'] == 0: | |
ba_lainnya_data.loc[index, 'Cluster_Name'] = f"{sentimen} Similar" | |
ba_lainnya_data.loc[index, 'Sentiment'] = sentimen | |
break | |
else: | |
ba_lainnya_data.loc[index, 'Cluster_Name'] = 'Special Cluster' | |
ba_lainnya_data.loc[index, 'corrected_by'] = 'Special Cluster' | |
ba_lainnya_data.loc[index, 'Sentiment'] = 'Special Sentiment' | |
ba_lainnya_data['Cluster_Name'] = ba_lainnya_data['Cluster_Name'].apply(lambda x: 'Special Cluster' if x == 'nan' else x) | |
special_cluster_data = ba_lainnya_data[ba_lainnya_data['Cluster_Name'] == 'Special Cluster'] | |
if not special_cluster_data.empty: | |
special_cluster_data = handle_special_cluster(special_cluster_data, candidate_keywords, sarcasm_detector) | |
ba_lainnya_data.update(special_cluster_data) | |
ba_lainnya_data.loc[special_cluster_data.index, 'corrected_by'] = 'Special Cluster' | |
ba_lainnya_data.loc[special_cluster_data.index, 'Sentiment'] = 'Special Sentiment' | |
st.warning("Some comments were not captured by the current keywords. Please add new keywords in the 'Update Keywords' section.") | |
st.subheader("Detection Distribution in Special Cluster") | |
detection_counts = special_cluster_data['detected_by'].value_counts() | |
plt.figure(figsize=(10, 6)) | |
plt.bar(detection_counts.index, detection_counts.values, color=['blue', 'orange', 'red']) | |
plt.xlabel('Detection Method') | |
plt.ylabel('Number of Comments') | |
plt.title('Detection Distribution in Special Cluster') | |
plt.show() | |
st.pyplot(plt) | |
st.write("Top Keywords in Special Cluster") | |
for sentiment, keywords in candidate_keywords.items(): | |
st.write(f"{sentiment}: {', '.join(keywords)}") | |
st.subheader("Special Cluster Details") | |
st.dataframe(special_cluster_data[['komentar', 'Cluster_Name', 'detected_by']]) | |
corrected_comments = load_corrected_comments_from_excel(corrected_comments_file) | |
display_cluster_visualization(ba_lainnya_data) | |
st.subheader("Edit Sentiment in Table") | |
# Tambahkan kolom apakah komentar sudah masuk ke corrected_comments | |
ba_lainnya_data['Already Corrected'] = ba_lainnya_data['komentar'].isin(corrected_comments['komentar']) | |
ba_lainnya_data['Corrected Sentiment'] = ba_lainnya_data.apply( | |
lambda row: corrected_comments[corrected_comments['komentar'] == row['komentar']]['sentimen'].values[0] | |
if row['Already Corrected'] else "", axis=1 | |
) | |
# Tambahkan kolom checkbox untuk memilih komentar yang mau diedit | |
ba_lainnya_data['Select'] = False | |
# Dropdown untuk memilih cluster (bulk selection) | |
selected_cluster = st.selectbox("Select a cluster to edit:", sorted(ba_lainnya_data['Cluster_Name'].unique())) | |
# Jika cluster dipilih, auto-select semua komentar di cluster tersebut | |
if selected_cluster: | |
ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'Select'] = True | |
# Tampilkan tabel dengan checkbox & informasi sentimen | |
edited_data = st.data_editor( | |
ba_lainnya_data[['Select', 'komentar', 'Cluster_Name', 'Sentiment', 'Already Corrected', 'Corrected Sentiment']], | |
column_config={ | |
"Select": st.column_config.CheckboxColumn("✅ Select"), | |
"komentar": st.column_config.TextColumn("Komentar"), | |
"Cluster_Name": st.column_config.TextColumn("Cluster"), | |
"Sentiment": st.column_config.TextColumn("Sentimen"), | |
"Already Corrected": st.column_config.CheckboxColumn("✅ Sudah di Corrected?"), | |
"Corrected Sentiment": st.column_config.TextColumn("Sentimen Corrected"), | |
}, | |
disabled=["komentar", "Cluster_Name", "Already Corrected", "Corrected Sentiment"], # Kolom yang gak bisa diedit | |
use_container_width=True, | |
) | |
# Dropdown untuk memilih sentimen baru yang akan diterapkan ke komentar yang dicentang | |
new_sentimen = st.selectbox("Select new sentiment for selected comments:", | |
['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']) | |
# Jika tombol ditekan, update sentimen komentar yang dicentang | |
if st.button("Update Sentiments"): | |
selected_rows = edited_data[edited_data['Select']] # Ambil data yang dicentang | |
if selected_rows.empty: | |
st.warning("No comments selected for update!") | |
else: | |
for index, row in selected_rows.iterrows(): | |
ba_lainnya_data.at[index, 'Sentiment'] = new_sentimen | |
ba_lainnya_data.at[index, 'corrected_by'] = 'Manual Edit' | |
if ba_lainnya_data.at[index, 'komentar'] in corrected_comments['komentar'].values: | |
corrected_comments.loc[corrected_comments['komentar'] == ba_lainnya_data.at[index, 'komentar'], 'sentimen'] = new_sentimen | |
else: | |
new_entry = ba_lainnya_data.loc[[index]].copy() | |
new_entry['sentimen'] = new_sentimen | |
corrected_comments = pd.concat([corrected_comments, new_entry]) | |
corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True) | |
save_corrected_comments_to_excel(corrected_comments, corrected_comments_file) | |
st.success("Sentiments updated successfully!") | |
st.rerun() | |
st.subheader("Clustered Data") | |
selected_cluster = st.selectbox("Select a cluster to view comments:", sorted(ba_lainnya_data['Cluster_Name'].unique())) | |
cluster_comments = ba_lainnya_data[ba_lainnya_data['Cluster_Name'] == selected_cluster] | |
st.dataframe(cluster_comments[['komentar', 'Cluster_Name']].head(100)) | |
new_sentimen = st.selectbox("Select new sentiment for this cluster:", ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']) | |
if st.button("Update Sentiment for this cluster"): | |
ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'corrected_by'] = 'Batch Cluster' | |
ba_lainnya_data.loc[ba_lainnya_data['Cluster_Name'] == selected_cluster, 'sentimen'] = new_sentimen | |
st.success(f"Sentiment for cluster {selected_cluster} updated to {new_sentimen}") | |
# Save and refresh Corrected Comments table and Cluster Visualization | |
corrected_comments = pd.concat([corrected_comments, ba_lainnya_data[ba_lainnya_data['corrected_by'] != "Not Corrected"]]) | |
corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True) | |
save_corrected_comments_to_excel(corrected_comments, corrected_comments_file) | |
st.subheader("Corrected Comments") | |
st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100)) | |
display_cluster_visualization(ba_lainnya_data) | |
st.subheader("Special Rules Based on Keywords") | |
keyword = st.text_input("Enter a keyword to set a rule:") | |
specific_cluster = st.selectbox("Select a cluster for this keyword:", sorted(ba_lainnya_data['Cluster_Name'].unique())) | |
if keyword: | |
new_cluster = st.selectbox("Select sentiment for this keyword:", ['Co Likes', 'Co Support', 'Co Optimism', 'Co Negative', 'Co Sarkastic']) | |
if st.button("Apply Rule"): | |
ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'Cluster_Name'] = new_cluster | |
ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'detected_by'] = specific_cluster | |
ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'corrected_by'] = 'Keyword Rule' | |
ba_lainnya_data.loc[ba_lainnya_data['komentar'].str.contains(keyword, case=False, na=False), 'sentimen'] = new_cluster | |
st.success(f"All comments containing '{keyword}' have been updated to '{new_cluster}' sentiment.") | |
# Update keywords.json file to avoid duplicates | |
if selected_candidate.replace(' ', '_') in keyword_dict: | |
if new_cluster in keyword_dict[selected_candidate.replace(' ', '_')]: | |
if keyword not in keyword_dict[selected_candidate.replace(' ', '_')][new_cluster]: | |
keyword_dict[selected_candidate.replace(' ', '_')][new_cluster].append(keyword) | |
else: | |
keyword_dict[selected_candidate.replace(' ', '_')][new_cluster] = [keyword] | |
else: | |
keyword_dict[selected_candidate.replace(' ', '_')] = {new_cluster: [keyword]} | |
with open('keywords.json', 'w') as f: | |
json.dump(keyword_dict, f) | |
st.success(f"Keyword '{keyword}' has been added to the keyword list.") | |
# Save and refresh Corrected Comments table and Cluster Visualization | |
corrected_comments = pd.concat([corrected_comments, ba_lainnya_data[ba_lainnya_data['corrected_by'] != "Not Corrected"]]) | |
corrected_comments.drop_duplicates(subset=['komentar'], keep='last', inplace=True) | |
save_corrected_comments_to_excel(corrected_comments, corrected_comments_file) | |
st.subheader("Corrected Comments") | |
st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100)) | |
display_cluster_visualization(ba_lainnya_data) | |
st.subheader("Corrected Comments") | |
corrected_comments = load_corrected_comments_from_excel(corrected_comments_file) | |
st.dataframe(corrected_comments[['komentar', 'Cluster_Name', 'corrected_by', 'sentimen']].head(100)) | |
st.subheader("Visual Representation of Corrected Comments") | |
sentiment_counts = corrected_comments['sentimen'].value_counts() | |
plt.figure(figsize=(10, 6)) | |
plt.bar(sentiment_counts.index, sentiment_counts.values, color=['blue', 'green', 'orange', 'red', 'purple']) | |
plt.xlabel('Sentimen') | |
plt.ylabel('Number of Corrected Comments') | |
plt.title('Number of Corrected Comments by Sentiment') | |
plt.show() | |
st.pyplot(plt) | |
st.subheader("Download Options") | |
excel_buffer_cluster = io.BytesIO() | |
with pd.ExcelWriter(excel_buffer_cluster, engine='xlsxwriter') as writer: | |
ba_lainnya_data.to_excel(writer, index=False, sheet_name='Clustered Data') | |
excel_buffer_cluster.seek(0) | |
st.download_button( | |
label=f"Download Clustered Data for {selected_candidate}", | |
data=excel_buffer_cluster, | |
file_name=f"clustered_data_{selected_candidate}.xlsx", | |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
) | |
st.subheader("Save Corrected Comments to Training Dataset") | |
if st.button("Save Corrected Comments"): | |
try: | |
# Simpan hanya hasil koreksi | |
corrected_comments['tanggal_masuk'] = pd.Timestamp.now().strftime('%Y-%m-%d') | |
update_training_dataset(corrected_comments, st.session_state['candidate']) | |
st.success("Corrected comments have been successfully saved to the training dataset.") | |
# Clear corrected comments from session state to avoid duplicate saves | |
st.session_state['data_with_ba'] = pd.DataFrame(columns=corrected_comments.columns) | |
# Rerun automatically after saving to refresh the state | |
st.rerun() | |
except Exception as e: | |
st.error(f"An error occurred while saving the corrected comments: {e}") | |
except FileNotFoundError: | |
st.error(f"No dataset found for {selected_candidate}. Please add data to create the dataset.") | |
except Exception as e: | |
st.error(f"An unexpected error occurred: {e}") | |
# === Sidebar Navigation === # | |
menu = st.sidebar.radio("Select a Feature", ["Model-Based Classification","Clustering for 'BA Lainnya'", "Update Keywords","View Training Dataset","Evaluate Data Train","Maximize Preprocessing","Retraining Model"]) | |
if menu == "Model-Based Classification": | |
st.title("Model-Based Classification") | |
candidate = st.selectbox("Choose a candidate:", candidate_list) | |
model_path = f"best_rf_model_{candidate.replace(' ', '_').lower()}.joblib" | |
vectorizer_path = f"tfidf_vectorizer_{candidate.replace(' ', '_').lower()}.joblib" | |
# Save the selected candidate to session state | |
st.session_state['candidate'] = candidate | |
uploaded_file = st.file_uploader("Upload an Excel file for classification", type=['xlsx']) | |
def load_model_and_vectorizer(model_path, vectorizer_path): | |
"""Load model and vectorizer, cache them for efficiency.""" | |
try: | |
model = joblib.load(model_path) | |
vectorizer = joblib.load(vectorizer_path) | |
return model, vectorizer | |
except FileNotFoundError: | |
return None, None | |
model, vectorizer = load_model_and_vectorizer(model_path, vectorizer_path) | |
if not model or not vectorizer: | |
st.error("Model or vectorizer not found for the selected candidate.") | |
st.stop() | |
# Save the vectorizer and model to session state | |
st.session_state['vectorizer'] = vectorizer | |
st.session_state['model'] = model | |
def preprocess_data(data): | |
"""Preprocess comments with batching for large datasets.""" | |
from joblib import Parallel, delayed | |
def preprocess_batch(batch): | |
batch['translated_emojis'] = batch['komentar'].apply(translate_emojis) | |
batch['normalized_unicode'] = batch['translated_emojis'].apply(normalize_unicode) | |
batch['reply_handled'] = batch['normalized_unicode'].apply(handle_replies) | |
batch['clean_text'] = batch['reply_handled'].apply(clean_text) | |
batch['translated_ntb'] = batch['clean_text'].apply(lambda x: translate_text(x, ntb_dict)) | |
batch['translated_slang'] = batch['translated_ntb'].apply(lambda x: translate_text(x, slang_dict)) | |
batch['negation_handled'] = batch['translated_slang'].apply(handle_negation) | |
return batch | |
batch_size = 10000 # Process 10,000 rows at a time | |
batches = [data.iloc[i:i+batch_size] for i in range(0, len(data), batch_size)] | |
processed_batches = Parallel(n_jobs=-1)(delayed(preprocess_batch)(batch) for batch in batches) | |
return pd.concat(processed_batches, ignore_index=True) | |
# Process uploaded file | |
if uploaded_file: | |
try: | |
data = pd.read_excel(uploaded_file) | |
if 'komentar' not in data.columns: | |
st.error("The uploaded file must include a 'komentar' column.") | |
st.stop() | |
data = preprocess_data(data) | |
except Exception as e: | |
st.error(f"An error occurred while processing the file: {e}") | |
st.stop() | |
elif 'model_classified_data' in st.session_state: | |
data = st.session_state['model_classified_data'] | |
else: | |
st.info("Please upload a file for classification.") | |
st.stop() | |
# Transform comments into TF-IDF vectors | |
try: | |
tfidf_data = vectorizer.transform(data['negation_handled'].fillna('')) | |
data['predicted_category'] = model.predict(tfidf_data) | |
data['probabilities'] = model.predict_proba(tfidf_data).tolist() | |
data['max_probability'] = data['probabilities'].apply(lambda x: max(x)) | |
except Exception as e: | |
st.error(f"An error occurred during model prediction: {e}") | |
st.stop() | |
# Cache classified data | |
st.session_state['model_classified_data'] = data | |
# Interactive threshold adjustment | |
st.subheader("Set Threshold for 'BA Lainnya'") | |
threshold = st.slider("Threshold for tagging 'BA Lainnya'", min_value=0.0, max_value=1.0, value=0.80, step=0.01) | |
# Apply threshold to tag "BA Lainnya" | |
data['tag'] = data['max_probability'].apply(lambda x: 'BA Lainnya' if x < threshold else '') | |
# Separate data for visualization | |
data_without_ba = data[data['tag'] != 'BA Lainnya'] | |
data_with_ba = data[data['tag'] == 'BA Lainnya'] | |
# Save updated results to session state for dynamic updates | |
st.session_state['data_without_ba'] = data_without_ba | |
st.session_state['data_with_ba'] = data_with_ba | |
# Preview Results | |
st.subheader("Preview Results") | |
st.write("### 1. Hasil Klasifikasi Tanpa Tag 'BA Lainnya'") | |
if not data_without_ba.empty: | |
st.dataframe(data_without_ba[['komentar', 'predicted_category', 'max_probability']]) | |
else: | |
st.info("No high-probability classifications available.") | |
st.write("### 2. Hasil Klasifikasi Dengan Tag 'BA Lainnya'") | |
if not data_with_ba.empty: | |
st.dataframe(data_with_ba[['komentar', 'predicted_category', 'max_probability']]) | |
else: | |
st.info("No low-probability classifications available.") | |
# Visualization: Sentiment Distribution | |
st.subheader("Sentiment Distribution Visualization") | |
def plot_distribution(data, title): | |
sentiment_counts = data['predicted_category'].value_counts() | |
fig, ax = plt.subplots() | |
ax.bar(sentiment_counts.index, sentiment_counts.values) | |
ax.set_title(title) | |
ax.set_xlabel("Sentiments") | |
ax.set_ylabel("Count") | |
st.pyplot(fig) | |
if not data_without_ba.empty: | |
plot_distribution(data_without_ba, "Sentiment Distribution (Without 'BA Lainnya')") | |
if not data_with_ba.empty: | |
plot_distribution(data_with_ba, "Sentiment Distribution (With 'BA Lainnya')") | |
# Download Results | |
st.subheader("Download Results") | |
excel_buffer = io.BytesIO() | |
with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer: | |
data.to_excel(writer, index=False, sheet_name='Classification Results') | |
excel_buffer.seek(0) | |
st.download_button( | |
label="Download All Classification Results", | |
data=excel_buffer, | |
file_name=f"classification_results_{candidate}.xlsx", | |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
) | |
# Save Non-'BA Lainnya' Data to Training Dataset | |
st.subheader("Save Classified Data") | |
if not data_without_ba.empty: | |
st.info("Only Non-'BA Lainnya' comments will be saved to the training dataset.") | |
if st.button("Save Non-'BA Lainnya' Data to Training Dataset"): | |
try: | |
data_to_save = data_without_ba[['model_data', 'Platform', 'komentar', 'link', 'kandidat', 'tanggal']].copy() | |
data_to_save['sentimen'] = data_without_ba['predicted_category'] | |
data_to_save['tanggal_masuk'] = pd.Timestamp.now().strftime('%Y-%m-%d') | |
update_training_dataset(data_to_save, candidate) | |
st.success("Data successfully saved to the training dataset.") | |
except Exception as e: | |
st.error(f"An error occurred while saving the data: {e}") | |
else: | |
st.info("No Non-'BA Lainnya' data available to save.") | |
pass # Placeholder | |
# Integrasi fungsi dalam halaman "Clustering for 'BA Lainnya'" | |
if menu == "Clustering for 'BA Lainnya'": | |
selected_candidate = st.session_state['selected_candidate'] if 'selected_candidate' in st.session_state else None | |
run_clustering_for_ba_lainnya() | |
pass # Placeholder | |
# Memastikan kode ini hanya dijalankan jika menu yang dipilih adalah "View Training Dataset" | |
if menu == "View Training Dataset": | |
st.title("View Training Dataset") | |
# Header untuk memilih kandidat | |
st.header("Options") | |
selected_candidate = st.selectbox("Choose a candidate:", list(candidate_list), key='candidate_select_view') | |
# Path dataset | |
dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx" | |
history_path = f"history_{selected_candidate.lower().replace(' ', '_')}.json" | |
# Memuat dataset | |
try: | |
df = pd.read_excel(dataset_path) | |
# Memastikan kolom yang diperlukan ada | |
required_columns = ['model_data', 'Platform', 'komentar', 'sentimen', 'tanggal', 'tanggal_masuk', 'link', 'evaluated_by_data_train'] | |
for col in required_columns: | |
if col not in df.columns: | |
if col == 'evaluated_by_data_train': | |
df[col] = False | |
# Menambahkan kolom 'update_ba' jika belum ada | |
if 'update_ba' not in df.columns: | |
df['update_ba'] = None | |
# Menambahkan kolom 'missing_comment' jika belum ada | |
if 'missing_comment' not in df.columns: | |
df['missing_comment'] = False | |
# Pastikan 'tanggal_masuk' dan 'tanggal' dalam format datetime yang benar | |
df['tanggal_masuk'] = pd.to_datetime(df['tanggal_masuk'], errors='coerce') | |
df['tanggal'] = pd.to_datetime(df['tanggal'], errors='coerce') | |
# Konversi kolom tanggal agar hanya menyimpan bagian tanggal tanpa waktu | |
df['tanggal'] = df['tanggal'].dt.date | |
df['tanggal_masuk'] = df['tanggal_masuk'].dt.date | |
df['update_ba'] = pd.to_datetime(df['update_ba'], errors='coerce').dt.date | |
# Menangani nilai NaT (Not a Time) jika ada | |
if df['tanggal_masuk'].isnull().any(): | |
st.warning("Some dates 'tanggal_masuk' could not be parsed correctly. Please check the date format in the dataset.") | |
df['tanggal_masuk'].fillna(pd.Timestamp.now().date(), inplace=True) | |
if df['tanggal'].isnull().any(): | |
st.warning("Some dates 'tanggal' could not be parsed correctly. Please check the date format in the dataset.") | |
df['tanggal'].fillna(pd.Timestamp.now().date(), inplace=True) | |
# Menambahkan kolom 'kandidat' jika belum ada dan mengisinya | |
if 'kandidat' not in df.columns: | |
df['kandidat'] = selected_candidate | |
# Mengambil subset kolom yang diperlukan | |
df = df[required_columns + ['update_ba', 'kandidat', 'missing_comment']] | |
# Perbarui nilai None di update_ba dengan tanggal_masuk | |
df['update_ba'].fillna(df['tanggal_masuk'], inplace=True) | |
# Menampilkan statistik dasar | |
st.subheader(f"Training Dataset for {selected_candidate}") | |
st.write(f"**Total rows in dataset:** {len(df)}") | |
if not df.empty: | |
# Visualisasi sebaran update BA | |
st.subheader("Visualisasi Postingan Berdasarkan Update BA") | |
ba_update_counts = df['update_ba'].value_counts().sort_index() | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
ba_update_counts.plot(kind='bar', ax=ax, color='blue') | |
ax.set_title('Sebaran Postingan Berdasarkan Update BA') | |
ax.set_xlabel('Tanggal Update BA') | |
ax.set_ylabel('Jumlah Postingan') | |
plt.xticks(rotation=45) | |
plt.tight_layout() | |
st.pyplot(fig) | |
# Visualisasi tambahan sebaran platform | |
st.subheader("Sebaran Platform Berdasarkan Update BA") | |
platform_counts = df['Platform'].value_counts() | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
platform_counts.plot(kind='bar', ax=ax, color='green') | |
ax.set_title('Sebaran Platform Berdasarkan Update BA') | |
ax.set_xlabel('Platform') | |
ax.set_ylabel('Jumlah Postingan') | |
plt.xticks(rotation=45) | |
plt.tight_layout() | |
st.pyplot(fig) | |
# Visualisasi jumlah komentar hilang berdasarkan platform | |
st.subheader("Jumlah Komentar Hilang Berdasarkan Platform") | |
missing_comments_by_platform = df.groupby('Platform')['missing_comment'].sum().sort_index() | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
missing_comments_by_platform.plot(kind='bar', ax=ax, color='red') | |
ax.set_title('Jumlah Komentar Hilang Berdasarkan Platform') | |
ax.set_xlabel('Platform') | |
ax.set_ylabel('Jumlah Komentar Hilang') | |
plt.xticks(rotation=45) | |
plt.tight_layout() | |
st.pyplot(fig) | |
# Filter berdasarkan status validasi | |
st.subheader("Filter Data") | |
validation_filter = st.radio( | |
"Choose data type to view:", | |
["All Data", "Validated Data", "Non-Validated Data"], | |
key='validation_filter' | |
) | |
if validation_filter == "Validated Data": | |
filtered_data = df[df['evaluated_by_data_train'] == True] | |
elif validation_filter == "Non-Validated Data": | |
filtered_data = df[df['evaluated_by_data_train'] == False] | |
else: | |
filtered_data = df | |
if not filtered_data.empty: | |
st.subheader(f"Filtered Data: {validation_filter}") | |
st.dataframe(filtered_data) # Menampilkan semua data yang sesuai dengan filter | |
else: | |
st.warning("Tidak ada data yang sesuai dengan filter yang dipilih.") | |
# Menampilkan riwayat penambahan data | |
st.subheader("History of Data Additions") | |
try: | |
with open(history_path, "r") as f: | |
history = json.load(f) | |
history_list = [] | |
for key, value in history.items(): | |
for entry in value: | |
for k, v in entry.items(): | |
history_list.append({ | |
'key': key, | |
'field': k, | |
'date': v | |
}) | |
history_df = pd.DataFrame(history_list) | |
st.dataframe(history_df) | |
except FileNotFoundError: | |
st.write("No addition history available.") | |
except ValueError as e: | |
st.error(f"An error occurred while loading history data: {e}") | |
# Opsi untuk mengunduh dataset yang telah difilter | |
st.subheader("Download Options") | |
if not filtered_data.empty: | |
excel_buffer = io.BytesIO() | |
with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer: | |
filtered_data.to_excel(writer, index=False, sheet_name='Filtered Dataset') | |
excel_buffer.seek(0) | |
st.download_button( | |
label=f"Download Filtered Dataset for {selected_candidate}", | |
data=excel_buffer, | |
file_name=f"filtered_training_dataset_{selected_candidate}.xlsx", | |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
) | |
# Opsi untuk mengunduh seluruh dataset | |
if not df.empty: | |
excel_buffer_full = io.BytesIO() | |
with pd.ExcelWriter(excel_buffer_full, engine='xlsxwriter') as writer: | |
df.to_excel(writer, index=False, sheet_name='Training Dataset') | |
excel_buffer_full.seek(0) | |
st.download_button( | |
label=f"Download Full Training Dataset for {selected_candidate}", | |
data=excel_buffer_full, | |
file_name=f"training_dataset_{selected_candidate}.xlsx", | |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
) | |
# Fitur untuk mengunduh dataset berdasarkan filter tanggal | |
st.subheader("Download Aggregated Data by Date") | |
# Mendapatkan nilai maksimum dan minimum tanggal untuk default date_input | |
if not df['tanggal'].empty: | |
min_date = df['tanggal'].min() | |
max_date = df['tanggal'].max() | |
selected_start_date = st.date_input("Select start date for aggregation:", value=min_date) | |
selected_end_date = st.date_input("Select end date for aggregation:", value=max_date) | |
agg_filtered_data = df[(df['tanggal'] >= selected_start_date) & (df['tanggal'] <= selected_end_date)] | |
if not agg_filtered_data.empty: | |
# Pastikan kolom 'kandidat' ada dan diisi | |
agg_filtered_data['kandidat'] = selected_candidate | |
aggregated_data = agg_filtered_data.groupby(['kandidat', 'link', 'tanggal', 'Platform', 'model_data', 'update_ba']).agg( | |
co_likes=('sentimen', lambda x: (x == 'Co Likes').sum()), | |
co_support=('sentimen', lambda x: (x == 'Co Support').sum()), | |
co_optimism=('sentimen', lambda x: (x == 'Co Optimism').sum()), | |
co_sarkastic=('sentimen', lambda x: (x == 'Co Sarkastic').sum()), | |
co_negative=('sentimen', lambda x: (x == 'Co Negative').sum()), | |
missing_comments=('missing_comment', 'sum') # Menambahkan agregasi untuk missing_comment | |
).reset_index() | |
# Menambahkan kolom-kolom baru untuk jumlah komentar | |
aggregated_data['jumlah_komentar_positif'] = aggregated_data['co_likes'] + aggregated_data['co_support'] + aggregated_data['co_optimism'] | |
aggregated_data['jumlah_komentar_negatif'] = aggregated_data['co_sarkastic'] + aggregated_data['co_negative'] | |
aggregated_data['jumlah_komentar'] = aggregated_data[['co_likes', 'co_support', 'co_optimism', 'co_sarkastic', 'co_negative']].sum(axis=1) | |
st.dataframe(aggregated_data) | |
# Visualisasi jumlah postingan yang diupdate BA dengan sebaran platform berdasarkan rentang tanggal | |
st.subheader("Visualisasi Postingan yang Diupdate BA Berdasarkan Rentang Tanggal") | |
ba_update_range = aggregated_data[aggregated_data['update_ba'] != 'Belum diupdate'] | |
if not ba_update_range.empty: | |
plt.figure(figsize=(10, 6)) | |
ba_update_range['Platform'].value_counts().plot(kind='bar', title='Sebaran Platform - Diupdate BA (Rentang Tanggal)') | |
plt.xlabel('Platform') | |
plt.ylabel('Jumlah Postingan') | |
st.pyplot(plt) | |
# Tambahan visualisasi sebaran postingan berdasarkan tanggal | |
st.subheader("Sebaran Postingan Berdasarkan Tanggal") | |
plt.figure(figsize=(10, 6)) | |
ba_update_range['tanggal'].value_counts().sort_index().plot(kind='bar', title='Sebaran Postingan Berdasarkan Tanggal') | |
plt.xlabel('Tanggal') | |
plt.ylabel('Jumlah Postingan') | |
plt.xticks(rotation=45) | |
plt.tight_layout() | |
st.pyplot(plt) | |
# Tambahan visualisasi sebaran update BA | |
st.subheader("Sebaran Update BA") | |
plt.figure(figsize=(10, 6)) | |
ba_update_range['update_ba'].value_counts().sort_index().plot(kind='bar', title='Sebaran Update BA') | |
plt.xlabel('Tanggal Update BA') | |
plt.ylabel('Jumlah Postingan') | |
plt.xticks(rotation=45) | |
plt.tight_layout() | |
st.pyplot(plt) | |
# Visualisasi jumlah komentar berdasarkan tanggal_masuk | |
st.subheader("Jumlah Komentar Berdasarkan Tanggal Masuk") | |
plt.figure(figsize=(10, 6)) | |
agg_filtered_data.groupby('tanggal_masuk')['komentar'].count().sort_index().plot(kind='bar', title='Jumlah Komentar Berdasarkan Tanggal Masuk') | |
plt.xlabel('Tanggal Masuk') | |
plt.ylabel('Jumlah Komentar') | |
plt.xticks(rotation=45) | |
plt.tight_layout() | |
st.pyplot(plt) | |
# Visualisasi sebaran komentar di tiap platform | |
st.subheader("Sebaran Komentar di Tiap Platform") | |
plt.figure(figsize=(10, 6)) | |
agg_filtered_data['Platform'].value_counts().plot(kind='bar', title='Sebaran Komentar di Tiap Platform') | |
plt.xlabel('Platform') | |
plt.ylabel('Jumlah Komentar') | |
plt.xticks(rotation=45) | |
plt.tight_layout() | |
st.pyplot(plt) | |
# Visualisasi jumlah missing comments berdasarkan postingan | |
st.subheader("Jumlah Komentar Hilang Berdasarkan Postingan") | |
plt.figure(figsize=(10, 6)) | |
aggregated_data.groupby('link')['missing_comments'].sum().sort_index().plot(kind='bar', title='Jumlah Komentar Hilang Berdasarkan Postingan') | |
plt.xlabel('Link') | |
plt.ylabel('Jumlah Komentar Hilang') | |
plt.xticks(rotation=45) | |
plt.tight_layout() | |
st.pyplot(plt) | |
else: | |
st.warning("Tidak ada data yang diupdate BA untuk rentang tanggal yang dipilih.") | |
# Opsi untuk mengunduh dataset yang telah diagregasi | |
excel_buffer_aggregated = io.BytesIO() | |
with pd.ExcelWriter(excel_buffer_aggregated, engine='xlsxwriter') as writer: | |
aggregated_data.to_excel(writer, index=False, sheet_name='Aggregated Data') | |
excel_buffer_aggregated.seek(0) | |
st.download_button( | |
label=f"Download Aggregated Data by Date for {selected_candidate}", | |
data=excel_buffer_aggregated, | |
file_name=f"aggregated_data_{selected_candidate}.xlsx", | |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
) | |
else: | |
st.warning("Tidak ada data yang sesuai dengan rentang tanggal yang dipilih untuk diagregasi.") | |
else: | |
st.warning("Tidak ada data dalam dataset untuk divisualisasikan.") | |
else: | |
st.warning("Tidak ada data dalam dataset untuk divisualisasikan.") | |
except FileNotFoundError: | |
st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.") | |
except Exception as e: | |
st.error(f"An error occurred: {e}") | |
pass | |
if menu == "Evaluate Data Train": | |
st.title("Evaluate Data Train") | |
selected_candidate = st.selectbox("Choose a candidate:", list(candidate_list), key='candidate_select_evaluate') | |
dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx" | |
try: | |
df = pd.read_excel(dataset_path) | |
# Load existing keyword dictionary | |
try: | |
with open('keywords.json', 'r') as f: | |
keyword_dict = json.load(f) | |
st.success("keywords.json loaded successfully.") | |
except FileNotFoundError: | |
st.error("keywords.json file not found. Please ensure the file is in the correct directory.") | |
st.stop() | |
except json.JSONDecodeError: | |
st.error("keywords.json file is not a valid JSON. Please check the file format.") | |
st.stop() | |
# Select candidate-specific keywords | |
candidate_key = selected_candidate.replace(' ', '_') | |
candidate_keywords = keyword_dict.get(candidate_key) | |
if not candidate_keywords: | |
st.error(f"No keywords found for the selected candidate '{selected_candidate}'. Please update the 'keywords.json' file with appropriate keywords.") | |
st.stop() | |
keywords = [kw for sentiment_keywords in candidate_keywords.values() for kw in sentiment_keywords] | |
# Validasi Konsistensi Data | |
st.subheader("Data Consistency Validation") | |
missing_values = df.isnull().sum() | |
st.write("Missing values in each column:") | |
st.write(missing_values) | |
# Menghapus baris dengan nilai yang hilang | |
st.write("Removing rows with missing values...") | |
df.dropna(inplace=True) | |
# Distribusi Sentimen | |
st.subheader("Sentiment Distribution") | |
sentiment_counts = df['sentimen'].value_counts() | |
st.write("Number of comments for each sentiment:") | |
st.write(sentiment_counts) | |
# Kualitas Data | |
st.subheader("Data Quality Check") | |
invalid_entries = df[df['komentar'].str.len() == 0] | |
st.write(f"Number of invalid comments (empty): {len(invalid_entries)}") | |
if len(invalid_entries) > 0: | |
st.write("Invalid comments (empty):") | |
st.dataframe(invalid_entries.head(100)) # Preview 100 baris pertama | |
# Deteksi Inkonistensi Sentimen | |
st.subheader("Inconsistent Sentiment Labels") | |
duplicate_comments = df[df.duplicated(subset=['komentar'], keep=False)] | |
inconsistent_labels = duplicate_comments.groupby('komentar')['sentimen'].nunique() | |
inconsistent_labels = inconsistent_labels[inconsistent_labels > 1] | |
if not inconsistent_labels.empty: | |
inconsistent_labels_df = duplicate_comments[duplicate_comments['komentar'].isin(inconsistent_labels.index)] | |
st.write(f"Number of comments with inconsistent sentiment labels: {len(inconsistent_labels_df)}") | |
st.dataframe(inconsistent_labels_df.head(100)) # Preview 100 baris pertama | |
else: | |
st.write("No comments with inconsistent sentiment labels found.") | |
# Penanganan Masalah | |
st.subheader("Problem Handling") | |
# Menghapus komentar yang kosong | |
st.write("Removing invalid (empty) comments...") | |
df = df[df['komentar'].str.len() > 0] | |
# Interaktif: Menangani komentar dengan label sentimen yang tidak konsisten | |
st.write("Resolving inconsistent sentiment labels...") | |
if not inconsistent_labels.empty: | |
for index, row in inconsistent_labels_df.iterrows(): | |
st.write(f"Comment: {row['komentar']}") | |
sentimen_options = df[df['komentar'] == row['komentar']]['sentimen'].unique().tolist() | |
new_sentimen = st.selectbox("Select correct sentiment", sentimen_options, key=f'sentimen_{index}') | |
if st.button("Update Sentiment", key=f'update_{index}'): | |
update_sentiment(index, new_sentimen) | |
# Clustering menggunakan Keyword dan Model Sarkas | |
st.write("Clustering comments using keywords and sarcasm model...") | |
keyword_vectorizer = TfidfVectorizer(vocabulary=keywords) | |
X_keywords = keyword_vectorizer.fit_transform(df['komentar']) | |
kmeans = KMeans(n_clusters=10, random_state=0).fit(X_keywords) | |
df['cluster'] = kmeans.labels_ | |
# Identifikasi kluster yang perlu ditinjau | |
review_clusters = df[df['cluster'].isin(df['cluster'].value_counts()[df['cluster'].value_counts() > 10].index)] | |
st.write("Clusters identified for review:") | |
st.dataframe(review_clusters.head(100)) # Preview 100 baris pertama | |
# Notifikasi Tambahkan Kata Kunci | |
st.warning("Some comments were not captured by the current keywords. Please add new keywords in the 'Update Keywords' section.") | |
# Visualisasi Klaster | |
cluster_counts = df['cluster'].value_counts() | |
st.write("Number of comments in each cluster:") | |
st.write(cluster_counts) | |
# Menampilkan hasil clustering | |
st.write("Comments clustered by patterns:") | |
st.dataframe(df.head(100)) # Preview 100 baris pertama | |
# Export hasil analisis dan penanganan | |
st.subheader("Export Final Data") | |
json_buffer = io.BytesIO() | |
df.to_json(json_buffer, orient='records', lines=True) | |
json_buffer.seek(0) | |
st.download_button( | |
label=f"Download Final Data for {selected_candidate}", | |
data=json_buffer, | |
file_name=f"final_data_{selected_candidate}.json", | |
mime="application/json" | |
) | |
except FileNotFoundError: | |
st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.") | |
except Exception as e: | |
st.error(f"An error occurred: {e}") | |
pass # Placeholder | |
if menu == "Retraining Model": | |
st.title("Retrain Model") | |
selected_candidate = st.selectbox("Select a candidate to retrain the model:", list(candidate_list)) | |
dataset_path = f"datasetntbnew_{selected_candidate.lower().replace(' ', '_')}.xlsx" | |
model_path = f"best_rf_model_{selected_candidate.lower().replace(' ', '_')}.joblib" | |
vectorizer_path = f"tfidf_vectorizer_{selected_candidate.lower().replace(' ', '_')}.joblib" | |
retrain_history_path = f"retrain_history_{selected_candidate.lower().replace(' ', '_')}.json" | |
try: | |
# Load dataset | |
data = pd.read_excel(dataset_path) | |
# Ensure 'komentar' is string | |
if 'komentar' not in data.columns: | |
st.error("Dataset must include a 'komentar' column.") | |
st.stop() | |
else: | |
# Convert all comments to string | |
data['komentar'] = data['komentar'].fillna('').astype(str) | |
# Separate validated and unvalidated data | |
if 'evaluated_by_cluster' in data.columns: | |
validated_data = data[data['evaluated_by_cluster'] == True] | |
unvalidated_data = data[data['evaluated_by_cluster'] == False] | |
else: | |
validated_data = pd.DataFrame(columns=data.columns) | |
unvalidated_data = data | |
st.write(f"**Validated Data:** {len(validated_data)} rows") | |
st.write(f"**Unvalidated Data:** {len(unvalidated_data)} rows") | |
# Check if all data is validated | |
if len(unvalidated_data) > 0: | |
st.warning("Model retraining is only allowed if all data has been validated through 'Evaluate Clustering'. Please ensure all data is validated before retraining the model.") | |
st.stop() | |
# Combine all data for preprocessing | |
combined_data = validated_data # Only use validated data | |
# Preprocessing Function | |
def preprocess_data(data): | |
from joblib import Parallel, delayed | |
def preprocess_comment(comment): | |
comment = translate_emojis(comment) | |
comment = normalize_unicode(comment) | |
comment = handle_replies(comment) | |
comment = clean_text(comment) | |
comment = translate_text(comment, ntb_dict) | |
comment = translate_text(comment, slang_dict) | |
comment = handle_negation(comment) | |
return comment | |
data['processed_comments'] = Parallel(n_jobs=-1)( | |
delayed(preprocess_comment)(c) for c in data['komentar'] | |
) | |
return data | |
# Preprocessing | |
st.write("Starting preprocessing...") | |
combined_data = preprocess_data(combined_data) | |
if st.button("Retrain Model"): | |
# Vectorization | |
st.write("Vectorizing data...") | |
vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=5000) | |
X = vectorizer.fit_transform(combined_data['processed_comments']) | |
y = combined_data['sentimen'] | |
# Split Data | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
# Handle Class Imbalance with SMOTE | |
st.write("Balancing data with SMOTE...") | |
smote = SMOTE(random_state=42, n_jobs=-1) | |
X_train_res, y_train_res = smote.fit_resample(X_train, y_train) | |
# Train Random Forest Model | |
st.write("Training Random Forest model...") | |
rf_model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42) | |
rf_model.fit(X_train_res, y_train_res) | |
# Evaluate on Training Data | |
st.write("Evaluating model...") | |
y_pred_train = rf_model.predict(X_train) | |
accuracy_train = accuracy_score(y_train, y_pred_train) | |
report_train = classification_report(y_train, y_pred_train, output_dict=True) | |
# Evaluate on Test Data | |
y_pred_test = rf_model.predict(X_test) | |
accuracy_test = accuracy_score(y_test, y_pred_test) | |
report_test = classification_report(y_test, y_pred_test, output_dict=True) | |
# Save Model and Vectorizer | |
st.write("Saving model and vectorizer...") | |
joblib.dump(rf_model, model_path) | |
joblib.dump(vectorizer, vectorizer_path) | |
# Log Retraining History | |
st.write("Logging retraining history...") | |
try: | |
with open(retrain_history_path, "r") as f: | |
retrain_history = json.load(f) | |
except FileNotFoundError: | |
retrain_history = [] | |
retrain_history.append({ | |
"date_retrained": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'), | |
"accuracy_on_train": accuracy_train, | |
"accuracy_on_test": accuracy_test, | |
"f1_score_on_train": report_train['weighted avg']['f1-score'], | |
"f1_score_on_test": report_test['weighted avg']['f1-score'], | |
}) | |
with open(retrain_history_path, "w") as f: | |
json.dump(retrain_history, f, indent=4) | |
# Display Results | |
st.success(f"Model retrained successfully! Accuracy on training data: {accuracy_train:.4f}, Accuracy on test data: {accuracy_test:.4f}") | |
st.subheader("Model Metrics on Training Data") | |
st.table(pd.DataFrame(report_train).T) | |
st.subheader("Model Metrics on Test Data") | |
st.table(pd.DataFrame(report_test).T) | |
# Show Retrain History | |
st.subheader("Retrain History") | |
st.json(retrain_history) | |
except FileNotFoundError: | |
st.error(f"No training dataset found for {selected_candidate}. Please add data to create the dataset.") | |
except Exception as e: | |
st.error(f"An unexpected error occurred: {e}") | |
pass # Placeholder | |
if menu == "Maximize Preprocessing": | |
st.title("Maximize Preprocessing") | |
# Load Dataset Train | |
candidate = st.selectbox("Choose a candidate:", list(candidate_list)) | |
dataset_path = f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx" | |
try: | |
# Load dataset | |
data = pd.read_excel(dataset_path) | |
# Ensure 'komentar' is string | |
if 'komentar' not in data.columns: | |
st.error("Dataset must include a 'komentar' column.") | |
st.stop() | |
data['komentar'] = data['komentar'].fillna('').astype(str) | |
# Preprocessing Steps | |
def preprocess_data(data): | |
st.write("Starting preprocessing...") | |
data['translated_emojis'] = data['komentar'].apply(translate_emojis) | |
data['normalized_unicode'] = data['translated_emojis'].apply(normalize_unicode) | |
data['reply_handled'] = data['normalized_unicode'].apply(handle_replies) | |
data['clean_text'] = data['reply_handled'].apply(clean_text) | |
data['translated_ntb'] = data['clean_text'].apply(lambda x: translate_text(x, ntb_dict)) | |
data['translated_slang'] = data['translated_ntb'].apply(lambda x: translate_text(x, slang_dict)) | |
data['negation_handled'] = data['translated_slang'].apply(handle_negation) | |
return data | |
data = preprocess_data(data) | |
# Check Unmapped Words | |
st.subheader("Check for Unmapped Words") | |
all_words = (word.lower() for comment in data['negation_handled'] for word in comment.split()) | |
unique_words = set(all_words) | |
ntb_dict_keys = set(ntb_dict.keys()) | |
slang_dict_keys = set(slang_dict.keys()) | |
mapped_words = ntb_dict_keys.union(slang_dict_keys) | |
unmapped_words = sorted(unique_words - mapped_words) | |
if unmapped_words: | |
st.write(f"Found **{len(unmapped_words)} unmapped words.**") | |
# Pilihan jumlah kata yang ingin ditampilkan | |
max_words = st.slider( | |
"Select number of words to display:", | |
min_value=10, | |
max_value=len(unmapped_words), | |
value=min(50, len(unmapped_words)), | |
step=10, | |
) | |
# Buat DataFrame untuk tampilan tabel | |
unmapped_df = pd.DataFrame(unmapped_words, columns=["Unmapped Words"]) | |
st.dataframe(unmapped_df.head(max_words)) | |
# Tampilkan jumlah total kata jika dibutuhkan | |
st.caption(f"Showing {min(max_words, len(unmapped_words))} out of {len(unmapped_words)} unmapped words.") | |
else: | |
st.success("No unmapped words found!") | |
# Add Words to Dictionary | |
st.subheader("Add New Words to Dictionary") | |
new_word = st.text_input("Enter new word:") | |
normalized_word = st.text_input("Enter normalized form:") | |
dictionary_choice = st.radio("Select dictionary to update:", ["Kamus Alay", "Kamus ntb"]) | |
if st.button("Add to Dictionary"): | |
if new_word and normalized_word: | |
if dictionary_choice == "Kamus Alay": | |
slang_dict[new_word.lower()] = normalized_word | |
st.success(f"Added '{new_word}' -> '{normalized_word}' to Kamus Alay.") | |
elif dictionary_choice == "Kamus ntb": | |
ntb_dict[new_word.lower()] = normalized_word | |
st.success(f"Added '{new_word}' -> '{normalized_word}' to Kamus ntb.") | |
else: | |
st.warning("Please enter both the new word and its normalized form.") | |
# Save Updates to File | |
st.subheader("Save Updated Dictionaries") | |
if st.button("Save Kamus Alay"): | |
kamus_alay_path = 'kamusalay.csv' # Adjust the path as needed | |
pd.DataFrame(list(slang_dict.items()), columns=["slang", "formal"]).to_csv(kamus_alay_path, index=False) | |
st.success(f"Kamus Alay saved successfully to {kamus_alay_path}.") | |
if st.button("Save Kamus ntb"): | |
kamus_ntb_path = 'ntb_dict.json' # Adjust the path as needed | |
with open(kamus_ntb_path, 'w', encoding='utf-8') as f: | |
json.dump(ntb_dict, f, indent=4) | |
st.success(f"Kamus ntb saved successfully to {kamus_ntb_path}.") | |
except FileNotFoundError: | |
st.error(f"No training dataset found for {candidate}. Please ensure the dataset is available.") | |
except Exception as e: | |
st.error(f"An unexpected error occurred: {e}") | |
pass # Placeholder | |
if menu == "Update Keywords": | |
st.title("Update Keywords") | |
# Load existing keyword dictionary | |
with open('keywords.json', 'r') as f: | |
keyword_dict = json.load(f) | |
# Show current keywords | |
st.subheader("Current Keywords") | |
candidate = st.selectbox("Select candidate", list(keyword_dict.keys())) | |
for sentiment, keywords in keyword_dict[candidate].items(): | |
st.write(f"{sentiment}: {', '.join(keywords)}") | |
# Add new keyword | |
st.subheader("Add New Keyword") | |
new_keyword = st.text_input("Enter new keyword") | |
selected_sentiment = st.selectbox("Select sentiment for new keyword", list(keyword_dict[candidate].keys())) | |
if st.button("Add Keyword"): | |
if new_keyword and selected_sentiment: | |
keyword_dict[candidate][selected_sentiment].append(new_keyword) | |
with open('keywords.json', 'w') as f: | |
json.dump(keyword_dict, f, indent=4) | |
st.success(f"Keyword '{new_keyword}' added to {selected_sentiment} for {candidate}") | |
else: | |
st.error("Please enter a keyword and select a sentiment") | |
# Analyze Special Cluster | |
st.subheader("Analyze Special Cluster") | |
if 'ba_lainnya_data' in st.session_state: | |
try: | |
# Langsung Memuat Data `Special Cluster` | |
special_cluster_data = st.session_state['ba_lainnya_data'][st.session_state['ba_lainnya_data']['Cluster_Name'] == 'Special Cluster'] | |
if special_cluster_data.empty: | |
st.warning("No data found in Special Cluster.") | |
else: | |
st.write(f"Total comments in Special Cluster: {len(special_cluster_data)}") | |
all_words_special = [] | |
for comment in special_cluster_data['negation_handled']: | |
comment = translate_emojis(comment) | |
comment = normalize_unicode(comment) | |
comment = handle_replies(comment) | |
comment = clean_text(comment) | |
comment = translate_text(comment, {}) # Adjust based on your dictionary | |
comment = handle_negation(comment) | |
words = preprocess_text(comment) | |
all_words_special.extend(words) | |
# Calculate word frequencies | |
word_freq_special = Counter(all_words_special) | |
# Add slider to select number of words to display | |
num_words_special = st.slider("Number of words to display (Special Cluster)", min_value=5, max_value=50, value=20) | |
most_common_words_special = word_freq_special.most_common(num_words_special) | |
# Display word frequencies as a table | |
st.subheader(f"Top {num_words_special} Word Frequencies in Special Cluster") | |
word_freq_df_special = pd.DataFrame(most_common_words_special, columns=['Word', 'Frequency']) | |
st.dataframe(word_freq_df_special) | |
except Exception as e: | |
st.error(f"An error occurred: {e}") | |
else: | |
st.warning("No 'BA Lainnya' data found. Please classify comments first.") | |
# Analyze Training Data | |
st.subheader("Analyze Training Data") | |
dataset_path = f"datasetntbnew_{candidate.lower().replace(' ', '_')}.xlsx" | |
try: | |
train_data = pd.read_excel(dataset_path) | |
if train_data.empty: | |
st.warning("Training dataset is empty.") | |
else: | |
all_words_train = [] | |
for comment in train_data['komentar'].astype(str): | |
comment = translate_emojis(comment) | |
comment = normalize_unicode(comment) | |
comment = handle_replies(comment) | |
comment = clean_text(comment) | |
comment = translate_text(comment, {}) # Adjust based on your dictionary | |
comment = handle_negation(comment) | |
words = preprocess_text(comment) | |
all_words_train.extend(words) | |
# Calculate word frequencies | |
word_freq_train = Counter(all_words_train) | |
# Add slider to select number of words to display | |
num_words_train = st.slider("Number of words to display (Training Data)", min_value=5, max_value=50, value=20) | |
most_common_words_train = word_freq_train.most_common(num_words_train) | |
# Display word frequencies as a table | |
st.subheader(f"Top {num_words_train} Word Frequencies in Training Data") | |
word_freq_df_train = pd.DataFrame(most_common_words_train, columns=['Word', 'Frequency']) | |
st.dataframe(word_freq_df_train) | |
except FileNotFoundError: | |
st.error(f"Training dataset for {candidate} not found.") | |
except Exception as e: | |
st.error(f"An error occurred: {e}") | |
# Option to export keywords | |
st.subheader("Export Keywords") | |
json_buffer = io.BytesIO() | |
json_buffer.write(json.dumps(keyword_dict).encode('utf-8')) | |
json_buffer.seek(0) | |
st.download_button( | |
label="Export Keywords", | |
data=json_buffer, | |
file_name="keywords.json", | |
mime="application/json" | |
) | |
# Option to import keywords | |
st.subheader("Import Keywords") | |
uploaded_file = st.file_uploader("Choose a JSON file", type="json") | |
if uploaded_file is not None: | |
imported_keywords = json.load(uploaded_file) | |
keyword_dict.update(imported_keywords) | |
with open('keywords.json', 'w') as f: | |
json.dump(keyword_dict, f, indent=4) | |
st.success("Keywords imported successfully") | |
pass | |