Spaces:
Running
Running
File size: 6,756 Bytes
685d696 c242887 685d696 316bd95 0181f7d cbdba6f 0181f7d c6a8635 0181f7d 685d696 316bd95 685d696 316bd95 685d696 c6a8635 685d696 c35872d 685d696 c35872d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import streamlit as st
import json
import random
import numpy as np
from gensim import corpora, models
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import pandas as pd
import streamlit.components.v1 as components
from MIND_utils import df_to_self_states_json, element_short_desc_map
# ---------------------------
# Streamlit App Layout
# ---------------------------
st.set_page_config(layout="wide")
st.title("Prototypical Self-States via Topic Modeling")
uploaded_file = st.file_uploader("Upload your own data file (CSV)", type="csv")
st.header("Model Parameters")
lda_document_is = st.radio("A 'Document' in the topic model will correspond to a:", ("self-state", "segment"))
num_topics = st.slider("Number of Topics", min_value=2, max_value=20, value=5)
num_passes = st.slider("Number of Passes", min_value=5, max_value=50, value=10)
seed_value = st.number_input("Random Seed", value=42)
st.subheader("Beta -- dispersion of words in a topic - lower means less words in each topic")
is_set_beta = st.checkbox("Set custom Beta (default: 1 / num_topics)? ")
if is_set_beta:
beta = st.number_input("Beta", min_value=0.0, max_value=1.0, value=1/num_topics, step=0.05, format="%.3f")
else:
beta = 1 / num_topics
st.subheader("Alpha -- dispersion of topics in a document - lower means less topics in each document")
is_set_alpha = st.checkbox("Set custom Alpha (default: dynamic per document)? ")
if is_set_alpha:
alpha = st.number_input("Alpha", min_value=0.0, max_value=1.0, value=1/num_topics, step=0.05, format="%.3f")
else:
alpha = "auto"
st.header("Display")
num_top_elements_to_show = st.slider("# top element to show in a topic", min_value=2, max_value=15, value=5)
show_long_elements = st.checkbox("Show full element name")
# ---------------------------
# Load Data
# ---------------------------
@st.cache_data
def load_data(csv):
return pd.read_csv(csv)
df = load_data(uploaded_file or "clean_annotations_safe.csv")
# ---------------------------
# Preprocess Data: Build Documents
# ---------------------------
# Set random seeds for reproducibility
random.seed(seed_value)
np.random.seed(seed_value)
# Functions to extract "words" (elements -- <dim>:<category>) from a segment / self-state
def extract_elements_from_selfstate(selfstate):
words = []
for dim, dim_obj in selfstate.items():
if dim == "is_adaptive":
continue
if "Category" in dim_obj and not pd.isna(dim_obj["Category"]):
word = f"{dim}:{dim_obj['Category']}"
words.append(word)
return words
def extract_elements_from_segment(segment):
words = []
for selfstate in segment["self-states"]:
words += extract_elements_from_selfstate(selfstate)
return words
# Build a list of "documents" (one per segment)
lda_documents = []
lda_document_ids = []
for (doc_id, annotator), df_ in df.groupby(["document", "annotator"]):
doc_json = df_to_self_states_json(df_, doc_id, annotator)
### * for Segment-level LDA-documents:
if lda_document_is == "segment":
for segment in doc_json["segments"]:
lda_doc = extract_elements_from_segment(segment)
if lda_doc: # only add if non-empty
lda_documents.append(lda_doc)
lda_document_ids.append(f"{doc_id}_seg{segment['segment']}")
### * for SelfState-level LDA-documents:
elif lda_document_is == "self-state":
for segment in doc_json["segments"]:
for i, selfstate in enumerate(segment["self-states"]):
lda_doc = extract_elements_from_selfstate(selfstate)
if lda_doc:
lda_documents.append(lda_doc)
lda_document_ids.append(f"{doc_id}_seg{segment['segment']}_state{i+1}")
# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(lda_documents)
corpus = [dictionary.doc2bow(doc) for doc in lda_documents]
# ---------------------------
# Run LDA Model
# ---------------------------
lda_model = models.LdaModel(corpus,
num_topics=num_topics,
id2word=dictionary,
passes=num_passes,
eta=beta,
alpha=alpha,
random_state=seed_value)
# ---------------------------
# Display Pretty Printed Topics
# ---------------------------
st.header("Pretty Printed Topics")
# Build a mapping for each topic to the list of (document index, topic probability)
topic_docs = {topic_id: [] for topic_id in range(lda_model.num_topics)}
# Iterate over the corpus to get topic distributions for each document
for i, doc_bow in enumerate(corpus):
# Get the full topic distribution (with minimum_probability=0 so every topic is included)
doc_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0)
for topic_id, prob in doc_topics:
topic_docs[topic_id].append((i, prob))
# For each topic, sort the documents by probability in descending order and keep the top 3
top_docs = {}
for topic_id, doc_list in topic_docs.items():
sorted_docs = sorted(doc_list, key=lambda x: x[1], reverse=True)
top_docs[topic_id] = sorted_docs[:3]
# Aggregate output into a single string
output_str = "Identified Prototypical Self-States (Topics):\n\n"
for topic_id, topic_str in lda_model.print_topics(num_words=num_top_elements_to_show):
output_str += f"Topic {topic_id}:\n"
terms = topic_str.split(" + ")
for term in terms:
weight, token = term.split("*")
token = token.strip().replace('"', '')
output_str += f" {float(weight):.3f} -> {token}\n"
output_str += " Top 3 Documents (Segment Indices) for this topic:\n"
for doc_index, prob in top_docs[topic_id]:
# Assuming lda_document_ids is a list or dict mapping document indices to identifiers
output_str += f" Doc {doc_index} ({lda_document_ids[doc_index]}) with probability {prob:.3f}\n"
output_str += "-" * 60 + "\n"
# Now you can display the aggregated string in Streamlit:
import streamlit as st
st.text(output_str)
# ---------------------------
# Prepare and Display pyLDAvis Visualization
# ---------------------------
st.header("Interactive Topic Visualization")
if not show_long_elements:
vis_dict = {i: element_short_desc_map[v] for i, v in dictionary.items()}
vis_dictionary = corpora.dictionary.Dictionary([[new_token] for new_token in vis_dict.values()])
vis_data = gensimvis.prepare(lda_model, corpus, vis_dictionary)
else:
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
html_string = pyLDAvis.prepared_data_to_html(vis_data)
components.html(html_string, width=2300, height=800, scrolling=True)
|