Spaces:
Running
Running
Dana Atzil
commited on
Commit
·
685d696
1
Parent(s):
0f1ab6c
add files
Browse files- MIND_utils.py +120 -0
- clean_annotations_safe.csv +0 -0
- streamlit_app_LDA.py +145 -0
MIND_utils.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np, pandas as pd, json
|
2 |
+
|
3 |
+
dimensions = ["A", "B-S", "B-O", "C-S", "C-O", "D"]
|
4 |
+
dimension_to_layer_name = {
|
5 |
+
"B-S": "Segment Patient B-S",
|
6 |
+
"C-S": "Segment Patient C-S",
|
7 |
+
"D": "Segment Patient Desire (D)",
|
8 |
+
"C-O": "Segment Patient C-O",
|
9 |
+
"B-O": "Segment Patient B-O",
|
10 |
+
"A": "Segment Patient Affect (A)",
|
11 |
+
}
|
12 |
+
|
13 |
+
def df_to_self_states_json(df, doc_name, annotator = None):
|
14 |
+
"""Convert a dataframe into a json object that can be more easily used for visualization."""
|
15 |
+
# df is the dataframe of annotations
|
16 |
+
# doc_name is the name of the document
|
17 |
+
# annotator is the name of the annotator (optional)
|
18 |
+
def get_evidence_obj(evidence_df):
|
19 |
+
"Assume that the evidence_df is a partial dataframe including only annotation of a single evidence span."
|
20 |
+
evidence_obj = {k: v.value.iloc[0] for k, v in evidence_df.groupby("feature")}
|
21 |
+
# evidence_obj["text"] = evidence_df.span_text.iloc[0]
|
22 |
+
# evidence_obj["span_index_begin"] = evidence_df.begin.iloc[0]
|
23 |
+
# evidence_obj["span_index_end"] = evidence_df.end.iloc[0]
|
24 |
+
return evidence_obj
|
25 |
+
|
26 |
+
doc_object = {"document": doc_name, "annotator": annotator}
|
27 |
+
doc_object["segments"] = []
|
28 |
+
doc_df = df[df.document == doc_name]
|
29 |
+
if annotator:
|
30 |
+
doc_df = doc_df[doc_df.annotator == annotator]
|
31 |
+
|
32 |
+
# now add the segments
|
33 |
+
for segment_index, segment_group in df[df.document == doc_name].groupby("segment"):
|
34 |
+
# add Segment Summary features into segment object
|
35 |
+
segment_object = {"segment": segment_index}
|
36 |
+
segment_summary_df = segment_group[segment_group.layer == "Segment Summary"]
|
37 |
+
# # if not post-summary, skip this post
|
38 |
+
# if segment_summary_df.empty:
|
39 |
+
# continue
|
40 |
+
segment_object["Segment Summary"] = {k: v.value.iloc[0] for k, v in segment_summary_df.groupby("feature")}
|
41 |
+
|
42 |
+
state1_df = segment_group[segment_group.self_state_index == 1]
|
43 |
+
state2_df = segment_group[segment_group.self_state_index == 2]
|
44 |
+
states_list = list()
|
45 |
+
state1_obj = dict()
|
46 |
+
state2_obj = dict()
|
47 |
+
# set is_adaptive for each state
|
48 |
+
if not state1_df.empty:
|
49 |
+
state1_obj["is_adaptive"] = state1_df.is_adaptive.dropna().iloc[0]
|
50 |
+
states_list.append(state1_obj)
|
51 |
+
if not state2_df.empty:
|
52 |
+
state2_obj["is_adaptive"] = state2_df.is_adaptive.dropna().iloc[0]
|
53 |
+
states_list.append(state2_obj)
|
54 |
+
# collect elements per dimension
|
55 |
+
for dimension in dimensions:
|
56 |
+
segment_dim_layer_name = dimension_to_layer_name[dimension]
|
57 |
+
state1_dimension_df = state1_df[state1_df.layer == segment_dim_layer_name]
|
58 |
+
state2_dimension_df = state2_df[state2_df.layer == segment_dim_layer_name]
|
59 |
+
# search for evidence layers that match the same segment and dimension
|
60 |
+
dim_evidence_rows = segment_group[segment_group.layer == f"Patient_{dimension}_evidence"]
|
61 |
+
if not state1_dimension_df.empty:
|
62 |
+
state1_obj[dimension] = {k: v.value.iloc[0] for k, v in state1_dimension_df.groupby("feature")} # "Category", "Adaptivity", "Presence"
|
63 |
+
evidences_obj = []
|
64 |
+
# for _, evidence_df in dim_evidence_rows.groupby("span_text"):
|
65 |
+
for _, evidence_df in dim_evidence_rows.groupby(["begin", "end"]):
|
66 |
+
# take only the evidence that matches the category of the segment-level element of the same dimension
|
67 |
+
if not evidence_df.empty and evidence_df[evidence_df.feature == "Category"].value.iloc[0] == state1_obj[dimension]["Category"]:
|
68 |
+
evidences_obj.append(get_evidence_obj(evidence_df))
|
69 |
+
if evidences_obj:
|
70 |
+
state1_obj[dimension]["evidences"] = evidences_obj
|
71 |
+
if not state2_dimension_df.empty:
|
72 |
+
state2_obj[dimension] = {k: v.value.iloc[0] for k, v in state2_dimension_df.groupby("feature")} # "Category", "Adaptivity", "Presence"
|
73 |
+
evidences_obj = []
|
74 |
+
# for _, evidence_df in dim_evidence_rows.groupby("span_text"):
|
75 |
+
for _, evidence_df in dim_evidence_rows.groupby(["begin", "end"]):
|
76 |
+
# take only the evidence that matches the category of the segment-level element of the same dimension
|
77 |
+
if not evidence_df.empty and evidence_df[evidence_df.feature == "Category"].value.iloc[0] == state2_obj[dimension]["Category"]:
|
78 |
+
evidences_obj.append(get_evidence_obj(evidence_df))
|
79 |
+
if evidences_obj:
|
80 |
+
state2_obj[dimension]["evidences"] = evidences_obj
|
81 |
+
|
82 |
+
segment_object["self-states"] = states_list
|
83 |
+
|
84 |
+
# add the post object to the document object
|
85 |
+
doc_object["segments"].append(segment_object)
|
86 |
+
return doc_object
|
87 |
+
|
88 |
+
element_short_desc_map = {
|
89 |
+
'A:(11) Proud': 'A:(11) Proud',
|
90 |
+
'B-O:(1) Relating behavior': 'B-O:(1) Relating',
|
91 |
+
'C-S:(1) Self-acceptance and compassion': 'C-S:(1) Self-compassion',
|
92 |
+
'D:(1) Relatedness': 'D:(1) Relatedness',
|
93 |
+
'A:(4) Depressed, despair, hopeless': 'A:(4) Depressed',
|
94 |
+
'C-O:(4) Perception of the other as blocking autonomy needs': 'C-O:(4) Other blocks autonomy',
|
95 |
+
'C-S:(2) Self criticism': 'C-S:(2) Self-criticism',
|
96 |
+
'C-O:(2) Perception of the other as detached or over attached': 'C-O:(2) Other detached/overattached',
|
97 |
+
'C-O:(1) Perception of the other as related': 'C-O:(1) Other related',
|
98 |
+
'A:(3) Sad, emotional pain, grieving': 'A:(3) Sadness',
|
99 |
+
'B-O:(2) Fight or flight behavior': 'B-O:(2) Fight/flight',
|
100 |
+
'A:(14) Feel lonely': 'A:(14) Lonely',
|
101 |
+
'D:(2) Expectation that relatedness needs will not be met': 'D:(2) Relatedness (-)',
|
102 |
+
'B-S:(2) Self harm, neglect and avoidance': 'B-S:(2) Self-harm',
|
103 |
+
'A:(10) Angry (aggression), disgust, contempt': 'A:(10) Angry/Aggressive',
|
104 |
+
'A:(8) Apathic, don’t care, blunted': 'A:(8) Apathetic',
|
105 |
+
'B-S:(1) Self care and improvement': 'B-S:(1) Self-care',
|
106 |
+
'D:(5) Competence, self esteem, self-care': 'D:(5) Competence',
|
107 |
+
'D:(6) Expectation that competence needs will not be met': 'D:(6) Competence (-)',
|
108 |
+
'C-O:(3) Perception of the other as facilitating autonomy needs': 'C-O:(3) Other supports autonomy',
|
109 |
+
'A:(2) Anxious/ fearful/ tense': 'A:(2) Anxious',
|
110 |
+
'A:(12) Ashamed, guilty': 'A:(12) Ashamed/Guilty',
|
111 |
+
'B-O:(4) Over controlled or controlling behavior': 'B-O:(4) Controlling',
|
112 |
+
'A:(1) Calm/ laid back': 'A:(1) Calm',
|
113 |
+
'D:(4) Expectation that autonomy needs will not be met': 'D:(4) Autonomy (-)',
|
114 |
+
'D:(3) Autonomy and adaptive control': 'D:(3) Autonomy',
|
115 |
+
'A:(5) Content, happy, joy, hopeful': 'A:(5) Happy',
|
116 |
+
'B-O:(3) Autonomous or adaptive control behavior': 'B-O:(3) Adaptive control',
|
117 |
+
'A:(9) Justifiable anger/ assertive anger, justifiable outrage': 'A:(9) Justified anger',
|
118 |
+
'A:(13) Feel loved, belong': 'A:(13) Loved/Belonging',
|
119 |
+
'A:(7) Vigor / energetic': 'A:(7) Vigor'
|
120 |
+
}
|
clean_annotations_safe.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
streamlit_app_LDA.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import json
|
3 |
+
import random
|
4 |
+
import numpy as np
|
5 |
+
from gensim import corpora, models
|
6 |
+
import pyLDAvis.gensim_models as gensimvis
|
7 |
+
import pyLDAvis
|
8 |
+
import pandas as pd
|
9 |
+
import streamlit.components.v1 as components
|
10 |
+
|
11 |
+
from MIND_utils import df_to_self_states_json, element_short_desc_map
|
12 |
+
|
13 |
+
|
14 |
+
# ---------------------------
|
15 |
+
# Streamlit App Layout
|
16 |
+
# ---------------------------
|
17 |
+
st.title("Prototypical Self-States via Topic Modeling")
|
18 |
+
|
19 |
+
st.sidebar.header("Model Parameters")
|
20 |
+
num_topics = st.sidebar.slider("Number of Topics", min_value=2, max_value=20, value=5)
|
21 |
+
num_passes = st.sidebar.slider("Number of Passes", min_value=5, max_value=50, value=10)
|
22 |
+
lda_document_is = st.radio("A 'Document' in the topic model will correspond to a:", ("self-state", "segment"))
|
23 |
+
seed_value = st.sidebar.number_input("Random Seed", value=42)
|
24 |
+
num_top_elements_to_show = st.sidebar.slider("# top element to show in a topic", min_value=2, max_value=15, value=5)
|
25 |
+
|
26 |
+
# ---------------------------
|
27 |
+
# Load Data
|
28 |
+
# ---------------------------
|
29 |
+
# You can also allow users to upload their file via st.file_uploader.
|
30 |
+
# @st.cache(allow_output_mutation=True)
|
31 |
+
def load_data():
|
32 |
+
return pd.read_csv("clean_annotations_safe.csv")
|
33 |
+
|
34 |
+
df = load_data()
|
35 |
+
|
36 |
+
# ---------------------------
|
37 |
+
# Preprocess Data: Build Documents
|
38 |
+
# ---------------------------
|
39 |
+
# Set random seeds for reproducibility
|
40 |
+
random.seed(seed_value)
|
41 |
+
np.random.seed(seed_value)
|
42 |
+
|
43 |
+
# Functions to extract "words" (elements -- <dim>:<category>) from a segment / self-state
|
44 |
+
def extract_elements_from_selfstate(selfstate):
|
45 |
+
words = []
|
46 |
+
for dim, dim_obj in selfstate.items():
|
47 |
+
if dim == "is_adaptive":
|
48 |
+
continue
|
49 |
+
if "Category" in dim_obj and not pd.isna(dim_obj["Category"]):
|
50 |
+
word = f"{dim}:{dim_obj['Category']}"
|
51 |
+
words.append(word)
|
52 |
+
return words
|
53 |
+
|
54 |
+
def extract_elements_from_segment(segment):
|
55 |
+
words = []
|
56 |
+
for selfstate in segment["self-states"]:
|
57 |
+
words += extract_elements_from_selfstate(selfstate)
|
58 |
+
return words
|
59 |
+
|
60 |
+
# Build a list of "documents" (one per segment)
|
61 |
+
lda_documents = []
|
62 |
+
lda_document_ids = []
|
63 |
+
for (doc_id, annotator), df_ in df.groupby(["document", "annotator"]):
|
64 |
+
doc_json = df_to_self_states_json(df_, doc_id, annotator)
|
65 |
+
### * for Segment-level LDA-documents:
|
66 |
+
if lda_document_is == "segment":
|
67 |
+
for segment in doc_json["segments"]:
|
68 |
+
lda_doc = extract_elements_from_segment(segment)
|
69 |
+
if lda_doc: # only add if non-empty
|
70 |
+
lda_documents.append(lda_doc)
|
71 |
+
lda_document_ids.append(f"{doc_id}_seg{segment['segment']}")
|
72 |
+
### * for SelfState-level LDA-documents:
|
73 |
+
elif lda_document_is == "self-state":
|
74 |
+
for segment in doc_json["segments"]:
|
75 |
+
for i, selfstate in enumerate(segment["self-states"]):
|
76 |
+
lda_doc = extract_elements_from_selfstate(selfstate)
|
77 |
+
if lda_doc:
|
78 |
+
lda_documents.append(lda_doc)
|
79 |
+
lda_document_ids.append(f"{doc_id}_seg{segment['segment']}_state{i+1}")
|
80 |
+
|
81 |
+
# Create a dictionary and corpus for LDA
|
82 |
+
dictionary = corpora.Dictionary(lda_documents)
|
83 |
+
corpus = [dictionary.doc2bow(doc) for doc in lda_documents]
|
84 |
+
|
85 |
+
|
86 |
+
# ---------------------------
|
87 |
+
# Run LDA Model
|
88 |
+
# ---------------------------
|
89 |
+
lda_model = models.LdaModel(corpus,
|
90 |
+
num_topics=num_topics,
|
91 |
+
id2word=dictionary,
|
92 |
+
passes=num_passes,
|
93 |
+
random_state=seed_value)
|
94 |
+
|
95 |
+
# ---------------------------
|
96 |
+
# Display Pretty Printed Topics
|
97 |
+
# ---------------------------
|
98 |
+
st.header("Pretty Printed Topics")
|
99 |
+
|
100 |
+
# Build a mapping for each topic to the list of (document index, topic probability)
|
101 |
+
topic_docs = {topic_id: [] for topic_id in range(lda_model.num_topics)}
|
102 |
+
|
103 |
+
# Iterate over the corpus to get topic distributions for each document
|
104 |
+
for i, doc_bow in enumerate(corpus):
|
105 |
+
# Get the full topic distribution (with minimum_probability=0 so every topic is included)
|
106 |
+
doc_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0)
|
107 |
+
for topic_id, prob in doc_topics:
|
108 |
+
topic_docs[topic_id].append((i, prob))
|
109 |
+
|
110 |
+
# For each topic, sort the documents by probability in descending order and keep the top 3
|
111 |
+
top_docs = {}
|
112 |
+
for topic_id, doc_list in topic_docs.items():
|
113 |
+
sorted_docs = sorted(doc_list, key=lambda x: x[1], reverse=True)
|
114 |
+
top_docs[topic_id] = sorted_docs[:3]
|
115 |
+
|
116 |
+
# Aggregate output into a single string
|
117 |
+
output_str = "Identified Prototypical Self-States (Topics):\n\n"
|
118 |
+
for topic_id, topic_str in lda_model.print_topics(num_words=num_top_elements_to_show):
|
119 |
+
output_str += f"Topic {topic_id}:\n"
|
120 |
+
terms = topic_str.split(" + ")
|
121 |
+
for term in terms:
|
122 |
+
weight, token = term.split("*")
|
123 |
+
token = token.strip().replace('"', '')
|
124 |
+
output_str += f" {float(weight):.3f} -> {token}\n"
|
125 |
+
|
126 |
+
output_str += " Top 3 Documents (Segment Indices) for this topic:\n"
|
127 |
+
for doc_index, prob in top_docs[topic_id]:
|
128 |
+
# Assuming lda_document_ids is a list or dict mapping document indices to identifiers
|
129 |
+
output_str += f" Doc {doc_index} ({lda_document_ids[doc_index]}) with probability {prob:.3f}\n"
|
130 |
+
output_str += "-" * 60 + "\n"
|
131 |
+
|
132 |
+
# Now you can display the aggregated string in Streamlit:
|
133 |
+
import streamlit as st
|
134 |
+
st.text(output_str)
|
135 |
+
|
136 |
+
|
137 |
+
# ---------------------------
|
138 |
+
# Prepare and Display pyLDAvis Visualization
|
139 |
+
# ---------------------------
|
140 |
+
st.header("Interactive Topic Visualization")
|
141 |
+
# vis_dict = {i: element_short_desc_map[v] for i, v in dictionary.items()}
|
142 |
+
# vis_dictionary = corpora.dictionary.Dictionary([[new_token] for new_token in vis_dict.values()])
|
143 |
+
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
|
144 |
+
html_string = pyLDAvis.prepared_data_to_html(vis_data)
|
145 |
+
components.html(html_string, width=1300, height=800)
|