Dana Atzil commited on
Commit
685d696
·
1 Parent(s): 0f1ab6c
Files changed (3) hide show
  1. MIND_utils.py +120 -0
  2. clean_annotations_safe.csv +0 -0
  3. streamlit_app_LDA.py +145 -0
MIND_utils.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np, pandas as pd, json
2
+
3
+ dimensions = ["A", "B-S", "B-O", "C-S", "C-O", "D"]
4
+ dimension_to_layer_name = {
5
+ "B-S": "Segment Patient B-S",
6
+ "C-S": "Segment Patient C-S",
7
+ "D": "Segment Patient Desire (D)",
8
+ "C-O": "Segment Patient C-O",
9
+ "B-O": "Segment Patient B-O",
10
+ "A": "Segment Patient Affect (A)",
11
+ }
12
+
13
+ def df_to_self_states_json(df, doc_name, annotator = None):
14
+ """Convert a dataframe into a json object that can be more easily used for visualization."""
15
+ # df is the dataframe of annotations
16
+ # doc_name is the name of the document
17
+ # annotator is the name of the annotator (optional)
18
+ def get_evidence_obj(evidence_df):
19
+ "Assume that the evidence_df is a partial dataframe including only annotation of a single evidence span."
20
+ evidence_obj = {k: v.value.iloc[0] for k, v in evidence_df.groupby("feature")}
21
+ # evidence_obj["text"] = evidence_df.span_text.iloc[0]
22
+ # evidence_obj["span_index_begin"] = evidence_df.begin.iloc[0]
23
+ # evidence_obj["span_index_end"] = evidence_df.end.iloc[0]
24
+ return evidence_obj
25
+
26
+ doc_object = {"document": doc_name, "annotator": annotator}
27
+ doc_object["segments"] = []
28
+ doc_df = df[df.document == doc_name]
29
+ if annotator:
30
+ doc_df = doc_df[doc_df.annotator == annotator]
31
+
32
+ # now add the segments
33
+ for segment_index, segment_group in df[df.document == doc_name].groupby("segment"):
34
+ # add Segment Summary features into segment object
35
+ segment_object = {"segment": segment_index}
36
+ segment_summary_df = segment_group[segment_group.layer == "Segment Summary"]
37
+ # # if not post-summary, skip this post
38
+ # if segment_summary_df.empty:
39
+ # continue
40
+ segment_object["Segment Summary"] = {k: v.value.iloc[0] for k, v in segment_summary_df.groupby("feature")}
41
+
42
+ state1_df = segment_group[segment_group.self_state_index == 1]
43
+ state2_df = segment_group[segment_group.self_state_index == 2]
44
+ states_list = list()
45
+ state1_obj = dict()
46
+ state2_obj = dict()
47
+ # set is_adaptive for each state
48
+ if not state1_df.empty:
49
+ state1_obj["is_adaptive"] = state1_df.is_adaptive.dropna().iloc[0]
50
+ states_list.append(state1_obj)
51
+ if not state2_df.empty:
52
+ state2_obj["is_adaptive"] = state2_df.is_adaptive.dropna().iloc[0]
53
+ states_list.append(state2_obj)
54
+ # collect elements per dimension
55
+ for dimension in dimensions:
56
+ segment_dim_layer_name = dimension_to_layer_name[dimension]
57
+ state1_dimension_df = state1_df[state1_df.layer == segment_dim_layer_name]
58
+ state2_dimension_df = state2_df[state2_df.layer == segment_dim_layer_name]
59
+ # search for evidence layers that match the same segment and dimension
60
+ dim_evidence_rows = segment_group[segment_group.layer == f"Patient_{dimension}_evidence"]
61
+ if not state1_dimension_df.empty:
62
+ state1_obj[dimension] = {k: v.value.iloc[0] for k, v in state1_dimension_df.groupby("feature")} # "Category", "Adaptivity", "Presence"
63
+ evidences_obj = []
64
+ # for _, evidence_df in dim_evidence_rows.groupby("span_text"):
65
+ for _, evidence_df in dim_evidence_rows.groupby(["begin", "end"]):
66
+ # take only the evidence that matches the category of the segment-level element of the same dimension
67
+ if not evidence_df.empty and evidence_df[evidence_df.feature == "Category"].value.iloc[0] == state1_obj[dimension]["Category"]:
68
+ evidences_obj.append(get_evidence_obj(evidence_df))
69
+ if evidences_obj:
70
+ state1_obj[dimension]["evidences"] = evidences_obj
71
+ if not state2_dimension_df.empty:
72
+ state2_obj[dimension] = {k: v.value.iloc[0] for k, v in state2_dimension_df.groupby("feature")} # "Category", "Adaptivity", "Presence"
73
+ evidences_obj = []
74
+ # for _, evidence_df in dim_evidence_rows.groupby("span_text"):
75
+ for _, evidence_df in dim_evidence_rows.groupby(["begin", "end"]):
76
+ # take only the evidence that matches the category of the segment-level element of the same dimension
77
+ if not evidence_df.empty and evidence_df[evidence_df.feature == "Category"].value.iloc[0] == state2_obj[dimension]["Category"]:
78
+ evidences_obj.append(get_evidence_obj(evidence_df))
79
+ if evidences_obj:
80
+ state2_obj[dimension]["evidences"] = evidences_obj
81
+
82
+ segment_object["self-states"] = states_list
83
+
84
+ # add the post object to the document object
85
+ doc_object["segments"].append(segment_object)
86
+ return doc_object
87
+
88
+ element_short_desc_map = {
89
+ 'A:(11) Proud': 'A:(11) Proud',
90
+ 'B-O:(1) Relating behavior': 'B-O:(1) Relating',
91
+ 'C-S:(1) Self-acceptance and compassion': 'C-S:(1) Self-compassion',
92
+ 'D:(1) Relatedness': 'D:(1) Relatedness',
93
+ 'A:(4) Depressed, despair, hopeless': 'A:(4) Depressed',
94
+ 'C-O:(4) Perception of the other as blocking autonomy needs': 'C-O:(4) Other blocks autonomy',
95
+ 'C-S:(2) Self criticism': 'C-S:(2) Self-criticism',
96
+ 'C-O:(2) Perception of the other as detached or over attached': 'C-O:(2) Other detached/overattached',
97
+ 'C-O:(1) Perception of the other as related': 'C-O:(1) Other related',
98
+ 'A:(3) Sad, emotional pain, grieving': 'A:(3) Sadness',
99
+ 'B-O:(2) Fight or flight behavior': 'B-O:(2) Fight/flight',
100
+ 'A:(14) Feel lonely': 'A:(14) Lonely',
101
+ 'D:(2) Expectation that relatedness needs will not be met': 'D:(2) Relatedness (-)',
102
+ 'B-S:(2) Self harm, neglect and avoidance': 'B-S:(2) Self-harm',
103
+ 'A:(10) Angry (aggression), disgust, contempt': 'A:(10) Angry/Aggressive',
104
+ 'A:(8) Apathic, don’t care, blunted': 'A:(8) Apathetic',
105
+ 'B-S:(1) Self care and improvement': 'B-S:(1) Self-care',
106
+ 'D:(5) Competence, self esteem, self-care': 'D:(5) Competence',
107
+ 'D:(6) Expectation that competence needs will not be met': 'D:(6) Competence (-)',
108
+ 'C-O:(3) Perception of the other as facilitating autonomy needs': 'C-O:(3) Other supports autonomy',
109
+ 'A:(2) Anxious/ fearful/ tense': 'A:(2) Anxious',
110
+ 'A:(12) Ashamed, guilty': 'A:(12) Ashamed/Guilty',
111
+ 'B-O:(4) Over controlled or controlling behavior': 'B-O:(4) Controlling',
112
+ 'A:(1) Calm/ laid back': 'A:(1) Calm',
113
+ 'D:(4) Expectation that autonomy needs will not be met': 'D:(4) Autonomy (-)',
114
+ 'D:(3) Autonomy and adaptive control': 'D:(3) Autonomy',
115
+ 'A:(5) Content, happy, joy, hopeful': 'A:(5) Happy',
116
+ 'B-O:(3) Autonomous or adaptive control behavior': 'B-O:(3) Adaptive control',
117
+ 'A:(9) Justifiable anger/ assertive anger, justifiable outrage': 'A:(9) Justified anger',
118
+ 'A:(13) Feel loved, belong': 'A:(13) Loved/Belonging',
119
+ 'A:(7) Vigor / energetic': 'A:(7) Vigor'
120
+ }
clean_annotations_safe.csv ADDED
The diff for this file is too large to render. See raw diff
 
streamlit_app_LDA.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import json
3
+ import random
4
+ import numpy as np
5
+ from gensim import corpora, models
6
+ import pyLDAvis.gensim_models as gensimvis
7
+ import pyLDAvis
8
+ import pandas as pd
9
+ import streamlit.components.v1 as components
10
+
11
+ from MIND_utils import df_to_self_states_json, element_short_desc_map
12
+
13
+
14
+ # ---------------------------
15
+ # Streamlit App Layout
16
+ # ---------------------------
17
+ st.title("Prototypical Self-States via Topic Modeling")
18
+
19
+ st.sidebar.header("Model Parameters")
20
+ num_topics = st.sidebar.slider("Number of Topics", min_value=2, max_value=20, value=5)
21
+ num_passes = st.sidebar.slider("Number of Passes", min_value=5, max_value=50, value=10)
22
+ lda_document_is = st.radio("A 'Document' in the topic model will correspond to a:", ("self-state", "segment"))
23
+ seed_value = st.sidebar.number_input("Random Seed", value=42)
24
+ num_top_elements_to_show = st.sidebar.slider("# top element to show in a topic", min_value=2, max_value=15, value=5)
25
+
26
+ # ---------------------------
27
+ # Load Data
28
+ # ---------------------------
29
+ # You can also allow users to upload their file via st.file_uploader.
30
+ # @st.cache(allow_output_mutation=True)
31
+ def load_data():
32
+ return pd.read_csv("clean_annotations_safe.csv")
33
+
34
+ df = load_data()
35
+
36
+ # ---------------------------
37
+ # Preprocess Data: Build Documents
38
+ # ---------------------------
39
+ # Set random seeds for reproducibility
40
+ random.seed(seed_value)
41
+ np.random.seed(seed_value)
42
+
43
+ # Functions to extract "words" (elements -- <dim>:<category>) from a segment / self-state
44
+ def extract_elements_from_selfstate(selfstate):
45
+ words = []
46
+ for dim, dim_obj in selfstate.items():
47
+ if dim == "is_adaptive":
48
+ continue
49
+ if "Category" in dim_obj and not pd.isna(dim_obj["Category"]):
50
+ word = f"{dim}:{dim_obj['Category']}"
51
+ words.append(word)
52
+ return words
53
+
54
+ def extract_elements_from_segment(segment):
55
+ words = []
56
+ for selfstate in segment["self-states"]:
57
+ words += extract_elements_from_selfstate(selfstate)
58
+ return words
59
+
60
+ # Build a list of "documents" (one per segment)
61
+ lda_documents = []
62
+ lda_document_ids = []
63
+ for (doc_id, annotator), df_ in df.groupby(["document", "annotator"]):
64
+ doc_json = df_to_self_states_json(df_, doc_id, annotator)
65
+ ### * for Segment-level LDA-documents:
66
+ if lda_document_is == "segment":
67
+ for segment in doc_json["segments"]:
68
+ lda_doc = extract_elements_from_segment(segment)
69
+ if lda_doc: # only add if non-empty
70
+ lda_documents.append(lda_doc)
71
+ lda_document_ids.append(f"{doc_id}_seg{segment['segment']}")
72
+ ### * for SelfState-level LDA-documents:
73
+ elif lda_document_is == "self-state":
74
+ for segment in doc_json["segments"]:
75
+ for i, selfstate in enumerate(segment["self-states"]):
76
+ lda_doc = extract_elements_from_selfstate(selfstate)
77
+ if lda_doc:
78
+ lda_documents.append(lda_doc)
79
+ lda_document_ids.append(f"{doc_id}_seg{segment['segment']}_state{i+1}")
80
+
81
+ # Create a dictionary and corpus for LDA
82
+ dictionary = corpora.Dictionary(lda_documents)
83
+ corpus = [dictionary.doc2bow(doc) for doc in lda_documents]
84
+
85
+
86
+ # ---------------------------
87
+ # Run LDA Model
88
+ # ---------------------------
89
+ lda_model = models.LdaModel(corpus,
90
+ num_topics=num_topics,
91
+ id2word=dictionary,
92
+ passes=num_passes,
93
+ random_state=seed_value)
94
+
95
+ # ---------------------------
96
+ # Display Pretty Printed Topics
97
+ # ---------------------------
98
+ st.header("Pretty Printed Topics")
99
+
100
+ # Build a mapping for each topic to the list of (document index, topic probability)
101
+ topic_docs = {topic_id: [] for topic_id in range(lda_model.num_topics)}
102
+
103
+ # Iterate over the corpus to get topic distributions for each document
104
+ for i, doc_bow in enumerate(corpus):
105
+ # Get the full topic distribution (with minimum_probability=0 so every topic is included)
106
+ doc_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0)
107
+ for topic_id, prob in doc_topics:
108
+ topic_docs[topic_id].append((i, prob))
109
+
110
+ # For each topic, sort the documents by probability in descending order and keep the top 3
111
+ top_docs = {}
112
+ for topic_id, doc_list in topic_docs.items():
113
+ sorted_docs = sorted(doc_list, key=lambda x: x[1], reverse=True)
114
+ top_docs[topic_id] = sorted_docs[:3]
115
+
116
+ # Aggregate output into a single string
117
+ output_str = "Identified Prototypical Self-States (Topics):\n\n"
118
+ for topic_id, topic_str in lda_model.print_topics(num_words=num_top_elements_to_show):
119
+ output_str += f"Topic {topic_id}:\n"
120
+ terms = topic_str.split(" + ")
121
+ for term in terms:
122
+ weight, token = term.split("*")
123
+ token = token.strip().replace('"', '')
124
+ output_str += f" {float(weight):.3f} -> {token}\n"
125
+
126
+ output_str += " Top 3 Documents (Segment Indices) for this topic:\n"
127
+ for doc_index, prob in top_docs[topic_id]:
128
+ # Assuming lda_document_ids is a list or dict mapping document indices to identifiers
129
+ output_str += f" Doc {doc_index} ({lda_document_ids[doc_index]}) with probability {prob:.3f}\n"
130
+ output_str += "-" * 60 + "\n"
131
+
132
+ # Now you can display the aggregated string in Streamlit:
133
+ import streamlit as st
134
+ st.text(output_str)
135
+
136
+
137
+ # ---------------------------
138
+ # Prepare and Display pyLDAvis Visualization
139
+ # ---------------------------
140
+ st.header("Interactive Topic Visualization")
141
+ # vis_dict = {i: element_short_desc_map[v] for i, v in dictionary.items()}
142
+ # vis_dictionary = corpora.dictionary.Dictionary([[new_token] for new_token in vis_dict.values()])
143
+ vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
144
+ html_string = pyLDAvis.prepared_data_to_html(vis_data)
145
+ components.html(html_string, width=1300, height=800)