Spaces:

kleinay
/

MIND-states-LDA

Running

App Files Files Community

Dana Atzil commited on 7 days ago

Commit

685d696

1 Parent(s): 0f1ab6c

add files

Browse files

Files changed (3) hide show

MIND_utils.py +120 -0
clean_annotations_safe.csv +0 -0
streamlit_app_LDA.py +145 -0

MIND_utils.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import numpy as np, pandas as pd, json
+dimensions = ["A", "B-S", "B-O", "C-S", "C-O", "D"]
+dimension_to_layer_name = {
+    "B-S": "Segment Patient B-S",
+    "C-S": "Segment Patient C-S",
+    "D": "Segment Patient Desire (D)",
+    "C-O": "Segment Patient C-O",
+    "B-O": "Segment Patient B-O",
+    "A": "Segment Patient Affect (A)",
+}
+def df_to_self_states_json(df, doc_name, annotator = None):
+    """Convert a dataframe into a json object that can be more easily used for visualization."""
+    # df is the dataframe of annotations
+    # doc_name is the name of the document
+    # annotator is the name of the annotator (optional)
+    def get_evidence_obj(evidence_df):
+        "Assume that the evidence_df is a partial dataframe including only annotation of a single evidence span."
+        evidence_obj = {k: v.value.iloc[0] for k, v in evidence_df.groupby("feature")}
+        # evidence_obj["text"] = evidence_df.span_text.iloc[0]
+        # evidence_obj["span_index_begin"] = evidence_df.begin.iloc[0]
+        # evidence_obj["span_index_end"] = evidence_df.end.iloc[0]
+        return evidence_obj
+    doc_object = {"document": doc_name, "annotator": annotator}
+    doc_object["segments"] = []
+    doc_df = df[df.document == doc_name]
+    if annotator:
+        doc_df = doc_df[doc_df.annotator == annotator]
+    # now add the segments
+    for segment_index, segment_group in df[df.document == doc_name].groupby("segment"):
+        # add Segment Summary features into segment object
+        segment_object = {"segment": segment_index}
+        segment_summary_df = segment_group[segment_group.layer == "Segment Summary"]
+        # # if not post-summary, skip this post
+        # if segment_summary_df.empty:
+        #     continue
+        segment_object["Segment Summary"] = {k: v.value.iloc[0] for k, v in segment_summary_df.groupby("feature")}
+        state1_df = segment_group[segment_group.self_state_index == 1]
+        state2_df = segment_group[segment_group.self_state_index == 2]
+        states_list = list()
+        state1_obj = dict()
+        state2_obj = dict()
+        # set is_adaptive for each state
+        if not state1_df.empty:
+            state1_obj["is_adaptive"] = state1_df.is_adaptive.dropna().iloc[0]
+            states_list.append(state1_obj)
+        if not state2_df.empty:
+            state2_obj["is_adaptive"] = state2_df.is_adaptive.dropna().iloc[0]
+            states_list.append(state2_obj)
+        # collect elements per dimension
+        for dimension in dimensions:
+            segment_dim_layer_name = dimension_to_layer_name[dimension]
+            state1_dimension_df = state1_df[state1_df.layer == segment_dim_layer_name]
+            state2_dimension_df = state2_df[state2_df.layer == segment_dim_layer_name]
+            # search for evidence layers that match the same segment and dimension
+            dim_evidence_rows = segment_group[segment_group.layer == f"Patient_{dimension}_evidence"]
+            if not state1_dimension_df.empty:
+                state1_obj[dimension] = {k: v.value.iloc[0] for k, v in state1_dimension_df.groupby("feature")} # "Category", "Adaptivity", "Presence"
+                evidences_obj = []
+                # for _, evidence_df in dim_evidence_rows.groupby("span_text"):
+                for _, evidence_df in dim_evidence_rows.groupby(["begin", "end"]):
+                    # take only the evidence that matches the category of the segment-level element of the same dimension
+                    if not evidence_df.empty and evidence_df[evidence_df.feature == "Category"].value.iloc[0] == state1_obj[dimension]["Category"]:
+                        evidences_obj.append(get_evidence_obj(evidence_df))
+                if evidences_obj:
+                    state1_obj[dimension]["evidences"] = evidences_obj
+            if not state2_dimension_df.empty:
+                state2_obj[dimension] = {k: v.value.iloc[0] for k, v in state2_dimension_df.groupby("feature")} # "Category", "Adaptivity", "Presence"
+                evidences_obj = []
+                # for _, evidence_df in dim_evidence_rows.groupby("span_text"):
+                for _, evidence_df in dim_evidence_rows.groupby(["begin", "end"]):
+                    # take only the evidence that matches the category of the segment-level element of the same dimension
+                    if not evidence_df.empty and evidence_df[evidence_df.feature == "Category"].value.iloc[0] == state2_obj[dimension]["Category"]:
+                        evidences_obj.append(get_evidence_obj(evidence_df))
+                if evidences_obj:
+                    state2_obj[dimension]["evidences"] = evidences_obj
+        segment_object["self-states"] = states_list
+        # add the post object to the document object
+        doc_object["segments"].append(segment_object)
+    return doc_object
+element_short_desc_map = {
+    'A:(11) Proud': 'A:(11) Proud',
+    'B-O:(1) Relating behavior': 'B-O:(1) Relating',
+    'C-S:(1) Self-acceptance and compassion': 'C-S:(1) Self-compassion',
+    'D:(1) Relatedness': 'D:(1) Relatedness',
+    'A:(4) Depressed, despair, hopeless': 'A:(4) Depressed',
+    'C-O:(4) Perception of the other as blocking autonomy needs': 'C-O:(4) Other blocks autonomy',
+    'C-S:(2) Self criticism': 'C-S:(2) Self-criticism',
+    'C-O:(2) Perception of the other as detached or over attached': 'C-O:(2) Other detached/overattached',
+    'C-O:(1) Perception of the other as related': 'C-O:(1) Other related',
+    'A:(3) Sad, emotional pain, grieving': 'A:(3) Sadness',
+    'B-O:(2) Fight or flight behavior': 'B-O:(2) Fight/flight',
+    'A:(14) Feel lonely': 'A:(14) Lonely',
+    'D:(2) Expectation that relatedness needs will not be met': 'D:(2) Relatedness (-)',
+    'B-S:(2) Self harm, neglect and avoidance': 'B-S:(2) Self-harm',
+    'A:(10) Angry (aggression), disgust, contempt': 'A:(10) Angry/Aggressive',
+    'A:(8) Apathic, don’t care, blunted': 'A:(8) Apathetic',
+    'B-S:(1) Self care and improvement': 'B-S:(1) Self-care',
+    'D:(5) Competence, self esteem, self-care': 'D:(5) Competence',
+    'D:(6) Expectation that competence needs will not be met': 'D:(6) Competence (-)',
+    'C-O:(3) Perception of the other as facilitating autonomy needs': 'C-O:(3) Other supports autonomy',
+    'A:(2) Anxious/ fearful/ tense': 'A:(2) Anxious',
+    'A:(12) Ashamed, guilty': 'A:(12) Ashamed/Guilty',
+    'B-O:(4) Over controlled or controlling behavior': 'B-O:(4) Controlling',
+    'A:(1) Calm/ laid back': 'A:(1) Calm',
+    'D:(4) Expectation that autonomy needs will not be met': 'D:(4) Autonomy (-)',
+    'D:(3) Autonomy and adaptive control': 'D:(3) Autonomy',
+    'A:(5) Content, happy, joy, hopeful': 'A:(5) Happy',
+    'B-O:(3) Autonomous or adaptive control behavior': 'B-O:(3) Adaptive control',
+    'A:(9) Justifiable anger/ assertive anger, justifiable outrage': 'A:(9) Justified anger',
+    'A:(13) Feel loved, belong': 'A:(13) Loved/Belonging',
+    'A:(7) Vigor / energetic': 'A:(7) Vigor'
+}

clean_annotations_safe.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

streamlit_app_LDA.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import streamlit as st
+import json
+import random
+import numpy as np
+from gensim import corpora, models
+import pyLDAvis.gensim_models as gensimvis
+import pyLDAvis
+import pandas as pd
+import streamlit.components.v1 as components
+from MIND_utils import df_to_self_states_json, element_short_desc_map
+# ---------------------------
+# Streamlit App Layout
+# ---------------------------
+st.title("Prototypical Self-States via Topic Modeling")
+st.sidebar.header("Model Parameters")
+num_topics = st.sidebar.slider("Number of Topics", min_value=2, max_value=20, value=5)
+num_passes = st.sidebar.slider("Number of Passes", min_value=5, max_value=50, value=10)
+lda_document_is = st.radio("A 'Document' in the topic model will correspond to a:", ("self-state", "segment"))
+seed_value = st.sidebar.number_input("Random Seed", value=42)
+num_top_elements_to_show = st.sidebar.slider("# top element to show in a topic", min_value=2, max_value=15, value=5)
+# ---------------------------
+# Load Data
+# ---------------------------
+# You can also allow users to upload their file via st.file_uploader.
+# @st.cache(allow_output_mutation=True)
+def load_data():
+    return pd.read_csv("clean_annotations_safe.csv")
+df = load_data()
+# ---------------------------
+# Preprocess Data: Build Documents
+# ---------------------------
+# Set random seeds for reproducibility
+random.seed(seed_value)
+np.random.seed(seed_value)
+# Functions to extract "words" (elements -- <dim>:<category>) from a segment / self-state
+def extract_elements_from_selfstate(selfstate):
+    words = []
+    for dim, dim_obj in selfstate.items():
+        if dim == "is_adaptive":
+            continue
+        if "Category" in dim_obj and not pd.isna(dim_obj["Category"]):
+            word = f"{dim}:{dim_obj['Category']}"
+            words.append(word)
+    return words
+def extract_elements_from_segment(segment):
+    words = []
+    for selfstate in segment["self-states"]:
+        words += extract_elements_from_selfstate(selfstate)
+    return words
+# Build a list of "documents" (one per segment)
+lda_documents = []
+lda_document_ids = []
+for (doc_id, annotator), df_ in df.groupby(["document", "annotator"]):
+    doc_json = df_to_self_states_json(df_, doc_id, annotator)
+    ### * for Segment-level LDA-documents:
+    if lda_document_is == "segment":
+        for segment in doc_json["segments"]:
+            lda_doc = extract_elements_from_segment(segment)
+            if lda_doc:  # only add if non-empty
+                lda_documents.append(lda_doc)
+                lda_document_ids.append(f"{doc_id}_seg{segment['segment']}")
+    ### * for SelfState-level LDA-documents:
+    elif lda_document_is == "self-state":
+        for segment in doc_json["segments"]:
+            for i, selfstate in enumerate(segment["self-states"]):
+                lda_doc = extract_elements_from_selfstate(selfstate)
+                if lda_doc:
+                    lda_documents.append(lda_doc)
+                    lda_document_ids.append(f"{doc_id}_seg{segment['segment']}_state{i+1}")
+# Create a dictionary and corpus for LDA
+dictionary = corpora.Dictionary(lda_documents)
+corpus = [dictionary.doc2bow(doc) for doc in lda_documents]
+# ---------------------------
+# Run LDA Model
+# ---------------------------
+lda_model = models.LdaModel(corpus,
+                            num_topics=num_topics,
+                            id2word=dictionary,
+                            passes=num_passes,
+                            random_state=seed_value)
+# ---------------------------
+# Display Pretty Printed Topics
+# ---------------------------
+st.header("Pretty Printed Topics")
+# Build a mapping for each topic to the list of (document index, topic probability)
+topic_docs = {topic_id: [] for topic_id in range(lda_model.num_topics)}
+# Iterate over the corpus to get topic distributions for each document
+for i, doc_bow in enumerate(corpus):
+    # Get the full topic distribution (with minimum_probability=0 so every topic is included)
+    doc_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0)
+    for topic_id, prob in doc_topics:
+        topic_docs[topic_id].append((i, prob))
+# For each topic, sort the documents by probability in descending order and keep the top 3
+top_docs = {}
+for topic_id, doc_list in topic_docs.items():
+    sorted_docs = sorted(doc_list, key=lambda x: x[1], reverse=True)
+    top_docs[topic_id] = sorted_docs[:3]
+# Aggregate output into a single string
+output_str = "Identified Prototypical Self-States (Topics):\n\n"
+for topic_id, topic_str in lda_model.print_topics(num_words=num_top_elements_to_show):
+    output_str += f"Topic {topic_id}:\n"
+    terms = topic_str.split(" + ")
+    for term in terms:
+        weight, token = term.split("*")
+        token = token.strip().replace('"', '')
+        output_str += f"  {float(weight):.3f} -> {token}\n"
+    output_str += "  Top 3 Documents (Segment Indices) for this topic:\n"
+    for doc_index, prob in top_docs[topic_id]:
+        # Assuming lda_document_ids is a list or dict mapping document indices to identifiers
+        output_str += f"    Doc {doc_index} ({lda_document_ids[doc_index]}) with probability {prob:.3f}\n"
+    output_str += "-" * 60 + "\n"
+# Now you can display the aggregated string in Streamlit:
+import streamlit as st
+st.text(output_str)
+# ---------------------------
+# Prepare and Display pyLDAvis Visualization
+# ---------------------------
+st.header("Interactive Topic Visualization")
+# vis_dict = {i: element_short_desc_map[v] for i, v in dictionary.items()}
+# vis_dictionary = corpora.dictionary.Dictionary([[new_token] for new_token in vis_dict.values()])
+vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
+html_string = pyLDAvis.prepared_data_to_html(vis_data)
+components.html(html_string, width=1300, height=800)