Spaces:

magnolia-psychometrics
/

synth-net

Running

App Files Files Community

bjorn-hommel commited on Oct 21, 2024

Commit

818f654

1 Parent(s): 3f83c9c

init commit

Browse files

Files changed (10) hide show

.gitattributes +1 -0
.gitignore +8 -0
.streamlit/config.toml +8 -0
README.md +6 -8
all-mpnet-base-v2.enc +3 -0
app.py +152 -0
config.yaml +8 -0
modeling.py +76 -0
psisent.enc +3 -0
surveybot3000.enc +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.enc filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+.env
+db.parquet
+preprocess.py
+encrypt.py
+surveybot3000.parquet
+psisent.parquet
+all-mpnet-base-v2.parquet
+__pycache__

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,8 @@

+[theme]
+primaryColor="#4361ee"
+backgroundColor="#FFFFFF"
+secondaryBackgroundColor="#F0F2F6"
+textColor="#262730"
+font="sans serif"
+[server]
+enableStaticServing = true

README.md CHANGED Viewed

@@ -1,12 +1,10 @@
 ---
-title: Synth Net
-emoji: 👁
-colorFrom: blue
-colorTo: blue
 sdk: streamlit
 sdk_version: 1.39.0
 app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Synthetic Nomological Net
+emoji: 🕸️
+colorFrom: indigo
+colorTo: green
 sdk: streamlit
 sdk_version: 1.39.0
 app_file: app.py
+pinned: true
+---

all-mpnet-base-v2.enc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1835bad0cd24cc9019803c0589430fa4af320a16da456c364c22f21d51590ed2
+size 97110456

app.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import streamlit as st
+import yaml
+import pandas as pd
+from cryptography.fernet import Fernet
+from dotenv import load_dotenv
+from io import StringIO
+import modeling
+def df_to_csv(df):
+    csv = StringIO()
+    df.to_csv(csv, index=True)
+    csv.seek(0)
+    csv_data = csv.getvalue()
+    return(csv_data)
+def dict_to_yaml(data):
+    return yaml.dump(data, default_flow_style=False)
+def yaml_to_dict(yaml_str):
+    return yaml.safe_load(yaml_str)
+def initialize():
+    load_dotenv()
+    st.session_state.setdefault('model_names', ['SurveyBot3000', 'PsiSent', 'all-mpnet-base-v2'])
+    st.session_state.setdefault('loaded_model_name', None)
+    st.session_state.setdefault('search_query', None)
+    st.session_state.setdefault('db', None)
+    st.session_state.setdefault('results', pd.DataFrame())
+    st.session_state.setdefault('decrypt_key', None)
+    st.session_state.setdefault('valid_decrypt_key', False)
+    with open('config.yaml', 'r') as stream:
+        st.session_state['config'] = yaml.safe_load(stream)
+def main():
+    st.set_page_config(page_title='Synth-Net')
+    st.markdown("# The Synthetic Nomological Net")
+    # st.markdown("#### This is a demo on how to extract trait information from responses to open-ended questions.")
+    st.markdown("""
+        Psychological science is experiencing rapid growth in constructs and measures, partly due to refinement and new research areas,
+        but also due to excessive proliferation. This proliferation, driven by academic incentives for novelty, may lead to redundant
+        constructs with different names (jangle fallacy) and seemingly similar constructs with little content overlap (jingle fallacy).
+        This web application uses state-of-the-art models and methods in natural language processing to search for semantic overlap in measures.
+        It analyzes textual data from over 21,000 scales (containing more than 330,000 items) in an effort to reduce redundancies in measures used in the behavioral sciences.
+        - 📖 **Preprint (Open Access)**: NA
+        - 🖊️ **Cite**: NA
+        - 🌐 **Project website**: NA
+        - 💾 **Data**: NA
+        - #️⃣ **Social Media**: NA
+        The web application is maintained by [magnolia psychometrics](https://www.magnolia-psychometrics.com/).
+    """, unsafe_allow_html=True)
+    placeholder_demo = st.empty()
+    show_demo(placeholder_demo)
+def show_demo(placeholder):
+    with placeholder:
+        with st.container():
+            st.divider()
+            st.markdown("""
+                ## Try it yourself!
+                Define a scale by entering individual items in YAML format.
+                After form submission, a vector representation for the scale is calculated using the selected encoder model.
+                Cosine similarities between this vector and the representations of existing scales are then computed.
+                The resulting table outputs measures with high semantic overlap.
+            """)
+            with st.form("submission_form"):
+                if not st.session_state['valid_decrypt_key']:
+                    with st.expander(label="Authentication", expanded=True, icon="🔑"):
+                        st.text_input(
+                            label="Encryption key",
+                            value="",
+                            max_chars=None,
+                            key='decrypt_key',
+                            placeholder="A URL-safe base64-encoded 32-byte key"
+                        )
+                with st.expander(label="Model", expanded=False, icon="🧠"):
+                    if st.session_state['loaded_model_name'] is not None:
+                        input_model_index = st.session_state['model_names'].index(st.session_state['input_model_name'])
+                    else:
+                        input_model_index = 0
+                    st.selectbox(
+                        label="Select model",
+                        options=st.session_state['model_names'],
+                        index=input_model_index,
+                        key='input_model_name'
+                    )
+                with st.expander(label="Search Query", expanded=True, icon="🔎"):
+                    if 'input_items' not in st.session_state:
+                        st.session_state['input_items'] = dict_to_yaml(st.session_state['config']['input_items'])
+                    st.text_area(
+                        label="Search for similar measures by entering items that constitute the scale (YAML-Formatted):",
+                        height=175,
+                        key='input_items'
+                    )
+                submitted = st.form_submit_button(
+                    label="Search Synth-Net",
+                    type="primary",
+                    use_container_width=True
+                )
+                if submitted:
+                    try:
+                        st.session_state['search_query'] = yaml_to_dict(st.session_state['input_items'])
+                    except yaml.YAMLError as e:
+                        st.error(f"Yikes, you better get your YAML straight! Check https://yaml.org/ for help! \n {e}")
+                        return
+                    try:
+                        modeling.load_model()
+                        modeling.search()
+                    except Exception as error:
+                        st.error(f"Error while loading model: {error}")
+                        return
+            with st.container():
+                if not st.session_state['results'].empty:
+                    df = st.session_state['results'].style.format({
+                        'Match': '{:.2f}'.format,
+                        'Scale': str.capitalize,
+                        'Instrument': str.capitalize,
+                    })
+                    st.dataframe(df, use_container_width=True)
+                    st.download_button(
+                        label="Download References",
+                        data=df_to_csv(st.session_state['results']),
+                        file_name='scored_survey_responses.csv',
+                        mime='text/csv',
+                        use_container_width=True
+                    )
+if __name__ == '__main__':
+    initialize()
+    main()

config.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+input_items:
+- Receiving compliments is important to me.
+- I rarely doubt my decisions or actions.
+- I'm skilled at persuading others to believe what I say.
+- Being humble doesn't suit my personality.
+- I consider myself to be truly exceptional.
+- Gaining recognition and status is important to me.
+- I'm destined for remarkable achievements.

modeling.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os
+import streamlit as st
+import logging
+import pandas as pd
+import numpy as np
+import pickle
+from cryptography.fernet import Fernet
+from sentence_transformers import SentenceTransformer, util
+def load_model():
+    no_model = st.session_state.get('model') is None
+    swap_model = st.session_state.get('input_model_name') != st.session_state['loaded_model_name']
+    if swap_model or no_model:
+        with st.spinner('Loading the model might take a couple of seconds...'):
+            env_local = st.session_state['input_model_name'].lower() + '_path'
+            env_remote = st.session_state['input_model_name'].lower() + '_remote_path'
+            if os.environ.get(env_remote):
+                model_path = os.environ.get(env_remote)
+            else:
+                model_path = os.getenv(env_local)
+            auth_token = os.environ.get('read_models') or True
+            st.session_state['model'] = SentenceTransformer(
+                model_name_or_path=model_path,
+                use_auth_token=auth_token
+            )
+            st.session_state['loaded_model_name'] = st.session_state['input_model_name']
+            logging.info(f"Loaded {st.session_state['input_model_name']}!")
+        with st.spinner('Loading embeddings...'):
+            file_path = f"./{st.session_state['input_model_name'].lower()}.enc"
+            with open(file_path, 'rb') as f:
+                encrypted_data = f.read()
+            try:
+                cipher = Fernet(st.session_state['decrypt_key'])
+                decrypted_df = cipher.decrypt(encrypted_data)
+                st.session_state['db'] = pickle.loads(decrypted_df)
+                st.session_state['valid_decrypt_key'] = True
+            except Exception as e:
+                st.error(body="Error: No valid encryption key!", icon="🔑")
+                logging.error(e)
+                return
+            # st.session_state['db'] = pd.read_parquet(
+            #     path=f"./{st.session_state['input_model_name'].lower()}.parquet"
+            # )
+            #decrypt_key
+def search():
+    with st.spinner('Searching...'):
+        query_embeddings = st.session_state['model'].encode(sentences=st.session_state['search_query']).mean(axis=0)
+        query_scores = util.cos_sim(
+            a=np.array(query_embeddings),
+            b=st.session_state['db']['ItemStemEmbeddings']
+        ).squeeze()
+        st.session_state['results'] = pd.DataFrame({
+            'Match': query_scores,
+            'Scale': st.session_state['db']['ScaleName'],
+            'Instrument': st.session_state['db']['InstrumentName'],
+            'Reference': st.session_state['db']['InstrumentApaReference'],
+        }).sort_values(by='Match', ascending=False)

psisent.enc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed65f3ce71eae48a6f732ca92a3ffea50872c8b348272e94659c1ce74a6d2b82
+size 97110456

surveybot3000.enc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53dbcafa17705290f4bd04262bf8bbedbee2133b0608004ad03cbc83444c6bda
+size 97110456