bjorn-hommel commited on
Commit
818f654
·
1 Parent(s): 3f83c9c

init commit

Browse files
Files changed (10) hide show
  1. .gitattributes +1 -0
  2. .gitignore +8 -0
  3. .streamlit/config.toml +8 -0
  4. README.md +6 -8
  5. all-mpnet-base-v2.enc +3 -0
  6. app.py +152 -0
  7. config.yaml +8 -0
  8. modeling.py +76 -0
  9. psisent.enc +3 -0
  10. surveybot3000.enc +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.enc filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ .env
2
+ db.parquet
3
+ preprocess.py
4
+ encrypt.py
5
+ surveybot3000.parquet
6
+ psisent.parquet
7
+ all-mpnet-base-v2.parquet
8
+ __pycache__
.streamlit/config.toml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [theme]
2
+ primaryColor="#4361ee"
3
+ backgroundColor="#FFFFFF"
4
+ secondaryBackgroundColor="#F0F2F6"
5
+ textColor="#262730"
6
+ font="sans serif"
7
+ [server]
8
+ enableStaticServing = true
README.md CHANGED
@@ -1,12 +1,10 @@
1
  ---
2
- title: Synth Net
3
- emoji: 👁
4
- colorFrom: blue
5
- colorTo: blue
6
  sdk: streamlit
7
  sdk_version: 1.39.0
8
  app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Synthetic Nomological Net
3
+ emoji: 🕸️
4
+ colorFrom: indigo
5
+ colorTo: green
6
  sdk: streamlit
7
  sdk_version: 1.39.0
8
  app_file: app.py
9
+ pinned: true
10
+ ---
 
 
all-mpnet-base-v2.enc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1835bad0cd24cc9019803c0589430fa4af320a16da456c364c22f21d51590ed2
3
+ size 97110456
app.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import yaml
3
+ import pandas as pd
4
+ from cryptography.fernet import Fernet
5
+ from dotenv import load_dotenv
6
+ from io import StringIO
7
+
8
+ import modeling
9
+
10
+ def df_to_csv(df):
11
+ csv = StringIO()
12
+ df.to_csv(csv, index=True)
13
+ csv.seek(0)
14
+ csv_data = csv.getvalue()
15
+ return(csv_data)
16
+
17
+ def dict_to_yaml(data):
18
+ return yaml.dump(data, default_flow_style=False)
19
+
20
+ def yaml_to_dict(yaml_str):
21
+ return yaml.safe_load(yaml_str)
22
+
23
+ def initialize():
24
+ load_dotenv()
25
+ st.session_state.setdefault('model_names', ['SurveyBot3000', 'PsiSent', 'all-mpnet-base-v2'])
26
+ st.session_state.setdefault('loaded_model_name', None)
27
+ st.session_state.setdefault('search_query', None)
28
+ st.session_state.setdefault('db', None)
29
+ st.session_state.setdefault('results', pd.DataFrame())
30
+ st.session_state.setdefault('decrypt_key', None)
31
+ st.session_state.setdefault('valid_decrypt_key', False)
32
+
33
+ with open('config.yaml', 'r') as stream:
34
+ st.session_state['config'] = yaml.safe_load(stream)
35
+
36
+ def main():
37
+ st.set_page_config(page_title='Synth-Net')
38
+
39
+ st.markdown("# The Synthetic Nomological Net")
40
+ # st.markdown("#### This is a demo on how to extract trait information from responses to open-ended questions.")
41
+
42
+ st.markdown("""
43
+ Psychological science is experiencing rapid growth in constructs and measures, partly due to refinement and new research areas,
44
+ but also due to excessive proliferation. This proliferation, driven by academic incentives for novelty, may lead to redundant
45
+ constructs with different names (jangle fallacy) and seemingly similar constructs with little content overlap (jingle fallacy).
46
+
47
+ This web application uses state-of-the-art models and methods in natural language processing to search for semantic overlap in measures.
48
+ It analyzes textual data from over 21,000 scales (containing more than 330,000 items) in an effort to reduce redundancies in measures used in the behavioral sciences.
49
+
50
+ - 📖 **Preprint (Open Access)**: NA
51
+ - 🖊️ **Cite**: NA
52
+ - 🌐 **Project website**: NA
53
+ - 💾 **Data**: NA
54
+ - #️⃣ **Social Media**: NA
55
+
56
+ The web application is maintained by [magnolia psychometrics](https://www.magnolia-psychometrics.com/).
57
+ """, unsafe_allow_html=True)
58
+
59
+ placeholder_demo = st.empty()
60
+
61
+ show_demo(placeholder_demo)
62
+
63
+ def show_demo(placeholder):
64
+
65
+ with placeholder:
66
+ with st.container():
67
+ st.divider()
68
+ st.markdown("""
69
+ ## Try it yourself!
70
+ Define a scale by entering individual items in YAML format.
71
+ After form submission, a vector representation for the scale is calculated using the selected encoder model.
72
+ Cosine similarities between this vector and the representations of existing scales are then computed.
73
+ The resulting table outputs measures with high semantic overlap.
74
+ """)
75
+
76
+ with st.form("submission_form"):
77
+
78
+ if not st.session_state['valid_decrypt_key']:
79
+ with st.expander(label="Authentication", expanded=True, icon="🔑"):
80
+ st.text_input(
81
+ label="Encryption key",
82
+ value="",
83
+ max_chars=None,
84
+ key='decrypt_key',
85
+ placeholder="A URL-safe base64-encoded 32-byte key"
86
+ )
87
+
88
+ with st.expander(label="Model", expanded=False, icon="🧠"):
89
+
90
+ if st.session_state['loaded_model_name'] is not None:
91
+ input_model_index = st.session_state['model_names'].index(st.session_state['input_model_name'])
92
+ else:
93
+ input_model_index = 0
94
+
95
+ st.selectbox(
96
+ label="Select model",
97
+ options=st.session_state['model_names'],
98
+ index=input_model_index,
99
+ key='input_model_name'
100
+ )
101
+
102
+ with st.expander(label="Search Query", expanded=True, icon="🔎"):
103
+ if 'input_items' not in st.session_state:
104
+ st.session_state['input_items'] = dict_to_yaml(st.session_state['config']['input_items'])
105
+
106
+ st.text_area(
107
+ label="Search for similar measures by entering items that constitute the scale (YAML-Formatted):",
108
+ height=175,
109
+ key='input_items'
110
+ )
111
+
112
+ submitted = st.form_submit_button(
113
+ label="Search Synth-Net",
114
+ type="primary",
115
+ use_container_width=True
116
+ )
117
+
118
+ if submitted:
119
+
120
+ try:
121
+ st.session_state['search_query'] = yaml_to_dict(st.session_state['input_items'])
122
+ except yaml.YAMLError as e:
123
+ st.error(f"Yikes, you better get your YAML straight! Check https://yaml.org/ for help! \n {e}")
124
+ return
125
+
126
+ try:
127
+ modeling.load_model()
128
+ modeling.search()
129
+ except Exception as error:
130
+ st.error(f"Error while loading model: {error}")
131
+ return
132
+
133
+ with st.container():
134
+ if not st.session_state['results'].empty:
135
+ df = st.session_state['results'].style.format({
136
+ 'Match': '{:.2f}'.format,
137
+ 'Scale': str.capitalize,
138
+ 'Instrument': str.capitalize,
139
+ })
140
+ st.dataframe(df, use_container_width=True)
141
+
142
+ st.download_button(
143
+ label="Download References",
144
+ data=df_to_csv(st.session_state['results']),
145
+ file_name='scored_survey_responses.csv',
146
+ mime='text/csv',
147
+ use_container_width=True
148
+ )
149
+
150
+ if __name__ == '__main__':
151
+ initialize()
152
+ main()
config.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ input_items:
2
+ - Receiving compliments is important to me.
3
+ - I rarely doubt my decisions or actions.
4
+ - I'm skilled at persuading others to believe what I say.
5
+ - Being humble doesn't suit my personality.
6
+ - I consider myself to be truly exceptional.
7
+ - Gaining recognition and status is important to me.
8
+ - I'm destined for remarkable achievements.
modeling.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import logging
4
+ import pandas as pd
5
+ import numpy as np
6
+ import pickle
7
+ from cryptography.fernet import Fernet
8
+ from sentence_transformers import SentenceTransformer, util
9
+
10
+ def load_model():
11
+
12
+ no_model = st.session_state.get('model') is None
13
+ swap_model = st.session_state.get('input_model_name') != st.session_state['loaded_model_name']
14
+
15
+ if swap_model or no_model:
16
+
17
+ with st.spinner('Loading the model might take a couple of seconds...'):
18
+ env_local = st.session_state['input_model_name'].lower() + '_path'
19
+ env_remote = st.session_state['input_model_name'].lower() + '_remote_path'
20
+
21
+ if os.environ.get(env_remote):
22
+ model_path = os.environ.get(env_remote)
23
+ else:
24
+ model_path = os.getenv(env_local)
25
+
26
+ auth_token = os.environ.get('read_models') or True
27
+
28
+ st.session_state['model'] = SentenceTransformer(
29
+ model_name_or_path=model_path,
30
+ use_auth_token=auth_token
31
+ )
32
+
33
+ st.session_state['loaded_model_name'] = st.session_state['input_model_name']
34
+
35
+ logging.info(f"Loaded {st.session_state['input_model_name']}!")
36
+
37
+ with st.spinner('Loading embeddings...'):
38
+
39
+ file_path = f"./{st.session_state['input_model_name'].lower()}.enc"
40
+
41
+ with open(file_path, 'rb') as f:
42
+ encrypted_data = f.read()
43
+
44
+ try:
45
+ cipher = Fernet(st.session_state['decrypt_key'])
46
+ decrypted_df = cipher.decrypt(encrypted_data)
47
+ st.session_state['db'] = pickle.loads(decrypted_df)
48
+ st.session_state['valid_decrypt_key'] = True
49
+ except Exception as e:
50
+ st.error(body="Error: No valid encryption key!", icon="🔑")
51
+ logging.error(e)
52
+ return
53
+
54
+
55
+ # st.session_state['db'] = pd.read_parquet(
56
+ # path=f"./{st.session_state['input_model_name'].lower()}.parquet"
57
+ # )
58
+
59
+ #decrypt_key
60
+
61
+ def search():
62
+
63
+ with st.spinner('Searching...'):
64
+ query_embeddings = st.session_state['model'].encode(sentences=st.session_state['search_query']).mean(axis=0)
65
+
66
+ query_scores = util.cos_sim(
67
+ a=np.array(query_embeddings),
68
+ b=st.session_state['db']['ItemStemEmbeddings']
69
+ ).squeeze()
70
+
71
+ st.session_state['results'] = pd.DataFrame({
72
+ 'Match': query_scores,
73
+ 'Scale': st.session_state['db']['ScaleName'],
74
+ 'Instrument': st.session_state['db']['InstrumentName'],
75
+ 'Reference': st.session_state['db']['InstrumentApaReference'],
76
+ }).sort_values(by='Match', ascending=False)
psisent.enc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed65f3ce71eae48a6f732ca92a3ffea50872c8b348272e94659c1ce74a6d2b82
3
+ size 97110456
surveybot3000.enc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53dbcafa17705290f4bd04262bf8bbedbee2133b0608004ad03cbc83444c6bda
3
+ size 97110456