bjorn-hommel commited on
Commit
cf1362c
·
1 Parent(s): e49e13a

added psyctest references

Browse files
Files changed (7) hide show
  1. .gitignore +3 -1
  2. all_mpnet_base_v2.enc +2 -2
  3. app.py +132 -73
  4. modeling.py +86 -45
  5. psisent.enc +2 -2
  6. requirements.txt +2 -0
  7. surveybot3000.enc +2 -2
.gitignore CHANGED
@@ -1,3 +1,4 @@
 
1
  .env
2
  db.parquet
3
  preprocess.py
@@ -5,4 +6,5 @@ encrypt.py
5
  surveybot3000.parquet
6
  psisent.parquet
7
  all_mpnet_base_v2.parquet
8
- __pycache__
 
 
1
+ psyctest_doi.parquet
2
  .env
3
  db.parquet
4
  preprocess.py
 
6
  surveybot3000.parquet
7
  psisent.parquet
8
  all_mpnet_base_v2.parquet
9
+ __pycache__
10
+ **tmp.**
all_mpnet_base_v2.enc CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1835bad0cd24cc9019803c0589430fa4af320a16da456c364c22f21d51590ed2
3
- size 97110456
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87cc6e38ec15b4ac377d3e85c6b17e403a5ec9e57ee1b3371e543ecf998e09a0
3
+ size 91447948
app.py CHANGED
@@ -1,8 +1,9 @@
1
  import os
2
  import streamlit as st
3
  import yaml
 
4
  import pandas as pd
5
- from cryptography.fernet import Fernet
6
  from dotenv import load_dotenv
7
  from io import StringIO
8
 
@@ -22,36 +23,51 @@ def yaml_to_dict(yaml_str):
22
  return yaml.safe_load(yaml_str)
23
 
24
  def initialize():
 
 
25
  load_dotenv()
 
26
  st.session_state.setdefault('model_names', ['SurveyBot3000', 'PsiSent', 'all_mpnet_base_v2'])
 
27
  st.session_state.setdefault('loaded_model_name', None)
28
  st.session_state.setdefault('search_query', None)
29
  st.session_state.setdefault('db', None)
30
- st.session_state.setdefault('results', pd.DataFrame())
 
 
31
 
32
  with open('config.yaml', 'r') as stream:
33
  st.session_state['config'] = yaml.safe_load(stream)
34
 
35
- if os.environ.get('decrypt_key'):
36
- decrypt_key = os.environ.get('decrypt_key')
37
- st.session_state.setdefault('decrypt_key', decrypt_key)
 
38
  else:
39
- st.session_state.setdefault('decrypt_key', None)
40
 
41
  def main():
42
  st.set_page_config(page_title='Synth-Net')
43
 
44
  st.markdown("# The Synthetic Nomological Net")
45
- # st.markdown("#### This is a demo on how to extract trait information from responses to open-ended questions.")
46
-
47
  st.markdown("""
48
- Psychological science is experiencing rapid growth in constructs and measures, partly due to refinement and new research areas,
49
- but also due to excessive proliferation. This proliferation, driven by academic incentives for novelty, may lead to redundant
50
  constructs with different names (jangle fallacy) and seemingly similar constructs with little content overlap (jingle fallacy).
51
 
52
  This web application uses state-of-the-art models and methods in natural language processing to search for semantic overlap in measures.
53
  It analyzes textual data from over 21,000 scales (containing more than 330,000 items) in an effort to reduce redundancies in measures used in the behavioral sciences.
 
 
 
 
54
 
 
 
 
 
 
 
55
  - 📖 **Preprint (Open Access)**: NA
56
  - 🖊️ **Cite**: NA
57
  - 🌐 **Project website**: NA
@@ -61,95 +77,138 @@ def main():
61
  The web application is maintained by [magnolia psychometrics](https://www.magnolia-psychometrics.com/).
62
  """, unsafe_allow_html=True)
63
 
64
- placeholder_demo = st.empty()
 
 
65
 
66
- show_demo(placeholder_demo)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  def show_demo(placeholder):
69
 
70
  with placeholder:
71
  with st.container():
72
  st.divider()
73
- st.markdown("""
74
  ## Try it yourself!
75
- Define a scale by entering individual items in YAML format.
76
  After form submission, a vector representation for the scale is calculated using the selected encoder model.
77
  Cosine similarities between this vector and the representations of existing scales are then computed.
78
- The resulting table outputs measures with high semantic overlap.
79
  """)
80
 
81
- with st.form("submission_form"):
82
-
83
- with st.expander(label="Authentication", expanded=True, icon="🔑"):
84
- st.text_input(
85
- label="Encryption key",
86
- value="",
87
- max_chars=None,
88
- key='decrypt_key',
89
- placeholder="A URL-safe base64-encoded 32-byte key"
90
- )
91
 
92
- with st.expander(label="Model", expanded=False, icon="🧠"):
 
 
 
 
 
 
93
 
94
- if st.session_state['loaded_model_name'] is not None:
95
- input_model_index = st.session_state['model_names'].index(st.session_state['input_model_name'])
96
- else:
97
- input_model_index = 0
98
 
99
- st.selectbox(
100
- label="Select model",
101
- options=st.session_state['model_names'],
102
- index=input_model_index,
103
- key='input_model_name'
104
- )
105
-
106
- with st.expander(label="Search Query", expanded=True, icon="🔎"):
107
- if 'input_items' not in st.session_state:
108
- st.session_state['input_items'] = dict_to_yaml(st.session_state['config']['input_items'])
109
 
 
110
  st.text_area(
111
  label="Search for similar measures by entering items that constitute the scale (YAML-Formatted):",
112
  height=175,
113
  key='input_items'
114
  )
115
 
116
- submitted = st.form_submit_button(
117
- label="Search Synth-Net",
118
- type="primary",
119
- use_container_width=True
120
- )
121
 
122
- if submitted:
123
 
124
- try:
125
- st.session_state['search_query'] = yaml_to_dict(st.session_state['input_items'])
126
- except yaml.YAMLError as e:
127
- st.error(f"Yikes, you better get your YAML straight! Check https://yaml.org/ for help! \n {e}")
128
- return
 
 
 
 
 
 
 
129
 
130
- try:
131
- modeling.load_model()
132
  modeling.search()
133
- except Exception as error:
134
- st.error(f"Error while loading model: {error}")
135
- return
136
-
137
- with st.container():
138
- if not st.session_state['results'].empty:
139
- df = st.session_state['results'].style.format({
140
- 'Match': '{:.2f}'.format,
141
- 'Scale': str.capitalize,
142
- 'Instrument': str.capitalize,
143
- })
144
- st.dataframe(df, use_container_width=True, hide_index=True)
145
-
146
- st.download_button(
147
- label="Download References",
148
- data=df_to_csv(st.session_state['results']),
149
- file_name='scored_survey_responses.csv',
150
- mime='text/csv',
151
- use_container_width=True
152
- )
 
 
 
 
 
 
 
 
 
153
 
154
  if __name__ == '__main__':
155
  initialize()
 
1
  import os
2
  import streamlit as st
3
  import yaml
4
+ import logging
5
  import pandas as pd
6
+ from cryptography.fernet import Fernet, InvalidToken
7
  from dotenv import load_dotenv
8
  from io import StringIO
9
 
 
23
  return yaml.safe_load(yaml_str)
24
 
25
  def initialize():
26
+
27
+ logging.basicConfig(level=logging.INFO)
28
  load_dotenv()
29
+
30
  st.session_state.setdefault('model_names', ['SurveyBot3000', 'PsiSent', 'all_mpnet_base_v2'])
31
+ st.session_state.setdefault('input_model_name', st.session_state['model_names'][0])
32
  st.session_state.setdefault('loaded_model_name', None)
33
  st.session_state.setdefault('search_query', None)
34
  st.session_state.setdefault('db', None)
35
+ st.session_state.setdefault('search_results', pd.DataFrame())
36
+ st.session_state.setdefault('explore_plot', None)
37
+ st.session_state.setdefault('is_authenticated', False)
38
 
39
  with open('config.yaml', 'r') as stream:
40
  st.session_state['config'] = yaml.safe_load(stream)
41
 
42
+ if os.environ.get('encryption_key'):
43
+ encryption_key = os.environ.get('encryption_key')
44
+ st.session_state.setdefault('encryption_key', encryption_key)
45
+ # st.session_state.setdefault('encryption_key', None)
46
  else:
47
+ st.session_state.setdefault('encryption_key', None)
48
 
49
  def main():
50
  st.set_page_config(page_title='Synth-Net')
51
 
52
  st.markdown("# The Synthetic Nomological Net")
 
 
53
  st.markdown("""
54
+ Psychological science is experiencing rapid growth in constructs and measures, partly due to refinement and new research areas,
55
+ but also due to excessive proliferation. This proliferation, driven by academic incentives for novelty, may lead to redundant
56
  constructs with different names (jangle fallacy) and seemingly similar constructs with little content overlap (jingle fallacy).
57
 
58
  This web application uses state-of-the-art models and methods in natural language processing to search for semantic overlap in measures.
59
  It analyzes textual data from over 21,000 scales (containing more than 330,000 items) in an effort to reduce redundancies in measures used in the behavioral sciences.
60
+ """, unsafe_allow_html=True)
61
+
62
+ placeholder_authentication = st.empty()
63
+ placeholder_demo = st.empty()
64
 
65
+ if st.session_state['is_authenticated']:
66
+ show_demo(placeholder_demo)
67
+ else:
68
+ show_authentication(placeholder_authentication)
69
+
70
+ st.markdown("""
71
  - 📖 **Preprint (Open Access)**: NA
72
  - 🖊️ **Cite**: NA
73
  - 🌐 **Project website**: NA
 
77
  The web application is maintained by [magnolia psychometrics](https://www.magnolia-psychometrics.com/).
78
  """, unsafe_allow_html=True)
79
 
80
+ def show_authentication(placeholder):
81
+ with placeholder:
82
+ with st.container():
83
 
84
+ with st.form("authentication_form"):
85
+
86
+ st.markdown("""
87
+ ## Authentication
88
+ This app is a research preview and requires authentication.
89
+ All data is encrypted. Please use your 32-byte encryption key to proceed!
90
+ """)
91
+
92
+ st.text_input(
93
+ label="🔑 Encryption key",
94
+ value="",
95
+ max_chars=None,
96
+ key='encryption_key',
97
+ placeholder="A URL-safe base64-encoded 32-byte key"
98
+ )
99
+
100
+ submitted = st.form_submit_button(
101
+ label="Authenticate",
102
+ type="primary",
103
+ use_container_width=True
104
+ )
105
+
106
+ if submitted:
107
+ try:
108
+ modeling.load_db()
109
+ st.rerun()
110
+ except InvalidToken:
111
+ error = f"Error: The encryption key you have entered is invalid (**{st.session_state['encryption_key']}**)!"
112
+ st.error(body=error, icon="🔑")
113
+ logging.error(error)
114
+ st.session_state['is_authenticated'] = False
115
+ return
116
+ except ValueError as error:
117
+ st.error(body=error, icon="🔑")
118
+ logging.error(error)
119
+ st.session_state['is_authenticated'] = False
120
+ return
121
 
122
  def show_demo(placeholder):
123
 
124
  with placeholder:
125
  with st.container():
126
  st.divider()
127
+ st.markdown("""
128
  ## Try it yourself!
129
+ Define a scale by entering individual items in YAML format.
130
  After form submission, a vector representation for the scale is calculated using the selected encoder model.
131
  Cosine similarities between this vector and the representations of existing scales are then computed.
132
+ The resulting table outputs measures with high semantic overlap.
133
  """)
134
 
135
+ if st.session_state['loaded_model_name'] is not None:
136
+ input_model_index = st.session_state['model_names'].index(st.session_state['input_model_name'])
137
+ else:
138
+ input_model_index = 0
 
 
 
 
 
 
139
 
140
+ st.selectbox(
141
+ label="Select model",
142
+ options=st.session_state['model_names'],
143
+ index=input_model_index,
144
+ placeholder="Choose a model",
145
+ key='input_model_name'
146
+ )
147
 
148
+ tab1, tab2 = st.tabs(["🔎 Search for scales", "🕸️ Explore the synthetic nomological net"])
 
 
 
149
 
150
+ with tab1:
151
+ if 'input_items' not in st.session_state:
152
+ st.session_state['input_items'] = dict_to_yaml(st.session_state['config']['input_items'])
 
 
 
 
 
 
 
153
 
154
+ with st.form("submission_form"):
155
  st.text_area(
156
  label="Search for similar measures by entering items that constitute the scale (YAML-Formatted):",
157
  height=175,
158
  key='input_items'
159
  )
160
 
161
+ submitted = st.form_submit_button(
162
+ label="Search Synth-Net",
163
+ type="primary",
164
+ use_container_width=True
165
+ )
166
 
167
+ if submitted:
168
 
169
+ try:
170
+ st.session_state['search_query'] = yaml_to_dict(st.session_state['input_items'])
171
+ except yaml.YAMLError as e:
172
+ st.error(f"Yikes, you better get your YAML straight! Check https://yaml.org/ for help! \n {e}")
173
+ return
174
+
175
+ no_model = st.session_state.get('model') is None
176
+ swap_model = st.session_state.get('input_model_name') != st.session_state['loaded_model_name']
177
+
178
+ if swap_model or no_model:
179
+ modeling.load_db()
180
+ modeling.load_model()
181
 
 
 
182
  modeling.search()
183
+
184
+
185
+ with st.container():
186
+ if not st.session_state['search_results'].empty:
187
+ with st.spinner('Rendering search results...'):
188
+ df = st.session_state['search_results'].style.format({
189
+ 'Match': '{:.2f}'.format,
190
+ 'Scale': str.capitalize,
191
+ 'Instrument': str.capitalize,
192
+ })
193
+ st.dataframe(df, use_container_width=True, hide_index=True)
194
+
195
+ with tab2:
196
+ with st.container():
197
+ modeling.explore()
198
+ if st.session_state['explore_plot']:
199
+ st.plotly_chart(
200
+ figure_or_data=st.session_state['explore_plot'],
201
+ use_container_width=True
202
+ )
203
+
204
+ # if not st.session_state['search_results'].empty:
205
+ # st.download_button(
206
+ # label="Download References",
207
+ # data=df_to_csv(st.session_state['search_results']),
208
+ # file_name='scored_survey_responses.csv',
209
+ # mime='text/csv',
210
+ # use_container_width=True
211
+ # )
212
 
213
  if __name__ == '__main__':
214
  initialize()
modeling.py CHANGED
@@ -4,72 +4,113 @@ import logging
4
  import pandas as pd
5
  import numpy as np
6
  import pickle
 
 
 
7
  from cryptography.fernet import Fernet
8
  from sentence_transformers import SentenceTransformer, util
 
9
 
10
- def load_model():
11
 
12
- no_model = st.session_state.get('model') is None
13
- swap_model = st.session_state.get('input_model_name') != st.session_state['loaded_model_name']
14
 
15
- if swap_model or no_model:
16
-
17
- with st.spinner('Loading the model might take a couple of seconds...'):
18
- env_local = st.session_state['input_model_name'].lower() + '_path'
19
- env_remote = st.session_state['input_model_name'].lower() + '_remote_path'
20
-
21
- if os.environ.get(env_remote):
22
- model_path = os.environ.get(env_remote)
23
- else:
24
- model_path = os.getenv(env_local)
25
 
26
- auth_token = os.environ.get('read_models') or True
27
 
28
- st.session_state['model'] = SentenceTransformer(
29
- model_name_or_path=model_path,
30
- use_auth_token=auth_token
31
- )
32
 
33
- st.session_state['loaded_model_name'] = st.session_state['input_model_name']
 
 
34
 
35
- logging.info(f"Loaded {st.session_state['input_model_name']}!")
36
-
37
- with st.spinner('Loading embeddings...'):
38
 
39
- file_path = f"./{st.session_state['input_model_name'].lower()}.enc"
40
 
41
- with open(file_path, 'rb') as f:
42
- encrypted_data = f.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- try:
45
- cipher = Fernet(st.session_state['decrypt_key'])
46
- decrypted_df = cipher.decrypt(encrypted_data)
47
- st.session_state['db'] = pickle.loads(decrypted_df)
48
- except Exception as e:
49
- st.error(body="Error: No valid encryption key!", icon="🔑")
50
- logging.error(e)
51
- return
52
 
53
-
54
- # st.session_state['db'] = pd.read_parquet(
55
- # path=f"./{st.session_state['input_model_name'].lower()}.parquet"
56
- # )
57
 
58
- #decrypt_key
59
 
60
  def search():
61
-
62
- with st.spinner('Searching...'):
63
- query_embeddings = st.session_state['model'].encode(sentences=st.session_state['search_query']).mean(axis=0)
64
 
 
 
 
 
 
 
65
  query_scores = util.cos_sim(
66
  a=np.array(query_embeddings),
67
- b=st.session_state['db']['ItemStemEmbeddings']
68
  ).squeeze()
69
 
70
- st.session_state['results'] = pd.DataFrame({
71
  'Match': query_scores,
72
  'Scale': st.session_state['db']['ScaleName'],
73
  'Instrument': st.session_state['db']['InstrumentName'],
74
- 'Reference': st.session_state['db']['InstrumentApaReference'],
75
- }).sort_values(by='Match', ascending=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import pandas as pd
5
  import numpy as np
6
  import pickle
7
+ import numpy as np
8
+ from bertopic import BERTopic
9
+ from umap import UMAP
10
  from cryptography.fernet import Fernet
11
  from sentence_transformers import SentenceTransformer, util
12
+ from pdb import set_trace as trace
13
 
 
14
 
15
+ def load_db():
 
16
 
17
+ with st.spinner('Loading pre-computed embeddings...'):
18
+ if st.session_state['input_model_name']:
19
+ file_path = f"./{st.session_state['input_model_name'].lower()}.enc"
20
+ else:
21
+ file_path = f"./{st.session_state['model_names'][0].lower()}.enc"
 
 
 
 
 
22
 
23
+ logging.info(f"Loading data from {file_path}!")
24
 
25
+ with open(file_path, 'rb') as f:
26
+ encrypted_data = f.read()
 
 
27
 
28
+ cipher = Fernet(st.session_state['encryption_key'])
29
+ decrypted_df = cipher.decrypt(encrypted_data)
30
+ st.session_state['db'] = pickle.loads(decrypted_df)
31
 
32
+ st.session_state['is_authenticated'] = True
33
+ logging.info(f"Loaded {file_path}!")
 
34
 
 
35
 
36
+ def load_model():
37
+
38
+ with st.spinner('Loading the model...'):
39
+ env_local = st.session_state['input_model_name'].lower() + '_path'
40
+ env_remote = st.session_state['input_model_name'].lower(
41
+ ) + '_remote_path'
42
+
43
+ if os.environ.get(env_remote):
44
+ model_path = os.environ.get(env_remote)
45
+ else:
46
+ model_path = os.getenv(env_local)
47
+
48
+ logging.info(f"Loading model from {model_path}!")
49
+
50
+ auth_token = os.environ.get('read_models') or True
51
+
52
+ st.session_state['model'] = SentenceTransformer(
53
+ model_name_or_path=model_path,
54
+ token=auth_token
55
+ )
56
 
57
+ st.session_state['loaded_model_name'] = st.session_state['input_model_name']
 
 
 
 
 
 
 
58
 
59
+ logging.info(f"Loaded {st.session_state['input_model_name']}!")
 
 
 
60
 
 
61
 
62
  def search():
 
 
 
63
 
64
+ with st.spinner('Searching the synthetic net...'):
65
+ query_embeddings = st.session_state['model'].encode(
66
+ sentences=st.session_state['search_query']).mean(axis=0)
67
+
68
+ item_embeddings = np.vstack(
69
+ st.session_state['db']['ItemStemEmbeddings'])
70
  query_scores = util.cos_sim(
71
  a=np.array(query_embeddings),
72
+ b=item_embeddings
73
  ).squeeze()
74
 
75
+ st.session_state['search_results'] = pd.DataFrame({
76
  'Match': query_scores,
77
  'Scale': st.session_state['db']['ScaleName'],
78
  'Instrument': st.session_state['db']['InstrumentName'],
79
+ 'Reference': st.session_state['db']['psyctest_doi'],
80
+ }).sort_values(by='Match', ascending=False)
81
+
82
+
83
+ def explore():
84
+
85
+ df = st.session_state['db']
86
+ message = f'Modeling synthetic construct space for {df.shape[0]} scales...'
87
+ logging.info(message)
88
+
89
+ with st.spinner(message):
90
+
91
+ documents = [f'{x}\n{y}' for x, y in zip(
92
+ df.ScaleName.tolist(), df.InstrumentName.tolist())]
93
+ embeddings = np.stack(df.ItemStemEmbeddings.to_numpy())
94
+
95
+ topic_model = BERTopic().fit(
96
+ documents=documents,
97
+ embeddings=embeddings
98
+ )
99
+
100
+ reduced_embeddings = UMAP(
101
+ n_neighbors=10,
102
+ n_components=2,
103
+ min_dist=0.0,
104
+ metric='cosine'
105
+ ).fit_transform(embeddings)
106
+
107
+ st.session_state['explore_plot'] = topic_model.visualize_documents(
108
+ docs=documents,
109
+ reduced_embeddings=reduced_embeddings,
110
+ hide_annotations=True,
111
+ hide_document_hover=False,
112
+ custom_labels=False,
113
+ title="The Synthetic Nomological Net",
114
+ width=1500,
115
+ height=1500
116
+ )
psisent.enc CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed65f3ce71eae48a6f732ca92a3ffea50872c8b348272e94659c1ce74a6d2b82
3
- size 97110456
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c278cff21708369f5026353cb18d001eb6a4707702f27ef9e410988dc297247
3
+ size 91447948
requirements.txt CHANGED
@@ -5,4 +5,6 @@ sentence_transformers==2.7.0
5
  sentencepiece==0.1.99
6
  altair==4.2.2
7
  cryptography==41.0.1
 
 
8
  python-dotenv
 
5
  sentencepiece==0.1.99
6
  altair==4.2.2
7
  cryptography==41.0.1
8
+ matplotlib_venn==1.1.1
9
+ bertopic==0.16.1
10
  python-dotenv
surveybot3000.enc CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53dbcafa17705290f4bd04262bf8bbedbee2133b0608004ad03cbc83444c6bda
3
- size 97110456
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2817f75bf07ae5bba7c02af4494ead8a904847955e0a94bf8cdb4b6fd90004f
3
+ size 91447948