Commit
·
818f654
1
Parent(s):
3f83c9c
init commit
Browse files- .gitattributes +1 -0
- .gitignore +8 -0
- .streamlit/config.toml +8 -0
- README.md +6 -8
- all-mpnet-base-v2.enc +3 -0
- app.py +152 -0
- config.yaml +8 -0
- modeling.py +76 -0
- psisent.enc +3 -0
- surveybot3000.enc +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.enc filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
db.parquet
|
3 |
+
preprocess.py
|
4 |
+
encrypt.py
|
5 |
+
surveybot3000.parquet
|
6 |
+
psisent.parquet
|
7 |
+
all-mpnet-base-v2.parquet
|
8 |
+
__pycache__
|
.streamlit/config.toml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
primaryColor="#4361ee"
|
3 |
+
backgroundColor="#FFFFFF"
|
4 |
+
secondaryBackgroundColor="#F0F2F6"
|
5 |
+
textColor="#262730"
|
6 |
+
font="sans serif"
|
7 |
+
[server]
|
8 |
+
enableStaticServing = true
|
README.md
CHANGED
@@ -1,12 +1,10 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.39.0
|
8 |
app_file: app.py
|
9 |
-
pinned:
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Synthetic Nomological Net
|
3 |
+
emoji: 🕸️
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: green
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.39.0
|
8 |
app_file: app.py
|
9 |
+
pinned: true
|
10 |
+
---
|
|
|
|
all-mpnet-base-v2.enc
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1835bad0cd24cc9019803c0589430fa4af320a16da456c364c22f21d51590ed2
|
3 |
+
size 97110456
|
app.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import yaml
|
3 |
+
import pandas as pd
|
4 |
+
from cryptography.fernet import Fernet
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
from io import StringIO
|
7 |
+
|
8 |
+
import modeling
|
9 |
+
|
10 |
+
def df_to_csv(df):
|
11 |
+
csv = StringIO()
|
12 |
+
df.to_csv(csv, index=True)
|
13 |
+
csv.seek(0)
|
14 |
+
csv_data = csv.getvalue()
|
15 |
+
return(csv_data)
|
16 |
+
|
17 |
+
def dict_to_yaml(data):
|
18 |
+
return yaml.dump(data, default_flow_style=False)
|
19 |
+
|
20 |
+
def yaml_to_dict(yaml_str):
|
21 |
+
return yaml.safe_load(yaml_str)
|
22 |
+
|
23 |
+
def initialize():
|
24 |
+
load_dotenv()
|
25 |
+
st.session_state.setdefault('model_names', ['SurveyBot3000', 'PsiSent', 'all-mpnet-base-v2'])
|
26 |
+
st.session_state.setdefault('loaded_model_name', None)
|
27 |
+
st.session_state.setdefault('search_query', None)
|
28 |
+
st.session_state.setdefault('db', None)
|
29 |
+
st.session_state.setdefault('results', pd.DataFrame())
|
30 |
+
st.session_state.setdefault('decrypt_key', None)
|
31 |
+
st.session_state.setdefault('valid_decrypt_key', False)
|
32 |
+
|
33 |
+
with open('config.yaml', 'r') as stream:
|
34 |
+
st.session_state['config'] = yaml.safe_load(stream)
|
35 |
+
|
36 |
+
def main():
|
37 |
+
st.set_page_config(page_title='Synth-Net')
|
38 |
+
|
39 |
+
st.markdown("# The Synthetic Nomological Net")
|
40 |
+
# st.markdown("#### This is a demo on how to extract trait information from responses to open-ended questions.")
|
41 |
+
|
42 |
+
st.markdown("""
|
43 |
+
Psychological science is experiencing rapid growth in constructs and measures, partly due to refinement and new research areas,
|
44 |
+
but also due to excessive proliferation. This proliferation, driven by academic incentives for novelty, may lead to redundant
|
45 |
+
constructs with different names (jangle fallacy) and seemingly similar constructs with little content overlap (jingle fallacy).
|
46 |
+
|
47 |
+
This web application uses state-of-the-art models and methods in natural language processing to search for semantic overlap in measures.
|
48 |
+
It analyzes textual data from over 21,000 scales (containing more than 330,000 items) in an effort to reduce redundancies in measures used in the behavioral sciences.
|
49 |
+
|
50 |
+
- 📖 **Preprint (Open Access)**: NA
|
51 |
+
- 🖊️ **Cite**: NA
|
52 |
+
- 🌐 **Project website**: NA
|
53 |
+
- 💾 **Data**: NA
|
54 |
+
- #️⃣ **Social Media**: NA
|
55 |
+
|
56 |
+
The web application is maintained by [magnolia psychometrics](https://www.magnolia-psychometrics.com/).
|
57 |
+
""", unsafe_allow_html=True)
|
58 |
+
|
59 |
+
placeholder_demo = st.empty()
|
60 |
+
|
61 |
+
show_demo(placeholder_demo)
|
62 |
+
|
63 |
+
def show_demo(placeholder):
|
64 |
+
|
65 |
+
with placeholder:
|
66 |
+
with st.container():
|
67 |
+
st.divider()
|
68 |
+
st.markdown("""
|
69 |
+
## Try it yourself!
|
70 |
+
Define a scale by entering individual items in YAML format.
|
71 |
+
After form submission, a vector representation for the scale is calculated using the selected encoder model.
|
72 |
+
Cosine similarities between this vector and the representations of existing scales are then computed.
|
73 |
+
The resulting table outputs measures with high semantic overlap.
|
74 |
+
""")
|
75 |
+
|
76 |
+
with st.form("submission_form"):
|
77 |
+
|
78 |
+
if not st.session_state['valid_decrypt_key']:
|
79 |
+
with st.expander(label="Authentication", expanded=True, icon="🔑"):
|
80 |
+
st.text_input(
|
81 |
+
label="Encryption key",
|
82 |
+
value="",
|
83 |
+
max_chars=None,
|
84 |
+
key='decrypt_key',
|
85 |
+
placeholder="A URL-safe base64-encoded 32-byte key"
|
86 |
+
)
|
87 |
+
|
88 |
+
with st.expander(label="Model", expanded=False, icon="🧠"):
|
89 |
+
|
90 |
+
if st.session_state['loaded_model_name'] is not None:
|
91 |
+
input_model_index = st.session_state['model_names'].index(st.session_state['input_model_name'])
|
92 |
+
else:
|
93 |
+
input_model_index = 0
|
94 |
+
|
95 |
+
st.selectbox(
|
96 |
+
label="Select model",
|
97 |
+
options=st.session_state['model_names'],
|
98 |
+
index=input_model_index,
|
99 |
+
key='input_model_name'
|
100 |
+
)
|
101 |
+
|
102 |
+
with st.expander(label="Search Query", expanded=True, icon="🔎"):
|
103 |
+
if 'input_items' not in st.session_state:
|
104 |
+
st.session_state['input_items'] = dict_to_yaml(st.session_state['config']['input_items'])
|
105 |
+
|
106 |
+
st.text_area(
|
107 |
+
label="Search for similar measures by entering items that constitute the scale (YAML-Formatted):",
|
108 |
+
height=175,
|
109 |
+
key='input_items'
|
110 |
+
)
|
111 |
+
|
112 |
+
submitted = st.form_submit_button(
|
113 |
+
label="Search Synth-Net",
|
114 |
+
type="primary",
|
115 |
+
use_container_width=True
|
116 |
+
)
|
117 |
+
|
118 |
+
if submitted:
|
119 |
+
|
120 |
+
try:
|
121 |
+
st.session_state['search_query'] = yaml_to_dict(st.session_state['input_items'])
|
122 |
+
except yaml.YAMLError as e:
|
123 |
+
st.error(f"Yikes, you better get your YAML straight! Check https://yaml.org/ for help! \n {e}")
|
124 |
+
return
|
125 |
+
|
126 |
+
try:
|
127 |
+
modeling.load_model()
|
128 |
+
modeling.search()
|
129 |
+
except Exception as error:
|
130 |
+
st.error(f"Error while loading model: {error}")
|
131 |
+
return
|
132 |
+
|
133 |
+
with st.container():
|
134 |
+
if not st.session_state['results'].empty:
|
135 |
+
df = st.session_state['results'].style.format({
|
136 |
+
'Match': '{:.2f}'.format,
|
137 |
+
'Scale': str.capitalize,
|
138 |
+
'Instrument': str.capitalize,
|
139 |
+
})
|
140 |
+
st.dataframe(df, use_container_width=True)
|
141 |
+
|
142 |
+
st.download_button(
|
143 |
+
label="Download References",
|
144 |
+
data=df_to_csv(st.session_state['results']),
|
145 |
+
file_name='scored_survey_responses.csv',
|
146 |
+
mime='text/csv',
|
147 |
+
use_container_width=True
|
148 |
+
)
|
149 |
+
|
150 |
+
if __name__ == '__main__':
|
151 |
+
initialize()
|
152 |
+
main()
|
config.yaml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
input_items:
|
2 |
+
- Receiving compliments is important to me.
|
3 |
+
- I rarely doubt my decisions or actions.
|
4 |
+
- I'm skilled at persuading others to believe what I say.
|
5 |
+
- Being humble doesn't suit my personality.
|
6 |
+
- I consider myself to be truly exceptional.
|
7 |
+
- Gaining recognition and status is important to me.
|
8 |
+
- I'm destined for remarkable achievements.
|
modeling.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import streamlit as st
|
3 |
+
import logging
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
import pickle
|
7 |
+
from cryptography.fernet import Fernet
|
8 |
+
from sentence_transformers import SentenceTransformer, util
|
9 |
+
|
10 |
+
def load_model():
|
11 |
+
|
12 |
+
no_model = st.session_state.get('model') is None
|
13 |
+
swap_model = st.session_state.get('input_model_name') != st.session_state['loaded_model_name']
|
14 |
+
|
15 |
+
if swap_model or no_model:
|
16 |
+
|
17 |
+
with st.spinner('Loading the model might take a couple of seconds...'):
|
18 |
+
env_local = st.session_state['input_model_name'].lower() + '_path'
|
19 |
+
env_remote = st.session_state['input_model_name'].lower() + '_remote_path'
|
20 |
+
|
21 |
+
if os.environ.get(env_remote):
|
22 |
+
model_path = os.environ.get(env_remote)
|
23 |
+
else:
|
24 |
+
model_path = os.getenv(env_local)
|
25 |
+
|
26 |
+
auth_token = os.environ.get('read_models') or True
|
27 |
+
|
28 |
+
st.session_state['model'] = SentenceTransformer(
|
29 |
+
model_name_or_path=model_path,
|
30 |
+
use_auth_token=auth_token
|
31 |
+
)
|
32 |
+
|
33 |
+
st.session_state['loaded_model_name'] = st.session_state['input_model_name']
|
34 |
+
|
35 |
+
logging.info(f"Loaded {st.session_state['input_model_name']}!")
|
36 |
+
|
37 |
+
with st.spinner('Loading embeddings...'):
|
38 |
+
|
39 |
+
file_path = f"./{st.session_state['input_model_name'].lower()}.enc"
|
40 |
+
|
41 |
+
with open(file_path, 'rb') as f:
|
42 |
+
encrypted_data = f.read()
|
43 |
+
|
44 |
+
try:
|
45 |
+
cipher = Fernet(st.session_state['decrypt_key'])
|
46 |
+
decrypted_df = cipher.decrypt(encrypted_data)
|
47 |
+
st.session_state['db'] = pickle.loads(decrypted_df)
|
48 |
+
st.session_state['valid_decrypt_key'] = True
|
49 |
+
except Exception as e:
|
50 |
+
st.error(body="Error: No valid encryption key!", icon="🔑")
|
51 |
+
logging.error(e)
|
52 |
+
return
|
53 |
+
|
54 |
+
|
55 |
+
# st.session_state['db'] = pd.read_parquet(
|
56 |
+
# path=f"./{st.session_state['input_model_name'].lower()}.parquet"
|
57 |
+
# )
|
58 |
+
|
59 |
+
#decrypt_key
|
60 |
+
|
61 |
+
def search():
|
62 |
+
|
63 |
+
with st.spinner('Searching...'):
|
64 |
+
query_embeddings = st.session_state['model'].encode(sentences=st.session_state['search_query']).mean(axis=0)
|
65 |
+
|
66 |
+
query_scores = util.cos_sim(
|
67 |
+
a=np.array(query_embeddings),
|
68 |
+
b=st.session_state['db']['ItemStemEmbeddings']
|
69 |
+
).squeeze()
|
70 |
+
|
71 |
+
st.session_state['results'] = pd.DataFrame({
|
72 |
+
'Match': query_scores,
|
73 |
+
'Scale': st.session_state['db']['ScaleName'],
|
74 |
+
'Instrument': st.session_state['db']['InstrumentName'],
|
75 |
+
'Reference': st.session_state['db']['InstrumentApaReference'],
|
76 |
+
}).sort_values(by='Match', ascending=False)
|
psisent.enc
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ed65f3ce71eae48a6f732ca92a3ffea50872c8b348272e94659c1ce74a6d2b82
|
3 |
+
size 97110456
|
surveybot3000.enc
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53dbcafa17705290f4bd04262bf8bbedbee2133b0608004ad03cbc83444c6bda
|
3 |
+
size 97110456
|