Commit
·
cf1362c
1
Parent(s):
e49e13a
added psyctest references
Browse files- .gitignore +3 -1
- all_mpnet_base_v2.enc +2 -2
- app.py +132 -73
- modeling.py +86 -45
- psisent.enc +2 -2
- requirements.txt +2 -0
- surveybot3000.enc +2 -2
.gitignore
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
.env
|
2 |
db.parquet
|
3 |
preprocess.py
|
@@ -5,4 +6,5 @@ encrypt.py
|
|
5 |
surveybot3000.parquet
|
6 |
psisent.parquet
|
7 |
all_mpnet_base_v2.parquet
|
8 |
-
__pycache__
|
|
|
|
1 |
+
psyctest_doi.parquet
|
2 |
.env
|
3 |
db.parquet
|
4 |
preprocess.py
|
|
|
6 |
surveybot3000.parquet
|
7 |
psisent.parquet
|
8 |
all_mpnet_base_v2.parquet
|
9 |
+
__pycache__
|
10 |
+
**tmp.**
|
all_mpnet_base_v2.enc
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:87cc6e38ec15b4ac377d3e85c6b17e403a5ec9e57ee1b3371e543ecf998e09a0
|
3 |
+
size 91447948
|
app.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
import os
|
2 |
import streamlit as st
|
3 |
import yaml
|
|
|
4 |
import pandas as pd
|
5 |
-
from cryptography.fernet import Fernet
|
6 |
from dotenv import load_dotenv
|
7 |
from io import StringIO
|
8 |
|
@@ -22,36 +23,51 @@ def yaml_to_dict(yaml_str):
|
|
22 |
return yaml.safe_load(yaml_str)
|
23 |
|
24 |
def initialize():
|
|
|
|
|
25 |
load_dotenv()
|
|
|
26 |
st.session_state.setdefault('model_names', ['SurveyBot3000', 'PsiSent', 'all_mpnet_base_v2'])
|
|
|
27 |
st.session_state.setdefault('loaded_model_name', None)
|
28 |
st.session_state.setdefault('search_query', None)
|
29 |
st.session_state.setdefault('db', None)
|
30 |
-
st.session_state.setdefault('
|
|
|
|
|
31 |
|
32 |
with open('config.yaml', 'r') as stream:
|
33 |
st.session_state['config'] = yaml.safe_load(stream)
|
34 |
|
35 |
-
if os.environ.get('
|
36 |
-
|
37 |
-
st.session_state.setdefault('
|
|
|
38 |
else:
|
39 |
-
st.session_state.setdefault('
|
40 |
|
41 |
def main():
|
42 |
st.set_page_config(page_title='Synth-Net')
|
43 |
|
44 |
st.markdown("# The Synthetic Nomological Net")
|
45 |
-
# st.markdown("#### This is a demo on how to extract trait information from responses to open-ended questions.")
|
46 |
-
|
47 |
st.markdown("""
|
48 |
-
Psychological science is experiencing rapid growth in constructs and measures, partly due to refinement and new research areas,
|
49 |
-
but also due to excessive proliferation. This proliferation, driven by academic incentives for novelty, may lead to redundant
|
50 |
constructs with different names (jangle fallacy) and seemingly similar constructs with little content overlap (jingle fallacy).
|
51 |
|
52 |
This web application uses state-of-the-art models and methods in natural language processing to search for semantic overlap in measures.
|
53 |
It analyzes textual data from over 21,000 scales (containing more than 330,000 items) in an effort to reduce redundancies in measures used in the behavioral sciences.
|
|
|
|
|
|
|
|
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
- 📖 **Preprint (Open Access)**: NA
|
56 |
- 🖊️ **Cite**: NA
|
57 |
- 🌐 **Project website**: NA
|
@@ -61,95 +77,138 @@ def main():
|
|
61 |
The web application is maintained by [magnolia psychometrics](https://www.magnolia-psychometrics.com/).
|
62 |
""", unsafe_allow_html=True)
|
63 |
|
64 |
-
|
|
|
|
|
65 |
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
def show_demo(placeholder):
|
69 |
|
70 |
with placeholder:
|
71 |
with st.container():
|
72 |
st.divider()
|
73 |
-
st.markdown("""
|
74 |
## Try it yourself!
|
75 |
-
Define a scale by entering individual items in YAML format.
|
76 |
After form submission, a vector representation for the scale is calculated using the selected encoder model.
|
77 |
Cosine similarities between this vector and the representations of existing scales are then computed.
|
78 |
-
The resulting table outputs measures with high semantic overlap.
|
79 |
""")
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
label="Encryption key",
|
86 |
-
value="",
|
87 |
-
max_chars=None,
|
88 |
-
key='decrypt_key',
|
89 |
-
placeholder="A URL-safe base64-encoded 32-byte key"
|
90 |
-
)
|
91 |
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
-
|
95 |
-
input_model_index = st.session_state['model_names'].index(st.session_state['input_model_name'])
|
96 |
-
else:
|
97 |
-
input_model_index = 0
|
98 |
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
index=input_model_index,
|
103 |
-
key='input_model_name'
|
104 |
-
)
|
105 |
-
|
106 |
-
with st.expander(label="Search Query", expanded=True, icon="🔎"):
|
107 |
-
if 'input_items' not in st.session_state:
|
108 |
-
st.session_state['input_items'] = dict_to_yaml(st.session_state['config']['input_items'])
|
109 |
|
|
|
110 |
st.text_area(
|
111 |
label="Search for similar measures by entering items that constitute the scale (YAML-Formatted):",
|
112 |
height=175,
|
113 |
key='input_items'
|
114 |
)
|
115 |
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
|
122 |
-
|
123 |
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
-
try:
|
131 |
-
modeling.load_model()
|
132 |
modeling.search()
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
if __name__ == '__main__':
|
155 |
initialize()
|
|
|
1 |
import os
|
2 |
import streamlit as st
|
3 |
import yaml
|
4 |
+
import logging
|
5 |
import pandas as pd
|
6 |
+
from cryptography.fernet import Fernet, InvalidToken
|
7 |
from dotenv import load_dotenv
|
8 |
from io import StringIO
|
9 |
|
|
|
23 |
return yaml.safe_load(yaml_str)
|
24 |
|
25 |
def initialize():
|
26 |
+
|
27 |
+
logging.basicConfig(level=logging.INFO)
|
28 |
load_dotenv()
|
29 |
+
|
30 |
st.session_state.setdefault('model_names', ['SurveyBot3000', 'PsiSent', 'all_mpnet_base_v2'])
|
31 |
+
st.session_state.setdefault('input_model_name', st.session_state['model_names'][0])
|
32 |
st.session_state.setdefault('loaded_model_name', None)
|
33 |
st.session_state.setdefault('search_query', None)
|
34 |
st.session_state.setdefault('db', None)
|
35 |
+
st.session_state.setdefault('search_results', pd.DataFrame())
|
36 |
+
st.session_state.setdefault('explore_plot', None)
|
37 |
+
st.session_state.setdefault('is_authenticated', False)
|
38 |
|
39 |
with open('config.yaml', 'r') as stream:
|
40 |
st.session_state['config'] = yaml.safe_load(stream)
|
41 |
|
42 |
+
if os.environ.get('encryption_key'):
|
43 |
+
encryption_key = os.environ.get('encryption_key')
|
44 |
+
st.session_state.setdefault('encryption_key', encryption_key)
|
45 |
+
# st.session_state.setdefault('encryption_key', None)
|
46 |
else:
|
47 |
+
st.session_state.setdefault('encryption_key', None)
|
48 |
|
49 |
def main():
|
50 |
st.set_page_config(page_title='Synth-Net')
|
51 |
|
52 |
st.markdown("# The Synthetic Nomological Net")
|
|
|
|
|
53 |
st.markdown("""
|
54 |
+
Psychological science is experiencing rapid growth in constructs and measures, partly due to refinement and new research areas,
|
55 |
+
but also due to excessive proliferation. This proliferation, driven by academic incentives for novelty, may lead to redundant
|
56 |
constructs with different names (jangle fallacy) and seemingly similar constructs with little content overlap (jingle fallacy).
|
57 |
|
58 |
This web application uses state-of-the-art models and methods in natural language processing to search for semantic overlap in measures.
|
59 |
It analyzes textual data from over 21,000 scales (containing more than 330,000 items) in an effort to reduce redundancies in measures used in the behavioral sciences.
|
60 |
+
""", unsafe_allow_html=True)
|
61 |
+
|
62 |
+
placeholder_authentication = st.empty()
|
63 |
+
placeholder_demo = st.empty()
|
64 |
|
65 |
+
if st.session_state['is_authenticated']:
|
66 |
+
show_demo(placeholder_demo)
|
67 |
+
else:
|
68 |
+
show_authentication(placeholder_authentication)
|
69 |
+
|
70 |
+
st.markdown("""
|
71 |
- 📖 **Preprint (Open Access)**: NA
|
72 |
- 🖊️ **Cite**: NA
|
73 |
- 🌐 **Project website**: NA
|
|
|
77 |
The web application is maintained by [magnolia psychometrics](https://www.magnolia-psychometrics.com/).
|
78 |
""", unsafe_allow_html=True)
|
79 |
|
80 |
+
def show_authentication(placeholder):
|
81 |
+
with placeholder:
|
82 |
+
with st.container():
|
83 |
|
84 |
+
with st.form("authentication_form"):
|
85 |
+
|
86 |
+
st.markdown("""
|
87 |
+
## Authentication
|
88 |
+
This app is a research preview and requires authentication.
|
89 |
+
All data is encrypted. Please use your 32-byte encryption key to proceed!
|
90 |
+
""")
|
91 |
+
|
92 |
+
st.text_input(
|
93 |
+
label="🔑 Encryption key",
|
94 |
+
value="",
|
95 |
+
max_chars=None,
|
96 |
+
key='encryption_key',
|
97 |
+
placeholder="A URL-safe base64-encoded 32-byte key"
|
98 |
+
)
|
99 |
+
|
100 |
+
submitted = st.form_submit_button(
|
101 |
+
label="Authenticate",
|
102 |
+
type="primary",
|
103 |
+
use_container_width=True
|
104 |
+
)
|
105 |
+
|
106 |
+
if submitted:
|
107 |
+
try:
|
108 |
+
modeling.load_db()
|
109 |
+
st.rerun()
|
110 |
+
except InvalidToken:
|
111 |
+
error = f"Error: The encryption key you have entered is invalid (**{st.session_state['encryption_key']}**)!"
|
112 |
+
st.error(body=error, icon="🔑")
|
113 |
+
logging.error(error)
|
114 |
+
st.session_state['is_authenticated'] = False
|
115 |
+
return
|
116 |
+
except ValueError as error:
|
117 |
+
st.error(body=error, icon="🔑")
|
118 |
+
logging.error(error)
|
119 |
+
st.session_state['is_authenticated'] = False
|
120 |
+
return
|
121 |
|
122 |
def show_demo(placeholder):
|
123 |
|
124 |
with placeholder:
|
125 |
with st.container():
|
126 |
st.divider()
|
127 |
+
st.markdown("""
|
128 |
## Try it yourself!
|
129 |
+
Define a scale by entering individual items in YAML format.
|
130 |
After form submission, a vector representation for the scale is calculated using the selected encoder model.
|
131 |
Cosine similarities between this vector and the representations of existing scales are then computed.
|
132 |
+
The resulting table outputs measures with high semantic overlap.
|
133 |
""")
|
134 |
|
135 |
+
if st.session_state['loaded_model_name'] is not None:
|
136 |
+
input_model_index = st.session_state['model_names'].index(st.session_state['input_model_name'])
|
137 |
+
else:
|
138 |
+
input_model_index = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
|
140 |
+
st.selectbox(
|
141 |
+
label="Select model",
|
142 |
+
options=st.session_state['model_names'],
|
143 |
+
index=input_model_index,
|
144 |
+
placeholder="Choose a model",
|
145 |
+
key='input_model_name'
|
146 |
+
)
|
147 |
|
148 |
+
tab1, tab2 = st.tabs(["🔎 Search for scales", "🕸️ Explore the synthetic nomological net"])
|
|
|
|
|
|
|
149 |
|
150 |
+
with tab1:
|
151 |
+
if 'input_items' not in st.session_state:
|
152 |
+
st.session_state['input_items'] = dict_to_yaml(st.session_state['config']['input_items'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
+
with st.form("submission_form"):
|
155 |
st.text_area(
|
156 |
label="Search for similar measures by entering items that constitute the scale (YAML-Formatted):",
|
157 |
height=175,
|
158 |
key='input_items'
|
159 |
)
|
160 |
|
161 |
+
submitted = st.form_submit_button(
|
162 |
+
label="Search Synth-Net",
|
163 |
+
type="primary",
|
164 |
+
use_container_width=True
|
165 |
+
)
|
166 |
|
167 |
+
if submitted:
|
168 |
|
169 |
+
try:
|
170 |
+
st.session_state['search_query'] = yaml_to_dict(st.session_state['input_items'])
|
171 |
+
except yaml.YAMLError as e:
|
172 |
+
st.error(f"Yikes, you better get your YAML straight! Check https://yaml.org/ for help! \n {e}")
|
173 |
+
return
|
174 |
+
|
175 |
+
no_model = st.session_state.get('model') is None
|
176 |
+
swap_model = st.session_state.get('input_model_name') != st.session_state['loaded_model_name']
|
177 |
+
|
178 |
+
if swap_model or no_model:
|
179 |
+
modeling.load_db()
|
180 |
+
modeling.load_model()
|
181 |
|
|
|
|
|
182 |
modeling.search()
|
183 |
+
|
184 |
+
|
185 |
+
with st.container():
|
186 |
+
if not st.session_state['search_results'].empty:
|
187 |
+
with st.spinner('Rendering search results...'):
|
188 |
+
df = st.session_state['search_results'].style.format({
|
189 |
+
'Match': '{:.2f}'.format,
|
190 |
+
'Scale': str.capitalize,
|
191 |
+
'Instrument': str.capitalize,
|
192 |
+
})
|
193 |
+
st.dataframe(df, use_container_width=True, hide_index=True)
|
194 |
+
|
195 |
+
with tab2:
|
196 |
+
with st.container():
|
197 |
+
modeling.explore()
|
198 |
+
if st.session_state['explore_plot']:
|
199 |
+
st.plotly_chart(
|
200 |
+
figure_or_data=st.session_state['explore_plot'],
|
201 |
+
use_container_width=True
|
202 |
+
)
|
203 |
+
|
204 |
+
# if not st.session_state['search_results'].empty:
|
205 |
+
# st.download_button(
|
206 |
+
# label="Download References",
|
207 |
+
# data=df_to_csv(st.session_state['search_results']),
|
208 |
+
# file_name='scored_survey_responses.csv',
|
209 |
+
# mime='text/csv',
|
210 |
+
# use_container_width=True
|
211 |
+
# )
|
212 |
|
213 |
if __name__ == '__main__':
|
214 |
initialize()
|
modeling.py
CHANGED
@@ -4,72 +4,113 @@ import logging
|
|
4 |
import pandas as pd
|
5 |
import numpy as np
|
6 |
import pickle
|
|
|
|
|
|
|
7 |
from cryptography.fernet import Fernet
|
8 |
from sentence_transformers import SentenceTransformer, util
|
|
|
9 |
|
10 |
-
def load_model():
|
11 |
|
12 |
-
|
13 |
-
swap_model = st.session_state.get('input_model_name') != st.session_state['loaded_model_name']
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
if os.environ.get(env_remote):
|
22 |
-
model_path = os.environ.get(env_remote)
|
23 |
-
else:
|
24 |
-
model_path = os.getenv(env_local)
|
25 |
|
26 |
-
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
use_auth_token=auth_token
|
31 |
-
)
|
32 |
|
33 |
-
|
|
|
|
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
with st.spinner('Loading embeddings...'):
|
38 |
|
39 |
-
file_path = f"./{st.session_state['input_model_name'].lower()}.enc"
|
40 |
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
-
|
45 |
-
cipher = Fernet(st.session_state['decrypt_key'])
|
46 |
-
decrypted_df = cipher.decrypt(encrypted_data)
|
47 |
-
st.session_state['db'] = pickle.loads(decrypted_df)
|
48 |
-
except Exception as e:
|
49 |
-
st.error(body="Error: No valid encryption key!", icon="🔑")
|
50 |
-
logging.error(e)
|
51 |
-
return
|
52 |
|
53 |
-
|
54 |
-
# st.session_state['db'] = pd.read_parquet(
|
55 |
-
# path=f"./{st.session_state['input_model_name'].lower()}.parquet"
|
56 |
-
# )
|
57 |
|
58 |
-
#decrypt_key
|
59 |
|
60 |
def search():
|
61 |
-
|
62 |
-
with st.spinner('Searching...'):
|
63 |
-
query_embeddings = st.session_state['model'].encode(sentences=st.session_state['search_query']).mean(axis=0)
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
query_scores = util.cos_sim(
|
66 |
a=np.array(query_embeddings),
|
67 |
-
b=
|
68 |
).squeeze()
|
69 |
|
70 |
-
st.session_state['
|
71 |
'Match': query_scores,
|
72 |
'Scale': st.session_state['db']['ScaleName'],
|
73 |
'Instrument': st.session_state['db']['InstrumentName'],
|
74 |
-
'Reference': st.session_state['db']['
|
75 |
-
}).sort_values(by='Match', ascending=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import pandas as pd
|
5 |
import numpy as np
|
6 |
import pickle
|
7 |
+
import numpy as np
|
8 |
+
from bertopic import BERTopic
|
9 |
+
from umap import UMAP
|
10 |
from cryptography.fernet import Fernet
|
11 |
from sentence_transformers import SentenceTransformer, util
|
12 |
+
from pdb import set_trace as trace
|
13 |
|
|
|
14 |
|
15 |
+
def load_db():
|
|
|
16 |
|
17 |
+
with st.spinner('Loading pre-computed embeddings...'):
|
18 |
+
if st.session_state['input_model_name']:
|
19 |
+
file_path = f"./{st.session_state['input_model_name'].lower()}.enc"
|
20 |
+
else:
|
21 |
+
file_path = f"./{st.session_state['model_names'][0].lower()}.enc"
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
+
logging.info(f"Loading data from {file_path}!")
|
24 |
|
25 |
+
with open(file_path, 'rb') as f:
|
26 |
+
encrypted_data = f.read()
|
|
|
|
|
27 |
|
28 |
+
cipher = Fernet(st.session_state['encryption_key'])
|
29 |
+
decrypted_df = cipher.decrypt(encrypted_data)
|
30 |
+
st.session_state['db'] = pickle.loads(decrypted_df)
|
31 |
|
32 |
+
st.session_state['is_authenticated'] = True
|
33 |
+
logging.info(f"Loaded {file_path}!")
|
|
|
34 |
|
|
|
35 |
|
36 |
+
def load_model():
|
37 |
+
|
38 |
+
with st.spinner('Loading the model...'):
|
39 |
+
env_local = st.session_state['input_model_name'].lower() + '_path'
|
40 |
+
env_remote = st.session_state['input_model_name'].lower(
|
41 |
+
) + '_remote_path'
|
42 |
+
|
43 |
+
if os.environ.get(env_remote):
|
44 |
+
model_path = os.environ.get(env_remote)
|
45 |
+
else:
|
46 |
+
model_path = os.getenv(env_local)
|
47 |
+
|
48 |
+
logging.info(f"Loading model from {model_path}!")
|
49 |
+
|
50 |
+
auth_token = os.environ.get('read_models') or True
|
51 |
+
|
52 |
+
st.session_state['model'] = SentenceTransformer(
|
53 |
+
model_name_or_path=model_path,
|
54 |
+
token=auth_token
|
55 |
+
)
|
56 |
|
57 |
+
st.session_state['loaded_model_name'] = st.session_state['input_model_name']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
+
logging.info(f"Loaded {st.session_state['input_model_name']}!")
|
|
|
|
|
|
|
60 |
|
|
|
61 |
|
62 |
def search():
|
|
|
|
|
|
|
63 |
|
64 |
+
with st.spinner('Searching the synthetic net...'):
|
65 |
+
query_embeddings = st.session_state['model'].encode(
|
66 |
+
sentences=st.session_state['search_query']).mean(axis=0)
|
67 |
+
|
68 |
+
item_embeddings = np.vstack(
|
69 |
+
st.session_state['db']['ItemStemEmbeddings'])
|
70 |
query_scores = util.cos_sim(
|
71 |
a=np.array(query_embeddings),
|
72 |
+
b=item_embeddings
|
73 |
).squeeze()
|
74 |
|
75 |
+
st.session_state['search_results'] = pd.DataFrame({
|
76 |
'Match': query_scores,
|
77 |
'Scale': st.session_state['db']['ScaleName'],
|
78 |
'Instrument': st.session_state['db']['InstrumentName'],
|
79 |
+
'Reference': st.session_state['db']['psyctest_doi'],
|
80 |
+
}).sort_values(by='Match', ascending=False)
|
81 |
+
|
82 |
+
|
83 |
+
def explore():
|
84 |
+
|
85 |
+
df = st.session_state['db']
|
86 |
+
message = f'Modeling synthetic construct space for {df.shape[0]} scales...'
|
87 |
+
logging.info(message)
|
88 |
+
|
89 |
+
with st.spinner(message):
|
90 |
+
|
91 |
+
documents = [f'{x}\n{y}' for x, y in zip(
|
92 |
+
df.ScaleName.tolist(), df.InstrumentName.tolist())]
|
93 |
+
embeddings = np.stack(df.ItemStemEmbeddings.to_numpy())
|
94 |
+
|
95 |
+
topic_model = BERTopic().fit(
|
96 |
+
documents=documents,
|
97 |
+
embeddings=embeddings
|
98 |
+
)
|
99 |
+
|
100 |
+
reduced_embeddings = UMAP(
|
101 |
+
n_neighbors=10,
|
102 |
+
n_components=2,
|
103 |
+
min_dist=0.0,
|
104 |
+
metric='cosine'
|
105 |
+
).fit_transform(embeddings)
|
106 |
+
|
107 |
+
st.session_state['explore_plot'] = topic_model.visualize_documents(
|
108 |
+
docs=documents,
|
109 |
+
reduced_embeddings=reduced_embeddings,
|
110 |
+
hide_annotations=True,
|
111 |
+
hide_document_hover=False,
|
112 |
+
custom_labels=False,
|
113 |
+
title="The Synthetic Nomological Net",
|
114 |
+
width=1500,
|
115 |
+
height=1500
|
116 |
+
)
|
psisent.enc
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3c278cff21708369f5026353cb18d001eb6a4707702f27ef9e410988dc297247
|
3 |
+
size 91447948
|
requirements.txt
CHANGED
@@ -5,4 +5,6 @@ sentence_transformers==2.7.0
|
|
5 |
sentencepiece==0.1.99
|
6 |
altair==4.2.2
|
7 |
cryptography==41.0.1
|
|
|
|
|
8 |
python-dotenv
|
|
|
5 |
sentencepiece==0.1.99
|
6 |
altair==4.2.2
|
7 |
cryptography==41.0.1
|
8 |
+
matplotlib_venn==1.1.1
|
9 |
+
bertopic==0.16.1
|
10 |
python-dotenv
|
surveybot3000.enc
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2817f75bf07ae5bba7c02af4494ead8a904847955e0a94bf8cdb4b6fd90004f
|
3 |
+
size 91447948
|