Milad Alshomary
commited on
Commit
Β·
001c059
1
Parent(s):
c54eddb
updates
Browse files- app.py +2 -2
- config/config.yaml +2 -2
- utils/interp_space_utils.py +1 -1
- utils/visualizations.py +12 -10
app.py
CHANGED
@@ -58,8 +58,8 @@ def app(share=False, use_cluster_feats=False):
|
|
58 |
instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
|
59 |
|
60 |
interp = load_interp_space(cfg)
|
61 |
-
clustered_authors_df = interp['clustered_authors_df']
|
62 |
-
clustered_authors_df['fullText'] = clustered_authors_df['fullText']
|
63 |
|
64 |
with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
|
65 |
# ββ Big Centered Title ββββββββββββββββββββββββββββββββββββββββββ
|
|
|
58 |
instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
|
59 |
|
60 |
interp = load_interp_space(cfg)
|
61 |
+
clustered_authors_df = interp['clustered_authors_df'][:1000]
|
62 |
+
clustered_authors_df['fullText'] = clustered_authors_df['fullText']
|
63 |
|
64 |
with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
|
65 |
# ββ Big Centered Title ββββββββββββββββββββββββββββββββββββββββββ
|
config/config.yaml
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
# config.yaml
|
2 |
instances_to_explain_path: "./datasets/hrs_explanations.json"
|
3 |
instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_18_balanced.json?/download=true"
|
4 |
-
interp_space_path: "./datasets/
|
5 |
-
interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/
|
6 |
gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
|
7 |
gram2vec_feats_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/gram2vec_feats.csv?download=true"
|
8 |
|
|
|
1 |
# config.yaml
|
2 |
instances_to_explain_path: "./datasets/hrs_explanations.json"
|
3 |
instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_18_balanced.json?/download=true"
|
4 |
+
interp_space_path: "./datasets/sentence_luar_interp_space_2_35/"
|
5 |
+
interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_space_2_35.zip?download=true"
|
6 |
gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
|
7 |
gram2vec_feats_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/gram2vec_feats.csv?download=true"
|
8 |
|
utils/interp_space_utils.py
CHANGED
@@ -78,7 +78,7 @@ def compute_g2v_features(clustered_authors_df: pd.DataFrame, task_authors_df: pd
|
|
78 |
clustered_authors_df = pickle.load(f)
|
79 |
|
80 |
else: # Else compute and cache
|
81 |
-
g2v_feats_df = vectorizer.from_documents(author_texts, batch_size=
|
82 |
|
83 |
print(f"Number of g2v features: {len(g2v_feats_df)}")
|
84 |
print(f"Number of clustered_authors_df.authorID.tolist(): {len(clustered_authors_df.authorID.tolist())}")
|
|
|
78 |
clustered_authors_df = pickle.load(f)
|
79 |
|
80 |
else: # Else compute and cache
|
81 |
+
g2v_feats_df = vectorizer.from_documents(author_texts, batch_size=8)
|
82 |
|
83 |
print(f"Number of g2v features: {len(g2v_feats_df)}")
|
84 |
print(f"Number of clustered_authors_df.authorID.tolist(): {len(clustered_authors_df.authorID.tolist())}")
|
utils/visualizations.py
CHANGED
@@ -14,7 +14,7 @@ from utils.interp_space_utils import compute_clusters_style_representation_3, co
|
|
14 |
from utils.llm_feat_utils import split_features
|
15 |
from utils.gram2vec_feat_utils import get_shorthand, get_fullform
|
16 |
from gram2vec.feature_locator import find_feature_spans
|
17 |
-
|
18 |
import plotly.io as pio
|
19 |
|
20 |
def clean_text(text: str) -> str:
|
@@ -132,8 +132,10 @@ def compute_tsne_with_cache(embeddings: np.ndarray, cache_path: str = 'datasets/
|
|
132 |
return cache[hash_key]
|
133 |
else:
|
134 |
print("Computing t-SNE")
|
135 |
-
tsne_result = TSNE(n_components=2, learning_rate='auto',
|
136 |
-
|
|
|
|
|
137 |
cache[hash_key] = tsne_result
|
138 |
with open(cache_path, 'wb') as f:
|
139 |
pkl.dump(cache, f)
|
@@ -147,7 +149,7 @@ def load_interp_space(cfg):
|
|
147 |
|
148 |
# Load authors embeddings and their cluster labels
|
149 |
clustered_authors_df = pd.read_pickle(clustered_authors_path)
|
150 |
-
clustered_authors_df = clustered_authors_df[clustered_authors_df.cluster_label != -1]
|
151 |
author_embedding = clustered_authors_df.author_embedding.tolist()
|
152 |
author_labels = clustered_authors_df.cluster_label.tolist()
|
153 |
author_ids = clustered_authors_df.authorID.tolist()
|
@@ -276,12 +278,12 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
|
|
276 |
print(f"Task authors: {len(task_authors_df)}, Clustered authors: {len(clustered_authors_df)}")
|
277 |
merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
|
278 |
print(f"Merged authors DataFrame:\n{len(merged_authors_df)}")
|
279 |
-
|
280 |
-
style_analysis_response = compute_clusters_style_representation_3(
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
)
|
285 |
|
286 |
llm_feats = ['None'] + style_analysis_response['features']
|
287 |
|
|
|
14 |
from utils.llm_feat_utils import split_features
|
15 |
from utils.gram2vec_feat_utils import get_shorthand, get_fullform
|
16 |
from gram2vec.feature_locator import find_feature_spans
|
17 |
+
import umap
|
18 |
import plotly.io as pio
|
19 |
|
20 |
def clean_text(text: str) -> str:
|
|
|
132 |
return cache[hash_key]
|
133 |
else:
|
134 |
print("Computing t-SNE")
|
135 |
+
# tsne_result = TSNE(n_components=2, learning_rate='auto',
|
136 |
+
# init='random', perplexity=3).fit_transform(embeddings)
|
137 |
+
tsne_result = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.0, metric='cosine').fit_transform(embeddings)
|
138 |
+
|
139 |
cache[hash_key] = tsne_result
|
140 |
with open(cache_path, 'wb') as f:
|
141 |
pkl.dump(cache, f)
|
|
|
149 |
|
150 |
# Load authors embeddings and their cluster labels
|
151 |
clustered_authors_df = pd.read_pickle(clustered_authors_path)
|
152 |
+
#clustered_authors_df = clustered_authors_df[clustered_authors_df.cluster_label != -1]
|
153 |
author_embedding = clustered_authors_df.author_embedding.tolist()
|
154 |
author_labels = clustered_authors_df.cluster_label.tolist()
|
155 |
author_ids = clustered_authors_df.authorID.tolist()
|
|
|
278 |
print(f"Task authors: {len(task_authors_df)}, Clustered authors: {len(clustered_authors_df)}")
|
279 |
merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
|
280 |
print(f"Merged authors DataFrame:\n{len(merged_authors_df)}")
|
281 |
+
style_analysis_response = {'features': [], 'spans': []}
|
282 |
+
# style_analysis_response = compute_clusters_style_representation_3(
|
283 |
+
# background_corpus_df=merged_authors_df,
|
284 |
+
# cluster_ids=visible_authors,
|
285 |
+
# cluster_label_clm_name='authorID',
|
286 |
+
# )
|
287 |
|
288 |
llm_feats = ['None'] + style_analysis_response['features']
|
289 |
|