Milad Alshomary commited on
Commit
001c059
Β·
1 Parent(s): c54eddb
app.py CHANGED
@@ -58,8 +58,8 @@ def app(share=False, use_cluster_feats=False):
58
  instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
59
 
60
  interp = load_interp_space(cfg)
61
- clustered_authors_df = interp['clustered_authors_df']
62
- clustered_authors_df['fullText'] = clustered_authors_df['fullText'].map(lambda l: l[:5]) # Take at most 3 texts per author
63
 
64
  with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
65
  # ── Big Centered Title ──────────────────────────────────────────
 
58
  instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
59
 
60
  interp = load_interp_space(cfg)
61
+ clustered_authors_df = interp['clustered_authors_df'][:1000]
62
+ clustered_authors_df['fullText'] = clustered_authors_df['fullText']
63
 
64
  with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
65
  # ── Big Centered Title ──────────────────────────────────────────
config/config.yaml CHANGED
@@ -1,8 +1,8 @@
1
  # config.yaml
2
  instances_to_explain_path: "./datasets/hrs_explanations.json"
3
  instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_18_balanced.json?/download=true"
4
- interp_space_path: "./datasets/sentence_luar_interp_clusters_25/"
5
- interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_clusters_25.zip?download=true"
6
  gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
7
  gram2vec_feats_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/gram2vec_feats.csv?download=true"
8
 
 
1
  # config.yaml
2
  instances_to_explain_path: "./datasets/hrs_explanations.json"
3
  instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_18_balanced.json?/download=true"
4
+ interp_space_path: "./datasets/sentence_luar_interp_space_2_35/"
5
+ interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_space_2_35.zip?download=true"
6
  gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
7
  gram2vec_feats_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/gram2vec_feats.csv?download=true"
8
 
utils/interp_space_utils.py CHANGED
@@ -78,7 +78,7 @@ def compute_g2v_features(clustered_authors_df: pd.DataFrame, task_authors_df: pd
78
  clustered_authors_df = pickle.load(f)
79
 
80
  else: # Else compute and cache
81
- g2v_feats_df = vectorizer.from_documents(author_texts, batch_size=16)
82
 
83
  print(f"Number of g2v features: {len(g2v_feats_df)}")
84
  print(f"Number of clustered_authors_df.authorID.tolist(): {len(clustered_authors_df.authorID.tolist())}")
 
78
  clustered_authors_df = pickle.load(f)
79
 
80
  else: # Else compute and cache
81
+ g2v_feats_df = vectorizer.from_documents(author_texts, batch_size=8)
82
 
83
  print(f"Number of g2v features: {len(g2v_feats_df)}")
84
  print(f"Number of clustered_authors_df.authorID.tolist(): {len(clustered_authors_df.authorID.tolist())}")
utils/visualizations.py CHANGED
@@ -14,7 +14,7 @@ from utils.interp_space_utils import compute_clusters_style_representation_3, co
14
  from utils.llm_feat_utils import split_features
15
  from utils.gram2vec_feat_utils import get_shorthand, get_fullform
16
  from gram2vec.feature_locator import find_feature_spans
17
-
18
  import plotly.io as pio
19
 
20
  def clean_text(text: str) -> str:
@@ -132,8 +132,10 @@ def compute_tsne_with_cache(embeddings: np.ndarray, cache_path: str = 'datasets/
132
  return cache[hash_key]
133
  else:
134
  print("Computing t-SNE")
135
- tsne_result = TSNE(n_components=2, learning_rate='auto',
136
- init='random', perplexity=3).fit_transform(embeddings)
 
 
137
  cache[hash_key] = tsne_result
138
  with open(cache_path, 'wb') as f:
139
  pkl.dump(cache, f)
@@ -147,7 +149,7 @@ def load_interp_space(cfg):
147
 
148
  # Load authors embeddings and their cluster labels
149
  clustered_authors_df = pd.read_pickle(clustered_authors_path)
150
- clustered_authors_df = clustered_authors_df[clustered_authors_df.cluster_label != -1]
151
  author_embedding = clustered_authors_df.author_embedding.tolist()
152
  author_labels = clustered_authors_df.cluster_label.tolist()
153
  author_ids = clustered_authors_df.authorID.tolist()
@@ -276,12 +278,12 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
276
  print(f"Task authors: {len(task_authors_df)}, Clustered authors: {len(clustered_authors_df)}")
277
  merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
278
  print(f"Merged authors DataFrame:\n{len(merged_authors_df)}")
279
- #style_analysis_response = {'features': [], 'spans': []}
280
- style_analysis_response = compute_clusters_style_representation_3(
281
- background_corpus_df=merged_authors_df,
282
- cluster_ids=visible_authors,
283
- cluster_label_clm_name='authorID',
284
- )
285
 
286
  llm_feats = ['None'] + style_analysis_response['features']
287
 
 
14
  from utils.llm_feat_utils import split_features
15
  from utils.gram2vec_feat_utils import get_shorthand, get_fullform
16
  from gram2vec.feature_locator import find_feature_spans
17
+ import umap
18
  import plotly.io as pio
19
 
20
  def clean_text(text: str) -> str:
 
132
  return cache[hash_key]
133
  else:
134
  print("Computing t-SNE")
135
+ # tsne_result = TSNE(n_components=2, learning_rate='auto',
136
+ # init='random', perplexity=3).fit_transform(embeddings)
137
+ tsne_result = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.0, metric='cosine').fit_transform(embeddings)
138
+
139
  cache[hash_key] = tsne_result
140
  with open(cache_path, 'wb') as f:
141
  pkl.dump(cache, f)
 
149
 
150
  # Load authors embeddings and their cluster labels
151
  clustered_authors_df = pd.read_pickle(clustered_authors_path)
152
+ #clustered_authors_df = clustered_authors_df[clustered_authors_df.cluster_label != -1]
153
  author_embedding = clustered_authors_df.author_embedding.tolist()
154
  author_labels = clustered_authors_df.cluster_label.tolist()
155
  author_ids = clustered_authors_df.authorID.tolist()
 
278
  print(f"Task authors: {len(task_authors_df)}, Clustered authors: {len(clustered_authors_df)}")
279
  merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
280
  print(f"Merged authors DataFrame:\n{len(merged_authors_df)}")
281
+ style_analysis_response = {'features': [], 'spans': []}
282
+ # style_analysis_response = compute_clusters_style_representation_3(
283
+ # background_corpus_df=merged_authors_df,
284
+ # cluster_ids=visible_authors,
285
+ # cluster_label_clm_name='authorID',
286
+ # )
287
 
288
  llm_feats = ['None'] + style_analysis_response['features']
289