Milad Alshomary commited on
Commit
87e3b98
Β·
1 Parent(s): 74947b9
Files changed (4) hide show
  1. app.py +1 -1
  2. config/config.yaml +2 -2
  3. utils/interp_space_utils.py +34 -12
  4. utils/ui.py +6 -9
app.py CHANGED
@@ -58,7 +58,7 @@ def app(share=False, use_cluster_feats=False):
58
  instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
59
 
60
  interp = load_interp_space(cfg)
61
- clustered_authors_df = interp['clustered_authors_df'][:1000]
62
  clustered_authors_df['fullText'] = clustered_authors_df['fullText'].map(lambda l: l[:3]) # Take at most 3 texts per author
63
 
64
  with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
 
58
  instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
59
 
60
  interp = load_interp_space(cfg)
61
+ clustered_authors_df = interp['clustered_authors_df'][:200]
62
  clustered_authors_df['fullText'] = clustered_authors_df['fullText'].map(lambda l: l[:3]) # Take at most 3 texts per author
63
 
64
  with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
config/config.yaml CHANGED
@@ -1,8 +1,8 @@
1
  # config.yaml
2
  instances_to_explain_path: "./datasets/hrs_explanations.json"
3
  instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_18_balanced.json?/download=true"
4
- interp_space_path: "./datasets/sentence_luar_interp_clusters_25/"
5
- interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_clusters_25.zip?download=true"
6
  gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
7
  gram2vec_feats_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/gram2vec_feats.csv?download=true"
8
 
 
1
  # config.yaml
2
  instances_to_explain_path: "./datasets/hrs_explanations.json"
3
  instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_18_balanced.json?/download=true"
4
+ interp_space_path: "./datasets/sentence_luar_interp_clusters_18.zip/"
5
+ interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/luar_interp_space_cluster_18.zip?download=true"
6
  gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
7
  gram2vec_feats_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/gram2vec_feats.csv?download=true"
8
 
utils/interp_space_utils.py CHANGED
@@ -20,6 +20,7 @@ from utils.llm_feat_utils import generate_feature_spans_cached
20
  from collections import Counter
21
  import numpy as np
22
  from sklearn.metrics.pairwise import cosine_similarity
 
23
 
24
  CACHE_DIR = "datasets/embeddings_cache"
25
  ZOOM_CACHE = "datasets/zoom_cache/features_cache.json"
@@ -140,7 +141,7 @@ def instance_to_df(instance, predicted_author=None, ground_truth_author=None):
140
  return task_authos_df
141
 
142
 
143
- def generate_style_embedding(background_corpus_df: pd.DataFrame, text_clm: str, model_name: str) -> pd.DataFrame:
144
  """
145
  Generates style embeddings for documents in a background corpus using a specified model.
146
  If a row in `text_clm` contains a list of strings, the final embedding for that row
@@ -218,22 +219,33 @@ def generate_style_embedding(background_corpus_df: pd.DataFrame, text_clm: str,
218
  embeddings = model.encode(texts, show_progress_bar=True)
219
  final_embeddings = list(embeddings)
220
 
221
- # Create a clean column name from the model name
222
- col_name = f'{model_name.split("/")[-1]}_style_embedding'
223
- background_corpus_df[col_name] = final_embeddings
 
 
224
 
225
- return background_corpus_df
226
 
227
  # ── wrapper with caching ───────────────────────────────────────
228
  def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
229
  text_clm: str,
230
- model_name: str) -> pd.DataFrame:
 
231
  """
232
  Wraps `generate_style_embedding`, caching its output in pickle files
233
  keyed by an MD5 of (model_name + text list). If the cache exists,
234
  loads and returns it instead of recomputing.
235
  """
236
 
 
 
 
 
 
 
 
 
237
  # Gather the input texts (preserves list-of-strings if any)
238
  texts = background_corpus_df[text_clm].fillna("").tolist()
239
 
@@ -255,12 +267,22 @@ def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
255
  with open(cache_path, "rb") as f:
256
  return pickle.load(f)
257
 
258
- # Otherwise, compute, cache, and return
259
- df_with_emb = generate_style_embedding(background_corpus_df, text_clm, model_name)
260
- print(f"Computing embeddings for {model_name} on column '{text_clm}', saving to {cache_path}")
261
- with open(cache_path, "wb") as f:
262
- pickle.dump(df_with_emb, f)
263
- return df_with_emb
 
 
 
 
 
 
 
 
 
 
264
 
265
  def get_style_feats_distribution(documentIDs, style_feats_dict):
266
  style_feats = []
 
20
  from collections import Counter
21
  import numpy as np
22
  from sklearn.metrics.pairwise import cosine_similarity
23
+ from sklearn.decomposition import PCA
24
 
25
  CACHE_DIR = "datasets/embeddings_cache"
26
  ZOOM_CACHE = "datasets/zoom_cache/features_cache.json"
 
141
  return task_authos_df
142
 
143
 
144
+ def generate_style_embedding(background_corpus_df: pd.DataFrame, text_clm: str, model_name: str, dimensionality_reduction: bool = True, dimensions: int = 100) -> pd.DataFrame:
145
  """
146
  Generates style embeddings for documents in a background corpus using a specified model.
147
  If a row in `text_clm` contains a list of strings, the final embedding for that row
 
219
  embeddings = model.encode(texts, show_progress_bar=True)
220
  final_embeddings = list(embeddings)
221
 
222
+ # Apply PCA over the embeddings to reduce the dimentionality
223
+ if dimensionality_reduction:
224
+ if len(final_embeddings) > 0 and len(final_embeddings[0]) > dimensions: # Only apply PCA if embeddings exist and dim > dimensions
225
+ pca = PCA(n_components=dimensions)
226
+ final_embeddings = pca.fit_transform(final_embeddings)
227
 
228
+ return list(final_embeddings)
229
 
230
  # ── wrapper with caching ───────────────────────────────────────
231
  def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
232
  text_clm: str,
233
+ model_name: str,
234
+ task_authors_df: pd.DataFrame = None) -> pd.DataFrame:
235
  """
236
  Wraps `generate_style_embedding`, caching its output in pickle files
237
  keyed by an MD5 of (model_name + text list). If the cache exists,
238
  loads and returns it instead of recomputing.
239
  """
240
 
241
+ if task_authors_df is not None:
242
+ print (f"concatenating task authors and background corpus authors")
243
+ print(f"Number of task authors: {len(task_authors_df)}")
244
+ print(f"task authors author_ids: {task_authors_df.authorID.tolist()}")
245
+ print(f"Number of background corpus authors: {len(background_corpus_df)}")
246
+ background_corpus_df = pd.concat([task_authors_df, background_corpus_df])
247
+ print(f"Number of authors after concatenation: {len(background_corpus_df)}")
248
+
249
  # Gather the input texts (preserves list-of-strings if any)
250
  texts = background_corpus_df[text_clm].fillna("").tolist()
251
 
 
267
  with open(cache_path, "rb") as f:
268
  return pickle.load(f)
269
 
270
+ else:
271
+ # Otherwise, compute, cache, and return
272
+ print(f"Computing embeddings for {model_name} on column '{text_clm}', saving to {cache_path}")
273
+ task_and_background_embeddings = generate_style_embedding(background_corpus_df, text_clm, model_name, dimensionality_reduction=True)
274
+ # Create a clean column name from the model name
275
+ col_name = f'{model_name.split("/")[-1]}_style_embedding'
276
+ background_corpus_df[col_name] = task_and_background_embeddings
277
+
278
+ with open(cache_path, "wb") as f:
279
+ pickle.dump(background_corpus_df, f)
280
+
281
+ if task_authors_df is not None:
282
+ task_authors_df = background_corpus_df[background_corpus_df.authorID.isin(task_authors_df.authorID.tolist())]
283
+ background_corpus_df = background_corpus_df[~background_corpus_df.authorID.isin(task_authors_df.authorID.tolist())]
284
+
285
+ return background_corpus_df, task_authors_df
286
 
287
  def get_style_feats_distribution(documentIDs, style_feats_dict):
288
  style_feats = []
utils/ui.py CHANGED
@@ -102,7 +102,6 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
102
  #create a dataframe of the task authors
103
  task_authors_df = instance_to_df(instances[iid], predicted_author=predicted_author, ground_truth_author=ground_truth_author)
104
  print(f"\n\n\n ----> Loaded task {iid} with {len(task_authors_df)} authors\n\n\n")
105
- print(task_authors_df)
106
  else:
107
  header_html = "<h3>Custom Uploaded Task</h3>"
108
  mystery_txt = read_txt(mystery_file)
@@ -119,15 +118,15 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
119
  'a2_fullText': c3_txt
120
  }
121
  task_authors_df = instance_to_df(custom_task_instance)
122
- print(task_authors_df)
123
 
124
- print(f"Generating embeddings for {model_name} on task authors")
125
- task_authors_df = cached_generate_style_embedding(task_authors_df, 'fullText', model_name)
126
- print("Task authors after embedding generation:")
127
- print(task_authors_df)
 
128
  # Generate the new embedding of all the background_df authors
129
  print(f"Generating embeddings for {model_name} on background corpus")
130
- background_df = cached_generate_style_embedding(background_df, 'fullText', model_name)
131
  print(f"Generated embeddings for {len(background_df)} texts using model '{model_name}'")
132
 
133
  # computing g2v features
@@ -137,8 +136,6 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
137
  task_authors_df['g2v_vector'] = task_authors_g2v
138
  print(f"Gram2Vec feature generation complete")
139
 
140
- print(background_df.columns)
141
-
142
  if mode != "Predefined HRS Task":
143
  # Computing predicted author by checking pairwise cosine similarity over luar embeddings
144
  col_name = f'{model_name.split("/")[-1]}_style_embedding'
 
102
  #create a dataframe of the task authors
103
  task_authors_df = instance_to_df(instances[iid], predicted_author=predicted_author, ground_truth_author=ground_truth_author)
104
  print(f"\n\n\n ----> Loaded task {iid} with {len(task_authors_df)} authors\n\n\n")
 
105
  else:
106
  header_html = "<h3>Custom Uploaded Task</h3>"
107
  mystery_txt = read_txt(mystery_file)
 
118
  'a2_fullText': c3_txt
119
  }
120
  task_authors_df = instance_to_df(custom_task_instance)
 
121
 
122
+ #print(f"Generating embeddings for {model_name} on task authors")
123
+ # task_authors_df = cached_generate_style_embedding(task_authors_df, 'fullText', model_name)
124
+ # print("Task authors after embedding generation:")
125
+ # print(task_authors_df)
126
+
127
  # Generate the new embedding of all the background_df authors
128
  print(f"Generating embeddings for {model_name} on background corpus")
129
+ background_df, task_authors_df = cached_generate_style_embedding(background_df, 'fullText', model_name, task_authors_df=task_authors_df)
130
  print(f"Generated embeddings for {len(background_df)} texts using model '{model_name}'")
131
 
132
  # computing g2v features
 
136
  task_authors_df['g2v_vector'] = task_authors_g2v
137
  print(f"Gram2Vec feature generation complete")
138
 
 
 
139
  if mode != "Predefined HRS Task":
140
  # Computing predicted author by checking pairwise cosine similarity over luar embeddings
141
  col_name = f'{model_name.split("/")[-1]}_style_embedding'