Anisha Bhatnagar commited on
Commit
8367823
·
1 Parent(s): 6fc987a

showing g2v with z scores

Browse files
Files changed (3) hide show
  1. app.py +7 -0
  2. utils/interp_space_utils.py +21 -9
  3. utils/visualizations.py +46 -4
app.py CHANGED
@@ -424,6 +424,13 @@ def app(share=False, use_cluster_feats=False):
424
  ">
425
  Gram2Vec Features prominent in the zoomed-in region
426
  </div>
 
 
 
 
 
 
 
427
  """)
428
  gram2vec_rb = gr.Radio(choices=[], label="Gram2Vec features for this zoomed-in region")#, label="Top-10 Gram2Vec Features most likely to occur in Mystery Author", info="Most prominent Gram2Vec features in the mystery text")
429
  gram2vec_state = gr.State()
 
424
  ">
425
  Gram2Vec Features prominent in the zoomed-in region
426
  </div>
427
+ <div style="
428
+ font-size: 0.9em;
429
+ color: #666;
430
+ margin-bottom: 1em;
431
+ ">
432
+ Features shown with normalized z-scores
433
+ </div>
434
  """)
435
  gram2vec_rb = gr.Radio(choices=[], label="Gram2Vec features for this zoomed-in region")#, label="Top-10 Gram2Vec Features most likely to occur in Mystery Author", info="Most prominent Gram2Vec features in the mystery text")
436
  gram2vec_state = gr.State()
utils/interp_space_utils.py CHANGED
@@ -571,7 +571,7 @@ def compute_clusters_g2v_representation(
571
  mode: str = "contrastive",
572
  sharedness_method: str = "mean_minus_alpha_std",
573
  alpha: float = 0.5
574
- ) -> List[str]:
575
 
576
 
577
  selected_mask = background_corpus_df['authorID'].isin(author_ids).to_numpy()
@@ -600,10 +600,10 @@ def compute_clusters_g2v_representation(
600
  stds = selected_matrix.std(axis=0)
601
  scores = means - float(alpha) * stds
602
 
603
- # Rank and return
604
  feature_scores = [(feat, score) for feat, score in zip(all_g2v_feats, scores) if score > 0]
605
  feature_scores.sort(key=lambda x: x[1], reverse=True)
606
- return [feat for feat, _ in feature_scores[:top_n]]
607
 
608
 
609
  # Contrastive mode (default): compute target mean and subtract contrast mean
@@ -626,11 +626,23 @@ def compute_clusters_g2v_representation(
626
 
627
  final_g2v_feats_values = all_g2v_values - all_g2v_other_values
628
 
 
 
 
 
 
 
 
 
 
 
 
 
629
 
630
  # Keep only features that have a positive contrastive score
631
  top_g2v_feats = sorted(
632
- [(feat, val) for feat, val in zip(all_g2v_feats, final_g2v_feats_values) if val > 0],
633
- key=lambda x: -x[1]
634
  )
635
 
636
  # Filter out features that are not present in any of the authors
@@ -638,18 +650,18 @@ def compute_clusters_g2v_representation(
638
  print('Filtering in g2v features for only the following authors: ', selected_authors)
639
  authors_g2v_feats = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
640
  filtered_features = []
641
- for feature, score in top_g2v_feats:
642
  found_in_any_author = False
643
  for author_g2v_feats in authors_g2v_feats:
644
  if author_g2v_feats[feature] > 0:
645
  found_in_any_author = True
646
  break
647
  if found_in_any_author:
648
- filtered_features.append(feature)
649
 
650
- print('Filtered G2V features: ', filtered_features)
651
 
652
- return filtered_features[:top_n]
653
 
654
  def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
655
 
 
571
  mode: str = "contrastive",
572
  sharedness_method: str = "mean_minus_alpha_std",
573
  alpha: float = 0.5
574
+ ) -> List[tuple]: # Changed return type to List[tuple] to include scores
575
 
576
 
577
  selected_mask = background_corpus_df['authorID'].isin(author_ids).to_numpy()
 
600
  stds = selected_matrix.std(axis=0)
601
  scores = means - float(alpha) * stds
602
 
603
+ # Rank and return with scores
604
  feature_scores = [(feat, score) for feat, score in zip(all_g2v_feats, scores) if score > 0]
605
  feature_scores.sort(key=lambda x: x[1], reverse=True)
606
+ return feature_scores[:top_n] # Return tuples instead of just features
607
 
608
 
609
  # Contrastive mode (default): compute target mean and subtract contrast mean
 
626
 
627
  final_g2v_feats_values = all_g2v_values - all_g2v_other_values
628
 
629
+ # Compute z-scores for normalization
630
+ # Get population statistics from all features (both selected and contrast)
631
+ all_feats = background_corpus_df[features_clm_name].tolist()
632
+ population_matrix = np.array([list(x.values()) for x in all_feats])
633
+ population_mean = population_matrix.mean(axis=0)
634
+ population_std = population_matrix.std(axis=0)
635
+
636
+ # Avoid division by zero
637
+ population_std = np.where(population_std == 0, 1, population_std)
638
+
639
+ # Calculate z-scores for the contrastive values
640
+ z_scores = (final_g2v_feats_values - population_mean) / population_std
641
 
642
  # Keep only features that have a positive contrastive score
643
  top_g2v_feats = sorted(
644
+ [(feat, val, z_score) for feat, val, z_score in zip(all_g2v_feats, final_g2v_feats_values, z_scores) if val > 0],
645
+ key=lambda x: -x[1] # Sort by contrastive score
646
  )
647
 
648
  # Filter out features that are not present in any of the authors
 
650
  print('Filtering in g2v features for only the following authors: ', selected_authors)
651
  authors_g2v_feats = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
652
  filtered_features = []
653
+ for feature, score, z_score in top_g2v_feats:
654
  found_in_any_author = False
655
  for author_g2v_feats in authors_g2v_feats:
656
  if author_g2v_feats[feature] > 0:
657
  found_in_any_author = True
658
  break
659
  if found_in_any_author:
660
+ filtered_features.append((feature, score, z_score))
661
 
662
+ print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features]) # Print feature names and z-scores
663
 
664
+ return filtered_features[:top_n] # Return tuples with z-scores
665
 
666
  def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
667
 
utils/visualizations.py CHANGED
@@ -194,6 +194,47 @@ def load_interp_space(cfg):
194
 
195
  }
196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  #function to handle zoom events
198
  def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors_df):
199
  """
@@ -268,7 +309,7 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
268
  for feat in g2v_feats:
269
  try:
270
  # `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
271
- if any(find_feature_spans(txt, feat) for txt in task_texts):
272
  filtered_g2v_feats.append(feat)
273
  else:
274
  print(f"[INFO] Dropping G2V feature with no spans in task texts: {feat}")
@@ -278,19 +319,20 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
278
  # Convert to human readable for display
279
  HR_g2v_list = []
280
  for feat in filtered_g2v_feats:
281
- HR_g2v = get_fullform(feat)
282
  print(f"\n\n feat: {feat} ---> Human Readable: {HR_g2v}")
283
  if HR_g2v is None:
284
  print(f"Skipping Gram2Vec feature without human readable form: {feat}")
285
  else:
286
- HR_g2v_list.append(HR_g2v)
287
 
288
- HR_g2v_list = ["None"] + HR_g2v_list
289
 
290
  print(f"[INFO] Found {len(llm_feats)} LLM features and {len(g2v_feats)} Gram2Vec features in the zoomed region.")
291
  print(f"[INFO] unfiltered g2v features: {g2v_feats}")
292
 
293
  print(f"[INFO] LLM features: {llm_feats}")
 
294
  print(f"[INFO] Gram2Vec features: {HR_g2v_list}")
295
 
296
  return (
 
194
 
195
  }
196
 
197
+ # Function to process G2V features and create display choices
198
+ def format_g2v_features_for_display(g2v_features_with_scores):
199
+ """
200
+ Convert G2V features with z-scores into display format for Gradio radio buttons.
201
+
202
+ Args:
203
+ g2v_features_with_scores: List of tuples like:
204
+ [('None', None), ('Feature Name', z_score), ...]
205
+
206
+ Returns:
207
+ tuple: (display_choices, original_values)
208
+ """
209
+ display_choices = []
210
+ original_values = []
211
+
212
+ for item in g2v_features_with_scores:
213
+ if len(item) == 2:
214
+ feature_name, z_score = item
215
+
216
+ # Handle None case
217
+ if feature_name == "None" or z_score is None:
218
+ display_choices.append("None")
219
+ original_values.append("None")
220
+ else:
221
+ # Convert numpy float to regular float if needed
222
+ if hasattr(z_score, 'item'):
223
+ z_score = float(z_score.item())
224
+ else:
225
+ z_score = float(z_score)
226
+
227
+ # Create display string with z-score
228
+ display_string = f"{feature_name} | Z={z_score:.2f}]"
229
+ display_choices.append(display_string)
230
+ original_values.append(feature_name)
231
+ else:
232
+ # Handle unexpected format
233
+ display_choices.append(str(item))
234
+ original_values.append(str(item))
235
+
236
+ return display_choices, original_values
237
+
238
  #function to handle zoom events
239
  def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors_df):
240
  """
 
309
  for feat in g2v_feats:
310
  try:
311
  # `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
312
+ if any(find_feature_spans(txt, feat[0]) for txt in task_texts):
313
  filtered_g2v_feats.append(feat)
314
  else:
315
  print(f"[INFO] Dropping G2V feature with no spans in task texts: {feat}")
 
319
  # Convert to human readable for display
320
  HR_g2v_list = []
321
  for feat in filtered_g2v_feats:
322
+ HR_g2v = get_fullform(feat[0])
323
  print(f"\n\n feat: {feat} ---> Human Readable: {HR_g2v}")
324
  if HR_g2v is None:
325
  print(f"Skipping Gram2Vec feature without human readable form: {feat}")
326
  else:
327
+ HR_g2v_list.append((HR_g2v, feat[1])) #get the score
328
 
329
+ HR_g2v_list = [("None", None)] + HR_g2v_list
330
 
331
  print(f"[INFO] Found {len(llm_feats)} LLM features and {len(g2v_feats)} Gram2Vec features in the zoomed region.")
332
  print(f"[INFO] unfiltered g2v features: {g2v_feats}")
333
 
334
  print(f"[INFO] LLM features: {llm_feats}")
335
+ HR_g2v_list, _ = format_g2v_features_for_display(HR_g2v_list)
336
  print(f"[INFO] Gram2Vec features: {HR_g2v_list}")
337
 
338
  return (