Anisha Bhatnagar
commited on
Commit
·
8367823
1
Parent(s):
6fc987a
showing g2v with z scores
Browse files- app.py +7 -0
- utils/interp_space_utils.py +21 -9
- utils/visualizations.py +46 -4
app.py
CHANGED
@@ -424,6 +424,13 @@ def app(share=False, use_cluster_feats=False):
|
|
424 |
">
|
425 |
Gram2Vec Features prominent in the zoomed-in region
|
426 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
427 |
""")
|
428 |
gram2vec_rb = gr.Radio(choices=[], label="Gram2Vec features for this zoomed-in region")#, label="Top-10 Gram2Vec Features most likely to occur in Mystery Author", info="Most prominent Gram2Vec features in the mystery text")
|
429 |
gram2vec_state = gr.State()
|
|
|
424 |
">
|
425 |
Gram2Vec Features prominent in the zoomed-in region
|
426 |
</div>
|
427 |
+
<div style="
|
428 |
+
font-size: 0.9em;
|
429 |
+
color: #666;
|
430 |
+
margin-bottom: 1em;
|
431 |
+
">
|
432 |
+
Features shown with normalized z-scores
|
433 |
+
</div>
|
434 |
""")
|
435 |
gram2vec_rb = gr.Radio(choices=[], label="Gram2Vec features for this zoomed-in region")#, label="Top-10 Gram2Vec Features most likely to occur in Mystery Author", info="Most prominent Gram2Vec features in the mystery text")
|
436 |
gram2vec_state = gr.State()
|
utils/interp_space_utils.py
CHANGED
@@ -571,7 +571,7 @@ def compute_clusters_g2v_representation(
|
|
571 |
mode: str = "contrastive",
|
572 |
sharedness_method: str = "mean_minus_alpha_std",
|
573 |
alpha: float = 0.5
|
574 |
-
) -> List[
|
575 |
|
576 |
|
577 |
selected_mask = background_corpus_df['authorID'].isin(author_ids).to_numpy()
|
@@ -600,10 +600,10 @@ def compute_clusters_g2v_representation(
|
|
600 |
stds = selected_matrix.std(axis=0)
|
601 |
scores = means - float(alpha) * stds
|
602 |
|
603 |
-
# Rank and return
|
604 |
feature_scores = [(feat, score) for feat, score in zip(all_g2v_feats, scores) if score > 0]
|
605 |
feature_scores.sort(key=lambda x: x[1], reverse=True)
|
606 |
-
return [
|
607 |
|
608 |
|
609 |
# Contrastive mode (default): compute target mean and subtract contrast mean
|
@@ -626,11 +626,23 @@ def compute_clusters_g2v_representation(
|
|
626 |
|
627 |
final_g2v_feats_values = all_g2v_values - all_g2v_other_values
|
628 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
629 |
|
630 |
# Keep only features that have a positive contrastive score
|
631 |
top_g2v_feats = sorted(
|
632 |
-
[(feat, val) for feat, val in zip(all_g2v_feats, final_g2v_feats_values) if val > 0],
|
633 |
-
key=lambda x: -x[1]
|
634 |
)
|
635 |
|
636 |
# Filter out features that are not present in any of the authors
|
@@ -638,18 +650,18 @@ def compute_clusters_g2v_representation(
|
|
638 |
print('Filtering in g2v features for only the following authors: ', selected_authors)
|
639 |
authors_g2v_feats = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
|
640 |
filtered_features = []
|
641 |
-
for feature, score in top_g2v_feats:
|
642 |
found_in_any_author = False
|
643 |
for author_g2v_feats in authors_g2v_feats:
|
644 |
if author_g2v_feats[feature] > 0:
|
645 |
found_in_any_author = True
|
646 |
break
|
647 |
if found_in_any_author:
|
648 |
-
filtered_features.append(feature)
|
649 |
|
650 |
-
print('Filtered G2V features: ', filtered_features)
|
651 |
|
652 |
-
return filtered_features[:top_n]
|
653 |
|
654 |
def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
|
655 |
|
|
|
571 |
mode: str = "contrastive",
|
572 |
sharedness_method: str = "mean_minus_alpha_std",
|
573 |
alpha: float = 0.5
|
574 |
+
) -> List[tuple]: # Changed return type to List[tuple] to include scores
|
575 |
|
576 |
|
577 |
selected_mask = background_corpus_df['authorID'].isin(author_ids).to_numpy()
|
|
|
600 |
stds = selected_matrix.std(axis=0)
|
601 |
scores = means - float(alpha) * stds
|
602 |
|
603 |
+
# Rank and return with scores
|
604 |
feature_scores = [(feat, score) for feat, score in zip(all_g2v_feats, scores) if score > 0]
|
605 |
feature_scores.sort(key=lambda x: x[1], reverse=True)
|
606 |
+
return feature_scores[:top_n] # Return tuples instead of just features
|
607 |
|
608 |
|
609 |
# Contrastive mode (default): compute target mean and subtract contrast mean
|
|
|
626 |
|
627 |
final_g2v_feats_values = all_g2v_values - all_g2v_other_values
|
628 |
|
629 |
+
# Compute z-scores for normalization
|
630 |
+
# Get population statistics from all features (both selected and contrast)
|
631 |
+
all_feats = background_corpus_df[features_clm_name].tolist()
|
632 |
+
population_matrix = np.array([list(x.values()) for x in all_feats])
|
633 |
+
population_mean = population_matrix.mean(axis=0)
|
634 |
+
population_std = population_matrix.std(axis=0)
|
635 |
+
|
636 |
+
# Avoid division by zero
|
637 |
+
population_std = np.where(population_std == 0, 1, population_std)
|
638 |
+
|
639 |
+
# Calculate z-scores for the contrastive values
|
640 |
+
z_scores = (final_g2v_feats_values - population_mean) / population_std
|
641 |
|
642 |
# Keep only features that have a positive contrastive score
|
643 |
top_g2v_feats = sorted(
|
644 |
+
[(feat, val, z_score) for feat, val, z_score in zip(all_g2v_feats, final_g2v_feats_values, z_scores) if val > 0],
|
645 |
+
key=lambda x: -x[1] # Sort by contrastive score
|
646 |
)
|
647 |
|
648 |
# Filter out features that are not present in any of the authors
|
|
|
650 |
print('Filtering in g2v features for only the following authors: ', selected_authors)
|
651 |
authors_g2v_feats = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
|
652 |
filtered_features = []
|
653 |
+
for feature, score, z_score in top_g2v_feats:
|
654 |
found_in_any_author = False
|
655 |
for author_g2v_feats in authors_g2v_feats:
|
656 |
if author_g2v_feats[feature] > 0:
|
657 |
found_in_any_author = True
|
658 |
break
|
659 |
if found_in_any_author:
|
660 |
+
filtered_features.append((feature, score, z_score))
|
661 |
|
662 |
+
print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features]) # Print feature names and z-scores
|
663 |
|
664 |
+
return filtered_features[:top_n] # Return tuples with z-scores
|
665 |
|
666 |
def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
|
667 |
|
utils/visualizations.py
CHANGED
@@ -194,6 +194,47 @@ def load_interp_space(cfg):
|
|
194 |
|
195 |
}
|
196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
#function to handle zoom events
|
198 |
def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors_df):
|
199 |
"""
|
@@ -268,7 +309,7 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
|
|
268 |
for feat in g2v_feats:
|
269 |
try:
|
270 |
# `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
|
271 |
-
if any(find_feature_spans(txt, feat) for txt in task_texts):
|
272 |
filtered_g2v_feats.append(feat)
|
273 |
else:
|
274 |
print(f"[INFO] Dropping G2V feature with no spans in task texts: {feat}")
|
@@ -278,19 +319,20 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
|
|
278 |
# Convert to human readable for display
|
279 |
HR_g2v_list = []
|
280 |
for feat in filtered_g2v_feats:
|
281 |
-
HR_g2v = get_fullform(feat)
|
282 |
print(f"\n\n feat: {feat} ---> Human Readable: {HR_g2v}")
|
283 |
if HR_g2v is None:
|
284 |
print(f"Skipping Gram2Vec feature without human readable form: {feat}")
|
285 |
else:
|
286 |
-
HR_g2v_list.append(HR_g2v)
|
287 |
|
288 |
-
HR_g2v_list = ["None"] + HR_g2v_list
|
289 |
|
290 |
print(f"[INFO] Found {len(llm_feats)} LLM features and {len(g2v_feats)} Gram2Vec features in the zoomed region.")
|
291 |
print(f"[INFO] unfiltered g2v features: {g2v_feats}")
|
292 |
|
293 |
print(f"[INFO] LLM features: {llm_feats}")
|
|
|
294 |
print(f"[INFO] Gram2Vec features: {HR_g2v_list}")
|
295 |
|
296 |
return (
|
|
|
194 |
|
195 |
}
|
196 |
|
197 |
+
# Function to process G2V features and create display choices
|
198 |
+
def format_g2v_features_for_display(g2v_features_with_scores):
|
199 |
+
"""
|
200 |
+
Convert G2V features with z-scores into display format for Gradio radio buttons.
|
201 |
+
|
202 |
+
Args:
|
203 |
+
g2v_features_with_scores: List of tuples like:
|
204 |
+
[('None', None), ('Feature Name', z_score), ...]
|
205 |
+
|
206 |
+
Returns:
|
207 |
+
tuple: (display_choices, original_values)
|
208 |
+
"""
|
209 |
+
display_choices = []
|
210 |
+
original_values = []
|
211 |
+
|
212 |
+
for item in g2v_features_with_scores:
|
213 |
+
if len(item) == 2:
|
214 |
+
feature_name, z_score = item
|
215 |
+
|
216 |
+
# Handle None case
|
217 |
+
if feature_name == "None" or z_score is None:
|
218 |
+
display_choices.append("None")
|
219 |
+
original_values.append("None")
|
220 |
+
else:
|
221 |
+
# Convert numpy float to regular float if needed
|
222 |
+
if hasattr(z_score, 'item'):
|
223 |
+
z_score = float(z_score.item())
|
224 |
+
else:
|
225 |
+
z_score = float(z_score)
|
226 |
+
|
227 |
+
# Create display string with z-score
|
228 |
+
display_string = f"{feature_name} | Z={z_score:.2f}]"
|
229 |
+
display_choices.append(display_string)
|
230 |
+
original_values.append(feature_name)
|
231 |
+
else:
|
232 |
+
# Handle unexpected format
|
233 |
+
display_choices.append(str(item))
|
234 |
+
original_values.append(str(item))
|
235 |
+
|
236 |
+
return display_choices, original_values
|
237 |
+
|
238 |
#function to handle zoom events
|
239 |
def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors_df):
|
240 |
"""
|
|
|
309 |
for feat in g2v_feats:
|
310 |
try:
|
311 |
# `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
|
312 |
+
if any(find_feature_spans(txt, feat[0]) for txt in task_texts):
|
313 |
filtered_g2v_feats.append(feat)
|
314 |
else:
|
315 |
print(f"[INFO] Dropping G2V feature with no spans in task texts: {feat}")
|
|
|
319 |
# Convert to human readable for display
|
320 |
HR_g2v_list = []
|
321 |
for feat in filtered_g2v_feats:
|
322 |
+
HR_g2v = get_fullform(feat[0])
|
323 |
print(f"\n\n feat: {feat} ---> Human Readable: {HR_g2v}")
|
324 |
if HR_g2v is None:
|
325 |
print(f"Skipping Gram2Vec feature without human readable form: {feat}")
|
326 |
else:
|
327 |
+
HR_g2v_list.append((HR_g2v, feat[1])) #get the score
|
328 |
|
329 |
+
HR_g2v_list = [("None", None)] + HR_g2v_list
|
330 |
|
331 |
print(f"[INFO] Found {len(llm_feats)} LLM features and {len(g2v_feats)} Gram2Vec features in the zoomed region.")
|
332 |
print(f"[INFO] unfiltered g2v features: {g2v_feats}")
|
333 |
|
334 |
print(f"[INFO] LLM features: {llm_feats}")
|
335 |
+
HR_g2v_list, _ = format_g2v_features_for_display(HR_g2v_list)
|
336 |
print(f"[INFO] Gram2Vec features: {HR_g2v_list}")
|
337 |
|
338 |
return (
|