Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Sleeping

App Files Files Community

Anisha Bhatnagar commited on 28 days ago

Commit

74947b9

1 Parent(s): 8367823

show span in background authors issue fixed

Browse files

Files changed (4) hide show

utils/gram2vec_feat_utils.py +18 -7
utils/interp_space_utils.py +4 -3
utils/llm_feat_utils.py +3 -0
utils/visualizations.py +1 -1

utils/gram2vec_feat_utils.py CHANGED Viewed

@@ -126,7 +126,7 @@ def highlight_both_spans(text, llm_spans, gram_spans):
 def show_combined_spans_all(selected_feature_llm, selected_feature_g2v,
-                            llm_style_feats_analysis, background_authors_embeddings_df, task_authors_embeddings_df, visible_authors, predicted_author=None, ground_truth_author=None, max_num_authors=4):
     """
     For mystery + 3 candidates:
      1. get llm spans via your existing cache+API
@@ -152,9 +152,11 @@ def show_combined_spans_all(selected_feature_llm, selected_feature_g2v,
     if selected_feature_llm and selected_feature_llm != "None":
         # print(llm_style_feats_analysis)
         author_list = list(llm_style_feats_analysis['spans'].values())
         llm_spans_list = []
         for i, (_, txt) in enumerate(texts):
             author_spans_list = []
             for txt_span in author_list[i][selected_feature_llm]:
                     author_spans_list.append(Span(txt.find(txt_span), txt.find(txt_span) + len(txt_span)))
@@ -167,6 +169,8 @@ def show_combined_spans_all(selected_feature_llm, selected_feature_g2v,
     if selected_feature_g2v and selected_feature_g2v != "None":
         # get gram2vec spans
         gram_spans_list = []
         print(f"Selected Gram2Vec feature: {selected_feature_g2v}")
         short = get_shorthand(selected_feature_g2v)
         print(f"short hand: {short}")
@@ -199,14 +203,19 @@ def show_combined_spans_all(selected_feature_llm, selected_feature_g2v,
     )
     combined_html = "<div>" + "\n<hr>\n".join(html_task_authors) + "</div>"
     # Filter background authors to those with at least one Gram2Vec span
     bg_start = 4
     bg_indices = list(range(bg_start, len(texts)))
     kept_indices = [i for i in bg_indices if gram_spans_list[i]]
     filtered_texts_bg = [texts[i] for i in kept_indices]
     filtered_llm_bg   = [llm_spans_list[i] for i in kept_indices]
     filtered_gram_bg  = [gram_spans_list[i] for i in kept_indices]
     html_background_authors = create_html(
         filtered_texts_bg,
         filtered_llm_bg,
@@ -219,6 +228,7 @@ def show_combined_spans_all(selected_feature_llm, selected_feature_g2v,
         ground_truth_author=ground_truth_author
     )
     background_html = "<div>" + "\n<hr>\n".join(html_background_authors) + "</div>"
     return combined_html, background_html
 def get_label(label: str, predicted_author=None, ground_truth_author=None, bg_id: int=0) -> str:
@@ -230,26 +240,27 @@ def get_label(label: str, predicted_author=None, ground_truth_author=None, bg_id
         return "Mystery Author"
     elif label.startswith("a0_author") or label.startswith("a1_author") or label.startswith("a2_author") or label.startswith("Candidate"):
         if label.startswith("Candidate"):
-            id = int(label.split(" ")[2])  # Get the number after 'Candidate Author'
         else:
             id = label.split("_")[0][-1] # Get the last character of the first part (a0, a1, a2)
         if predicted_author is not None and ground_truth_author is not None:
             if int(id) == predicted_author and int(id) == ground_truth_author:
-                return f"Candidate {int(id)} (Predicted & Ground Truth)"
             elif int(id) == predicted_author:
-                return f"Candidate {int(id)} (Predicted)"
             elif int(id) == ground_truth_author:
-                return f"Candidate {int(id)} (Ground Truth)"
             else:
-                return f"Candidate {int(id)}"
         else:
-            return f"Candidate {int(id)}"
     else:
         return f"Background Author {bg_id+1}"
 def create_html(texts, llm_spans_list, gram_spans_list, selected_feature_llm, selected_feature_g2v, short=None, background = False, predicted_author=None, ground_truth_author=None):
     html = []
     for i, (label, txt) in enumerate(texts):
         label = get_label(label, predicted_author, ground_truth_author,  i) if background else get_label(label, predicted_author, ground_truth_author)
         combined = highlight_both_spans(txt, llm_spans_list[i], gram_spans_list[i])
         notice = ""

 def show_combined_spans_all(selected_feature_llm, selected_feature_g2v,
+                            llm_style_feats_analysis, background_authors_embeddings_df, task_authors_embeddings_df, visible_authors, predicted_author=None, ground_truth_author=None, max_num_authors=7):
     """
     For mystery + 3 candidates:
      1. get llm spans via your existing cache+API
     if selected_feature_llm and selected_feature_llm != "None":
         # print(llm_style_feats_analysis)
+        print(f"{len(llm_style_feats_analysis['spans'].values())}")
         author_list = list(llm_style_feats_analysis['spans'].values())
         llm_spans_list = []
         for i, (_, txt) in enumerate(texts):
+            print(f"{i}/{len(texts)}")
             author_spans_list = []
             for txt_span in author_list[i][selected_feature_llm]:
                     author_spans_list.append(Span(txt.find(txt_span), txt.find(txt_span) + len(txt_span)))
     if selected_feature_g2v and selected_feature_g2v != "None":
         # get gram2vec spans
         gram_spans_list = []
+        # clean the display string and get the feature name without the zscore
+        selected_feature_g2v = selected_feature_g2v.split(" | [Z=")[0].strip()
         print(f"Selected Gram2Vec feature: {selected_feature_g2v}")
         short = get_shorthand(selected_feature_g2v)
         print(f"short hand: {short}")
     )
     combined_html = "<div>" + "\n<hr>\n".join(html_task_authors) + "</div>"
+    # print(f"\n\n\n\n{texts[4:]}")
     # Filter background authors to those with at least one Gram2Vec span
     bg_start = 4
     bg_indices = list(range(bg_start, len(texts)))
     kept_indices = [i for i in bg_indices if gram_spans_list[i]]
+    print(f"\n---> {kept_indices}")
     filtered_texts_bg = [texts[i] for i in kept_indices]
     filtered_llm_bg   = [llm_spans_list[i] for i in kept_indices]
     filtered_gram_bg  = [gram_spans_list[i] for i in kept_indices]
+    print(filtered_texts_bg)
     html_background_authors = create_html(
         filtered_texts_bg,
         filtered_llm_bg,
         ground_truth_author=ground_truth_author
     )
     background_html = "<div>" + "\n<hr>\n".join(html_background_authors) + "</div>"
+    # print(f"Background HTML: {background_html}")
     return combined_html, background_html
 def get_label(label: str, predicted_author=None, ground_truth_author=None, bg_id: int=0) -> str:
         return "Mystery Author"
     elif label.startswith("a0_author") or label.startswith("a1_author") or label.startswith("a2_author") or label.startswith("Candidate"):
         if label.startswith("Candidate"):
+            id = int(label.split(" ")[2])-1  # Get the number after 'Candidate Author'; convert to 0 index
         else:
             id = label.split("_")[0][-1] # Get the last character of the first part (a0, a1, a2)
         if predicted_author is not None and ground_truth_author is not None:
             if int(id) == predicted_author and int(id) == ground_truth_author:
+                return f"Candidate {int(id)+1} (Predicted & Ground Truth)"
             elif int(id) == predicted_author:
+                return f"Candidate {int(id)+1} (Predicted)"
             elif int(id) == ground_truth_author:
+                return f"Candidate {int(id)+1} (Ground Truth)"
             else:
+                return f"Candidate {int(id)+1}"
         else:
+            return f"Candidate {int(id)+1}"
     else:
         return f"Background Author {bg_id+1}"
 def create_html(texts, llm_spans_list, gram_spans_list, selected_feature_llm, selected_feature_g2v, short=None, background = False, predicted_author=None, ground_truth_author=None):
     html = []
     for i, (label, txt) in enumerate(texts):
+        print(i, label, txt[:30])
         label = get_label(label, predicted_author, ground_truth_author,  i) if background else get_label(label, predicted_author, ground_truth_author)
         combined = highlight_both_spans(txt, llm_spans_list[i], gram_spans_list[i])
         notice = ""

utils/interp_space_utils.py CHANGED Viewed

@@ -521,7 +521,8 @@ def compute_clusters_style_representation_3(
     cluster_label_clm_name: str = 'authorID',
     max_num_feats: int = 10,
     max_num_documents_per_author=3,
-    max_num_authors=5
     ):
     print(f"Computing style representation for visible clusters: {len(cluster_ids)}")
@@ -537,8 +538,8 @@ def compute_clusters_style_representation_3(
     features = identify_style_features(author_texts, author_names, max_num_feats=max_num_feats)
     # STEP 2: Prepare author pool for span extraction
-    span_df = background_corpus_df.iloc[:4]
-    author_names = span_df[cluster_label_clm_name].tolist()[:4]
     print(f"Number of authors for span detection : {len(span_df)}")
     print(author_names)
     spans_by_author = extract_all_spans(span_df, features, cluster_label_clm_name)

     cluster_label_clm_name: str = 'authorID',
     max_num_feats: int = 10,
     max_num_documents_per_author=3,
+    max_num_authors=5,
+    max_authors_for_span_extraction=7
     ):
     print(f"Computing style representation for visible clusters: {len(cluster_ids)}")
     features = identify_style_features(author_texts, author_names, max_num_feats=max_num_feats)
     # STEP 2: Prepare author pool for span extraction
+    span_df = background_corpus_df.iloc[:max_authors_for_span_extraction]
+    author_names = span_df[cluster_label_clm_name].tolist()[:max_authors_for_span_extraction]
     print(f"Number of authors for span detection : {len(span_df)}")
     print(author_names)
     spans_by_author = extract_all_spans(span_df, features, cluster_label_clm_name)

utils/llm_feat_utils.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os
 import hashlib
 import time
 from json import JSONDecodeError
 CACHE_DIR = "datasets/feature_spans_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
@@ -66,10 +67,12 @@ def generate_feature_spans_with_retries(client, text: str, features: list[str])
     for attempt in range(MAX_ATTEMPTS):
         try:
             response_str = generate_feature_spans(client, text, features)
             result = json.loads(response_str)
             return result
         except (JSONDecodeError, ValueError) as e:
             print(f"Attempt {attempt+1} failed: {e}")
             if attempt < MAX_ATTEMPTS - 1:
                 wait_sec = WAIT_SECONDS * (2 ** attempt)
                 print(f"Retrying after {wait_sec} seconds...")

 import hashlib
 import time
 from json import JSONDecodeError
+import traceback
 CACHE_DIR = "datasets/feature_spans_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
     for attempt in range(MAX_ATTEMPTS):
         try:
             response_str = generate_feature_spans(client, text, features)
+            print(response_str)
             result = json.loads(response_str)
             return result
         except (JSONDecodeError, ValueError) as e:
             print(f"Attempt {attempt+1} failed: {e}")
+            traceback.print_exc()
             if attempt < MAX_ATTEMPTS - 1:
                 wait_sec = WAIT_SECONDS * (2 ** attempt)
                 print(f"Retrying after {wait_sec} seconds...")

utils/visualizations.py CHANGED Viewed

@@ -225,7 +225,7 @@ def format_g2v_features_for_display(g2v_features_with_scores):
                     z_score = float(z_score)
                 # Create display string with z-score
-                display_string = f"{feature_name} | Z={z_score:.2f}]"
                 display_choices.append(display_string)
                 original_values.append(feature_name)
         else:

                     z_score = float(z_score)
                 # Create display string with z-score
+                display_string = f"{feature_name} | [Z={z_score:.2f}]"
                 display_choices.append(display_string)
                 original_values.append(feature_name)
         else: