Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Sleeping

App Files Files Community

Milad Alshomary commited on 27 days ago

Commit

87e3b98

1 Parent(s): 74947b9

updates

Browse files

Files changed (4) hide show

app.py +1 -1
config/config.yaml +2 -2
utils/interp_space_utils.py +34 -12
utils/ui.py +6 -9

app.py CHANGED Viewed

@@ -58,7 +58,7 @@ def app(share=False, use_cluster_feats=False):
     instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
     interp      = load_interp_space(cfg)
-    clustered_authors_df = interp['clustered_authors_df'][:1000]
     clustered_authors_df['fullText'] = clustered_authors_df['fullText'].map(lambda l: l[:3]) # Take at most 3 texts per author
     with gr.Blocks(title="Author Attribution Explainability Tool") as demo:

     instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
     interp      = load_interp_space(cfg)
+    clustered_authors_df = interp['clustered_authors_df'][:200]
     clustered_authors_df['fullText'] = clustered_authors_df['fullText'].map(lambda l: l[:3]) # Take at most 3 texts per author
     with gr.Blocks(title="Author Attribution Explainability Tool") as demo:

config/config.yaml CHANGED Viewed

@@ -1,8 +1,8 @@
 # config.yaml
 instances_to_explain_path: "./datasets/hrs_explanations.json"
 instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_18_balanced.json?/download=true"
-interp_space_path:    "./datasets/sentence_luar_interp_clusters_25/"
-interp_space_url:    "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_clusters_25.zip?download=true"
 gram2vec_feats_path:      "./datasets/gram2vec_feats.csv"
 gram2vec_feats_url:      "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/gram2vec_feats.csv?download=true"

 # config.yaml
 instances_to_explain_path: "./datasets/hrs_explanations.json"
 instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_18_balanced.json?/download=true"
+interp_space_path:    "./datasets/sentence_luar_interp_clusters_18.zip/"
+interp_space_url:    "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/luar_interp_space_cluster_18.zip?download=true"
 gram2vec_feats_path:      "./datasets/gram2vec_feats.csv"
 gram2vec_feats_url:      "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/gram2vec_feats.csv?download=true"

utils/interp_space_utils.py CHANGED Viewed

@@ -20,6 +20,7 @@ from utils.llm_feat_utils import generate_feature_spans_cached
 from collections import Counter
 import numpy as np
 from sklearn.metrics.pairwise import cosine_similarity
 CACHE_DIR = "datasets/embeddings_cache"
 ZOOM_CACHE = "datasets/zoom_cache/features_cache.json"
@@ -140,7 +141,7 @@ def instance_to_df(instance, predicted_author=None, ground_truth_author=None):
     return task_authos_df
-def generate_style_embedding(background_corpus_df: pd.DataFrame, text_clm: str, model_name: str) -> pd.DataFrame:
     """
     Generates style embeddings for documents in a background corpus using a specified model.
     If a row in `text_clm` contains a list of strings, the final embedding for that row
@@ -218,22 +219,33 @@ def generate_style_embedding(background_corpus_df: pd.DataFrame, text_clm: str,
         embeddings = model.encode(texts, show_progress_bar=True)
         final_embeddings = list(embeddings)
-    # Create a clean column name from the model name
-    col_name = f'{model_name.split("/")[-1]}_style_embedding'
-    background_corpus_df[col_name] = final_embeddings
-    return background_corpus_df
 # ── wrapper with caching ───────────────────────────────────────
 def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
                                     text_clm: str,
-                                    model_name: str) -> pd.DataFrame:
     """
     Wraps `generate_style_embedding`, caching its output in pickle files
     keyed by an MD5 of (model_name + text list). If the cache exists,
     loads and returns it instead of recomputing.
     """
     # Gather the input texts (preserves list-of-strings if any)
     texts = background_corpus_df[text_clm].fillna("").tolist()
@@ -255,12 +267,22 @@ def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
         with open(cache_path, "rb") as f:
             return pickle.load(f)
-    # Otherwise, compute, cache, and return
-    df_with_emb = generate_style_embedding(background_corpus_df, text_clm, model_name)
-    print(f"Computing embeddings for {model_name} on column '{text_clm}', saving to {cache_path}")
-    with open(cache_path, "wb") as f:
-        pickle.dump(df_with_emb, f)
-    return df_with_emb
 def get_style_feats_distribution(documentIDs, style_feats_dict):
     style_feats = []

 from collections import Counter
 import numpy as np
 from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.decomposition import PCA
 CACHE_DIR = "datasets/embeddings_cache"
 ZOOM_CACHE = "datasets/zoom_cache/features_cache.json"
     return task_authos_df
+def generate_style_embedding(background_corpus_df: pd.DataFrame, text_clm: str, model_name: str, dimensionality_reduction: bool = True, dimensions: int = 100) -> pd.DataFrame:
     """
     Generates style embeddings for documents in a background corpus using a specified model.
     If a row in `text_clm` contains a list of strings, the final embedding for that row
         embeddings = model.encode(texts, show_progress_bar=True)
         final_embeddings = list(embeddings)
+    # Apply PCA over the embeddings to reduce the dimentionality
+    if dimensionality_reduction:
+        if len(final_embeddings) > 0 and len(final_embeddings[0]) > dimensions: # Only apply PCA if embeddings exist and dim > dimensions
+            pca = PCA(n_components=dimensions)
+            final_embeddings = pca.fit_transform(final_embeddings)
+    return list(final_embeddings)
 # ── wrapper with caching ───────────────────────────────────────
 def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
                                     text_clm: str,
+                                    model_name: str,
+                                    task_authors_df: pd.DataFrame = None) -> pd.DataFrame:
     """
     Wraps `generate_style_embedding`, caching its output in pickle files
     keyed by an MD5 of (model_name + text list). If the cache exists,
     loads and returns it instead of recomputing.
     """
+    if task_authors_df is not None:
+        print (f"concatenating task authors and background corpus authors")
+        print(f"Number of task authors: {len(task_authors_df)}")
+        print(f"task authors author_ids: {task_authors_df.authorID.tolist()}")
+        print(f"Number of background corpus authors: {len(background_corpus_df)}")
+        background_corpus_df = pd.concat([task_authors_df, background_corpus_df])
+        print(f"Number of authors after concatenation: {len(background_corpus_df)}")
     # Gather the input texts (preserves list-of-strings if any)
     texts = background_corpus_df[text_clm].fillna("").tolist()
         with open(cache_path, "rb") as f:
             return pickle.load(f)
+    else:
+        # Otherwise, compute, cache, and return
+        print(f"Computing embeddings for {model_name} on column '{text_clm}', saving to {cache_path}")
+        task_and_background_embeddings = generate_style_embedding(background_corpus_df, text_clm, model_name, dimensionality_reduction=True)
+        # Create a clean column name from the model name
+        col_name = f'{model_name.split("/")[-1]}_style_embedding'
+        background_corpus_df[col_name] = task_and_background_embeddings
+        with open(cache_path, "wb") as f:
+            pickle.dump(background_corpus_df, f)
+    if task_authors_df is not None:
+        task_authors_df = background_corpus_df[background_corpus_df.authorID.isin(task_authors_df.authorID.tolist())]
+        background_corpus_df = background_corpus_df[~background_corpus_df.authorID.isin(task_authors_df.authorID.tolist())]
+    return background_corpus_df, task_authors_df
 def get_style_feats_distribution(documentIDs, style_feats_dict):
     style_feats = []

utils/ui.py CHANGED Viewed

@@ -102,7 +102,6 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
         #create a dataframe of the task authors
         task_authors_df  = instance_to_df(instances[iid], predicted_author=predicted_author, ground_truth_author=ground_truth_author)
         print(f"\n\n\n ----> Loaded task {iid} with {len(task_authors_df)} authors\n\n\n")
-        print(task_authors_df)
     else:
         header_html = "<h3>Custom Uploaded Task</h3>"
         mystery_txt = read_txt(mystery_file)
@@ -119,15 +118,15 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
             'a2_fullText': c3_txt
         }
         task_authors_df  = instance_to_df(custom_task_instance)
-        print(task_authors_df)
-    print(f"Generating embeddings for {model_name} on task authors")
-    task_authors_df = cached_generate_style_embedding(task_authors_df, 'fullText', model_name)
-    print("Task authors after embedding generation:")
-    print(task_authors_df)
     # Generate the new embedding of all the background_df authors
     print(f"Generating embeddings for {model_name} on background corpus")
-    background_df = cached_generate_style_embedding(background_df, 'fullText', model_name)
     print(f"Generated embeddings for {len(background_df)} texts using model '{model_name}'")
     # computing g2v features
@@ -137,8 +136,6 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
     task_authors_df['g2v_vector'] = task_authors_g2v
     print(f"Gram2Vec feature generation complete")
-    print(background_df.columns)
     if mode != "Predefined HRS Task":
         # Computing predicted author by checking pairwise cosine similarity over luar embeddings
         col_name = f'{model_name.split("/")[-1]}_style_embedding'

         #create a dataframe of the task authors
         task_authors_df  = instance_to_df(instances[iid], predicted_author=predicted_author, ground_truth_author=ground_truth_author)
         print(f"\n\n\n ----> Loaded task {iid} with {len(task_authors_df)} authors\n\n\n")
     else:
         header_html = "<h3>Custom Uploaded Task</h3>"
         mystery_txt = read_txt(mystery_file)
             'a2_fullText': c3_txt
         }
         task_authors_df  = instance_to_df(custom_task_instance)
+    #print(f"Generating embeddings for {model_name} on task authors")
+    # task_authors_df = cached_generate_style_embedding(task_authors_df, 'fullText', model_name)
+    # print("Task authors after embedding generation:")
+    # print(task_authors_df)
     # Generate the new embedding of all the background_df authors
     print(f"Generating embeddings for {model_name} on background corpus")
+    background_df, task_authors_df = cached_generate_style_embedding(background_df, 'fullText', model_name, task_authors_df=task_authors_df)
     print(f"Generated embeddings for {len(background_df)} texts using model '{model_name}'")
     # computing g2v features
     task_authors_df['g2v_vector'] = task_authors_g2v
     print(f"Gram2Vec feature generation complete")
     if mode != "Predefined HRS Task":
         # Computing predicted author by checking pairwise cosine similarity over luar embeddings
         col_name = f'{model_name.split("/")[-1]}_style_embedding'