Milad Alshomary
commited on
Commit
Β·
87e3b98
1
Parent(s):
74947b9
updates
Browse files- app.py +1 -1
- config/config.yaml +2 -2
- utils/interp_space_utils.py +34 -12
- utils/ui.py +6 -9
app.py
CHANGED
@@ -58,7 +58,7 @@ def app(share=False, use_cluster_feats=False):
|
|
58 |
instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
|
59 |
|
60 |
interp = load_interp_space(cfg)
|
61 |
-
clustered_authors_df = interp['clustered_authors_df'][:
|
62 |
clustered_authors_df['fullText'] = clustered_authors_df['fullText'].map(lambda l: l[:3]) # Take at most 3 texts per author
|
63 |
|
64 |
with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
|
|
|
58 |
instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
|
59 |
|
60 |
interp = load_interp_space(cfg)
|
61 |
+
clustered_authors_df = interp['clustered_authors_df'][:200]
|
62 |
clustered_authors_df['fullText'] = clustered_authors_df['fullText'].map(lambda l: l[:3]) # Take at most 3 texts per author
|
63 |
|
64 |
with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
|
config/config.yaml
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
# config.yaml
|
2 |
instances_to_explain_path: "./datasets/hrs_explanations.json"
|
3 |
instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_18_balanced.json?/download=true"
|
4 |
-
interp_space_path: "./datasets/
|
5 |
-
interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/
|
6 |
gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
|
7 |
gram2vec_feats_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/gram2vec_feats.csv?download=true"
|
8 |
|
|
|
1 |
# config.yaml
|
2 |
instances_to_explain_path: "./datasets/hrs_explanations.json"
|
3 |
instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_18_balanced.json?/download=true"
|
4 |
+
interp_space_path: "./datasets/sentence_luar_interp_clusters_18.zip/"
|
5 |
+
interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/luar_interp_space_cluster_18.zip?download=true"
|
6 |
gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
|
7 |
gram2vec_feats_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/gram2vec_feats.csv?download=true"
|
8 |
|
utils/interp_space_utils.py
CHANGED
@@ -20,6 +20,7 @@ from utils.llm_feat_utils import generate_feature_spans_cached
|
|
20 |
from collections import Counter
|
21 |
import numpy as np
|
22 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
23 |
|
24 |
CACHE_DIR = "datasets/embeddings_cache"
|
25 |
ZOOM_CACHE = "datasets/zoom_cache/features_cache.json"
|
@@ -140,7 +141,7 @@ def instance_to_df(instance, predicted_author=None, ground_truth_author=None):
|
|
140 |
return task_authos_df
|
141 |
|
142 |
|
143 |
-
def generate_style_embedding(background_corpus_df: pd.DataFrame, text_clm: str, model_name: str) -> pd.DataFrame:
|
144 |
"""
|
145 |
Generates style embeddings for documents in a background corpus using a specified model.
|
146 |
If a row in `text_clm` contains a list of strings, the final embedding for that row
|
@@ -218,22 +219,33 @@ def generate_style_embedding(background_corpus_df: pd.DataFrame, text_clm: str,
|
|
218 |
embeddings = model.encode(texts, show_progress_bar=True)
|
219 |
final_embeddings = list(embeddings)
|
220 |
|
221 |
-
#
|
222 |
-
|
223 |
-
|
|
|
|
|
224 |
|
225 |
-
return
|
226 |
|
227 |
# ββ wrapper with caching βββββββββββββββββββββββββββββββββββββββ
|
228 |
def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
|
229 |
text_clm: str,
|
230 |
-
model_name: str
|
|
|
231 |
"""
|
232 |
Wraps `generate_style_embedding`, caching its output in pickle files
|
233 |
keyed by an MD5 of (model_name + text list). If the cache exists,
|
234 |
loads and returns it instead of recomputing.
|
235 |
"""
|
236 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
# Gather the input texts (preserves list-of-strings if any)
|
238 |
texts = background_corpus_df[text_clm].fillna("").tolist()
|
239 |
|
@@ -255,12 +267,22 @@ def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
|
|
255 |
with open(cache_path, "rb") as f:
|
256 |
return pickle.load(f)
|
257 |
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
|
265 |
def get_style_feats_distribution(documentIDs, style_feats_dict):
|
266 |
style_feats = []
|
|
|
20 |
from collections import Counter
|
21 |
import numpy as np
|
22 |
from sklearn.metrics.pairwise import cosine_similarity
|
23 |
+
from sklearn.decomposition import PCA
|
24 |
|
25 |
CACHE_DIR = "datasets/embeddings_cache"
|
26 |
ZOOM_CACHE = "datasets/zoom_cache/features_cache.json"
|
|
|
141 |
return task_authos_df
|
142 |
|
143 |
|
144 |
+
def generate_style_embedding(background_corpus_df: pd.DataFrame, text_clm: str, model_name: str, dimensionality_reduction: bool = True, dimensions: int = 100) -> pd.DataFrame:
|
145 |
"""
|
146 |
Generates style embeddings for documents in a background corpus using a specified model.
|
147 |
If a row in `text_clm` contains a list of strings, the final embedding for that row
|
|
|
219 |
embeddings = model.encode(texts, show_progress_bar=True)
|
220 |
final_embeddings = list(embeddings)
|
221 |
|
222 |
+
# Apply PCA over the embeddings to reduce the dimentionality
|
223 |
+
if dimensionality_reduction:
|
224 |
+
if len(final_embeddings) > 0 and len(final_embeddings[0]) > dimensions: # Only apply PCA if embeddings exist and dim > dimensions
|
225 |
+
pca = PCA(n_components=dimensions)
|
226 |
+
final_embeddings = pca.fit_transform(final_embeddings)
|
227 |
|
228 |
+
return list(final_embeddings)
|
229 |
|
230 |
# ββ wrapper with caching βββββββββββββββββββββββββββββββββββββββ
|
231 |
def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
|
232 |
text_clm: str,
|
233 |
+
model_name: str,
|
234 |
+
task_authors_df: pd.DataFrame = None) -> pd.DataFrame:
|
235 |
"""
|
236 |
Wraps `generate_style_embedding`, caching its output in pickle files
|
237 |
keyed by an MD5 of (model_name + text list). If the cache exists,
|
238 |
loads and returns it instead of recomputing.
|
239 |
"""
|
240 |
|
241 |
+
if task_authors_df is not None:
|
242 |
+
print (f"concatenating task authors and background corpus authors")
|
243 |
+
print(f"Number of task authors: {len(task_authors_df)}")
|
244 |
+
print(f"task authors author_ids: {task_authors_df.authorID.tolist()}")
|
245 |
+
print(f"Number of background corpus authors: {len(background_corpus_df)}")
|
246 |
+
background_corpus_df = pd.concat([task_authors_df, background_corpus_df])
|
247 |
+
print(f"Number of authors after concatenation: {len(background_corpus_df)}")
|
248 |
+
|
249 |
# Gather the input texts (preserves list-of-strings if any)
|
250 |
texts = background_corpus_df[text_clm].fillna("").tolist()
|
251 |
|
|
|
267 |
with open(cache_path, "rb") as f:
|
268 |
return pickle.load(f)
|
269 |
|
270 |
+
else:
|
271 |
+
# Otherwise, compute, cache, and return
|
272 |
+
print(f"Computing embeddings for {model_name} on column '{text_clm}', saving to {cache_path}")
|
273 |
+
task_and_background_embeddings = generate_style_embedding(background_corpus_df, text_clm, model_name, dimensionality_reduction=True)
|
274 |
+
# Create a clean column name from the model name
|
275 |
+
col_name = f'{model_name.split("/")[-1]}_style_embedding'
|
276 |
+
background_corpus_df[col_name] = task_and_background_embeddings
|
277 |
+
|
278 |
+
with open(cache_path, "wb") as f:
|
279 |
+
pickle.dump(background_corpus_df, f)
|
280 |
+
|
281 |
+
if task_authors_df is not None:
|
282 |
+
task_authors_df = background_corpus_df[background_corpus_df.authorID.isin(task_authors_df.authorID.tolist())]
|
283 |
+
background_corpus_df = background_corpus_df[~background_corpus_df.authorID.isin(task_authors_df.authorID.tolist())]
|
284 |
+
|
285 |
+
return background_corpus_df, task_authors_df
|
286 |
|
287 |
def get_style_feats_distribution(documentIDs, style_feats_dict):
|
288 |
style_feats = []
|
utils/ui.py
CHANGED
@@ -102,7 +102,6 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
|
|
102 |
#create a dataframe of the task authors
|
103 |
task_authors_df = instance_to_df(instances[iid], predicted_author=predicted_author, ground_truth_author=ground_truth_author)
|
104 |
print(f"\n\n\n ----> Loaded task {iid} with {len(task_authors_df)} authors\n\n\n")
|
105 |
-
print(task_authors_df)
|
106 |
else:
|
107 |
header_html = "<h3>Custom Uploaded Task</h3>"
|
108 |
mystery_txt = read_txt(mystery_file)
|
@@ -119,15 +118,15 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
|
|
119 |
'a2_fullText': c3_txt
|
120 |
}
|
121 |
task_authors_df = instance_to_df(custom_task_instance)
|
122 |
-
print(task_authors_df)
|
123 |
|
124 |
-
print(f"Generating embeddings for {model_name} on task authors")
|
125 |
-
task_authors_df = cached_generate_style_embedding(task_authors_df, 'fullText', model_name)
|
126 |
-
print("Task authors after embedding generation:")
|
127 |
-
print(task_authors_df)
|
|
|
128 |
# Generate the new embedding of all the background_df authors
|
129 |
print(f"Generating embeddings for {model_name} on background corpus")
|
130 |
-
background_df = cached_generate_style_embedding(background_df, 'fullText', model_name)
|
131 |
print(f"Generated embeddings for {len(background_df)} texts using model '{model_name}'")
|
132 |
|
133 |
# computing g2v features
|
@@ -137,8 +136,6 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
|
|
137 |
task_authors_df['g2v_vector'] = task_authors_g2v
|
138 |
print(f"Gram2Vec feature generation complete")
|
139 |
|
140 |
-
print(background_df.columns)
|
141 |
-
|
142 |
if mode != "Predefined HRS Task":
|
143 |
# Computing predicted author by checking pairwise cosine similarity over luar embeddings
|
144 |
col_name = f'{model_name.split("/")[-1]}_style_embedding'
|
|
|
102 |
#create a dataframe of the task authors
|
103 |
task_authors_df = instance_to_df(instances[iid], predicted_author=predicted_author, ground_truth_author=ground_truth_author)
|
104 |
print(f"\n\n\n ----> Loaded task {iid} with {len(task_authors_df)} authors\n\n\n")
|
|
|
105 |
else:
|
106 |
header_html = "<h3>Custom Uploaded Task</h3>"
|
107 |
mystery_txt = read_txt(mystery_file)
|
|
|
118 |
'a2_fullText': c3_txt
|
119 |
}
|
120 |
task_authors_df = instance_to_df(custom_task_instance)
|
|
|
121 |
|
122 |
+
#print(f"Generating embeddings for {model_name} on task authors")
|
123 |
+
# task_authors_df = cached_generate_style_embedding(task_authors_df, 'fullText', model_name)
|
124 |
+
# print("Task authors after embedding generation:")
|
125 |
+
# print(task_authors_df)
|
126 |
+
|
127 |
# Generate the new embedding of all the background_df authors
|
128 |
print(f"Generating embeddings for {model_name} on background corpus")
|
129 |
+
background_df, task_authors_df = cached_generate_style_embedding(background_df, 'fullText', model_name, task_authors_df=task_authors_df)
|
130 |
print(f"Generated embeddings for {len(background_df)} texts using model '{model_name}'")
|
131 |
|
132 |
# computing g2v features
|
|
|
136 |
task_authors_df['g2v_vector'] = task_authors_g2v
|
137 |
print(f"Gram2Vec feature generation complete")
|
138 |
|
|
|
|
|
139 |
if mode != "Predefined HRS Task":
|
140 |
# Computing predicted author by checking pairwise cosine similarity over luar embeddings
|
141 |
col_name = f'{model_name.split("/")[-1]}_style_embedding'
|