Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Running

App Files Files Community

Anisha Bhatnagar commited on 19 days ago

Commit

ad938d7

1 Parent(s): d1e7150

peter's changes : g2v filtering to ensure displayed features appear in the texts.

Browse files

Files changed (1) hide show

utils/interp_space_utils.py +72 -2

utils/interp_space_utils.py CHANGED Viewed

@@ -667,14 +667,84 @@ def compute_clusters_g2v_representation(
     # Filter in only features that are present in selected_authors
     selected_authors = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}.intersection(set(author_ids))
     # Filter in only features that are present in selected_authors
     selected_authors_g2v_data = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
     filtered_features = []
     for feature, score, z_score in top_g2v_feats:
         # Check if the feature has a non-zero value in all of the selected authors
-        if all(author_g2v_feats.get(feature, 0) > 0 for author_g2v_feats in selected_authors_g2v_data):
-            filtered_features.append((feature, score, z_score)) # Only return feature and z-score
     print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features])  # Print feature names and z-scores

     # Filter in only features that are present in selected_authors
     selected_authors = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}.intersection(set(author_ids))
+    # DEBUG: Print what we're actually working with
+    print(f"[DEBUG] author_ids parameter: {author_ids}")
+    print(f"[DEBUG] Hardcoded selected_authors set: {{'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}}")
+    print(f"[DEBUG] Intersection result: {selected_authors}")
+    print(f"[DEBUG] Is selected_authors empty? {len(selected_authors) == 0}")
     # Filter in only features that are present in selected_authors
     selected_authors_g2v_data = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
+    print(f"[DEBUG] selected_authors_g2v_data length: {len(selected_authors_g2v_data)}")
+    print(f"[DEBUG] selected_authors_g2v_data content: {selected_authors_g2v_data}")
+    # Get the actual text documents for the selected authors to verify feature presence
+    selected_authors_docs = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)]['fullText'].tolist()
+    print(f"[DEBUG] Found {len(selected_authors_docs)} documents for selected authors")
+    # Import find_feature_spans for text-based feature verification
+    try:
+        from gram2vec.feature_locator import find_feature_spans
+        print("[DEBUG] Successfully imported find_feature_spans")
+    except ImportError:
+        print("[WARNING] Could not import find_feature_spans, falling back to vector-based filtering")
+        find_feature_spans = None
     filtered_features = []
     for feature, score, z_score in top_g2v_feats:
+        # DEBUG: Print what we're checking for this feature
+        print(f"[DEBUG] Checking feature: {feature}")
+        print(f"[DEBUG] Feature score: {score}, z_score: {z_score}")
         # Check if the feature has a non-zero value in all of the selected authors
+        feature_presence = []
+        for i, author_g2v_feats in enumerate(selected_authors_g2v_data):
+            feature_value = author_g2v_feats.get(feature, 0)
+            feature_presence.append(feature_value)
+            print(f"[DEBUG] Author {i} has feature '{feature}' = {feature_value}")
+        print(f"[DEBUG] All feature values: {feature_presence}")
+        print(f"[DEBUG] All values > 0? {[v > 0 for v in feature_presence]}")
+        print(f"[DEBUG] All values > 0? {all(v > 0 for v in feature_presence)}")
+        # First check: feature must be present in Gram2Vec vectors
+        vector_present = all(author_g2v_feats.get(feature, 0) > 0 for author_g2v_feats in selected_authors_g2v_data)
+        # Second check: feature must be present in actual text documents
+        text_present = True
+        if find_feature_spans and selected_authors_docs:
+            try:
+                # Check if feature appears in at least one document from each selected author
+                for i, doc in enumerate(selected_authors_docs):
+                    if isinstance(doc, list):
+                        doc_text = '\n\n'.join(doc)
+                    else:
+                        doc_text = str(doc)
+                    spans = find_feature_spans(doc_text, feature)
+                    if not spans:  # No spans found in this document
+                        print(f"[DEBUG] ✗ Feature '{feature}' not found in document {i} of selected author")
+                        text_present = False
+                        break
+                    else:
+                        print(f"[DEBUG] ✓ Feature '{feature}' found in document {i} with {len(spans)} spans")
+            except Exception as e:
+                print(f"[WARNING] Error checking text presence for feature '{feature}': {e}")
+                # Fall back to vector-based filtering if text checking fails
+                text_present = vector_present
+        # Feature must pass BOTH checks
+        if vector_present and text_present:
+            filtered_features.append((feature, score, z_score))
+            print(f"[DEBUG] ✓ Feature '{feature}' PASSED both vector and text checks")
+        else:
+            if not vector_present:
+                print(f"[DEBUG] ✗ Feature '{feature}' FAILED vector check")
+            if not text_present:
+                print(f"[DEBUG] ✗ Feature '{feature}' FAILED text check")
+            print(f"[DEBUG] ✗ Feature '{feature}' FAILED the filter")
     print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features])  # Print feature names and z-scores