Anisha Bhatnagar
commited on
Commit
Β·
ad938d7
1
Parent(s):
d1e7150
peter's changes : g2v filtering to ensure displayed features appear in the texts.
Browse files- utils/interp_space_utils.py +72 -2
utils/interp_space_utils.py
CHANGED
@@ -667,14 +667,84 @@ def compute_clusters_g2v_representation(
|
|
667 |
|
668 |
# Filter in only features that are present in selected_authors
|
669 |
selected_authors = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}.intersection(set(author_ids))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
670 |
# Filter in only features that are present in selected_authors
|
671 |
selected_authors_g2v_data = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
|
672 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
673 |
filtered_features = []
|
674 |
for feature, score, z_score in top_g2v_feats:
|
|
|
|
|
|
|
|
|
675 |
# Check if the feature has a non-zero value in all of the selected authors
|
676 |
-
|
677 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
678 |
|
679 |
|
680 |
print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features]) # Print feature names and z-scores
|
|
|
667 |
|
668 |
# Filter in only features that are present in selected_authors
|
669 |
selected_authors = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}.intersection(set(author_ids))
|
670 |
+
|
671 |
+
# DEBUG: Print what we're actually working with
|
672 |
+
print(f"[DEBUG] author_ids parameter: {author_ids}")
|
673 |
+
print(f"[DEBUG] Hardcoded selected_authors set: {{'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}}")
|
674 |
+
print(f"[DEBUG] Intersection result: {selected_authors}")
|
675 |
+
print(f"[DEBUG] Is selected_authors empty? {len(selected_authors) == 0}")
|
676 |
+
|
677 |
# Filter in only features that are present in selected_authors
|
678 |
selected_authors_g2v_data = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
|
679 |
|
680 |
+
print(f"[DEBUG] selected_authors_g2v_data length: {len(selected_authors_g2v_data)}")
|
681 |
+
print(f"[DEBUG] selected_authors_g2v_data content: {selected_authors_g2v_data}")
|
682 |
+
|
683 |
+
# Get the actual text documents for the selected authors to verify feature presence
|
684 |
+
selected_authors_docs = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)]['fullText'].tolist()
|
685 |
+
print(f"[DEBUG] Found {len(selected_authors_docs)} documents for selected authors")
|
686 |
+
|
687 |
+
# Import find_feature_spans for text-based feature verification
|
688 |
+
try:
|
689 |
+
from gram2vec.feature_locator import find_feature_spans
|
690 |
+
print("[DEBUG] Successfully imported find_feature_spans")
|
691 |
+
except ImportError:
|
692 |
+
print("[WARNING] Could not import find_feature_spans, falling back to vector-based filtering")
|
693 |
+
find_feature_spans = None
|
694 |
+
|
695 |
filtered_features = []
|
696 |
for feature, score, z_score in top_g2v_feats:
|
697 |
+
# DEBUG: Print what we're checking for this feature
|
698 |
+
print(f"[DEBUG] Checking feature: {feature}")
|
699 |
+
print(f"[DEBUG] Feature score: {score}, z_score: {z_score}")
|
700 |
+
|
701 |
# Check if the feature has a non-zero value in all of the selected authors
|
702 |
+
feature_presence = []
|
703 |
+
for i, author_g2v_feats in enumerate(selected_authors_g2v_data):
|
704 |
+
feature_value = author_g2v_feats.get(feature, 0)
|
705 |
+
feature_presence.append(feature_value)
|
706 |
+
print(f"[DEBUG] Author {i} has feature '{feature}' = {feature_value}")
|
707 |
+
|
708 |
+
print(f"[DEBUG] All feature values: {feature_presence}")
|
709 |
+
print(f"[DEBUG] All values > 0? {[v > 0 for v in feature_presence]}")
|
710 |
+
print(f"[DEBUG] All values > 0? {all(v > 0 for v in feature_presence)}")
|
711 |
+
|
712 |
+
# First check: feature must be present in Gram2Vec vectors
|
713 |
+
vector_present = all(author_g2v_feats.get(feature, 0) > 0 for author_g2v_feats in selected_authors_g2v_data)
|
714 |
+
|
715 |
+
# Second check: feature must be present in actual text documents
|
716 |
+
text_present = True
|
717 |
+
if find_feature_spans and selected_authors_docs:
|
718 |
+
try:
|
719 |
+
# Check if feature appears in at least one document from each selected author
|
720 |
+
for i, doc in enumerate(selected_authors_docs):
|
721 |
+
if isinstance(doc, list):
|
722 |
+
doc_text = '\n\n'.join(doc)
|
723 |
+
else:
|
724 |
+
doc_text = str(doc)
|
725 |
+
|
726 |
+
spans = find_feature_spans(doc_text, feature)
|
727 |
+
if not spans: # No spans found in this document
|
728 |
+
print(f"[DEBUG] β Feature '{feature}' not found in document {i} of selected author")
|
729 |
+
text_present = False
|
730 |
+
break
|
731 |
+
else:
|
732 |
+
print(f"[DEBUG] β Feature '{feature}' found in document {i} with {len(spans)} spans")
|
733 |
+
except Exception as e:
|
734 |
+
print(f"[WARNING] Error checking text presence for feature '{feature}': {e}")
|
735 |
+
# Fall back to vector-based filtering if text checking fails
|
736 |
+
text_present = vector_present
|
737 |
+
|
738 |
+
# Feature must pass BOTH checks
|
739 |
+
if vector_present and text_present:
|
740 |
+
filtered_features.append((feature, score, z_score))
|
741 |
+
print(f"[DEBUG] β Feature '{feature}' PASSED both vector and text checks")
|
742 |
+
else:
|
743 |
+
if not vector_present:
|
744 |
+
print(f"[DEBUG] β Feature '{feature}' FAILED vector check")
|
745 |
+
if not text_present:
|
746 |
+
print(f"[DEBUG] β Feature '{feature}' FAILED text check")
|
747 |
+
print(f"[DEBUG] β Feature '{feature}' FAILED the filter")
|
748 |
|
749 |
|
750 |
print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features]) # Print feature names and z-scores
|