Anisha Bhatnagar commited on
Commit
ad938d7
Β·
1 Parent(s): d1e7150

peter's changes : g2v filtering to ensure displayed features appear in the texts.

Browse files
Files changed (1) hide show
  1. utils/interp_space_utils.py +72 -2
utils/interp_space_utils.py CHANGED
@@ -667,14 +667,84 @@ def compute_clusters_g2v_representation(
667
 
668
  # Filter in only features that are present in selected_authors
669
  selected_authors = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}.intersection(set(author_ids))
 
 
 
 
 
 
 
670
  # Filter in only features that are present in selected_authors
671
  selected_authors_g2v_data = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
672
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
673
  filtered_features = []
674
  for feature, score, z_score in top_g2v_feats:
 
 
 
 
675
  # Check if the feature has a non-zero value in all of the selected authors
676
- if all(author_g2v_feats.get(feature, 0) > 0 for author_g2v_feats in selected_authors_g2v_data):
677
- filtered_features.append((feature, score, z_score)) # Only return feature and z-score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
678
 
679
 
680
  print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features]) # Print feature names and z-scores
 
667
 
668
  # Filter in only features that are present in selected_authors
669
  selected_authors = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}.intersection(set(author_ids))
670
+
671
+ # DEBUG: Print what we're actually working with
672
+ print(f"[DEBUG] author_ids parameter: {author_ids}")
673
+ print(f"[DEBUG] Hardcoded selected_authors set: {{'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}}")
674
+ print(f"[DEBUG] Intersection result: {selected_authors}")
675
+ print(f"[DEBUG] Is selected_authors empty? {len(selected_authors) == 0}")
676
+
677
  # Filter in only features that are present in selected_authors
678
  selected_authors_g2v_data = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
679
 
680
+ print(f"[DEBUG] selected_authors_g2v_data length: {len(selected_authors_g2v_data)}")
681
+ print(f"[DEBUG] selected_authors_g2v_data content: {selected_authors_g2v_data}")
682
+
683
+ # Get the actual text documents for the selected authors to verify feature presence
684
+ selected_authors_docs = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)]['fullText'].tolist()
685
+ print(f"[DEBUG] Found {len(selected_authors_docs)} documents for selected authors")
686
+
687
+ # Import find_feature_spans for text-based feature verification
688
+ try:
689
+ from gram2vec.feature_locator import find_feature_spans
690
+ print("[DEBUG] Successfully imported find_feature_spans")
691
+ except ImportError:
692
+ print("[WARNING] Could not import find_feature_spans, falling back to vector-based filtering")
693
+ find_feature_spans = None
694
+
695
  filtered_features = []
696
  for feature, score, z_score in top_g2v_feats:
697
+ # DEBUG: Print what we're checking for this feature
698
+ print(f"[DEBUG] Checking feature: {feature}")
699
+ print(f"[DEBUG] Feature score: {score}, z_score: {z_score}")
700
+
701
  # Check if the feature has a non-zero value in all of the selected authors
702
+ feature_presence = []
703
+ for i, author_g2v_feats in enumerate(selected_authors_g2v_data):
704
+ feature_value = author_g2v_feats.get(feature, 0)
705
+ feature_presence.append(feature_value)
706
+ print(f"[DEBUG] Author {i} has feature '{feature}' = {feature_value}")
707
+
708
+ print(f"[DEBUG] All feature values: {feature_presence}")
709
+ print(f"[DEBUG] All values > 0? {[v > 0 for v in feature_presence]}")
710
+ print(f"[DEBUG] All values > 0? {all(v > 0 for v in feature_presence)}")
711
+
712
+ # First check: feature must be present in Gram2Vec vectors
713
+ vector_present = all(author_g2v_feats.get(feature, 0) > 0 for author_g2v_feats in selected_authors_g2v_data)
714
+
715
+ # Second check: feature must be present in actual text documents
716
+ text_present = True
717
+ if find_feature_spans and selected_authors_docs:
718
+ try:
719
+ # Check if feature appears in at least one document from each selected author
720
+ for i, doc in enumerate(selected_authors_docs):
721
+ if isinstance(doc, list):
722
+ doc_text = '\n\n'.join(doc)
723
+ else:
724
+ doc_text = str(doc)
725
+
726
+ spans = find_feature_spans(doc_text, feature)
727
+ if not spans: # No spans found in this document
728
+ print(f"[DEBUG] βœ— Feature '{feature}' not found in document {i} of selected author")
729
+ text_present = False
730
+ break
731
+ else:
732
+ print(f"[DEBUG] βœ“ Feature '{feature}' found in document {i} with {len(spans)} spans")
733
+ except Exception as e:
734
+ print(f"[WARNING] Error checking text presence for feature '{feature}': {e}")
735
+ # Fall back to vector-based filtering if text checking fails
736
+ text_present = vector_present
737
+
738
+ # Feature must pass BOTH checks
739
+ if vector_present and text_present:
740
+ filtered_features.append((feature, score, z_score))
741
+ print(f"[DEBUG] βœ“ Feature '{feature}' PASSED both vector and text checks")
742
+ else:
743
+ if not vector_present:
744
+ print(f"[DEBUG] βœ— Feature '{feature}' FAILED vector check")
745
+ if not text_present:
746
+ print(f"[DEBUG] βœ— Feature '{feature}' FAILED text check")
747
+ print(f"[DEBUG] βœ— Feature '{feature}' FAILED the filter")
748
 
749
 
750
  print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features]) # Print feature names and z-scores