Ahmedik95316 commited on
Commit
dff1572
·
1 Parent(s): ed2e413

Update features/feature_engineer.py

Browse files
Files changed (1) hide show
  1. features/feature_engineer.py +254 -322
features/feature_engineer.py CHANGED
@@ -1,4 +1,5 @@
1
- # Fixed features/feature_engineer.py addressing the IndexError and regex issues
 
2
 
3
  import json
4
  import joblib
@@ -19,16 +20,11 @@ from sklearn.preprocessing import StandardScaler, FunctionTransformer
19
  import warnings
20
  warnings.filterwarnings('ignore')
21
 
22
- # Import feature analyzers with error handling
23
- try:
24
- from features.sentiment_analyzer import SentimentAnalyzer
25
- from features.readability_analyzer import ReadabilityAnalyzer
26
- from features.entity_analyzer import EntityAnalyzer
27
- from features.linguistic_analyzer import LinguisticAnalyzer
28
- FEATURE_ANALYZERS_AVAILABLE = True
29
- except ImportError:
30
- FEATURE_ANALYZERS_AVAILABLE = False
31
- logging.warning("Advanced feature analyzers not available - using basic features only")
32
 
33
  # Configure logging
34
  logging.basicConfig(level=logging.INFO)
@@ -37,7 +33,8 @@ logger = logging.getLogger(__name__)
37
 
38
  class AdvancedFeatureEngineer(BaseEstimator, TransformerMixin):
39
  """
40
- Fixed advanced feature engineering pipeline with proper error handling
 
41
  """
42
 
43
  def __init__(self,
@@ -52,39 +49,33 @@ class AdvancedFeatureEngineer(BaseEstimator, TransformerMixin):
52
  max_df: float = 0.95):
53
  """
54
  Initialize the advanced feature engineering pipeline.
 
 
 
 
 
 
 
 
 
 
 
55
  """
56
- self.enable_sentiment = enable_sentiment and FEATURE_ANALYZERS_AVAILABLE
57
- self.enable_readability = enable_readability and FEATURE_ANALYZERS_AVAILABLE
58
- self.enable_entities = enable_entities and FEATURE_ANALYZERS_AVAILABLE
59
- self.enable_linguistic = enable_linguistic and FEATURE_ANALYZERS_AVAILABLE
60
  self.feature_selection_k = feature_selection_k
61
  self.tfidf_max_features = tfidf_max_features
62
  self.ngram_range = ngram_range
63
  self.min_df = min_df
64
  self.max_df = max_df
65
 
66
- # Initialize feature extractors only if available
67
- self.sentiment_analyzer = None
68
- self.readability_analyzer = None
69
- self.entity_analyzer = None
70
- self.linguistic_analyzer = None
71
-
72
- if FEATURE_ANALYZERS_AVAILABLE:
73
- try:
74
- if self.enable_sentiment:
75
- self.sentiment_analyzer = SentimentAnalyzer()
76
- if self.enable_readability:
77
- self.readability_analyzer = ReadabilityAnalyzer()
78
- if self.enable_entities:
79
- self.entity_analyzer = EntityAnalyzer()
80
- if self.enable_linguistic:
81
- self.linguistic_analyzer = LinguisticAnalyzer()
82
- except Exception as e:
83
- logger.warning(f"Failed to initialize feature analyzers: {e}")
84
- self.sentiment_analyzer = None
85
- self.readability_analyzer = None
86
- self.entity_analyzer = None
87
- self.linguistic_analyzer = None
88
 
89
  # Initialize TF-IDF components
90
  self.tfidf_vectorizer = None
@@ -98,7 +89,11 @@ class AdvancedFeatureEngineer(BaseEstimator, TransformerMixin):
98
 
99
  def fit(self, X, y=None):
100
  """
101
- Fit the feature engineering pipeline with proper error handling.
 
 
 
 
102
  """
103
  logger.info("Fitting advanced feature engineering pipeline...")
104
 
@@ -112,61 +107,41 @@ class AdvancedFeatureEngineer(BaseEstimator, TransformerMixin):
112
  if len(X) == 0:
113
  raise ValueError("Cannot fit on empty data")
114
 
115
- # Initialize TF-IDF vectorizer with safer parameters
116
- actual_max_features = min(self.tfidf_max_features, len(X) * 10)
117
-
118
  self.tfidf_vectorizer = TfidfVectorizer(
119
- max_features=actual_max_features,
120
  ngram_range=self.ngram_range,
121
- min_df=max(1, min(self.min_df, len(X) // 10)),
122
  max_df=self.max_df,
123
  stop_words='english',
124
  sublinear_tf=True,
125
  norm='l2',
126
- lowercase=True,
127
- token_pattern=r'\b[a-zA-Z][a-zA-Z]+\b' # Fix regex pattern
128
  )
129
 
130
  # Fit TF-IDF on text data
131
  logger.info("Fitting TF-IDF vectorizer...")
132
- try:
133
- tfidf_features = self.tfidf_vectorizer.fit_transform(X)
134
- logger.info(f"TF-IDF features shape: {tfidf_features.shape}")
135
- except Exception as e:
136
- logger.error(f"TF-IDF fitting failed: {e}")
137
- # Fallback to very basic TF-IDF
138
- self.tfidf_vectorizer = TfidfVectorizer(
139
- max_features=min(1000, len(X) * 5),
140
- stop_words='english',
141
- lowercase=True
142
- )
143
- tfidf_features = self.tfidf_vectorizer.fit_transform(X)
144
- logger.info(f"Fallback TF-IDF features shape: {tfidf_features.shape}")
145
 
146
- # Extract additional features with error handling
147
  additional_features = self._extract_additional_features(X, fit=True)
148
 
149
  # Combine all features
150
  if additional_features.shape[1] > 0:
151
- try:
152
- all_features = hstack([tfidf_features, additional_features])
153
- except Exception as e:
154
- logger.warning(f"Failed to combine features, using TF-IDF only: {e}")
155
- all_features = tfidf_features
156
- additional_features = np.empty((len(X), 0))
157
  else:
158
  all_features = tfidf_features
159
 
160
  logger.info(f"Total features before selection: {all_features.shape[1]}")
161
 
162
- # Feature selection with proper bounds checking
163
  if y is not None and self.feature_selection_k < all_features.shape[1]:
164
- actual_k = min(self.feature_selection_k, all_features.shape[1] - 1)
165
- logger.info(f"Performing feature selection (k={actual_k})...")
166
 
 
167
  self.feature_selector = SelectKBest(
168
  score_func=chi2,
169
- k=actual_k
170
  )
171
 
172
  # Ensure non-negative features for chi2
@@ -178,30 +153,22 @@ class AdvancedFeatureEngineer(BaseEstimator, TransformerMixin):
178
  # Make features non-negative for chi2
179
  features_dense = np.maximum(features_dense, 0)
180
 
181
- try:
182
- self.feature_selector.fit(features_dense, y)
183
- selected_features = self.feature_selector.transform(features_dense)
184
- logger.info(f"Selected {selected_features.shape[1]} features")
185
- except Exception as e:
186
- logger.warning(f"Feature selection failed: {e}, using all features")
187
- self.feature_selector = None
188
- selected_features = all_features
189
  else:
190
  selected_features = all_features
191
 
192
  # Scale numerical features (additional features only)
193
  if additional_features.shape[1] > 0:
194
  self.feature_scaler = StandardScaler()
195
- try:
196
- # Only scale the additional features part
197
- additional_selected = selected_features[:, -additional_features.shape[1]:]
198
- self.feature_scaler.fit(additional_selected)
199
- except Exception as e:
200
- logger.warning(f"Feature scaling failed: {e}")
201
- self.feature_scaler = None
202
 
203
- # Generate feature names with proper bounds checking
204
- self._generate_feature_names_safe()
205
 
206
  # Calculate feature importance if possible
207
  if y is not None and self.feature_selector is not None:
@@ -215,6 +182,12 @@ class AdvancedFeatureEngineer(BaseEstimator, TransformerMixin):
215
  def transform(self, X):
216
  """
217
  Transform text data into enhanced feature vectors.
 
 
 
 
 
 
218
  """
219
  if not self.is_fitted_:
220
  raise ValueError("Pipeline must be fitted before transforming")
@@ -226,62 +199,42 @@ class AdvancedFeatureEngineer(BaseEstimator, TransformerMixin):
226
  X = np.array(X)
227
 
228
  # Extract TF-IDF features
229
- try:
230
- tfidf_features = self.tfidf_vectorizer.transform(X)
231
- except Exception as e:
232
- logger.error(f"TF-IDF transform failed: {e}")
233
- # Return minimal features if transform fails
234
- return np.zeros((len(X), len(self.feature_names_) if self.feature_names_ else 100))
235
 
236
  # Extract additional features
237
  additional_features = self._extract_additional_features(X, fit=False)
238
 
239
  # Combine features
240
  if additional_features.shape[1] > 0:
241
- try:
242
- all_features = hstack([tfidf_features, additional_features])
243
- except Exception as e:
244
- logger.warning(f"Failed to combine features in transform: {e}")
245
- all_features = tfidf_features
246
  else:
247
  all_features = tfidf_features
248
 
249
  # Apply feature selection
250
  if self.feature_selector is not None:
251
- try:
252
- if hasattr(all_features, 'toarray'):
253
- features_dense = all_features.toarray()
254
- else:
255
- features_dense = all_features
256
-
257
- # Ensure non-negative for consistency
258
- features_dense = np.maximum(features_dense, 0)
259
- selected_features = self.feature_selector.transform(features_dense)
260
- except Exception as e:
261
- logger.warning(f"Feature selection failed in transform: {e}")
262
- selected_features = all_features
263
  else:
264
  selected_features = all_features
265
 
266
  # Scale additional features if scaler exists
267
  if self.feature_scaler is not None and additional_features.shape[1] > 0:
268
- try:
269
- # Scale only the additional features part
270
- tfidf_selected = selected_features[:, :-additional_features.shape[1]]
271
- additional_selected = selected_features[:, -additional_features.shape[1]:]
272
- additional_scaled = self.feature_scaler.transform(additional_selected)
273
-
274
- # Combine back
275
- if hasattr(tfidf_selected, 'toarray'):
276
- tfidf_selected = tfidf_selected.toarray()
277
-
278
- final_features = np.hstack([tfidf_selected, additional_scaled])
279
- except Exception as e:
280
- logger.warning(f"Feature scaling failed in transform: {e}")
281
- if hasattr(selected_features, 'toarray'):
282
- final_features = selected_features.toarray()
283
- else:
284
- final_features = selected_features
285
  else:
286
  if hasattr(selected_features, 'toarray'):
287
  final_features = selected_features.toarray()
@@ -291,64 +244,45 @@ class AdvancedFeatureEngineer(BaseEstimator, TransformerMixin):
291
  return final_features
292
 
293
  def _extract_additional_features(self, X, fit=False):
294
- """Extract additional features with comprehensive error handling"""
295
  feature_arrays = []
296
 
297
  try:
298
- # Basic text features (always available)
299
- basic_features = self._extract_basic_features(X)
300
- if basic_features.shape[1] > 0:
301
- feature_arrays.append(basic_features)
 
 
 
 
302
 
303
- # Advanced features (only if analyzers available)
304
- if FEATURE_ANALYZERS_AVAILABLE:
305
- # Sentiment features
306
- if self.sentiment_analyzer is not None:
307
- logger.info("Extracting sentiment features...")
308
- try:
309
- if fit:
310
- sentiment_features = self.sentiment_analyzer.fit_transform(X)
311
- else:
312
- sentiment_features = self.sentiment_analyzer.transform(X)
313
- feature_arrays.append(sentiment_features)
314
- except Exception as e:
315
- logger.warning(f"Sentiment analysis failed: {e}")
316
-
317
- # Readability features
318
- if self.readability_analyzer is not None:
319
- logger.info("Extracting readability features...")
320
- try:
321
- if fit:
322
- readability_features = self.readability_analyzer.fit_transform(X)
323
- else:
324
- readability_features = self.readability_analyzer.transform(X)
325
- feature_arrays.append(readability_features)
326
- except Exception as e:
327
- logger.warning(f"Readability analysis failed: {e}")
328
-
329
- # Entity features
330
- if self.entity_analyzer is not None:
331
- logger.info("Extracting entity features...")
332
- try:
333
- if fit:
334
- entity_features = self.entity_analyzer.fit_transform(X)
335
- else:
336
- entity_features = self.entity_analyzer.transform(X)
337
- feature_arrays.append(entity_features)
338
- except Exception as e:
339
- logger.warning(f"Entity analysis failed: {e}")
340
-
341
- # Linguistic features
342
- if self.linguistic_analyzer is not None:
343
- logger.info("Extracting linguistic features...")
344
- try:
345
- if fit:
346
- linguistic_features = self.linguistic_analyzer.fit_transform(X)
347
- else:
348
- linguistic_features = self.linguistic_analyzer.transform(X)
349
- feature_arrays.append(linguistic_features)
350
- except Exception as e:
351
- logger.warning(f"Linguistic analysis failed: {e}")
352
 
353
  # Combine all additional features
354
  if feature_arrays:
@@ -363,130 +297,51 @@ class AdvancedFeatureEngineer(BaseEstimator, TransformerMixin):
363
 
364
  return additional_features
365
 
366
- def _extract_basic_features(self, X):
367
- """Extract basic text features that don't require external libraries"""
368
- features = []
369
 
370
- for text in X:
371
- text_str = str(text)
372
-
373
- # Basic text statistics
374
- word_count = len(text_str.split())
375
- char_count = len(text_str)
376
- sentence_count = text_str.count('.') + text_str.count('!') + text_str.count('?')
377
- sentence_count = max(1, sentence_count) # Avoid division by zero
378
-
379
- # Basic ratios
380
- avg_word_length = char_count / max(word_count, 1)
381
- avg_sentence_length = word_count / sentence_count
382
-
383
- # Punctuation features
384
- exclamation_count = text_str.count('!')
385
- question_count = text_str.count('?')
386
- uppercase_ratio = sum(1 for c in text_str if c.isupper()) / max(len(text_str), 1)
387
-
388
- # Feature vector
389
- feature_vector = [
390
- word_count,
391
- char_count,
392
- sentence_count,
393
- avg_word_length,
394
- avg_sentence_length,
395
- exclamation_count,
396
- question_count,
397
- uppercase_ratio
398
- ]
399
-
400
- features.append(feature_vector)
401
 
402
- return np.array(features)
403
-
404
- def _generate_feature_names_safe(self):
405
- """Generate feature names with proper bounds checking"""
406
- self.feature_names_ = []
407
 
408
- try:
409
- # TF-IDF feature names
410
- if self.tfidf_vectorizer is not None:
411
- tfidf_names = [f"tfidf_{name}" for name in self.tfidf_vectorizer.get_feature_names_out()]
412
- self.feature_names_.extend(tfidf_names)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
 
414
- # Basic feature names
415
- basic_feature_names = [
416
- 'word_count', 'char_count', 'sentence_count',
417
- 'avg_word_length', 'avg_sentence_length',
418
- 'exclamation_count', 'question_count', 'uppercase_ratio'
419
- ]
420
- self.feature_names_.extend([f'basic_{name}' for name in basic_feature_names])
421
 
422
- # Advanced feature names (only if available)
423
- if FEATURE_ANALYZERS_AVAILABLE:
424
- if self.sentiment_analyzer is not None:
425
- try:
426
- self.feature_names_.extend(self.sentiment_analyzer.get_feature_names())
427
- except:
428
- self.feature_names_.extend(['sentiment_compound', 'sentiment_pos', 'sentiment_neg', 'sentiment_neu'])
429
-
430
- if self.readability_analyzer is not None:
431
- try:
432
- self.feature_names_.extend(self.readability_analyzer.get_feature_names())
433
- except:
434
- self.feature_names_.extend(['readability_score', 'reading_ease'])
435
-
436
- if self.entity_analyzer is not None:
437
- try:
438
- self.feature_names_.extend(self.entity_analyzer.get_feature_names())
439
- except:
440
- self.feature_names_.extend(['entity_person', 'entity_org', 'entity_loc'])
441
-
442
- if self.linguistic_analyzer is not None:
443
- try:
444
- self.feature_names_.extend(self.linguistic_analyzer.get_feature_names())
445
- except:
446
- self.feature_names_.extend(['linguistic_complexity', 'pos_diversity'])
447
 
448
- # Apply feature selection to names if applicable
449
- if self.feature_selector is not None:
450
- try:
451
- selected_indices = self.feature_selector.get_support()
452
- # FIX: Ensure bounds checking
453
- if len(selected_indices) == len(self.feature_names_):
454
- self.feature_names_ = [name for i, name in enumerate(self.feature_names_) if selected_indices[i]]
455
- else:
456
- logger.warning(f"Feature selection indices mismatch: {len(selected_indices)} vs {len(self.feature_names_)}")
457
- # Keep original names if mismatch
458
- except Exception as e:
459
- logger.warning(f"Failed to apply feature selection to names: {e}")
460
-
461
- except Exception as e:
462
- logger.warning(f"Failed to generate feature names: {e}")
463
- # Generate generic names
464
- self.feature_names_ = [f'feature_{i}' for i in range(100)] # Default fallback
465
-
466
- def _calculate_feature_importance(self):
467
- """Calculate feature importance scores with error handling"""
468
- try:
469
- if self.feature_selector is not None and hasattr(self.feature_selector, 'scores_'):
470
- scores = self.feature_selector.scores_
471
- selected_indices = self.feature_selector.get_support()
472
-
473
- # Get scores for selected features
474
- selected_scores = scores[selected_indices]
475
-
476
- # Create importance dictionary with bounds checking
477
- if len(selected_scores) == len(self.feature_names_):
478
- self.feature_importance_ = {
479
- name: float(score) for name, score in zip(self.feature_names_, selected_scores)
480
- }
481
-
482
- # Sort by importance
483
- self.feature_importance_ = dict(
484
- sorted(self.feature_importance_.items(), key=lambda x: x[1], reverse=True)
485
- )
486
- else:
487
- logger.warning("Feature importance calculation failed due to size mismatch")
488
- except Exception as e:
489
- logger.warning(f"Feature importance calculation failed: {e}")
490
 
491
  def get_feature_names(self):
492
  """Get names of output features"""
@@ -509,19 +364,15 @@ class AdvancedFeatureEngineer(BaseEstimator, TransformerMixin):
509
  if not self.is_fitted_:
510
  raise ValueError("Pipeline must be fitted first")
511
 
512
- # Count feature types safely
513
- feature_type_counts = {
514
- 'tfidf_features': sum(1 for name in self.feature_names_ if name.startswith('tfidf_')),
515
- 'basic_features': sum(1 for name in self.feature_names_ if name.startswith('basic_')),
516
- 'sentiment_features': sum(1 for name in self.feature_names_ if 'sentiment' in name),
517
- 'readability_features': sum(1 for name in self.feature_names_ if 'readability' in name),
518
- 'entity_features': sum(1 for name in self.feature_names_ if 'entity' in name),
519
- 'linguistic_features': sum(1 for name in self.feature_names_ if 'linguistic' in name)
520
- }
521
-
522
  metadata = {
523
  'total_features': len(self.feature_names_),
524
- 'feature_types': feature_type_counts,
 
 
 
 
 
 
525
  'configuration': {
526
  'enable_sentiment': self.enable_sentiment,
527
  'enable_readability': self.enable_readability,
@@ -529,17 +380,39 @@ class AdvancedFeatureEngineer(BaseEstimator, TransformerMixin):
529
  'enable_linguistic': self.enable_linguistic,
530
  'feature_selection_k': self.feature_selection_k,
531
  'tfidf_max_features': self.tfidf_max_features,
532
- 'ngram_range': self.ngram_range,
533
- 'analyzers_available': FEATURE_ANALYZERS_AVAILABLE
534
  },
535
  'feature_importance_available': bool(self.feature_importance_),
536
  'timestamp': datetime.now().isoformat()
537
  }
538
 
539
  return metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
540
 
541
 
542
- # Convenience functions remain the same...
543
  def create_enhanced_pipeline(X_train, y_train,
544
  enable_sentiment=True,
545
  enable_readability=True,
@@ -548,18 +421,28 @@ def create_enhanced_pipeline(X_train, y_train,
548
  feature_selection_k=5000):
549
  """
550
  Create and fit an enhanced feature engineering pipeline.
 
 
 
 
 
 
 
 
 
 
 
 
551
  """
552
  logger.info("Creating enhanced feature engineering pipeline...")
553
 
554
- # Create feature engineer with reduced complexity for stability
555
  feature_engineer = AdvancedFeatureEngineer(
556
- enable_sentiment=enable_sentiment and FEATURE_ANALYZERS_AVAILABLE,
557
- enable_readability=enable_readability and FEATURE_ANALYZERS_AVAILABLE,
558
- enable_entities=enable_entities and FEATURE_ANALYZERS_AVAILABLE,
559
- enable_linguistic=enable_linguistic and FEATURE_ANALYZERS_AVAILABLE,
560
- feature_selection_k=min(feature_selection_k, len(X_train) * 2), # Safer default
561
- tfidf_max_features=min(10000, len(X_train) * 5), # Safer default
562
- ngram_range=(1, 2) # Reduced complexity
563
  )
564
 
565
  # Fit the pipeline
@@ -570,4 +453,53 @@ def create_enhanced_pipeline(X_train, y_train,
570
  logger.info(f"Enhanced pipeline created with {metadata['total_features']} features")
571
  logger.info(f"Feature breakdown: {metadata['feature_types']}")
572
 
573
- return feature_engineer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File: features/feature_engineer.py
2
+ # Enhanced Feature Engineering Pipeline for Priority 6
3
 
4
  import json
5
  import joblib
 
20
  import warnings
21
  warnings.filterwarnings('ignore')
22
 
23
+ # Import feature analyzers
24
+ from features.sentiment_analyzer import SentimentAnalyzer
25
+ from features.readability_analyzer import ReadabilityAnalyzer
26
+ from features.entity_analyzer import EntityAnalyzer
27
+ from features.linguistic_analyzer import LinguisticAnalyzer
 
 
 
 
 
28
 
29
  # Configure logging
30
  logging.basicConfig(level=logging.INFO)
 
33
 
34
  class AdvancedFeatureEngineer(BaseEstimator, TransformerMixin):
35
  """
36
+ Advanced feature engineering pipeline combining multiple NLP feature extractors
37
+ for enhanced fake news detection performance.
38
  """
39
 
40
  def __init__(self,
 
49
  max_df: float = 0.95):
50
  """
51
  Initialize the advanced feature engineering pipeline.
52
+
53
+ Args:
54
+ enable_sentiment: Enable sentiment analysis features
55
+ enable_readability: Enable readability/complexity features
56
+ enable_entities: Enable named entity recognition features
57
+ enable_linguistic: Enable advanced linguistic features
58
+ feature_selection_k: Number of features to select
59
+ tfidf_max_features: Maximum TF-IDF features
60
+ ngram_range: N-gram range for TF-IDF
61
+ min_df: Minimum document frequency for TF-IDF
62
+ max_df: Maximum document frequency for TF-IDF
63
  """
64
+ self.enable_sentiment = enable_sentiment
65
+ self.enable_readability = enable_readability
66
+ self.enable_entities = enable_entities
67
+ self.enable_linguistic = enable_linguistic
68
  self.feature_selection_k = feature_selection_k
69
  self.tfidf_max_features = tfidf_max_features
70
  self.ngram_range = ngram_range
71
  self.min_df = min_df
72
  self.max_df = max_df
73
 
74
+ # Initialize feature extractors
75
+ self.sentiment_analyzer = SentimentAnalyzer() if enable_sentiment else None
76
+ self.readability_analyzer = ReadabilityAnalyzer() if enable_readability else None
77
+ self.entity_analyzer = EntityAnalyzer() if enable_entities else None
78
+ self.linguistic_analyzer = LinguisticAnalyzer() if enable_linguistic else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  # Initialize TF-IDF components
81
  self.tfidf_vectorizer = None
 
89
 
90
  def fit(self, X, y=None):
91
  """
92
+ Fit the feature engineering pipeline.
93
+
94
+ Args:
95
+ X: Text data (array-like of strings)
96
+ y: Target labels (optional, for supervised feature selection)
97
  """
98
  logger.info("Fitting advanced feature engineering pipeline...")
99
 
 
107
  if len(X) == 0:
108
  raise ValueError("Cannot fit on empty data")
109
 
110
+ # Initialize TF-IDF vectorizer
 
 
111
  self.tfidf_vectorizer = TfidfVectorizer(
112
+ max_features=self.tfidf_max_features,
113
  ngram_range=self.ngram_range,
114
+ min_df=self.min_df,
115
  max_df=self.max_df,
116
  stop_words='english',
117
  sublinear_tf=True,
118
  norm='l2',
119
+ lowercase=True
 
120
  )
121
 
122
  # Fit TF-IDF on text data
123
  logger.info("Fitting TF-IDF vectorizer...")
124
+ tfidf_features = self.tfidf_vectorizer.fit_transform(X)
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
+ # Extract additional features
127
  additional_features = self._extract_additional_features(X, fit=True)
128
 
129
  # Combine all features
130
  if additional_features.shape[1] > 0:
131
+ all_features = hstack([tfidf_features, additional_features])
 
 
 
 
 
132
  else:
133
  all_features = tfidf_features
134
 
135
  logger.info(f"Total features before selection: {all_features.shape[1]}")
136
 
137
+ # Feature selection
138
  if y is not None and self.feature_selection_k < all_features.shape[1]:
139
+ logger.info(f"Performing feature selection (k={self.feature_selection_k})...")
 
140
 
141
+ # Use chi2 for text features and mutual information for numerical features
142
  self.feature_selector = SelectKBest(
143
  score_func=chi2,
144
+ k=min(self.feature_selection_k, all_features.shape[1])
145
  )
146
 
147
  # Ensure non-negative features for chi2
 
153
  # Make features non-negative for chi2
154
  features_dense = np.maximum(features_dense, 0)
155
 
156
+ self.feature_selector.fit(features_dense, y)
157
+ selected_features = self.feature_selector.transform(features_dense)
158
+
159
+ logger.info(f"Selected {selected_features.shape[1]} features")
 
 
 
 
160
  else:
161
  selected_features = all_features
162
 
163
  # Scale numerical features (additional features only)
164
  if additional_features.shape[1] > 0:
165
  self.feature_scaler = StandardScaler()
166
+ # Only scale the additional features part
167
+ additional_selected = selected_features[:, -additional_features.shape[1]:]
168
+ self.feature_scaler.fit(additional_selected)
 
 
 
 
169
 
170
+ # Generate feature names
171
+ self._generate_feature_names()
172
 
173
  # Calculate feature importance if possible
174
  if y is not None and self.feature_selector is not None:
 
182
  def transform(self, X):
183
  """
184
  Transform text data into enhanced feature vectors.
185
+
186
+ Args:
187
+ X: Text data (array-like of strings)
188
+
189
+ Returns:
190
+ Transformed feature matrix
191
  """
192
  if not self.is_fitted_:
193
  raise ValueError("Pipeline must be fitted before transforming")
 
199
  X = np.array(X)
200
 
201
  # Extract TF-IDF features
202
+ tfidf_features = self.tfidf_vectorizer.transform(X)
 
 
 
 
 
203
 
204
  # Extract additional features
205
  additional_features = self._extract_additional_features(X, fit=False)
206
 
207
  # Combine features
208
  if additional_features.shape[1] > 0:
209
+ all_features = hstack([tfidf_features, additional_features])
 
 
 
 
210
  else:
211
  all_features = tfidf_features
212
 
213
  # Apply feature selection
214
  if self.feature_selector is not None:
215
+ if hasattr(all_features, 'toarray'):
216
+ features_dense = all_features.toarray()
217
+ else:
218
+ features_dense = all_features
219
+
220
+ # Ensure non-negative for consistency
221
+ features_dense = np.maximum(features_dense, 0)
222
+ selected_features = self.feature_selector.transform(features_dense)
 
 
 
 
223
  else:
224
  selected_features = all_features
225
 
226
  # Scale additional features if scaler exists
227
  if self.feature_scaler is not None and additional_features.shape[1] > 0:
228
+ # Scale only the additional features part
229
+ tfidf_selected = selected_features[:, :-additional_features.shape[1]]
230
+ additional_selected = selected_features[:, -additional_features.shape[1]:]
231
+ additional_scaled = self.feature_scaler.transform(additional_selected)
232
+
233
+ # Combine back
234
+ if hasattr(tfidf_selected, 'toarray'):
235
+ tfidf_selected = tfidf_selected.toarray()
236
+
237
+ final_features = np.hstack([tfidf_selected, additional_scaled])
 
 
 
 
 
 
 
238
  else:
239
  if hasattr(selected_features, 'toarray'):
240
  final_features = selected_features.toarray()
 
244
  return final_features
245
 
246
  def _extract_additional_features(self, X, fit=False):
247
+ """Extract additional features beyond TF-IDF"""
248
  feature_arrays = []
249
 
250
  try:
251
+ # Sentiment features
252
+ if self.sentiment_analyzer is not None:
253
+ logger.info("Extracting sentiment features...")
254
+ if fit:
255
+ sentiment_features = self.sentiment_analyzer.fit_transform(X)
256
+ else:
257
+ sentiment_features = self.sentiment_analyzer.transform(X)
258
+ feature_arrays.append(sentiment_features)
259
 
260
+ # Readability features
261
+ if self.readability_analyzer is not None:
262
+ logger.info("Extracting readability features...")
263
+ if fit:
264
+ readability_features = self.readability_analyzer.fit_transform(X)
265
+ else:
266
+ readability_features = self.readability_analyzer.transform(X)
267
+ feature_arrays.append(readability_features)
268
+
269
+ # Entity features
270
+ if self.entity_analyzer is not None:
271
+ logger.info("Extracting entity features...")
272
+ if fit:
273
+ entity_features = self.entity_analyzer.fit_transform(X)
274
+ else:
275
+ entity_features = self.entity_analyzer.transform(X)
276
+ feature_arrays.append(entity_features)
277
+
278
+ # Linguistic features
279
+ if self.linguistic_analyzer is not None:
280
+ logger.info("Extracting linguistic features...")
281
+ if fit:
282
+ linguistic_features = self.linguistic_analyzer.fit_transform(X)
283
+ else:
284
+ linguistic_features = self.linguistic_analyzer.transform(X)
285
+ feature_arrays.append(linguistic_features)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
 
287
  # Combine all additional features
288
  if feature_arrays:
 
297
 
298
  return additional_features
299
 
300
+ def _generate_feature_names(self):
301
+ """Generate comprehensive feature names"""
302
+ self.feature_names_ = []
303
 
304
+ # TF-IDF feature names
305
+ if self.tfidf_vectorizer is not None:
306
+ tfidf_names = [f"tfidf_{name}" for name in self.tfidf_vectorizer.get_feature_names_out()]
307
+ self.feature_names_.extend(tfidf_names)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
 
309
+ # Additional feature names
310
+ if self.sentiment_analyzer is not None:
311
+ self.feature_names_.extend(self.sentiment_analyzer.get_feature_names())
 
 
312
 
313
+ if self.readability_analyzer is not None:
314
+ self.feature_names_.extend(self.readability_analyzer.get_feature_names())
315
+
316
+ if self.entity_analyzer is not None:
317
+ self.feature_names_.extend(self.entity_analyzer.get_feature_names())
318
+
319
+ if self.linguistic_analyzer is not None:
320
+ self.feature_names_.extend(self.linguistic_analyzer.get_feature_names())
321
+
322
+ # Apply feature selection to names if applicable
323
+ if self.feature_selector is not None:
324
+ selected_indices = self.feature_selector.get_support()
325
+ self.feature_names_ = [name for i, name in enumerate(self.feature_names_) if selected_indices[i]]
326
+
327
+ def _calculate_feature_importance(self):
328
+ """Calculate feature importance scores"""
329
+ if self.feature_selector is not None:
330
+ scores = self.feature_selector.scores_
331
+ selected_indices = self.feature_selector.get_support()
332
 
333
+ # Get scores for selected features
334
+ selected_scores = scores[selected_indices]
 
 
 
 
 
335
 
336
+ # Create importance dictionary
337
+ self.feature_importance_ = {
338
+ name: float(score) for name, score in zip(self.feature_names_, selected_scores)
339
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
 
341
+ # Sort by importance
342
+ self.feature_importance_ = dict(
343
+ sorted(self.feature_importance_.items(), key=lambda x: x[1], reverse=True)
344
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
 
346
  def get_feature_names(self):
347
  """Get names of output features"""
 
364
  if not self.is_fitted_:
365
  raise ValueError("Pipeline must be fitted first")
366
 
 
 
 
 
 
 
 
 
 
 
367
  metadata = {
368
  'total_features': len(self.feature_names_),
369
+ 'feature_types': {
370
+ 'tfidf_features': sum(1 for name in self.feature_names_ if name.startswith('tfidf_')),
371
+ 'sentiment_features': sum(1 for name in self.feature_names_ if name.startswith('sentiment_')),
372
+ 'readability_features': sum(1 for name in self.feature_names_ if name.startswith('readability_')),
373
+ 'entity_features': sum(1 for name in self.feature_names_ if name.startswith('entity_')),
374
+ 'linguistic_features': sum(1 for name in self.feature_names_ if name.startswith('linguistic_'))
375
+ },
376
  'configuration': {
377
  'enable_sentiment': self.enable_sentiment,
378
  'enable_readability': self.enable_readability,
 
380
  'enable_linguistic': self.enable_linguistic,
381
  'feature_selection_k': self.feature_selection_k,
382
  'tfidf_max_features': self.tfidf_max_features,
383
+ 'ngram_range': self.ngram_range
 
384
  },
385
  'feature_importance_available': bool(self.feature_importance_),
386
  'timestamp': datetime.now().isoformat()
387
  }
388
 
389
  return metadata
390
+
391
+ def save_pipeline(self, filepath):
392
+ """Save the fitted pipeline"""
393
+ if not self.is_fitted_:
394
+ raise ValueError("Pipeline must be fitted before saving")
395
+
396
+ save_data = {
397
+ 'feature_engineer': self,
398
+ 'metadata': self.get_feature_metadata(),
399
+ 'feature_names': self.feature_names_,
400
+ 'feature_importance': self.feature_importance_
401
+ }
402
+
403
+ joblib.dump(save_data, filepath)
404
+ logger.info(f"Feature engineering pipeline saved to {filepath}")
405
+
406
+ @classmethod
407
+ def load_pipeline(cls, filepath):
408
+ """Load a fitted pipeline"""
409
+ save_data = joblib.load(filepath)
410
+ feature_engineer = save_data['feature_engineer']
411
+
412
+ logger.info(f"Feature engineering pipeline loaded from {filepath}")
413
+ return feature_engineer
414
 
415
 
 
416
  def create_enhanced_pipeline(X_train, y_train,
417
  enable_sentiment=True,
418
  enable_readability=True,
 
421
  feature_selection_k=5000):
422
  """
423
  Create and fit an enhanced feature engineering pipeline.
424
+
425
+ Args:
426
+ X_train: Training text data
427
+ y_train: Training labels
428
+ enable_sentiment: Enable sentiment analysis features
429
+ enable_readability: Enable readability features
430
+ enable_entities: Enable entity features
431
+ enable_linguistic: Enable linguistic features
432
+ feature_selection_k: Number of features to select
433
+
434
+ Returns:
435
+ Fitted AdvancedFeatureEngineer instance
436
  """
437
  logger.info("Creating enhanced feature engineering pipeline...")
438
 
439
+ # Create feature engineer
440
  feature_engineer = AdvancedFeatureEngineer(
441
+ enable_sentiment=enable_sentiment,
442
+ enable_readability=enable_readability,
443
+ enable_entities=enable_entities,
444
+ enable_linguistic=enable_linguistic,
445
+ feature_selection_k=feature_selection_k
 
 
446
  )
447
 
448
  # Fit the pipeline
 
453
  logger.info(f"Enhanced pipeline created with {metadata['total_features']} features")
454
  logger.info(f"Feature breakdown: {metadata['feature_types']}")
455
 
456
+ return feature_engineer
457
+
458
+
459
+ def analyze_feature_importance(feature_engineer, top_k=20):
460
+ """
461
+ Analyze and display feature importance.
462
+
463
+ Args:
464
+ feature_engineer: Fitted AdvancedFeatureEngineer instance
465
+ top_k: Number of top features to analyze
466
+
467
+ Returns:
468
+ Dictionary with feature analysis results
469
+ """
470
+ if not feature_engineer.is_fitted_:
471
+ raise ValueError("Feature engineer must be fitted first")
472
+
473
+ # Get feature importance
474
+ importance = feature_engineer.get_feature_importance(top_k=top_k)
475
+ metadata = feature_engineer.get_feature_metadata()
476
+
477
+ # Analyze feature types in top features
478
+ top_features = list(importance.keys())
479
+ feature_type_counts = {}
480
+
481
+ for feature in top_features:
482
+ if feature.startswith('tfidf_'):
483
+ feature_type = 'tfidf'
484
+ elif feature.startswith('sentiment_'):
485
+ feature_type = 'sentiment'
486
+ elif feature.startswith('readability_'):
487
+ feature_type = 'readability'
488
+ elif feature.startswith('entity_'):
489
+ feature_type = 'entity'
490
+ elif feature.startswith('linguistic_'):
491
+ feature_type = 'linguistic'
492
+ else:
493
+ feature_type = 'other'
494
+
495
+ feature_type_counts[feature_type] = feature_type_counts.get(feature_type, 0) + 1
496
+
497
+ analysis = {
498
+ 'top_features': importance,
499
+ 'feature_type_distribution': feature_type_counts,
500
+ 'total_features': metadata['total_features'],
501
+ 'feature_breakdown': metadata['feature_types'],
502
+ 'analysis_timestamp': datetime.now().isoformat()
503
+ }
504
+
505
+ return analysis