Ahmedik95316 commited on
Commit
cf24ede
·
1 Parent(s): 2c4b5cc

Create features/feature_engineer.py

Browse files

Adding Enhanced Feature Engineering Pipeline

Files changed (1) hide show
  1. features/feature_engineer.py +505 -0
features/feature_engineer.py ADDED
@@ -0,0 +1,505 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File: features/feature_engineer.py
2
+ # Enhanced Feature Engineering Pipeline for Priority 6
3
+
4
+ import json
5
+ import joblib
6
+ import logging
7
+ import numpy as np
8
+ import pandas as pd
9
+ from pathlib import Path
10
+ from datetime import datetime
11
+ from scipy.sparse import hstack, csr_matrix
12
+ from typing import Dict, List, Any, Optional, Tuple
13
+
14
+ from sklearn.pipeline import Pipeline
15
+ from sklearn.base import BaseEstimator, TransformerMixin
16
+ from sklearn.feature_extraction.text import TfidfVectorizer
17
+ from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
18
+ from sklearn.preprocessing import StandardScaler, FunctionTransformer
19
+
20
+ import warnings
21
+ warnings.filterwarnings('ignore')
22
+
23
+ # Import feature analyzers
24
+ from features.sentiment_analyzer import SentimentAnalyzer
25
+ from features.readability_analyzer import ReadabilityAnalyzer
26
+ from features.entity_analyzer import EntityAnalyzer
27
+ from features.linguistic_analyzer import LinguisticAnalyzer
28
+
29
+ # Configure logging
30
+ logging.basicConfig(level=logging.INFO)
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ class AdvancedFeatureEngineer(BaseEstimator, TransformerMixin):
35
+ """
36
+ Advanced feature engineering pipeline combining multiple NLP feature extractors
37
+ for enhanced fake news detection performance.
38
+ """
39
+
40
+ def __init__(self,
41
+ enable_sentiment: bool = True,
42
+ enable_readability: bool = True,
43
+ enable_entities: bool = True,
44
+ enable_linguistic: bool = True,
45
+ feature_selection_k: int = 5000,
46
+ tfidf_max_features: int = 10000,
47
+ ngram_range: Tuple[int, int] = (1, 3),
48
+ min_df: int = 2,
49
+ max_df: float = 0.95):
50
+ """
51
+ Initialize the advanced feature engineering pipeline.
52
+
53
+ Args:
54
+ enable_sentiment: Enable sentiment analysis features
55
+ enable_readability: Enable readability/complexity features
56
+ enable_entities: Enable named entity recognition features
57
+ enable_linguistic: Enable advanced linguistic features
58
+ feature_selection_k: Number of features to select
59
+ tfidf_max_features: Maximum TF-IDF features
60
+ ngram_range: N-gram range for TF-IDF
61
+ min_df: Minimum document frequency for TF-IDF
62
+ max_df: Maximum document frequency for TF-IDF
63
+ """
64
+ self.enable_sentiment = enable_sentiment
65
+ self.enable_readability = enable_readability
66
+ self.enable_entities = enable_entities
67
+ self.enable_linguistic = enable_linguistic
68
+ self.feature_selection_k = feature_selection_k
69
+ self.tfidf_max_features = tfidf_max_features
70
+ self.ngram_range = ngram_range
71
+ self.min_df = min_df
72
+ self.max_df = max_df
73
+
74
+ # Initialize feature extractors
75
+ self.sentiment_analyzer = SentimentAnalyzer() if enable_sentiment else None
76
+ self.readability_analyzer = ReadabilityAnalyzer() if enable_readability else None
77
+ self.entity_analyzer = EntityAnalyzer() if enable_entities else None
78
+ self.linguistic_analyzer = LinguisticAnalyzer() if enable_linguistic else None
79
+
80
+ # Initialize TF-IDF components
81
+ self.tfidf_vectorizer = None
82
+ self.feature_selector = None
83
+ self.feature_scaler = None
84
+
85
+ # Feature metadata
86
+ self.feature_names_ = []
87
+ self.feature_importance_ = {}
88
+ self.is_fitted_ = False
89
+
90
+ def fit(self, X, y=None):
91
+ """
92
+ Fit the feature engineering pipeline.
93
+
94
+ Args:
95
+ X: Text data (array-like of strings)
96
+ y: Target labels (optional, for supervised feature selection)
97
+ """
98
+ logger.info("Fitting advanced feature engineering pipeline...")
99
+
100
+ # Convert to array if needed
101
+ if isinstance(X, pd.Series):
102
+ X = X.values
103
+ elif isinstance(X, list):
104
+ X = np.array(X)
105
+
106
+ # Validate input
107
+ if len(X) == 0:
108
+ raise ValueError("Cannot fit on empty data")
109
+
110
+ # Initialize TF-IDF vectorizer
111
+ self.tfidf_vectorizer = TfidfVectorizer(
112
+ max_features=self.tfidf_max_features,
113
+ ngram_range=self.ngram_range,
114
+ min_df=self.min_df,
115
+ max_df=self.max_df,
116
+ stop_words='english',
117
+ sublinear_tf=True,
118
+ norm='l2',
119
+ lowercase=True
120
+ )
121
+
122
+ # Fit TF-IDF on text data
123
+ logger.info("Fitting TF-IDF vectorizer...")
124
+ tfidf_features = self.tfidf_vectorizer.fit_transform(X)
125
+
126
+ # Extract additional features
127
+ additional_features = self._extract_additional_features(X, fit=True)
128
+
129
+ # Combine all features
130
+ if additional_features.shape[1] > 0:
131
+ all_features = hstack([tfidf_features, additional_features])
132
+ else:
133
+ all_features = tfidf_features
134
+
135
+ logger.info(f"Total features before selection: {all_features.shape[1]}")
136
+
137
+ # Feature selection
138
+ if y is not None and self.feature_selection_k < all_features.shape[1]:
139
+ logger.info(f"Performing feature selection (k={self.feature_selection_k})...")
140
+
141
+ # Use chi2 for text features and mutual information for numerical features
142
+ self.feature_selector = SelectKBest(
143
+ score_func=chi2,
144
+ k=min(self.feature_selection_k, all_features.shape[1])
145
+ )
146
+
147
+ # Ensure non-negative features for chi2
148
+ if hasattr(all_features, 'toarray'):
149
+ features_dense = all_features.toarray()
150
+ else:
151
+ features_dense = all_features
152
+
153
+ # Make features non-negative for chi2
154
+ features_dense = np.maximum(features_dense, 0)
155
+
156
+ self.feature_selector.fit(features_dense, y)
157
+ selected_features = self.feature_selector.transform(features_dense)
158
+
159
+ logger.info(f"Selected {selected_features.shape[1]} features")
160
+ else:
161
+ selected_features = all_features
162
+
163
+ # Scale numerical features (additional features only)
164
+ if additional_features.shape[1] > 0:
165
+ self.feature_scaler = StandardScaler()
166
+ # Only scale the additional features part
167
+ additional_selected = selected_features[:, -additional_features.shape[1]:]
168
+ self.feature_scaler.fit(additional_selected)
169
+
170
+ # Generate feature names
171
+ self._generate_feature_names()
172
+
173
+ # Calculate feature importance if possible
174
+ if y is not None and self.feature_selector is not None:
175
+ self._calculate_feature_importance()
176
+
177
+ self.is_fitted_ = True
178
+ logger.info("Feature engineering pipeline fitted successfully")
179
+
180
+ return self
181
+
182
+ def transform(self, X):
183
+ """
184
+ Transform text data into enhanced feature vectors.
185
+
186
+ Args:
187
+ X: Text data (array-like of strings)
188
+
189
+ Returns:
190
+ Transformed feature matrix
191
+ """
192
+ if not self.is_fitted_:
193
+ raise ValueError("Pipeline must be fitted before transforming")
194
+
195
+ # Convert to array if needed
196
+ if isinstance(X, pd.Series):
197
+ X = X.values
198
+ elif isinstance(X, list):
199
+ X = np.array(X)
200
+
201
+ # Extract TF-IDF features
202
+ tfidf_features = self.tfidf_vectorizer.transform(X)
203
+
204
+ # Extract additional features
205
+ additional_features = self._extract_additional_features(X, fit=False)
206
+
207
+ # Combine features
208
+ if additional_features.shape[1] > 0:
209
+ all_features = hstack([tfidf_features, additional_features])
210
+ else:
211
+ all_features = tfidf_features
212
+
213
+ # Apply feature selection
214
+ if self.feature_selector is not None:
215
+ if hasattr(all_features, 'toarray'):
216
+ features_dense = all_features.toarray()
217
+ else:
218
+ features_dense = all_features
219
+
220
+ # Ensure non-negative for consistency
221
+ features_dense = np.maximum(features_dense, 0)
222
+ selected_features = self.feature_selector.transform(features_dense)
223
+ else:
224
+ selected_features = all_features
225
+
226
+ # Scale additional features if scaler exists
227
+ if self.feature_scaler is not None and additional_features.shape[1] > 0:
228
+ # Scale only the additional features part
229
+ tfidf_selected = selected_features[:, :-additional_features.shape[1]]
230
+ additional_selected = selected_features[:, -additional_features.shape[1]:]
231
+ additional_scaled = self.feature_scaler.transform(additional_selected)
232
+
233
+ # Combine back
234
+ if hasattr(tfidf_selected, 'toarray'):
235
+ tfidf_selected = tfidf_selected.toarray()
236
+
237
+ final_features = np.hstack([tfidf_selected, additional_scaled])
238
+ else:
239
+ if hasattr(selected_features, 'toarray'):
240
+ final_features = selected_features.toarray()
241
+ else:
242
+ final_features = selected_features
243
+
244
+ return final_features
245
+
246
+ def _extract_additional_features(self, X, fit=False):
247
+ """Extract additional features beyond TF-IDF"""
248
+ feature_arrays = []
249
+
250
+ try:
251
+ # Sentiment features
252
+ if self.sentiment_analyzer is not None:
253
+ logger.info("Extracting sentiment features...")
254
+ if fit:
255
+ sentiment_features = self.sentiment_analyzer.fit_transform(X)
256
+ else:
257
+ sentiment_features = self.sentiment_analyzer.transform(X)
258
+ feature_arrays.append(sentiment_features)
259
+
260
+ # Readability features
261
+ if self.readability_analyzer is not None:
262
+ logger.info("Extracting readability features...")
263
+ if fit:
264
+ readability_features = self.readability_analyzer.fit_transform(X)
265
+ else:
266
+ readability_features = self.readability_analyzer.transform(X)
267
+ feature_arrays.append(readability_features)
268
+
269
+ # Entity features
270
+ if self.entity_analyzer is not None:
271
+ logger.info("Extracting entity features...")
272
+ if fit:
273
+ entity_features = self.entity_analyzer.fit_transform(X)
274
+ else:
275
+ entity_features = self.entity_analyzer.transform(X)
276
+ feature_arrays.append(entity_features)
277
+
278
+ # Linguistic features
279
+ if self.linguistic_analyzer is not None:
280
+ logger.info("Extracting linguistic features...")
281
+ if fit:
282
+ linguistic_features = self.linguistic_analyzer.fit_transform(X)
283
+ else:
284
+ linguistic_features = self.linguistic_analyzer.transform(X)
285
+ feature_arrays.append(linguistic_features)
286
+
287
+ # Combine all additional features
288
+ if feature_arrays:
289
+ additional_features = np.hstack(feature_arrays)
290
+ logger.info(f"Extracted {additional_features.shape[1]} additional features")
291
+ else:
292
+ additional_features = np.empty((len(X), 0))
293
+
294
+ except Exception as e:
295
+ logger.warning(f"Error extracting additional features: {e}")
296
+ additional_features = np.empty((len(X), 0))
297
+
298
+ return additional_features
299
+
300
+ def _generate_feature_names(self):
301
+ """Generate comprehensive feature names"""
302
+ self.feature_names_ = []
303
+
304
+ # TF-IDF feature names
305
+ if self.tfidf_vectorizer is not None:
306
+ tfidf_names = [f"tfidf_{name}" for name in self.tfidf_vectorizer.get_feature_names_out()]
307
+ self.feature_names_.extend(tfidf_names)
308
+
309
+ # Additional feature names
310
+ if self.sentiment_analyzer is not None:
311
+ self.feature_names_.extend(self.sentiment_analyzer.get_feature_names())
312
+
313
+ if self.readability_analyzer is not None:
314
+ self.feature_names_.extend(self.readability_analyzer.get_feature_names())
315
+
316
+ if self.entity_analyzer is not None:
317
+ self.feature_names_.extend(self.entity_analyzer.get_feature_names())
318
+
319
+ if self.linguistic_analyzer is not None:
320
+ self.feature_names_.extend(self.linguistic_analyzer.get_feature_names())
321
+
322
+ # Apply feature selection to names if applicable
323
+ if self.feature_selector is not None:
324
+ selected_indices = self.feature_selector.get_support()
325
+ self.feature_names_ = [name for i, name in enumerate(self.feature_names_) if selected_indices[i]]
326
+
327
+ def _calculate_feature_importance(self):
328
+ """Calculate feature importance scores"""
329
+ if self.feature_selector is not None:
330
+ scores = self.feature_selector.scores_
331
+ selected_indices = self.feature_selector.get_support()
332
+
333
+ # Get scores for selected features
334
+ selected_scores = scores[selected_indices]
335
+
336
+ # Create importance dictionary
337
+ self.feature_importance_ = {
338
+ name: float(score) for name, score in zip(self.feature_names_, selected_scores)
339
+ }
340
+
341
+ # Sort by importance
342
+ self.feature_importance_ = dict(
343
+ sorted(self.feature_importance_.items(), key=lambda x: x[1], reverse=True)
344
+ )
345
+
346
+ def get_feature_names(self):
347
+ """Get names of output features"""
348
+ if not self.is_fitted_:
349
+ raise ValueError("Pipeline must be fitted first")
350
+ return self.feature_names_
351
+
352
+ def get_feature_importance(self, top_k=None):
353
+ """Get feature importance scores"""
354
+ if not self.feature_importance_:
355
+ return {}
356
+
357
+ if top_k is not None:
358
+ return dict(list(self.feature_importance_.items())[:top_k])
359
+
360
+ return self.feature_importance_
361
+
362
+ def get_feature_metadata(self):
363
+ """Get comprehensive feature metadata"""
364
+ if not self.is_fitted_:
365
+ raise ValueError("Pipeline must be fitted first")
366
+
367
+ metadata = {
368
+ 'total_features': len(self.feature_names_),
369
+ 'feature_types': {
370
+ 'tfidf_features': sum(1 for name in self.feature_names_ if name.startswith('tfidf_')),
371
+ 'sentiment_features': sum(1 for name in self.feature_names_ if name.startswith('sentiment_')),
372
+ 'readability_features': sum(1 for name in self.feature_names_ if name.startswith('readability_')),
373
+ 'entity_features': sum(1 for name in self.feature_names_ if name.startswith('entity_')),
374
+ 'linguistic_features': sum(1 for name in self.feature_names_ if name.startswith('linguistic_'))
375
+ },
376
+ 'configuration': {
377
+ 'enable_sentiment': self.enable_sentiment,
378
+ 'enable_readability': self.enable_readability,
379
+ 'enable_entities': self.enable_entities,
380
+ 'enable_linguistic': self.enable_linguistic,
381
+ 'feature_selection_k': self.feature_selection_k,
382
+ 'tfidf_max_features': self.tfidf_max_features,
383
+ 'ngram_range': self.ngram_range
384
+ },
385
+ 'feature_importance_available': bool(self.feature_importance_),
386
+ 'timestamp': datetime.now().isoformat()
387
+ }
388
+
389
+ return metadata
390
+
391
+ def save_pipeline(self, filepath):
392
+ """Save the fitted pipeline"""
393
+ if not self.is_fitted_:
394
+ raise ValueError("Pipeline must be fitted before saving")
395
+
396
+ save_data = {
397
+ 'feature_engineer': self,
398
+ 'metadata': self.get_feature_metadata(),
399
+ 'feature_names': self.feature_names_,
400
+ 'feature_importance': self.feature_importance_
401
+ }
402
+
403
+ joblib.dump(save_data, filepath)
404
+ logger.info(f"Feature engineering pipeline saved to {filepath}")
405
+
406
+ @classmethod
407
+ def load_pipeline(cls, filepath):
408
+ """Load a fitted pipeline"""
409
+ save_data = joblib.load(filepath)
410
+ feature_engineer = save_data['feature_engineer']
411
+
412
+ logger.info(f"Feature engineering pipeline loaded from {filepath}")
413
+ return feature_engineer
414
+
415
+
416
+ def create_enhanced_pipeline(X_train, y_train,
417
+ enable_sentiment=True,
418
+ enable_readability=True,
419
+ enable_entities=True,
420
+ enable_linguistic=True,
421
+ feature_selection_k=5000):
422
+ """
423
+ Create and fit an enhanced feature engineering pipeline.
424
+
425
+ Args:
426
+ X_train: Training text data
427
+ y_train: Training labels
428
+ enable_sentiment: Enable sentiment analysis features
429
+ enable_readability: Enable readability features
430
+ enable_entities: Enable entity features
431
+ enable_linguistic: Enable linguistic features
432
+ feature_selection_k: Number of features to select
433
+
434
+ Returns:
435
+ Fitted AdvancedFeatureEngineer instance
436
+ """
437
+ logger.info("Creating enhanced feature engineering pipeline...")
438
+
439
+ # Create feature engineer
440
+ feature_engineer = AdvancedFeatureEngineer(
441
+ enable_sentiment=enable_sentiment,
442
+ enable_readability=enable_readability,
443
+ enable_entities=enable_entities,
444
+ enable_linguistic=enable_linguistic,
445
+ feature_selection_k=feature_selection_k
446
+ )
447
+
448
+ # Fit the pipeline
449
+ feature_engineer.fit(X_train, y_train)
450
+
451
+ # Log feature information
452
+ metadata = feature_engineer.get_feature_metadata()
453
+ logger.info(f"Enhanced pipeline created with {metadata['total_features']} features")
454
+ logger.info(f"Feature breakdown: {metadata['feature_types']}")
455
+
456
+ return feature_engineer
457
+
458
+
459
+ def analyze_feature_importance(feature_engineer, top_k=20):
460
+ """
461
+ Analyze and display feature importance.
462
+
463
+ Args:
464
+ feature_engineer: Fitted AdvancedFeatureEngineer instance
465
+ top_k: Number of top features to analyze
466
+
467
+ Returns:
468
+ Dictionary with feature analysis results
469
+ """
470
+ if not feature_engineer.is_fitted_:
471
+ raise ValueError("Feature engineer must be fitted first")
472
+
473
+ # Get feature importance
474
+ importance = feature_engineer.get_feature_importance(top_k=top_k)
475
+ metadata = feature_engineer.get_feature_metadata()
476
+
477
+ # Analyze feature types in top features
478
+ top_features = list(importance.keys())
479
+ feature_type_counts = {}
480
+
481
+ for feature in top_features:
482
+ if feature.startswith('tfidf_'):
483
+ feature_type = 'tfidf'
484
+ elif feature.startswith('sentiment_'):
485
+ feature_type = 'sentiment'
486
+ elif feature.startswith('readability_'):
487
+ feature_type = 'readability'
488
+ elif feature.startswith('entity_'):
489
+ feature_type = 'entity'
490
+ elif feature.startswith('linguistic_'):
491
+ feature_type = 'linguistic'
492
+ else:
493
+ feature_type = 'other'
494
+
495
+ feature_type_counts[feature_type] = feature_type_counts.get(feature_type, 0) + 1
496
+
497
+ analysis = {
498
+ 'top_features': importance,
499
+ 'feature_type_distribution': feature_type_counts,
500
+ 'total_features': metadata['total_features'],
501
+ 'feature_breakdown': metadata['feature_types'],
502
+ 'analysis_timestamp': datetime.now().isoformat()
503
+ }
504
+
505
+ return analysis