Ahmedik95316 commited on
Commit
23da975
·
1 Parent(s): be90b49

Create readability_analyzer.py

Browse files

Adding Enhanced Feature Engineering Pipeline

Files changed (1) hide show
  1. features/readability_analyzer.py +424 -0
features/readability_analyzer.py ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # features/readability_analyzer.py
2
+ # Readability and Linguistic Complexity Analysis Component
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ import re
7
+ import logging
8
+ from typing import List, Dict, Any
9
+ from sklearn.base import BaseEstimator, TransformerMixin
10
+ import warnings
11
+ warnings.filterwarnings('ignore')
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class ReadabilityAnalyzer(BaseEstimator, TransformerMixin):
17
+ """
18
+ Advanced readability and linguistic complexity analyzer.
19
+ Detects patterns in text complexity that may indicate misinformation tactics.
20
+ """
21
+
22
+ def __init__(self):
23
+ self.is_fitted_ = False
24
+
25
+ def fit(self, X, y=None):
26
+ """Fit the readability analyzer (for API consistency)"""
27
+ self.is_fitted_ = True
28
+ return self
29
+
30
+ def transform(self, X):
31
+ """Extract readability and complexity features"""
32
+ if not self.is_fitted_:
33
+ raise ValueError("ReadabilityAnalyzer must be fitted before transform")
34
+
35
+ # Convert input to array if needed
36
+ if isinstance(X, pd.Series):
37
+ X = X.values
38
+ elif isinstance(X, list):
39
+ X = np.array(X)
40
+
41
+ features = []
42
+
43
+ for text in X:
44
+ text_features = self._extract_readability_features(str(text))
45
+ features.append(text_features)
46
+
47
+ return np.array(features)
48
+
49
+ def fit_transform(self, X, y=None):
50
+ """Fit and transform in one step"""
51
+ return self.fit(X, y).transform(X)
52
+
53
+ def _extract_readability_features(self, text):
54
+ """Extract comprehensive readability features"""
55
+ # Basic text statistics
56
+ sentences = self._split_sentences(text)
57
+ words = self._split_words(text)
58
+ syllables = self._count_syllables_total(words)
59
+
60
+ # Handle edge cases
61
+ if len(sentences) == 0 or len(words) == 0:
62
+ return [0.0] * 15
63
+
64
+ features = []
65
+
66
+ # Basic metrics
67
+ avg_words_per_sentence = len(words) / len(sentences)
68
+ avg_syllables_per_word = syllables / len(words)
69
+ avg_chars_per_word = sum(len(word) for word in words) / len(words)
70
+
71
+ features.extend([avg_words_per_sentence, avg_syllables_per_word, avg_chars_per_word])
72
+
73
+ # Readability scores
74
+ flesch_reading_ease = self._calculate_flesch_reading_ease(words, sentences, syllables)
75
+ flesch_kincaid_grade = self._calculate_flesch_kincaid_grade(words, sentences, syllables)
76
+ automated_readability_index = self._calculate_ari(words, sentences, text)
77
+
78
+ features.extend([flesch_reading_ease, flesch_kincaid_grade, automated_readability_index])
79
+
80
+ # Complexity indicators
81
+ complex_words_ratio = self._calculate_complex_words_ratio(words)
82
+ long_words_ratio = self._calculate_long_words_ratio(words)
83
+ technical_terms_ratio = self._calculate_technical_terms_ratio(words)
84
+
85
+ features.extend([complex_words_ratio, long_words_ratio, technical_terms_ratio])
86
+
87
+ # Sentence structure complexity
88
+ sentence_length_variance = self._calculate_sentence_length_variance(sentences)
89
+ punctuation_density = self._calculate_punctuation_density(text)
90
+ subordinate_clause_ratio = self._calculate_subordinate_clause_ratio(text)
91
+
92
+ features.extend([sentence_length_variance, punctuation_density, subordinate_clause_ratio])
93
+
94
+ # Vocabulary sophistication
95
+ unique_word_ratio = self._calculate_unique_word_ratio(words)
96
+ rare_word_ratio = self._calculate_rare_word_ratio(words)
97
+ formal_language_ratio = self._calculate_formal_language_ratio(words)
98
+
99
+ features.extend([unique_word_ratio, rare_word_ratio, formal_language_ratio])
100
+
101
+ return features
102
+
103
+ def _split_sentences(self, text):
104
+ """Split text into sentences"""
105
+ # Simple sentence splitting - could be enhanced with NLTK
106
+ sentences = re.split(r'[.!?]+', text)
107
+ sentences = [s.strip() for s in sentences if s.strip()]
108
+ return sentences
109
+
110
+ def _split_words(self, text):
111
+ """Split text into words"""
112
+ words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
113
+ return words
114
+
115
+ def _count_syllables(self, word):
116
+ """Count syllables in a word (approximation)"""
117
+ word = word.lower()
118
+ vowels = 'aeiouy'
119
+ syllable_count = 0
120
+ previous_was_vowel = False
121
+
122
+ for char in word:
123
+ is_vowel = char in vowels
124
+ if is_vowel and not previous_was_vowel:
125
+ syllable_count += 1
126
+ previous_was_vowel = is_vowel
127
+
128
+ # Handle silent 'e'
129
+ if word.endswith('e') and syllable_count > 1:
130
+ syllable_count -= 1
131
+
132
+ return max(1, syllable_count) # Every word has at least 1 syllable
133
+
134
+ def _count_syllables_total(self, words):
135
+ """Count total syllables in word list"""
136
+ return sum(self._count_syllables(word) for word in words)
137
+
138
+ def _calculate_flesch_reading_ease(self, words, sentences, syllables):
139
+ """Calculate Flesch Reading Ease score"""
140
+ if len(sentences) == 0 or len(words) == 0:
141
+ return 0
142
+
143
+ avg_sentence_length = len(words) / len(sentences)
144
+ avg_syllables_per_word = syllables / len(words)
145
+
146
+ score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syllables_per_word)
147
+ return max(0, min(100, score)) # Clamp between 0-100
148
+
149
+ def _calculate_flesch_kincaid_grade(self, words, sentences, syllables):
150
+ """Calculate Flesch-Kincaid Grade Level"""
151
+ if len(sentences) == 0 or len(words) == 0:
152
+ return 0
153
+
154
+ avg_sentence_length = len(words) / len(sentences)
155
+ avg_syllables_per_word = syllables / len(words)
156
+
157
+ grade = (0.39 * avg_sentence_length) + (11.8 * avg_syllables_per_word) - 15.59
158
+ return max(0, grade)
159
+
160
+ def _calculate_ari(self, words, sentences, text):
161
+ """Calculate Automated Readability Index"""
162
+ if len(sentences) == 0 or len(words) == 0:
163
+ return 0
164
+
165
+ chars = len(re.sub(r'\s+', '', text))
166
+ avg_chars_per_word = chars / len(words)
167
+ avg_words_per_sentence = len(words) / len(sentences)
168
+
169
+ ari = (4.71 * avg_chars_per_word) + (0.5 * avg_words_per_sentence) - 21.43
170
+ return max(0, ari)
171
+
172
+ def _calculate_complex_words_ratio(self, words):
173
+ """Calculate ratio of complex words (3+ syllables)"""
174
+ if not words:
175
+ return 0
176
+
177
+ complex_words = sum(1 for word in words if self._count_syllables(word) >= 3)
178
+ return complex_words / len(words)
179
+
180
+ def _calculate_long_words_ratio(self, words):
181
+ """Calculate ratio of long words (7+ characters)"""
182
+ if not words:
183
+ return 0
184
+
185
+ long_words = sum(1 for word in words if len(word) >= 7)
186
+ return long_words / len(words)
187
+
188
+ def _calculate_technical_terms_ratio(self, words):
189
+ """Calculate ratio of potentially technical terms"""
190
+ if not words:
191
+ return 0
192
+
193
+ # Heuristics for technical terms
194
+ technical_indicators = {
195
+ 'tion', 'sion', 'ment', 'ness', 'ance', 'ence', 'ism', 'ist',
196
+ 'ogy', 'ics', 'phy', 'logical', 'ical', 'ative', 'itive'
197
+ }
198
+
199
+ technical_words = 0
200
+ for word in words:
201
+ if (len(word) > 6 and
202
+ any(word.endswith(suffix) for suffix in technical_indicators)):
203
+ technical_words += 1
204
+
205
+ return technical_words / len(words)
206
+
207
+ def _calculate_sentence_length_variance(self, sentences):
208
+ """Calculate variance in sentence lengths"""
209
+ if len(sentences) <= 1:
210
+ return 0
211
+
212
+ lengths = [len(sentence.split()) for sentence in sentences]
213
+ mean_length = sum(lengths) / len(lengths)
214
+ variance = sum((length - mean_length) ** 2 for length in lengths) / len(lengths)
215
+
216
+ return variance
217
+
218
+ def _calculate_punctuation_density(self, text):
219
+ """Calculate density of punctuation marks"""
220
+ if not text:
221
+ return 0
222
+
223
+ punctuation_marks = re.findall(r'[.,;:!?()-"]', text)
224
+ return len(punctuation_marks) / len(text)
225
+
226
+ def _calculate_subordinate_clause_ratio(self, text):
227
+ """Calculate ratio of subordinate clauses (approximation)"""
228
+ if not text:
229
+ return 0
230
+
231
+ # Look for subordinating conjunctions and relative pronouns
232
+ subordinate_indicators = [
233
+ 'although', 'because', 'since', 'while', 'whereas', 'if', 'unless',
234
+ 'when', 'whenever', 'where', 'wherever', 'that', 'which', 'who',
235
+ 'whom', 'whose', 'after', 'before', 'until', 'as'
236
+ ]
237
+
238
+ text_lower = text.lower()
239
+ subordinate_count = sum(text_lower.count(f' {indicator} ') for indicator in subordinate_indicators)
240
+ sentences = self._split_sentences(text)
241
+
242
+ return subordinate_count / len(sentences) if sentences else 0
243
+
244
+ def _calculate_unique_word_ratio(self, words):
245
+ """Calculate ratio of unique words (lexical diversity)"""
246
+ if not words:
247
+ return 0
248
+
249
+ unique_words = len(set(words))
250
+ return unique_words / len(words)
251
+
252
+ def _calculate_rare_word_ratio(self, words):
253
+ """Calculate ratio of rare/uncommon words"""
254
+ if not words:
255
+ return 0
256
+
257
+ # Common English words (top 1000 most frequent)
258
+ common_words = {
259
+ 'the', 'of', 'and', 'a', 'to', 'in', 'is', 'you', 'that', 'it',
260
+ 'he', 'was', 'for', 'on', 'are', 'as', 'with', 'his', 'they',
261
+ 'i', 'at', 'be', 'this', 'have', 'from', 'or', 'one', 'had',
262
+ 'by', 'word', 'but', 'not', 'what', 'all', 'were', 'we', 'when',
263
+ 'your', 'can', 'said', 'there', 'each', 'which', 'she', 'do',
264
+ 'how', 'their', 'if', 'will', 'up', 'other', 'about', 'out',
265
+ 'many', 'then', 'them', 'these', 'so', 'some', 'her', 'would',
266
+ 'make', 'like', 'into', 'him', 'has', 'two', 'more', 'very',
267
+ 'after', 'words', 'first', 'where', 'much', 'through', 'back',
268
+ 'years', 'work', 'came', 'right', 'used', 'take', 'three',
269
+ 'states', 'himself', 'few', 'house', 'use', 'during', 'without',
270
+ 'again', 'place', 'around', 'however', 'small', 'found', 'mrs',
271
+ 'thought', 'went', 'say', 'part', 'once', 'general', 'high',
272
+ 'upon', 'school', 'every', 'don', 'does', 'got', 'united',
273
+ 'left', 'number', 'course', 'war', 'until', 'always', 'away',
274
+ 'something', 'fact', 'though', 'water', 'less', 'public', 'put',
275
+ 'think', 'almost', 'hand', 'enough', 'far', 'took', 'head',
276
+ 'yet', 'government', 'system', 'better', 'set', 'told', 'nothing',
277
+ 'night', 'end', 'why', 'called', 'didn', 'eyes', 'find', 'going',
278
+ 'look', 'asked', 'later', 'knew', 'point', 'next', 'city', 'did',
279
+ 'want', 'way', 'could', 'people', 'may', 'says', 'each', 'those',
280
+ 'now', 'such', 'here', 'take', 'than', 'only', 'well', 'year'
281
+ }
282
+
283
+ rare_words = sum(1 for word in words if word not in common_words and len(word) > 4)
284
+ return rare_words / len(words)
285
+
286
+ def _calculate_formal_language_ratio(self, words):
287
+ """Calculate ratio of formal/academic language"""
288
+ if not words:
289
+ return 0
290
+
291
+ # Formal language indicators
292
+ formal_indicators = {
293
+ 'therefore', 'however', 'furthermore', 'moreover', 'nevertheless',
294
+ 'consequently', 'subsequently', 'accordingly', 'thus', 'hence',
295
+ 'whereas', 'whereby', 'wherein', 'hereafter', 'heretofore',
296
+ 'notwithstanding', 'inasmuch', 'insofar', 'albeit', 'vis'
297
+ }
298
+
299
+ # Academic/formal suffixes
300
+ formal_suffixes = {
301
+ 'tion', 'sion', 'ment', 'ance', 'ence', 'ity', 'ness', 'ism',
302
+ 'ize', 'ise', 'ate', 'fy', 'able', 'ible', 'ous', 'eous',
303
+ 'ious', 'ive', 'ary', 'ory', 'al', 'ic', 'ical'
304
+ }
305
+
306
+ formal_words = 0
307
+ for word in words:
308
+ if (word in formal_indicators or
309
+ (len(word) > 5 and any(word.endswith(suffix) for suffix in formal_suffixes))):
310
+ formal_words += 1
311
+
312
+ return formal_words / len(words)
313
+
314
+ def get_feature_names(self):
315
+ """Get names of extracted features"""
316
+ feature_names = [
317
+ 'readability_avg_words_per_sentence',
318
+ 'readability_avg_syllables_per_word',
319
+ 'readability_avg_chars_per_word',
320
+ 'readability_flesch_reading_ease',
321
+ 'readability_flesch_kincaid_grade',
322
+ 'readability_automated_readability_index',
323
+ 'readability_complex_words_ratio',
324
+ 'readability_long_words_ratio',
325
+ 'readability_technical_terms_ratio',
326
+ 'readability_sentence_length_variance',
327
+ 'readability_punctuation_density',
328
+ 'readability_subordinate_clause_ratio',
329
+ 'readability_unique_word_ratio',
330
+ 'readability_rare_word_ratio',
331
+ 'readability_formal_language_ratio'
332
+ ]
333
+
334
+ return feature_names
335
+
336
+ def analyze_text_readability(self, text):
337
+ """Detailed readability analysis of a single text"""
338
+ if not self.is_fitted_:
339
+ raise ValueError("ReadabilityAnalyzer must be fitted before analysis")
340
+
341
+ sentences = self._split_sentences(text)
342
+ words = self._split_words(text)
343
+ syllables = self._count_syllables_total(words)
344
+
345
+ if len(sentences) == 0 or len(words) == 0:
346
+ return {
347
+ 'error': 'Text too short for analysis',
348
+ 'text_length': len(text),
349
+ 'word_count': len(words),
350
+ 'sentence_count': len(sentences)
351
+ }
352
+
353
+ analysis = {
354
+ 'basic_stats': {
355
+ 'text_length': len(text),
356
+ 'word_count': len(words),
357
+ 'sentence_count': len(sentences),
358
+ 'syllable_count': syllables,
359
+ 'avg_words_per_sentence': len(words) / len(sentences),
360
+ 'avg_syllables_per_word': syllables / len(words),
361
+ 'avg_chars_per_word': sum(len(word) for word in words) / len(words)
362
+ },
363
+ 'readability_scores': {
364
+ 'flesch_reading_ease': self._calculate_flesch_reading_ease(words, sentences, syllables),
365
+ 'flesch_kincaid_grade': self._calculate_flesch_kincaid_grade(words, sentences, syllables),
366
+ 'automated_readability_index': self._calculate_ari(words, sentences, text)
367
+ },
368
+ 'complexity_metrics': {
369
+ 'complex_words_ratio': self._calculate_complex_words_ratio(words),
370
+ 'long_words_ratio': self._calculate_long_words_ratio(words),
371
+ 'technical_terms_ratio': self._calculate_technical_terms_ratio(words),
372
+ 'unique_word_ratio': self._calculate_unique_word_ratio(words),
373
+ 'rare_word_ratio': self._calculate_rare_word_ratio(words),
374
+ 'formal_language_ratio': self._calculate_formal_language_ratio(words)
375
+ },
376
+ 'structure_analysis': {
377
+ 'sentence_length_variance': self._calculate_sentence_length_variance(sentences),
378
+ 'punctuation_density': self._calculate_punctuation_density(text),
379
+ 'subordinate_clause_ratio': self._calculate_subordinate_clause_ratio(text)
380
+ }
381
+ }
382
+
383
+ # Interpret readability level
384
+ flesch_score = analysis['readability_scores']['flesch_reading_ease']
385
+ if flesch_score >= 90:
386
+ readability_level = 'very_easy'
387
+ elif flesch_score >= 80:
388
+ readability_level = 'easy'
389
+ elif flesch_score >= 70:
390
+ readability_level = 'fairly_easy'
391
+ elif flesch_score >= 60:
392
+ readability_level = 'standard'
393
+ elif flesch_score >= 50:
394
+ readability_level = 'fairly_difficult'
395
+ elif flesch_score >= 30:
396
+ readability_level = 'difficult'
397
+ else:
398
+ readability_level = 'very_difficult'
399
+
400
+ analysis['interpretation'] = {
401
+ 'readability_level': readability_level,
402
+ 'grade_level': analysis['readability_scores']['flesch_kincaid_grade'],
403
+ 'complexity_assessment': self._assess_complexity(analysis)
404
+ }
405
+
406
+ return analysis
407
+
408
+ def _assess_complexity(self, analysis):
409
+ """Assess overall complexity level"""
410
+ complexity_indicators = [
411
+ analysis['complexity_metrics']['complex_words_ratio'],
412
+ analysis['complexity_metrics']['technical_terms_ratio'],
413
+ analysis['complexity_metrics']['formal_language_ratio'],
414
+ min(1.0, analysis['structure_analysis']['subordinate_clause_ratio']) # Cap at 1.0
415
+ ]
416
+
417
+ avg_complexity = sum(complexity_indicators) / len(complexity_indicators)
418
+
419
+ if avg_complexity > 0.3:
420
+ return 'high'
421
+ elif avg_complexity > 0.15:
422
+ return 'medium'
423
+ else:
424
+ return 'low'