Commit
·
23da975
1
Parent(s):
be90b49
Create readability_analyzer.py
Browse filesAdding Enhanced Feature Engineering Pipeline
- features/readability_analyzer.py +424 -0
features/readability_analyzer.py
ADDED
@@ -0,0 +1,424 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# features/readability_analyzer.py
|
2 |
+
# Readability and Linguistic Complexity Analysis Component
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
import re
|
7 |
+
import logging
|
8 |
+
from typing import List, Dict, Any
|
9 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
10 |
+
import warnings
|
11 |
+
warnings.filterwarnings('ignore')
|
12 |
+
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
|
16 |
+
class ReadabilityAnalyzer(BaseEstimator, TransformerMixin):
|
17 |
+
"""
|
18 |
+
Advanced readability and linguistic complexity analyzer.
|
19 |
+
Detects patterns in text complexity that may indicate misinformation tactics.
|
20 |
+
"""
|
21 |
+
|
22 |
+
def __init__(self):
|
23 |
+
self.is_fitted_ = False
|
24 |
+
|
25 |
+
def fit(self, X, y=None):
|
26 |
+
"""Fit the readability analyzer (for API consistency)"""
|
27 |
+
self.is_fitted_ = True
|
28 |
+
return self
|
29 |
+
|
30 |
+
def transform(self, X):
|
31 |
+
"""Extract readability and complexity features"""
|
32 |
+
if not self.is_fitted_:
|
33 |
+
raise ValueError("ReadabilityAnalyzer must be fitted before transform")
|
34 |
+
|
35 |
+
# Convert input to array if needed
|
36 |
+
if isinstance(X, pd.Series):
|
37 |
+
X = X.values
|
38 |
+
elif isinstance(X, list):
|
39 |
+
X = np.array(X)
|
40 |
+
|
41 |
+
features = []
|
42 |
+
|
43 |
+
for text in X:
|
44 |
+
text_features = self._extract_readability_features(str(text))
|
45 |
+
features.append(text_features)
|
46 |
+
|
47 |
+
return np.array(features)
|
48 |
+
|
49 |
+
def fit_transform(self, X, y=None):
|
50 |
+
"""Fit and transform in one step"""
|
51 |
+
return self.fit(X, y).transform(X)
|
52 |
+
|
53 |
+
def _extract_readability_features(self, text):
|
54 |
+
"""Extract comprehensive readability features"""
|
55 |
+
# Basic text statistics
|
56 |
+
sentences = self._split_sentences(text)
|
57 |
+
words = self._split_words(text)
|
58 |
+
syllables = self._count_syllables_total(words)
|
59 |
+
|
60 |
+
# Handle edge cases
|
61 |
+
if len(sentences) == 0 or len(words) == 0:
|
62 |
+
return [0.0] * 15
|
63 |
+
|
64 |
+
features = []
|
65 |
+
|
66 |
+
# Basic metrics
|
67 |
+
avg_words_per_sentence = len(words) / len(sentences)
|
68 |
+
avg_syllables_per_word = syllables / len(words)
|
69 |
+
avg_chars_per_word = sum(len(word) for word in words) / len(words)
|
70 |
+
|
71 |
+
features.extend([avg_words_per_sentence, avg_syllables_per_word, avg_chars_per_word])
|
72 |
+
|
73 |
+
# Readability scores
|
74 |
+
flesch_reading_ease = self._calculate_flesch_reading_ease(words, sentences, syllables)
|
75 |
+
flesch_kincaid_grade = self._calculate_flesch_kincaid_grade(words, sentences, syllables)
|
76 |
+
automated_readability_index = self._calculate_ari(words, sentences, text)
|
77 |
+
|
78 |
+
features.extend([flesch_reading_ease, flesch_kincaid_grade, automated_readability_index])
|
79 |
+
|
80 |
+
# Complexity indicators
|
81 |
+
complex_words_ratio = self._calculate_complex_words_ratio(words)
|
82 |
+
long_words_ratio = self._calculate_long_words_ratio(words)
|
83 |
+
technical_terms_ratio = self._calculate_technical_terms_ratio(words)
|
84 |
+
|
85 |
+
features.extend([complex_words_ratio, long_words_ratio, technical_terms_ratio])
|
86 |
+
|
87 |
+
# Sentence structure complexity
|
88 |
+
sentence_length_variance = self._calculate_sentence_length_variance(sentences)
|
89 |
+
punctuation_density = self._calculate_punctuation_density(text)
|
90 |
+
subordinate_clause_ratio = self._calculate_subordinate_clause_ratio(text)
|
91 |
+
|
92 |
+
features.extend([sentence_length_variance, punctuation_density, subordinate_clause_ratio])
|
93 |
+
|
94 |
+
# Vocabulary sophistication
|
95 |
+
unique_word_ratio = self._calculate_unique_word_ratio(words)
|
96 |
+
rare_word_ratio = self._calculate_rare_word_ratio(words)
|
97 |
+
formal_language_ratio = self._calculate_formal_language_ratio(words)
|
98 |
+
|
99 |
+
features.extend([unique_word_ratio, rare_word_ratio, formal_language_ratio])
|
100 |
+
|
101 |
+
return features
|
102 |
+
|
103 |
+
def _split_sentences(self, text):
|
104 |
+
"""Split text into sentences"""
|
105 |
+
# Simple sentence splitting - could be enhanced with NLTK
|
106 |
+
sentences = re.split(r'[.!?]+', text)
|
107 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
108 |
+
return sentences
|
109 |
+
|
110 |
+
def _split_words(self, text):
|
111 |
+
"""Split text into words"""
|
112 |
+
words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
|
113 |
+
return words
|
114 |
+
|
115 |
+
def _count_syllables(self, word):
|
116 |
+
"""Count syllables in a word (approximation)"""
|
117 |
+
word = word.lower()
|
118 |
+
vowels = 'aeiouy'
|
119 |
+
syllable_count = 0
|
120 |
+
previous_was_vowel = False
|
121 |
+
|
122 |
+
for char in word:
|
123 |
+
is_vowel = char in vowels
|
124 |
+
if is_vowel and not previous_was_vowel:
|
125 |
+
syllable_count += 1
|
126 |
+
previous_was_vowel = is_vowel
|
127 |
+
|
128 |
+
# Handle silent 'e'
|
129 |
+
if word.endswith('e') and syllable_count > 1:
|
130 |
+
syllable_count -= 1
|
131 |
+
|
132 |
+
return max(1, syllable_count) # Every word has at least 1 syllable
|
133 |
+
|
134 |
+
def _count_syllables_total(self, words):
|
135 |
+
"""Count total syllables in word list"""
|
136 |
+
return sum(self._count_syllables(word) for word in words)
|
137 |
+
|
138 |
+
def _calculate_flesch_reading_ease(self, words, sentences, syllables):
|
139 |
+
"""Calculate Flesch Reading Ease score"""
|
140 |
+
if len(sentences) == 0 or len(words) == 0:
|
141 |
+
return 0
|
142 |
+
|
143 |
+
avg_sentence_length = len(words) / len(sentences)
|
144 |
+
avg_syllables_per_word = syllables / len(words)
|
145 |
+
|
146 |
+
score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syllables_per_word)
|
147 |
+
return max(0, min(100, score)) # Clamp between 0-100
|
148 |
+
|
149 |
+
def _calculate_flesch_kincaid_grade(self, words, sentences, syllables):
|
150 |
+
"""Calculate Flesch-Kincaid Grade Level"""
|
151 |
+
if len(sentences) == 0 or len(words) == 0:
|
152 |
+
return 0
|
153 |
+
|
154 |
+
avg_sentence_length = len(words) / len(sentences)
|
155 |
+
avg_syllables_per_word = syllables / len(words)
|
156 |
+
|
157 |
+
grade = (0.39 * avg_sentence_length) + (11.8 * avg_syllables_per_word) - 15.59
|
158 |
+
return max(0, grade)
|
159 |
+
|
160 |
+
def _calculate_ari(self, words, sentences, text):
|
161 |
+
"""Calculate Automated Readability Index"""
|
162 |
+
if len(sentences) == 0 or len(words) == 0:
|
163 |
+
return 0
|
164 |
+
|
165 |
+
chars = len(re.sub(r'\s+', '', text))
|
166 |
+
avg_chars_per_word = chars / len(words)
|
167 |
+
avg_words_per_sentence = len(words) / len(sentences)
|
168 |
+
|
169 |
+
ari = (4.71 * avg_chars_per_word) + (0.5 * avg_words_per_sentence) - 21.43
|
170 |
+
return max(0, ari)
|
171 |
+
|
172 |
+
def _calculate_complex_words_ratio(self, words):
|
173 |
+
"""Calculate ratio of complex words (3+ syllables)"""
|
174 |
+
if not words:
|
175 |
+
return 0
|
176 |
+
|
177 |
+
complex_words = sum(1 for word in words if self._count_syllables(word) >= 3)
|
178 |
+
return complex_words / len(words)
|
179 |
+
|
180 |
+
def _calculate_long_words_ratio(self, words):
|
181 |
+
"""Calculate ratio of long words (7+ characters)"""
|
182 |
+
if not words:
|
183 |
+
return 0
|
184 |
+
|
185 |
+
long_words = sum(1 for word in words if len(word) >= 7)
|
186 |
+
return long_words / len(words)
|
187 |
+
|
188 |
+
def _calculate_technical_terms_ratio(self, words):
|
189 |
+
"""Calculate ratio of potentially technical terms"""
|
190 |
+
if not words:
|
191 |
+
return 0
|
192 |
+
|
193 |
+
# Heuristics for technical terms
|
194 |
+
technical_indicators = {
|
195 |
+
'tion', 'sion', 'ment', 'ness', 'ance', 'ence', 'ism', 'ist',
|
196 |
+
'ogy', 'ics', 'phy', 'logical', 'ical', 'ative', 'itive'
|
197 |
+
}
|
198 |
+
|
199 |
+
technical_words = 0
|
200 |
+
for word in words:
|
201 |
+
if (len(word) > 6 and
|
202 |
+
any(word.endswith(suffix) for suffix in technical_indicators)):
|
203 |
+
technical_words += 1
|
204 |
+
|
205 |
+
return technical_words / len(words)
|
206 |
+
|
207 |
+
def _calculate_sentence_length_variance(self, sentences):
|
208 |
+
"""Calculate variance in sentence lengths"""
|
209 |
+
if len(sentences) <= 1:
|
210 |
+
return 0
|
211 |
+
|
212 |
+
lengths = [len(sentence.split()) for sentence in sentences]
|
213 |
+
mean_length = sum(lengths) / len(lengths)
|
214 |
+
variance = sum((length - mean_length) ** 2 for length in lengths) / len(lengths)
|
215 |
+
|
216 |
+
return variance
|
217 |
+
|
218 |
+
def _calculate_punctuation_density(self, text):
|
219 |
+
"""Calculate density of punctuation marks"""
|
220 |
+
if not text:
|
221 |
+
return 0
|
222 |
+
|
223 |
+
punctuation_marks = re.findall(r'[.,;:!?()-"]', text)
|
224 |
+
return len(punctuation_marks) / len(text)
|
225 |
+
|
226 |
+
def _calculate_subordinate_clause_ratio(self, text):
|
227 |
+
"""Calculate ratio of subordinate clauses (approximation)"""
|
228 |
+
if not text:
|
229 |
+
return 0
|
230 |
+
|
231 |
+
# Look for subordinating conjunctions and relative pronouns
|
232 |
+
subordinate_indicators = [
|
233 |
+
'although', 'because', 'since', 'while', 'whereas', 'if', 'unless',
|
234 |
+
'when', 'whenever', 'where', 'wherever', 'that', 'which', 'who',
|
235 |
+
'whom', 'whose', 'after', 'before', 'until', 'as'
|
236 |
+
]
|
237 |
+
|
238 |
+
text_lower = text.lower()
|
239 |
+
subordinate_count = sum(text_lower.count(f' {indicator} ') for indicator in subordinate_indicators)
|
240 |
+
sentences = self._split_sentences(text)
|
241 |
+
|
242 |
+
return subordinate_count / len(sentences) if sentences else 0
|
243 |
+
|
244 |
+
def _calculate_unique_word_ratio(self, words):
|
245 |
+
"""Calculate ratio of unique words (lexical diversity)"""
|
246 |
+
if not words:
|
247 |
+
return 0
|
248 |
+
|
249 |
+
unique_words = len(set(words))
|
250 |
+
return unique_words / len(words)
|
251 |
+
|
252 |
+
def _calculate_rare_word_ratio(self, words):
|
253 |
+
"""Calculate ratio of rare/uncommon words"""
|
254 |
+
if not words:
|
255 |
+
return 0
|
256 |
+
|
257 |
+
# Common English words (top 1000 most frequent)
|
258 |
+
common_words = {
|
259 |
+
'the', 'of', 'and', 'a', 'to', 'in', 'is', 'you', 'that', 'it',
|
260 |
+
'he', 'was', 'for', 'on', 'are', 'as', 'with', 'his', 'they',
|
261 |
+
'i', 'at', 'be', 'this', 'have', 'from', 'or', 'one', 'had',
|
262 |
+
'by', 'word', 'but', 'not', 'what', 'all', 'were', 'we', 'when',
|
263 |
+
'your', 'can', 'said', 'there', 'each', 'which', 'she', 'do',
|
264 |
+
'how', 'their', 'if', 'will', 'up', 'other', 'about', 'out',
|
265 |
+
'many', 'then', 'them', 'these', 'so', 'some', 'her', 'would',
|
266 |
+
'make', 'like', 'into', 'him', 'has', 'two', 'more', 'very',
|
267 |
+
'after', 'words', 'first', 'where', 'much', 'through', 'back',
|
268 |
+
'years', 'work', 'came', 'right', 'used', 'take', 'three',
|
269 |
+
'states', 'himself', 'few', 'house', 'use', 'during', 'without',
|
270 |
+
'again', 'place', 'around', 'however', 'small', 'found', 'mrs',
|
271 |
+
'thought', 'went', 'say', 'part', 'once', 'general', 'high',
|
272 |
+
'upon', 'school', 'every', 'don', 'does', 'got', 'united',
|
273 |
+
'left', 'number', 'course', 'war', 'until', 'always', 'away',
|
274 |
+
'something', 'fact', 'though', 'water', 'less', 'public', 'put',
|
275 |
+
'think', 'almost', 'hand', 'enough', 'far', 'took', 'head',
|
276 |
+
'yet', 'government', 'system', 'better', 'set', 'told', 'nothing',
|
277 |
+
'night', 'end', 'why', 'called', 'didn', 'eyes', 'find', 'going',
|
278 |
+
'look', 'asked', 'later', 'knew', 'point', 'next', 'city', 'did',
|
279 |
+
'want', 'way', 'could', 'people', 'may', 'says', 'each', 'those',
|
280 |
+
'now', 'such', 'here', 'take', 'than', 'only', 'well', 'year'
|
281 |
+
}
|
282 |
+
|
283 |
+
rare_words = sum(1 for word in words if word not in common_words and len(word) > 4)
|
284 |
+
return rare_words / len(words)
|
285 |
+
|
286 |
+
def _calculate_formal_language_ratio(self, words):
|
287 |
+
"""Calculate ratio of formal/academic language"""
|
288 |
+
if not words:
|
289 |
+
return 0
|
290 |
+
|
291 |
+
# Formal language indicators
|
292 |
+
formal_indicators = {
|
293 |
+
'therefore', 'however', 'furthermore', 'moreover', 'nevertheless',
|
294 |
+
'consequently', 'subsequently', 'accordingly', 'thus', 'hence',
|
295 |
+
'whereas', 'whereby', 'wherein', 'hereafter', 'heretofore',
|
296 |
+
'notwithstanding', 'inasmuch', 'insofar', 'albeit', 'vis'
|
297 |
+
}
|
298 |
+
|
299 |
+
# Academic/formal suffixes
|
300 |
+
formal_suffixes = {
|
301 |
+
'tion', 'sion', 'ment', 'ance', 'ence', 'ity', 'ness', 'ism',
|
302 |
+
'ize', 'ise', 'ate', 'fy', 'able', 'ible', 'ous', 'eous',
|
303 |
+
'ious', 'ive', 'ary', 'ory', 'al', 'ic', 'ical'
|
304 |
+
}
|
305 |
+
|
306 |
+
formal_words = 0
|
307 |
+
for word in words:
|
308 |
+
if (word in formal_indicators or
|
309 |
+
(len(word) > 5 and any(word.endswith(suffix) for suffix in formal_suffixes))):
|
310 |
+
formal_words += 1
|
311 |
+
|
312 |
+
return formal_words / len(words)
|
313 |
+
|
314 |
+
def get_feature_names(self):
|
315 |
+
"""Get names of extracted features"""
|
316 |
+
feature_names = [
|
317 |
+
'readability_avg_words_per_sentence',
|
318 |
+
'readability_avg_syllables_per_word',
|
319 |
+
'readability_avg_chars_per_word',
|
320 |
+
'readability_flesch_reading_ease',
|
321 |
+
'readability_flesch_kincaid_grade',
|
322 |
+
'readability_automated_readability_index',
|
323 |
+
'readability_complex_words_ratio',
|
324 |
+
'readability_long_words_ratio',
|
325 |
+
'readability_technical_terms_ratio',
|
326 |
+
'readability_sentence_length_variance',
|
327 |
+
'readability_punctuation_density',
|
328 |
+
'readability_subordinate_clause_ratio',
|
329 |
+
'readability_unique_word_ratio',
|
330 |
+
'readability_rare_word_ratio',
|
331 |
+
'readability_formal_language_ratio'
|
332 |
+
]
|
333 |
+
|
334 |
+
return feature_names
|
335 |
+
|
336 |
+
def analyze_text_readability(self, text):
|
337 |
+
"""Detailed readability analysis of a single text"""
|
338 |
+
if not self.is_fitted_:
|
339 |
+
raise ValueError("ReadabilityAnalyzer must be fitted before analysis")
|
340 |
+
|
341 |
+
sentences = self._split_sentences(text)
|
342 |
+
words = self._split_words(text)
|
343 |
+
syllables = self._count_syllables_total(words)
|
344 |
+
|
345 |
+
if len(sentences) == 0 or len(words) == 0:
|
346 |
+
return {
|
347 |
+
'error': 'Text too short for analysis',
|
348 |
+
'text_length': len(text),
|
349 |
+
'word_count': len(words),
|
350 |
+
'sentence_count': len(sentences)
|
351 |
+
}
|
352 |
+
|
353 |
+
analysis = {
|
354 |
+
'basic_stats': {
|
355 |
+
'text_length': len(text),
|
356 |
+
'word_count': len(words),
|
357 |
+
'sentence_count': len(sentences),
|
358 |
+
'syllable_count': syllables,
|
359 |
+
'avg_words_per_sentence': len(words) / len(sentences),
|
360 |
+
'avg_syllables_per_word': syllables / len(words),
|
361 |
+
'avg_chars_per_word': sum(len(word) for word in words) / len(words)
|
362 |
+
},
|
363 |
+
'readability_scores': {
|
364 |
+
'flesch_reading_ease': self._calculate_flesch_reading_ease(words, sentences, syllables),
|
365 |
+
'flesch_kincaid_grade': self._calculate_flesch_kincaid_grade(words, sentences, syllables),
|
366 |
+
'automated_readability_index': self._calculate_ari(words, sentences, text)
|
367 |
+
},
|
368 |
+
'complexity_metrics': {
|
369 |
+
'complex_words_ratio': self._calculate_complex_words_ratio(words),
|
370 |
+
'long_words_ratio': self._calculate_long_words_ratio(words),
|
371 |
+
'technical_terms_ratio': self._calculate_technical_terms_ratio(words),
|
372 |
+
'unique_word_ratio': self._calculate_unique_word_ratio(words),
|
373 |
+
'rare_word_ratio': self._calculate_rare_word_ratio(words),
|
374 |
+
'formal_language_ratio': self._calculate_formal_language_ratio(words)
|
375 |
+
},
|
376 |
+
'structure_analysis': {
|
377 |
+
'sentence_length_variance': self._calculate_sentence_length_variance(sentences),
|
378 |
+
'punctuation_density': self._calculate_punctuation_density(text),
|
379 |
+
'subordinate_clause_ratio': self._calculate_subordinate_clause_ratio(text)
|
380 |
+
}
|
381 |
+
}
|
382 |
+
|
383 |
+
# Interpret readability level
|
384 |
+
flesch_score = analysis['readability_scores']['flesch_reading_ease']
|
385 |
+
if flesch_score >= 90:
|
386 |
+
readability_level = 'very_easy'
|
387 |
+
elif flesch_score >= 80:
|
388 |
+
readability_level = 'easy'
|
389 |
+
elif flesch_score >= 70:
|
390 |
+
readability_level = 'fairly_easy'
|
391 |
+
elif flesch_score >= 60:
|
392 |
+
readability_level = 'standard'
|
393 |
+
elif flesch_score >= 50:
|
394 |
+
readability_level = 'fairly_difficult'
|
395 |
+
elif flesch_score >= 30:
|
396 |
+
readability_level = 'difficult'
|
397 |
+
else:
|
398 |
+
readability_level = 'very_difficult'
|
399 |
+
|
400 |
+
analysis['interpretation'] = {
|
401 |
+
'readability_level': readability_level,
|
402 |
+
'grade_level': analysis['readability_scores']['flesch_kincaid_grade'],
|
403 |
+
'complexity_assessment': self._assess_complexity(analysis)
|
404 |
+
}
|
405 |
+
|
406 |
+
return analysis
|
407 |
+
|
408 |
+
def _assess_complexity(self, analysis):
|
409 |
+
"""Assess overall complexity level"""
|
410 |
+
complexity_indicators = [
|
411 |
+
analysis['complexity_metrics']['complex_words_ratio'],
|
412 |
+
analysis['complexity_metrics']['technical_terms_ratio'],
|
413 |
+
analysis['complexity_metrics']['formal_language_ratio'],
|
414 |
+
min(1.0, analysis['structure_analysis']['subordinate_clause_ratio']) # Cap at 1.0
|
415 |
+
]
|
416 |
+
|
417 |
+
avg_complexity = sum(complexity_indicators) / len(complexity_indicators)
|
418 |
+
|
419 |
+
if avg_complexity > 0.3:
|
420 |
+
return 'high'
|
421 |
+
elif avg_complexity > 0.15:
|
422 |
+
return 'medium'
|
423 |
+
else:
|
424 |
+
return 'low'
|