Commit
·
92a45c5
1
Parent(s):
9e0e49b
Update data/validation_schemas.py
Browse files- data/validation_schemas.py +32 -25
data/validation_schemas.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
-
# File: data/validation_schemas.py
|
2 |
# Comprehensive Pydantic validation schemas for data quality assurance
|
3 |
|
4 |
-
from pydantic import BaseModel, Field,
|
5 |
from typing import List, Optional, Dict, Any, Union
|
6 |
from datetime import datetime
|
7 |
import re
|
@@ -53,7 +53,8 @@ class TextContentSchema(BaseModel):
|
|
53 |
description="The news article text content"
|
54 |
)
|
55 |
|
56 |
-
@
|
|
|
57 |
def validate_text_content(cls, v):
|
58 |
"""Comprehensive text content validation"""
|
59 |
if not v or not isinstance(v, str):
|
@@ -155,7 +156,8 @@ class LabelSchema(BaseModel):
|
|
155 |
description="Source reliability score (0-1)"
|
156 |
)
|
157 |
|
158 |
-
@
|
|
|
159 |
def validate_label(cls, v):
|
160 |
"""Validate label value"""
|
161 |
if v not in [0, 1]:
|
@@ -188,7 +190,8 @@ class DataSourceSchema(BaseModel):
|
|
188 |
description="Batch identifier for grouped data"
|
189 |
)
|
190 |
|
191 |
-
@
|
|
|
192 |
def validate_url(cls, v):
|
193 |
"""Validate URL format"""
|
194 |
if v is not None:
|
@@ -244,7 +247,8 @@ class NewsArticleSchema(BaseModel):
|
|
244 |
description="Overall quality score (0-1)"
|
245 |
)
|
246 |
|
247 |
-
@
|
|
|
248 |
def validate_title(cls, v):
|
249 |
"""Validate article title"""
|
250 |
if v is not None:
|
@@ -259,7 +263,8 @@ class NewsArticleSchema(BaseModel):
|
|
259 |
|
260 |
return v
|
261 |
|
262 |
-
@
|
|
|
263 |
def validate_language(cls, v):
|
264 |
"""Validate language code"""
|
265 |
valid_languages = ['en', 'es', 'fr', 'de', 'it', 'pt', 'nl', 'ru', 'zh', 'ja', 'ko']
|
@@ -267,11 +272,11 @@ class NewsArticleSchema(BaseModel):
|
|
267 |
raise ValueError(f"Unsupported language code: {v}")
|
268 |
return v
|
269 |
|
270 |
-
@
|
271 |
-
def validate_article_consistency(
|
272 |
"""Cross-field validation"""
|
273 |
-
text_content =
|
274 |
-
title =
|
275 |
|
276 |
if text_content and title:
|
277 |
# Check if title and content are suspiciously similar
|
@@ -282,7 +287,7 @@ class NewsArticleSchema(BaseModel):
|
|
282 |
# This is fine, just noting high similarity
|
283 |
pass
|
284 |
|
285 |
-
return
|
286 |
|
287 |
@property
|
288 |
def text_quality_level(self) -> TextQualityLevel:
|
@@ -338,8 +343,8 @@ class BatchValidationSchema(BaseModel):
|
|
338 |
|
339 |
articles: List[NewsArticleSchema] = Field(
|
340 |
...,
|
341 |
-
|
342 |
-
|
343 |
description="List of articles to validate"
|
344 |
)
|
345 |
|
@@ -360,7 +365,8 @@ class BatchValidationSchema(BaseModel):
|
|
360 |
description="Minimum quality score threshold"
|
361 |
)
|
362 |
|
363 |
-
@
|
|
|
364 |
def validate_article_list(cls, v):
|
365 |
"""Validate article list"""
|
366 |
if not v:
|
@@ -509,26 +515,27 @@ class BatchValidationResultSchema(BaseModel):
|
|
509 |
description="Validation summary statistics"
|
510 |
)
|
511 |
|
512 |
-
@
|
513 |
-
|
|
|
514 |
"""Validate article count consistency"""
|
515 |
-
if 'total_articles' in
|
516 |
-
total =
|
517 |
if v > total:
|
518 |
raise ValueError("Article count cannot exceed total")
|
519 |
return v
|
520 |
|
521 |
-
@
|
522 |
-
def validate_counts_consistency(
|
523 |
"""Validate count consistency"""
|
524 |
-
total =
|
525 |
-
valid =
|
526 |
-
invalid =
|
527 |
|
528 |
if valid + invalid != total:
|
529 |
raise ValueError("Valid + invalid articles must equal total articles")
|
530 |
|
531 |
-
return
|
532 |
|
533 |
@property
|
534 |
def success_rate(self) -> float:
|
|
|
1 |
+
# File: data/validation_schemas.py
|
2 |
# Comprehensive Pydantic validation schemas for data quality assurance
|
3 |
|
4 |
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
5 |
from typing import List, Optional, Dict, Any, Union
|
6 |
from datetime import datetime
|
7 |
import re
|
|
|
53 |
description="The news article text content"
|
54 |
)
|
55 |
|
56 |
+
@field_validator('text')
|
57 |
+
@classmethod
|
58 |
def validate_text_content(cls, v):
|
59 |
"""Comprehensive text content validation"""
|
60 |
if not v or not isinstance(v, str):
|
|
|
156 |
description="Source reliability score (0-1)"
|
157 |
)
|
158 |
|
159 |
+
@field_validator('label')
|
160 |
+
@classmethod
|
161 |
def validate_label(cls, v):
|
162 |
"""Validate label value"""
|
163 |
if v not in [0, 1]:
|
|
|
190 |
description="Batch identifier for grouped data"
|
191 |
)
|
192 |
|
193 |
+
@field_validator('url')
|
194 |
+
@classmethod
|
195 |
def validate_url(cls, v):
|
196 |
"""Validate URL format"""
|
197 |
if v is not None:
|
|
|
247 |
description="Overall quality score (0-1)"
|
248 |
)
|
249 |
|
250 |
+
@field_validator('title')
|
251 |
+
@classmethod
|
252 |
def validate_title(cls, v):
|
253 |
"""Validate article title"""
|
254 |
if v is not None:
|
|
|
263 |
|
264 |
return v
|
265 |
|
266 |
+
@field_validator('language')
|
267 |
+
@classmethod
|
268 |
def validate_language(cls, v):
|
269 |
"""Validate language code"""
|
270 |
valid_languages = ['en', 'es', 'fr', 'de', 'it', 'pt', 'nl', 'ru', 'zh', 'ja', 'ko']
|
|
|
272 |
raise ValueError(f"Unsupported language code: {v}")
|
273 |
return v
|
274 |
|
275 |
+
@model_validator(mode='after')
|
276 |
+
def validate_article_consistency(self):
|
277 |
"""Cross-field validation"""
|
278 |
+
text_content = self.text_content
|
279 |
+
title = self.title
|
280 |
|
281 |
if text_content and title:
|
282 |
# Check if title and content are suspiciously similar
|
|
|
287 |
# This is fine, just noting high similarity
|
288 |
pass
|
289 |
|
290 |
+
return self
|
291 |
|
292 |
@property
|
293 |
def text_quality_level(self) -> TextQualityLevel:
|
|
|
343 |
|
344 |
articles: List[NewsArticleSchema] = Field(
|
345 |
...,
|
346 |
+
min_length=1,
|
347 |
+
max_length=10000,
|
348 |
description="List of articles to validate"
|
349 |
)
|
350 |
|
|
|
365 |
description="Minimum quality score threshold"
|
366 |
)
|
367 |
|
368 |
+
@field_validator('articles')
|
369 |
+
@classmethod
|
370 |
def validate_article_list(cls, v):
|
371 |
"""Validate article list"""
|
372 |
if not v:
|
|
|
515 |
description="Validation summary statistics"
|
516 |
)
|
517 |
|
518 |
+
@field_validator('valid_articles', 'invalid_articles')
|
519 |
+
@classmethod
|
520 |
+
def validate_article_counts(cls, v, info):
|
521 |
"""Validate article count consistency"""
|
522 |
+
if 'total_articles' in info.data:
|
523 |
+
total = info.data['total_articles']
|
524 |
if v > total:
|
525 |
raise ValueError("Article count cannot exceed total")
|
526 |
return v
|
527 |
|
528 |
+
@model_validator(mode='after')
|
529 |
+
def validate_counts_consistency(self):
|
530 |
"""Validate count consistency"""
|
531 |
+
total = self.total_articles
|
532 |
+
valid = self.valid_articles
|
533 |
+
invalid = self.invalid_articles
|
534 |
|
535 |
if valid + invalid != total:
|
536 |
raise ValueError("Valid + invalid articles must equal total articles")
|
537 |
|
538 |
+
return self
|
539 |
|
540 |
@property
|
541 |
def success_rate(self) -> float:
|