Ahmedik95316 commited on
Commit
92a45c5
·
1 Parent(s): 9e0e49b

Update data/validation_schemas.py

Browse files
Files changed (1) hide show
  1. data/validation_schemas.py +32 -25
data/validation_schemas.py CHANGED
@@ -1,7 +1,7 @@
1
- # File: data/validation_schemas.py (NEW FILE)
2
  # Comprehensive Pydantic validation schemas for data quality assurance
3
 
4
- from pydantic import BaseModel, Field, validator, root_validator
5
  from typing import List, Optional, Dict, Any, Union
6
  from datetime import datetime
7
  import re
@@ -53,7 +53,8 @@ class TextContentSchema(BaseModel):
53
  description="The news article text content"
54
  )
55
 
56
- @validator('text')
 
57
  def validate_text_content(cls, v):
58
  """Comprehensive text content validation"""
59
  if not v or not isinstance(v, str):
@@ -155,7 +156,8 @@ class LabelSchema(BaseModel):
155
  description="Source reliability score (0-1)"
156
  )
157
 
158
- @validator('label')
 
159
  def validate_label(cls, v):
160
  """Validate label value"""
161
  if v not in [0, 1]:
@@ -188,7 +190,8 @@ class DataSourceSchema(BaseModel):
188
  description="Batch identifier for grouped data"
189
  )
190
 
191
- @validator('url')
 
192
  def validate_url(cls, v):
193
  """Validate URL format"""
194
  if v is not None:
@@ -244,7 +247,8 @@ class NewsArticleSchema(BaseModel):
244
  description="Overall quality score (0-1)"
245
  )
246
 
247
- @validator('title')
 
248
  def validate_title(cls, v):
249
  """Validate article title"""
250
  if v is not None:
@@ -259,7 +263,8 @@ class NewsArticleSchema(BaseModel):
259
 
260
  return v
261
 
262
- @validator('language')
 
263
  def validate_language(cls, v):
264
  """Validate language code"""
265
  valid_languages = ['en', 'es', 'fr', 'de', 'it', 'pt', 'nl', 'ru', 'zh', 'ja', 'ko']
@@ -267,11 +272,11 @@ class NewsArticleSchema(BaseModel):
267
  raise ValueError(f"Unsupported language code: {v}")
268
  return v
269
 
270
- @root_validator
271
- def validate_article_consistency(cls, values):
272
  """Cross-field validation"""
273
- text_content = values.get('text_content')
274
- title = values.get('title')
275
 
276
  if text_content and title:
277
  # Check if title and content are suspiciously similar
@@ -282,7 +287,7 @@ class NewsArticleSchema(BaseModel):
282
  # This is fine, just noting high similarity
283
  pass
284
 
285
- return values
286
 
287
  @property
288
  def text_quality_level(self) -> TextQualityLevel:
@@ -338,8 +343,8 @@ class BatchValidationSchema(BaseModel):
338
 
339
  articles: List[NewsArticleSchema] = Field(
340
  ...,
341
- min_items=1,
342
- max_items=10000,
343
  description="List of articles to validate"
344
  )
345
 
@@ -360,7 +365,8 @@ class BatchValidationSchema(BaseModel):
360
  description="Minimum quality score threshold"
361
  )
362
 
363
- @validator('articles')
 
364
  def validate_article_list(cls, v):
365
  """Validate article list"""
366
  if not v:
@@ -509,26 +515,27 @@ class BatchValidationResultSchema(BaseModel):
509
  description="Validation summary statistics"
510
  )
511
 
512
- @validator('valid_articles', 'invalid_articles')
513
- def validate_article_counts(cls, v, values):
 
514
  """Validate article count consistency"""
515
- if 'total_articles' in values:
516
- total = values['total_articles']
517
  if v > total:
518
  raise ValueError("Article count cannot exceed total")
519
  return v
520
 
521
- @root_validator
522
- def validate_counts_consistency(cls, values):
523
  """Validate count consistency"""
524
- total = values.get('total_articles', 0)
525
- valid = values.get('valid_articles', 0)
526
- invalid = values.get('invalid_articles', 0)
527
 
528
  if valid + invalid != total:
529
  raise ValueError("Valid + invalid articles must equal total articles")
530
 
531
- return values
532
 
533
  @property
534
  def success_rate(self) -> float:
 
1
+ # File: data/validation_schemas.py
2
  # Comprehensive Pydantic validation schemas for data quality assurance
3
 
4
+ from pydantic import BaseModel, Field, field_validator, model_validator
5
  from typing import List, Optional, Dict, Any, Union
6
  from datetime import datetime
7
  import re
 
53
  description="The news article text content"
54
  )
55
 
56
+ @field_validator('text')
57
+ @classmethod
58
  def validate_text_content(cls, v):
59
  """Comprehensive text content validation"""
60
  if not v or not isinstance(v, str):
 
156
  description="Source reliability score (0-1)"
157
  )
158
 
159
+ @field_validator('label')
160
+ @classmethod
161
  def validate_label(cls, v):
162
  """Validate label value"""
163
  if v not in [0, 1]:
 
190
  description="Batch identifier for grouped data"
191
  )
192
 
193
+ @field_validator('url')
194
+ @classmethod
195
  def validate_url(cls, v):
196
  """Validate URL format"""
197
  if v is not None:
 
247
  description="Overall quality score (0-1)"
248
  )
249
 
250
+ @field_validator('title')
251
+ @classmethod
252
  def validate_title(cls, v):
253
  """Validate article title"""
254
  if v is not None:
 
263
 
264
  return v
265
 
266
+ @field_validator('language')
267
+ @classmethod
268
  def validate_language(cls, v):
269
  """Validate language code"""
270
  valid_languages = ['en', 'es', 'fr', 'de', 'it', 'pt', 'nl', 'ru', 'zh', 'ja', 'ko']
 
272
  raise ValueError(f"Unsupported language code: {v}")
273
  return v
274
 
275
+ @model_validator(mode='after')
276
+ def validate_article_consistency(self):
277
  """Cross-field validation"""
278
+ text_content = self.text_content
279
+ title = self.title
280
 
281
  if text_content and title:
282
  # Check if title and content are suspiciously similar
 
287
  # This is fine, just noting high similarity
288
  pass
289
 
290
+ return self
291
 
292
  @property
293
  def text_quality_level(self) -> TextQualityLevel:
 
343
 
344
  articles: List[NewsArticleSchema] = Field(
345
  ...,
346
+ min_length=1,
347
+ max_length=10000,
348
  description="List of articles to validate"
349
  )
350
 
 
365
  description="Minimum quality score threshold"
366
  )
367
 
368
+ @field_validator('articles')
369
+ @classmethod
370
  def validate_article_list(cls, v):
371
  """Validate article list"""
372
  if not v:
 
515
  description="Validation summary statistics"
516
  )
517
 
518
+ @field_validator('valid_articles', 'invalid_articles')
519
+ @classmethod
520
+ def validate_article_counts(cls, v, info):
521
  """Validate article count consistency"""
522
+ if 'total_articles' in info.data:
523
+ total = info.data['total_articles']
524
  if v > total:
525
  raise ValueError("Article count cannot exceed total")
526
  return v
527
 
528
+ @model_validator(mode='after')
529
+ def validate_counts_consistency(self):
530
  """Validate count consistency"""
531
+ total = self.total_articles
532
+ valid = self.valid_articles
533
+ invalid = self.invalid_articles
534
 
535
  if valid + invalid != total:
536
  raise ValueError("Valid + invalid articles must equal total articles")
537
 
538
+ return self
539
 
540
  @property
541
  def success_rate(self) -> float: