rivapereira123 commited on
Commit
f2f4366
·
verified ·
1 Parent(s): fed12ea

Update core/fact_checker.py

Browse files
Files changed (1) hide show
  1. core/fact_checker.py +82 -38
core/fact_checker.py CHANGED
@@ -1,87 +1,131 @@
1
-
2
  import re
3
- from typing import Dict, List, Any
4
 
5
  def clean_ocr_artifacts(text: str) -> str:
6
  text = re.sub(r'\s{2,}', ' ', text)
7
- text = re.sub(r'(?<=[\.!?]\s)([eEoO])([A-Z][a-z]+)', r'\2', text)
8
  text = re.sub(r'\b[Aa]love\b', 'aloe', text)
9
  text = re.sub(r'\bRelevanci\b', 'Relevance', text)
10
  text = re.sub(r'\bAlove\b', 'Aloe', text)
11
  text = re.sub(r'\b[aA]dvice\b', 'advice', text)
12
  return text.strip()
13
 
 
14
  class MedicalFactChecker:
 
 
15
  def __init__(self):
 
16
  self.contraindications = self._load_contraindications()
17
  self.dosage_patterns = self._compile_dosage_patterns()
18
- self.definitive_patterns = [re.compile(r, re.IGNORECASE) for r in [
19
- r'always\s+(?:use|take|apply)',
20
- r'never\s+(?:use|take|apply)',
21
- r'will\s+(?:cure|heal|fix)',
22
- r'guaranteed\s+to',
23
- r'completely\s+(?:safe|effective)'
24
- ]]
 
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def _load_contraindications(self) -> Dict[str, List[str]]:
 
27
  return {
28
  "aspirin": ["children under 16", "bleeding disorders", "stomach ulcers"],
29
  "ibuprofen": ["kidney disease", "heart failure", "stomach bleeding"],
30
  "hydrogen_peroxide": ["deep wounds", "closed wounds", "eyes"],
31
  "tourniquets": ["non-life-threatening bleeding", "without proper training"]
32
  }
33
-
34
  def _compile_dosage_patterns(self) -> List[re.Pattern]:
 
35
  patterns = [
36
- r'\d+\s*mg\b',
37
- r'\d+\s*g\b',
38
- r'\d+\s*ml\b',
39
- r'\d+\s*tablets?\b',
40
- r'\d+\s*times?\s+(?:per\s+)?day\b',
41
- r'every\s+\d+\s+hours?\b'
42
  ]
43
- return [re.compile(p, re.IGNORECASE) for p in patterns]
44
-
45
  def check_medical_accuracy(self, response: str, context: str) -> Dict[str, Any]:
 
 
 
46
  issues = []
47
  warnings = []
48
  accuracy_score = 0.0
49
-
 
50
  response_lower = response.lower()
51
-
52
- for med, contraindications in self.contraindications.items():
53
- if med in response_lower:
54
- for item in contraindications:
55
- if any(word in response_lower for word in item.split()):
56
- issues.append(f"Contraindication: {med} with {item}")
57
  accuracy_score -= 0.3
58
  break
59
-
 
60
  if context:
61
  resp_words = set(response_lower.split())
62
  ctx_words = set(context.lower().split())
63
  context_similarity = len(resp_words & ctx_words) / len(resp_words | ctx_words) if ctx_words else 0.0
64
- if context_similarity < 0.5:
65
- warnings.append(f"Low context match: {context_similarity:.2f}")
66
  accuracy_score -= 0.1
67
-
 
 
 
 
 
 
 
 
 
68
  for pattern in self.definitive_patterns:
69
  if pattern.search(response):
70
- issues.append("Definitive claim detected")
71
  accuracy_score -= 0.4
72
  break
73
-
 
74
  for pattern in self.dosage_patterns:
75
  if pattern.search(response):
76
- warnings.append("Dosage detected verify with expert")
77
  accuracy_score -= 0.1
78
  break
79
-
80
- confidence = max(0.0, min(1.0, 0.8 + accuracy_score))
 
81
  return {
82
- "confidence_score": confidence,
83
  "issues": issues,
84
  "warnings": warnings,
85
- "context_similarity": context_similarity if context else 0.0,
86
- "is_safe": len(issues) == 0 and confidence > 0.5
87
  }
 
 
1
  import re
2
+ from typing import Dict, Any, List
3
 
4
  def clean_ocr_artifacts(text: str) -> str:
5
  text = re.sub(r'\s{2,}', ' ', text)
6
+ text = re.sub(r'(?<=[\.\?!]\s)([eEoO])([A-Z][a-z]+)', r'\2', text) # eFlood → Flood, oSeek → Seek
7
  text = re.sub(r'\b[Aa]love\b', 'aloe', text)
8
  text = re.sub(r'\bRelevanci\b', 'Relevance', text)
9
  text = re.sub(r'\bAlove\b', 'Aloe', text)
10
  text = re.sub(r'\b[aA]dvice\b', 'advice', text)
11
  return text.strip()
12
 
13
+
14
  class MedicalFactChecker:
15
+ """Enhanced medical fact checker with faster validation"""
16
+
17
  def __init__(self):
18
+ self.medical_facts = self._load_medical_facts()
19
  self.contraindications = self._load_contraindications()
20
  self.dosage_patterns = self._compile_dosage_patterns()
21
+ self.definitive_patterns = [
22
+ re.compile(r, re.IGNORECASE) for r in [
23
+ r'always\s+(?:use|take|apply)',
24
+ r'never\s+(?:use|take|apply)',
25
+ r'will\s+(?:cure|heal|fix)',
26
+ r'guaranteed\s+to',
27
+ r'completely\s+(?:safe|effective)'
28
+ ]
29
+ ]
30
 
31
+
32
+ def _load_medical_facts(self) -> Dict[str, Any]:
33
+ """Pre-loaded medical facts for Gaza context"""
34
+ return {
35
+ "burn_treatment": {
36
+ "cool_water": "Use clean, cool (not ice-cold) water for 10-20 minutes",
37
+ "no_ice": "Never apply ice directly to burns",
38
+ "clean_cloth": "Cover with clean, dry cloth if available"
39
+ },
40
+ "wound_care": {
41
+ "pressure": "Apply direct pressure to control bleeding",
42
+ "elevation": "Elevate injured limb if possible",
43
+ "clean_hands": "Clean hands before treating wounds when possible"
44
+ },
45
+ "infection_signs": {
46
+ "redness": "Increasing redness around wound",
47
+ "warmth": "Increased warmth at wound site",
48
+ "pus": "Yellow or green discharge",
49
+ "fever": "Fever may indicate systemic infection"
50
+ }
51
+ }
52
+
53
  def _load_contraindications(self) -> Dict[str, List[str]]:
54
+ """Pre-loaded contraindications for common treatments"""
55
  return {
56
  "aspirin": ["children under 16", "bleeding disorders", "stomach ulcers"],
57
  "ibuprofen": ["kidney disease", "heart failure", "stomach bleeding"],
58
  "hydrogen_peroxide": ["deep wounds", "closed wounds", "eyes"],
59
  "tourniquets": ["non-life-threatening bleeding", "without proper training"]
60
  }
61
+
62
  def _compile_dosage_patterns(self) -> List[re.Pattern]:
63
+ """Pre-compiled dosage patterns"""
64
  patterns = [
65
+ r'\d+\s*mg\b', # milligrams
66
+ r'\d+\s*g\b', # grams
67
+ r'\d+\s*ml\b', # milliliters
68
+ r'\d+\s*tablets?\b', # tablets
69
+ r'\d+\s*times?\s+(?:per\s+)?day\b', # frequency
70
+ r'every\s+\d+\s+hours?\b' # intervals
71
  ]
72
+ return [re.compile(pattern, re.IGNORECASE) for pattern in patterns]
73
+
74
  def check_medical_accuracy(self, response: str, context: str) -> Dict[str, Any]:
75
+ """Enhanced medical accuracy check with Gaza-specific considerations"""
76
+ if response is None:
77
+ response = ""
78
  issues = []
79
  warnings = []
80
  accuracy_score = 0.0
81
+
82
+ # Check for contraindications (faster keyword matching)
83
  response_lower = response.lower()
84
+ for medication, contra_list in self.contraindications.items():
85
+ if medication in response_lower:
86
+ for contra in contra_list:
87
+ if any(word in response_lower for word in contra.split()):
88
+ issues.append(f"Potential contraindication: {medication} with {contra}")
 
89
  accuracy_score -= 0.3
90
  break
91
+
92
+ # Context alignment using Jaccard similarity
93
  if context:
94
  resp_words = set(response_lower.split())
95
  ctx_words = set(context.lower().split())
96
  context_similarity = len(resp_words & ctx_words) / len(resp_words | ctx_words) if ctx_words else 0.0
97
+ if context_similarity < 0.5: # Lowered threshold for Gaza context
98
+ warnings.append(f"Low context similarity: {context_similarity:.2f}")
99
  accuracy_score -= 0.1
100
+ else:
101
+ context_similarity = 0.0
102
+
103
+ # Gaza-specific resource checks
104
+ gaza_resources = ["clean water", "sterile", "hospital", "ambulance", "electricity"]
105
+ if any(resource in response_lower for resource in gaza_resources):
106
+ warnings.append("Consider resource limitations in Gaza context")
107
+ accuracy_score -= 0.05
108
+
109
+ # Unsupported claims check
110
  for pattern in self.definitive_patterns:
111
  if pattern.search(response):
112
+ issues.append(f"Unsupported definitive claim detected")
113
  accuracy_score -= 0.4
114
  break
115
+
116
+ # Dosage validation
117
  for pattern in self.dosage_patterns:
118
  if pattern.search(response):
119
+ warnings.append("Dosage detected - verify with professional")
120
  accuracy_score -= 0.1
121
  break
122
+
123
+ confidence_score = max(0.0, min(1.0, 0.8 + accuracy_score))
124
+
125
  return {
126
+ "confidence_score": confidence_score,
127
  "issues": issues,
128
  "warnings": warnings,
129
+ "context_similarity": context_similarity,
130
+ "is_safe": len(issues) == 0 and confidence_score > 0.5
131
  }