Spaces:

akaafridi
/

TRUTHLENS

Sleeping

App Files Files Community

akaafridi commited on Aug 16

Commit

9381a8f

verified ·

1 Parent(s): 41038fb

Update src/classifier.py

Browse files

Files changed (1) hide show

src/classifier.py +43 -85

src/classifier.py CHANGED Viewed

@@ -3,26 +3,13 @@ classifier.py
 -------------
 This module defines utilities for classifying the relationship between a
-claim and candidate sentences.  It leverages a cross-encoder model
-pretrained on the Natural Language Inference (NLI) task to assign
-labels indicating whether each candidate sentence supports, contradicts,
-or is neutral with respect to the claim.  When the required
-transformers components cannot be loaded (e.g. due to missing
-dependencies or lack of network access), the module falls back to a
-lightweight heuristic-based classifier.
-The classifier returns one of three string labels for each input pair:
-* ``"support"`` – The sentence entails the claim.
-* ``"contradict"`` – The sentence contradicts the claim.
-* ``"neutral"`` – The sentence neither supports nor contradicts the claim.
-Example:
->>> from classifier import classify
->>> labels = classify("The sky is blue", ["The sky is blue on a clear day.", "Grass is green."])
->>> print(labels)  # ["support", "neutral"]
 """
 from __future__ import annotations
@@ -34,18 +21,15 @@ import numpy as np
 logger = logging.getLogger(__name__)
-_nli_model = None  # type: ignore
-_nli_tokenizer = None  # type: ignore
-_use_transformers = False
 def _load_nli_model(model_name: str = "cross-encoder/nli-roberta-base"):
-    """Lazy-load the NLI cross-encoder model and tokenizer.
-    If loading fails, the fallback heuristic classifier will be used.
-    """
     global _nli_model, _nli_tokenizer, _use_transformers
-    if _nli_model is not None or _use_transformers:
         return
     try:
         from transformers import AutoTokenizer, AutoModelForSequenceClassification  # type: ignore
@@ -56,7 +40,7 @@ def _load_nli_model(model_name: str = "cross-encoder/nli-roberta-base"):
         _use_transformers = True
     except Exception as exc:
         logger.warning(
-            "Failed to load NLI model '%s'. Falling back to heuristic classifier: %s",
             model_name,
             exc,
         )
@@ -72,98 +56,72 @@ def _classify_with_nli(claim: str, sentences: List[str], batch_size: int = 16) -
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     _nli_model.to(device)
-    labels_out: List[str] = []
-    # Map the model's label indices to human-readable labels.
-    # The order for 'cross-encoder/nli-roberta-base' is [contradiction, entailment, neutral].
     id2label = {0: "contradict", 1: "support", 2: "neutral"}
-    # Process in batches to avoid OOM
     for start in range(0, len(sentences), batch_size):
-        batch_sentences = sentences[start:start + batch_size]
-        encoded = _nli_tokenizer(
-            [claim] * len(batch_sentences),
-            batch_sentences,
             return_tensors="pt",
             truncation=True,
             padding=True,
         ).to(device)
         with torch.no_grad():
-            outputs = _nli_model(**encoded)
-            logits = outputs.logits.cpu().numpy()
         preds = logits.argmax(axis=1)
         labels_out.extend([id2label.get(int(p), "neutral") for p in preds])
     return labels_out
 def _heuristic_classify(claim: str, sentences: List[str]) -> List[str]:
-    """Simple heuristic classifier used when transformers are unavailable.
-    The heuristic checks for lexical overlap between the claim and
-    candidate sentences and the presence of negation words.  It aims to
-    approximate entailment/contradiction detection without external
-    dependencies.  The rules are very simple and should not be relied on
-    for production use, but they provide a reasonable fallback.
-    """
     import re
     claim_tokens = set(re.findall(r"\b\w+\b", claim.lower()))
-    negations = {"not", "no", "never", "none", "cannot", "n't"}
-    labels: List[str] = []
-    for sent in sentences:
-        sent_tokens = set(re.findall(r"\b\w+\b", sent.lower()))
-        overlap = claim_tokens & sent_tokens
-        has_neg = any(tok in sent_tokens for tok in negations)
         if overlap and not has_neg:
-            labels.append("support")
         elif overlap and has_neg:
-            labels.append("contradict")
         else:
-            labels.append("neutral")
-    return labels
 def classify(claim: str, sentences: Iterable[str], batch_size: int = 16) -> List[str]:
-    """Classify each sentence in ``sentences`` relative to ``claim``.
-    Parameters
-    ----------
-    claim:
-        The claim or hypothesis to compare against.
-    sentences:
-        An iterable of candidate sentences.
-    batch_size:
-        Batch size used when running inference with the transformer model.
-    Returns
-    -------
-    List[str]
-        A list of labels (``"support"``, ``"contradict"``, or ``"neutral"``)
-        corresponding to each input sentence.  The ordering of the
-        labels matches the ordering of the input sentences.
-    """
-    sentences_list = list(sentences)
-    if not sentences_list:
         return []
-    if _nli_model is None and not _use_transformers:
         _load_nli_model()
     if _use_transformers and _nli_model is not None and _nli_tokenizer is not None:
         try:
-            return _classify_with_nli(claim, sentences_list, batch_size=batch_size)
         except Exception as exc:
             logger.warning(
-                "NLI classification failed. Falling back to heuristic classifier: %s",
                 exc,
             )
-            # Mark transformers as unusable for subsequent calls
-            global _use_transformers
             _use_transformers = False
             _nli_model = None
             _nli_tokenizer = None
-    # Heuristic fallback
-    return _heuristic_classify(claim, sentences_list)

 -------------
 This module defines utilities for classifying the relationship between a
+claim and candidate sentences. It tries to use a transformers NLI
+cross-encoder; if that fails, it falls back to a lightweight heuristic.
+Labels:
+- "support"      (entailment)
+- "contradict"   (contradiction)
+- "neutral"
 """
 from __future__ import annotations
 logger = logging.getLogger(__name__)
+_nli_model = None          # type: ignore
+_nli_tokenizer = None      # type: ignore
+_use_transformers = False  # whether NLI model is successfully loaded
 def _load_nli_model(model_name: str = "cross-encoder/nli-roberta-base"):
+    """Lazy-load the NLI model and tokenizer; set fallback flag on failure."""
     global _nli_model, _nli_tokenizer, _use_transformers
+    if _nli_model is not None and _nli_tokenizer is not None and _use_transformers:
         return
     try:
         from transformers import AutoTokenizer, AutoModelForSequenceClassification  # type: ignore
         _use_transformers = True
     except Exception as exc:
         logger.warning(
+            "Failed to load NLI model '%s'. Falling back to heuristic: %s",
             model_name,
             exc,
         )
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     _nli_model.to(device)
+    # Order for nli-roberta-base: [contradiction, entailment, neutral]
     id2label = {0: "contradict", 1: "support", 2: "neutral"}
+    labels_out: List[str] = []
     for start in range(0, len(sentences), batch_size):
+        batch = sentences[start : start + batch_size]
+        enc = _nli_tokenizer(
+            [claim] * len(batch),
+            batch,
             return_tensors="pt",
             truncation=True,
             padding=True,
         ).to(device)
         with torch.no_grad():
+            logits = _nli_model(**enc).logits.cpu().numpy()
         preds = logits.argmax(axis=1)
         labels_out.extend([id2label.get(int(p), "neutral") for p in preds])
     return labels_out
 def _heuristic_classify(claim: str, sentences: List[str]) -> List[str]:
+    """Very simple heuristic fallback (lexical overlap + negation)."""
     import re
     claim_tokens = set(re.findall(r"\b\w+\b", claim.lower()))
+    neg = {"not", "no", "never", "none", "cannot", "n't"}
+    out: List[str] = []
+    for s in sentences:
+        s_tokens = set(re.findall(r"\b\w+\b", s.lower()))
+        overlap = bool(claim_tokens & s_tokens)
+        has_neg = any(tok in s_tokens for tok in neg)
         if overlap and not has_neg:
+            out.append("support")
         elif overlap and has_neg:
+            out.append("contradict")
         else:
+            out.append("neutral")
+    return out
 def classify(claim: str, sentences: Iterable[str], batch_size: int = 16) -> List[str]:
+    """Return a label for each sentence relative to the claim."""
+    # IMPORTANT: declare globals first since we modify them on failure
+    global _nli_model, _nli_tokenizer, _use_transformers
+    sents = list(sentences)
+    if not sents:
         return []
+    # Try to ensure model is loaded
+    if _nli_model is None or _nli_tokenizer is None:
         _load_nli_model()
     if _use_transformers and _nli_model is not None and _nli_tokenizer is not None:
         try:
+            return _classify_with_nli(claim, sents, batch_size=batch_size)
         except Exception as exc:
             logger.warning(
+                "NLI classification failed; switching to heuristic. Error: %s",
                 exc,
             )
+            # Mark as unusable so subsequent calls go straight to heuristic
             _use_transformers = False
             _nli_model = None
             _nli_tokenizer = None
+    # Fallback
+    return _heuristic_classify(claim, sents)