Spaces:

knowledgator
/

0-shot-NER

Build error

App Files Files Community

mvy commited on Jan 14, 2024

Commit

8e19b14

1 Parent(s): 6f3f044

add validations checks

Browse files

Files changed (2) hide show

app.py +1 -1
ner.py +46 -5

app.py CHANGED Viewed

@@ -24,7 +24,7 @@ examples = [
 ner = NER('knowledgator/UTC-DeBERTa-small')
 gradio_app = gr.Interface(
-    ner,
     inputs = [
         'text',
         gr.Textbox(placeholder="Enter sentence here..."),

 ner = NER('knowledgator/UTC-DeBERTa-small')
 gradio_app = gr.Interface(
+    ner.process,
     inputs = [
         'text',
         gr.Textbox(placeholder="Enter sentence here..."),

ner.py CHANGED Viewed

@@ -4,6 +4,8 @@ import string
 from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
 import spacy
 import torch
 class NER:
     prompt: str = """
@@ -13,8 +15,14 @@ Identify entities in the text having the following classes:
 Text:
  """
-    def __init__(self, model_name: str, sents_batch: int=10):
         self.sents_batch = sents_batch
         self.nlp: spacy.Language = spacy.load(
             'en_core_web_sm',
@@ -23,13 +31,13 @@ Text:
         self.nlp.add_pipe('sentencizer')
         device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForTokenClassification.from_pretrained(model_name)
         self.pipeline = pipeline(
             "ner",
             model=model,
-            tokenizer=tokenizer,
             aggregation_strategy='first',
             batch_size=12,
             device=device
@@ -115,14 +123,47 @@ Text:
         return outputs
-    def __call__(
         self, labels: str, text: str, threshold: float=0.
     ) -> dict[str, any]:
-        labels_list = [label.strip() for label in labels.split(',')]
         chunks, chunks_starts = self.chunkanize(text)
         inputs, prompts_lens = self.get_inputs(chunks, labels_list)
         outputs = self.predict(
             text, inputs, labels_list, chunks_starts, prompts_lens, threshold
         )

 from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
 import spacy
 import torch
+import gradio as gr
 class NER:
     prompt: str = """
 Text:
  """
+    def __init__(
+        self,
+        model_name: str,
+        sents_batch: int=10,
+        tokens_limit: int=2048
+    ):
         self.sents_batch = sents_batch
+        self.tokens_limit = tokens_limit
         self.nlp: spacy.Language = spacy.load(
             'en_core_web_sm',
         self.nlp.add_pipe('sentencizer')
         device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForTokenClassification.from_pretrained(model_name)
         self.pipeline = pipeline(
             "ner",
             model=model,
+            tokenizer=self.tokenizer,
             aggregation_strategy='first',
             batch_size=12,
             device=device
         return outputs
+    def check_text(self, text: str) -> None:
+        if not text:
+            raise gr.Error('No text provided. Please provide text.')
+    def check_labels(self, labels: list[str]) -> None:
+        if not labels:
+            raise gr.Error(
+                'No labels provided. Please provide labels.'
+                ' Multiple labels should be divided by commas.'
+                ' See examples below.'
+            )
+    def check_tokens_limit(self, inputs: list[str]) -> None:
+        tokens = 0
+        for input_ in inputs:
+            tokens += len(self.tokenizer.encode(input_))
+            if tokens > self.tokens_limit:
+                raise gr.Error(
+                    'Too many tokens! Please reduce size of text or amount of labels.'
+                    f' Max tokens count is: {self.tokens_limit}.'
+                )
+    def process(
         self, labels: str, text: str, threshold: float=0.
     ) -> dict[str, any]:
+        labels_list = list({
+            l for label in labels.split(',')
+            if (l:=label.strip())
+        })
+        self.check_labels(labels_list)
+        self.check_text(text)
         chunks, chunks_starts = self.chunkanize(text)
         inputs, prompts_lens = self.get_inputs(chunks, labels_list)
+        self.check_tokens_limit(inputs)
         outputs = self.predict(
             text, inputs, labels_list, chunks_starts, prompts_lens, threshold
         )