KoichiYasuoka
/

phobert-base-vietnamese-ud-goeswith

Token Classification

dependency-parsing

Inference Endpoints

Model card Files Files and versions Community

KoichiYasuoka commited on Dec 18, 2022

Commit

fb432f9

·

1 Parent(s): aa2ea32

preprocess improved

Files changed (1) hide show

ud.py +11 -12

ud.py CHANGED Viewed

@@ -2,19 +2,18 @@ from transformers import TokenClassificationPipeline
 class UniversalDependenciesPipeline(TokenClassificationPipeline):
   def preprocess(self,sentence,offset_mapping=None):
     from tokenizers.pre_tokenizers import Whitespace
-    t=[]
-    for k,(s,e) in Whitespace().pre_tokenize_str(sentence):
-      if t==[]:
-        t.append((k,(s,e)))
       else:
-        j=t[-1][0]+"_"+k
-        if self.tokenizer.convert_tokens_to_ids(j)!=self.tokenizer.unk_token_id:
-          t[-1]=(j,(t[-1][1][0],e))
-        else:
-          t.append((k,(s,e)))
-    r=super().preprocess(sentence=" ".join(i for i,j in t))
     m=[(0,0)]+[j for i,j in t]+[(0,0)]
     w=self.tokenizer.convert_ids_to_tokens(r["input_ids"][0])
     if len(m)!=len(w):
       for i,j in enumerate(w):
@@ -22,7 +21,7 @@ class UniversalDependenciesPipeline(TokenClassificationPipeline):
           s,e=m[i]
           m.insert(i+1,(s+len(j)-2,e))
           m[i]=(s,s+len(j)-2)
-    r["offset_mapping"]=m
     r["sentence"]=sentence
     return r
   def _forward(self,model_inputs):
@@ -49,7 +48,7 @@ class UniversalDependenciesPipeline(TokenClassificationPipeline):
       k,h=z[numpy.nanargmax(m[z,z])],numpy.nanmin(m)-numpy.nanmax(m)
       m[:,z]+=[[0 if j in z and (i!=j or i==k) else h for i in z] for j in range(m.shape[0])]
       h=self.chu_liu_edmonds(m)
-    v=[(s,e) for s,e in model_outputs["offset_mapping"] if s<e]
     q=[self.model.config.id2label[p[j,i]].split("|") for i,j in enumerate(h)]
     g="aggregation_strategy" in kwargs and kwargs["aggregation_strategy"]!="none"
     if g:

 class UniversalDependenciesPipeline(TokenClassificationPipeline):
   def preprocess(self,sentence,offset_mapping=None):
+    import torch
     from tokenizers.pre_tokenizers import Whitespace
+    v=Whitespace().pre_tokenize_str(sentence)
+    t=[v[0]]
+    for k,(s,e) in v[1:]:
+      j=t[-1][0]+"_"+k
+      if self.tokenizer.convert_tokens_to_ids(j)!=self.tokenizer.unk_token_id:
+        t[-1]=(j,(t[-1][1][0],e))
       else:
+        t.append((k,(s,e)))
     m=[(0,0)]+[j for i,j in t]+[(0,0)]
+    r=super().preprocess(sentence=" ".join(i for i,j in t))
     w=self.tokenizer.convert_ids_to_tokens(r["input_ids"][0])
     if len(m)!=len(w):
       for i,j in enumerate(w):
           s,e=m[i]
           m.insert(i+1,(s+len(j)-2,e))
           m[i]=(s,s+len(j)-2)
+    r["offset_mapping"]=torch.tensor([m])
     r["sentence"]=sentence
     return r
   def _forward(self,model_inputs):
       k,h=z[numpy.nanargmax(m[z,z])],numpy.nanmin(m)-numpy.nanmax(m)
       m[:,z]+=[[0 if j in z and (i!=j or i==k) else h for i in z] for j in range(m.shape[0])]
       h=self.chu_liu_edmonds(m)
+    v=[(s,e) for s,e in model_outputs["offset_mapping"][0].tolist() if s<e]
     q=[self.model.config.id2label[p[j,i]].split("|") for i,j in enumerate(h)]
     g="aggregation_strategy" in kwargs and kwargs["aggregation_strategy"]!="none"
     if g: