KoichiYasuoka
commited on
Commit
·
fb432f9
1
Parent(s):
aa2ea32
preprocess improved
Browse files
ud.py
CHANGED
@@ -2,19 +2,18 @@ from transformers import TokenClassificationPipeline
|
|
2 |
|
3 |
class UniversalDependenciesPipeline(TokenClassificationPipeline):
|
4 |
def preprocess(self,sentence,offset_mapping=None):
|
|
|
5 |
from tokenizers.pre_tokenizers import Whitespace
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
|
|
10 |
else:
|
11 |
-
|
12 |
-
if self.tokenizer.convert_tokens_to_ids(j)!=self.tokenizer.unk_token_id:
|
13 |
-
t[-1]=(j,(t[-1][1][0],e))
|
14 |
-
else:
|
15 |
-
t.append((k,(s,e)))
|
16 |
-
r=super().preprocess(sentence=" ".join(i for i,j in t))
|
17 |
m=[(0,0)]+[j for i,j in t]+[(0,0)]
|
|
|
18 |
w=self.tokenizer.convert_ids_to_tokens(r["input_ids"][0])
|
19 |
if len(m)!=len(w):
|
20 |
for i,j in enumerate(w):
|
@@ -22,7 +21,7 @@ class UniversalDependenciesPipeline(TokenClassificationPipeline):
|
|
22 |
s,e=m[i]
|
23 |
m.insert(i+1,(s+len(j)-2,e))
|
24 |
m[i]=(s,s+len(j)-2)
|
25 |
-
r["offset_mapping"]=m
|
26 |
r["sentence"]=sentence
|
27 |
return r
|
28 |
def _forward(self,model_inputs):
|
@@ -49,7 +48,7 @@ class UniversalDependenciesPipeline(TokenClassificationPipeline):
|
|
49 |
k,h=z[numpy.nanargmax(m[z,z])],numpy.nanmin(m)-numpy.nanmax(m)
|
50 |
m[:,z]+=[[0 if j in z and (i!=j or i==k) else h for i in z] for j in range(m.shape[0])]
|
51 |
h=self.chu_liu_edmonds(m)
|
52 |
-
v=[(s,e) for s,e in model_outputs["offset_mapping"] if s<e]
|
53 |
q=[self.model.config.id2label[p[j,i]].split("|") for i,j in enumerate(h)]
|
54 |
g="aggregation_strategy" in kwargs and kwargs["aggregation_strategy"]!="none"
|
55 |
if g:
|
|
|
2 |
|
3 |
class UniversalDependenciesPipeline(TokenClassificationPipeline):
|
4 |
def preprocess(self,sentence,offset_mapping=None):
|
5 |
+
import torch
|
6 |
from tokenizers.pre_tokenizers import Whitespace
|
7 |
+
v=Whitespace().pre_tokenize_str(sentence)
|
8 |
+
t=[v[0]]
|
9 |
+
for k,(s,e) in v[1:]:
|
10 |
+
j=t[-1][0]+"_"+k
|
11 |
+
if self.tokenizer.convert_tokens_to_ids(j)!=self.tokenizer.unk_token_id:
|
12 |
+
t[-1]=(j,(t[-1][1][0],e))
|
13 |
else:
|
14 |
+
t.append((k,(s,e)))
|
|
|
|
|
|
|
|
|
|
|
15 |
m=[(0,0)]+[j for i,j in t]+[(0,0)]
|
16 |
+
r=super().preprocess(sentence=" ".join(i for i,j in t))
|
17 |
w=self.tokenizer.convert_ids_to_tokens(r["input_ids"][0])
|
18 |
if len(m)!=len(w):
|
19 |
for i,j in enumerate(w):
|
|
|
21 |
s,e=m[i]
|
22 |
m.insert(i+1,(s+len(j)-2,e))
|
23 |
m[i]=(s,s+len(j)-2)
|
24 |
+
r["offset_mapping"]=torch.tensor([m])
|
25 |
r["sentence"]=sentence
|
26 |
return r
|
27 |
def _forward(self,model_inputs):
|
|
|
48 |
k,h=z[numpy.nanargmax(m[z,z])],numpy.nanmin(m)-numpy.nanmax(m)
|
49 |
m[:,z]+=[[0 if j in z and (i!=j or i==k) else h for i in z] for j in range(m.shape[0])]
|
50 |
h=self.chu_liu_edmonds(m)
|
51 |
+
v=[(s,e) for s,e in model_outputs["offset_mapping"][0].tolist() if s<e]
|
52 |
q=[self.model.config.id2label[p[j,i]].split("|") for i,j in enumerate(h)]
|
53 |
g="aggregation_strategy" in kwargs and kwargs["aggregation_strategy"]!="none"
|
54 |
if g:
|