File size: 3,118 Bytes
adf4ac7
 
cb8149d
adf4ac7
58ef0b0
df89742
 
58ef0b0
3cf2a36
58ef0b0
3cf2a36
 
 
58ef0b0
 
3cf2a36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83f7a4f
 
3cf2a36
 
83f7a4f
3cf2a36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83f7a4f
3cf2a36
 
83f7a4f
3cf2a36
adf4ac7
83f7a4f
 
3cf2a36
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import streamlit as st
from transformers import pipeline
from ipymarkup import show_span_box_markup

# Load the pre-trained NER model
model = pipeline("ner", model="/home/user/app/mendobert/", tokenizer="indolem/indobert-base-uncased")
basemodel = pipeline("ner", model="/home/user/app/base-model/", tokenizer="indolem/indobert-base-uncased")

text = st.text_area('enter some text: ')

if text:
    ner_results = model(text)
    ner_results2 = basemodel(text)
    
    
    # MendoBERT
    
    formatted_results = []
    for result in ner_results:
      end = result["start"]+len(result["word"].replace("##", ""))  
      
      if result["word"].startswith("##"):
        formatted_results[-1]["end"] = end
        formatted_results[-1]["word"]+= result["word"].replace("##", "")
      else:
        formatted_results.append({
                'start': result["start"], 
                'end': end,
                'entity': result["entity"],
                'index': result["index"],
                'score': result["score"],
                'word': result["word"]})
        
    for result in formatted_results:
        if result["entity"].startswith("LABEL_0"):
            result["entity"] = "O"
        elif result["entity"].startswith("LABEL_1"):
            result["entity"] = "B"
        elif result["entity"].startswith("LABEL_2"):
            result["entity"] = "I"
    
    mendo = []
    spanMendo = []
    for result in formatted_results:
        if not result["entity"].startswith("O"):
            spanMendo.append((result["start"],result["end"],result["entity"]))
            mendo.append(f"""Entity: {result["entity"]}, Start:{result["start"]}, End:{result["end"]}, word:{text[result["start"]:result["end"]]}""")        
    
    # Base Model     
    
    formatted_results = []
    for result in ner_results2:
      end = result["start"]+len(result["word"].replace("##", ""))  
      
      if result["word"].startswith("##"):
        formatted_results[-1]["end"] = end
        formatted_results[-1]["word"]+= result["word"].replace("##", "")
      else:
        formatted_results.append({
                'start': result["start"], 
                'end': end,
                'entity': result["entity"],
                'index': result["index"],
                'score': result["score"],
                'word': result["word"]})
        
    for result in formatted_results:
        if result["entity"].startswith("LABEL_0"):
            result["entity"] = "O"
        elif result["entity"].startswith("LABEL_1"):
            result["entity"] = "B"
        elif result["entity"].startswith("LABEL_2"):
            result["entity"] = "I"
    
    base=[]        
    spanBase=[]
    for result in formatted_results:
        if not result["entity"].startswith("O"):
            spanBase.append((result["start"],result["end"],result["entity"]))
            base.append(f"""Entity: {result["entity"]}, Start:{result["start"]}, End:{result["end"]}, word:{text[result["start"]:result["end"]]}""")

    st.text(show_span_box_markup(text, spanMendo))
    st.text(show_span_box_markup(text, spanBase))