File size: 4,567 Bytes
adf4ac7
 
062527a
adf4ac7
58ef0b0
df89742
 
58ef0b0
50e8f91
599660b
a445fea
faf86dc
 
 
 
 
 
 
 
 
 
 
a445fea
faf86dc
 
 
 
 
 
a445fea
3cf2a36
 
 
58ef0b0
 
3cf2a36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83f7a4f
 
3cf2a36
 
83f7a4f
15aafe1
3cf2a36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83f7a4f
3cf2a36
 
83f7a4f
afff5b8
adf4ac7
96898b2
 
 
 
 
 
dcb64e4
44854e8
08b8ceb
dcb64e4
 
 
b7f70e6
8dd94d9
f185ae2
c22d8af
a445fea
b7f70e6
8dd94d9
dcb64e4
3cf2a36
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import streamlit as st
from transformers import pipeline
from ipymarkup import format_span_box_markup

# Load the pre-trained NER model
model = pipeline("ner", model="/home/user/app/mendobert/", tokenizer="indolem/indobert-base-uncased")
basemodel = pipeline("ner", model="/home/user/app/base-model/", tokenizer="indolem/indobert-base-uncased")


st.title(':blue[MendoBERT] - Named Entity Recognition :sunglasses:')


if 'options' not in st.session_state:
    st.session_state['options'] = ""

def button1_callback():
    st.session_state['options'] = "Aspartylglucosaminuria (AGU) adalah gangguan metabolisme glikoprotein langka."
def button2_callback():
    st.session_state['options'] = "Mutasi germ - line dari gen BRCA1 membuat wanita cenderung mengalami kanker payudara dini dengan mengorbankan fungsi presumtif gen sebagai penekan tumor."

placeholder = st.empty()

st.caption('_Examples_')
st.button('Aspartylglucosaminuria (AGU) adalah gangguan metabolisme glikoprotein langka.', use_container_width=True, on_click = button1_callback)
st.button('Mutasi germ - line dari gen BRCA1 membuat wanita cenderung mengalami kanker payudara dini dengan mengorbankan fungsi presumtif gen sebagai penekan tumor.', use_container_width=True, on_click = button2_callback)


with placeholder:
    text = st.text_area('Enter some text: ', key = 'options')

if text:
    ner_results = model(text)
    ner_results2 = basemodel(text)
    
    
    # MendoBERT
    
    formatted_results = []
    for result in ner_results:
      end = result["start"]+len(result["word"].replace("##", ""))  
      
      if result["word"].startswith("##"):
        formatted_results[-1]["end"] = end
        formatted_results[-1]["word"]+= result["word"].replace("##", "")
      else:
        formatted_results.append({
                'start': result["start"], 
                'end': end,
                'entity': result["entity"],
                'index': result["index"],
                'score': result["score"],
                'word': result["word"]})
        
    for result in formatted_results:
        if result["entity"].startswith("LABEL_0"):
            result["entity"] = "O"
        elif result["entity"].startswith("LABEL_1"):
            result["entity"] = "B"
        elif result["entity"].startswith("LABEL_2"):
            result["entity"] = "I"
    
    mendo = []
    spanMendo = []
    for result in formatted_results:
        if not result["entity"].startswith("O"):
            spanMendo.append((result["start"],result["end"],result["entity"]))
            mendo.append(f"""Entity: {result["entity"]}, Start:{result["start"]}, End:{result["end"]}, word:{text[result["start"]:result["end"]]}, score:{result["score"]}""")        
    
    # Base Model     
    
    formatted_results = []
    for result in ner_results2:
      end = result["start"]+len(result["word"].replace("##", ""))  
      
      if result["word"].startswith("##"):
        formatted_results[-1]["end"] = end
        formatted_results[-1]["word"]+= result["word"].replace("##", "")
      else:
        formatted_results.append({
                'start': result["start"], 
                'end': end,
                'entity': result["entity"],
                'index': result["index"],
                'score': result["score"],
                'word': result["word"]})
        
    for result in formatted_results:
        if result["entity"].startswith("LABEL_0"):
            result["entity"] = "O"
        elif result["entity"].startswith("LABEL_1"):
            result["entity"] = "B"
        elif result["entity"].startswith("LABEL_2"):
            result["entity"] = "I"
    
    base=[]        
    spanBase=[]
    for result in formatted_results:
        if not result["entity"].startswith("O"):
            spanBase.append((result["start"],result["end"],result["entity"]))
            base.append(f"""Entity: {result["entity"]}, Start:{result["start"]}, End:{result["end"]}, word:{text[result["start"]:result["end"]]}, score:{result["score"]}""")

    formatMendo = format_span_box_markup(text, spanMendo)
    htmlMendo = ''.join(formatMendo)

    formatBase = format_span_box_markup(text, spanBase)
    htmlBase = ''.join(formatBase)

    st.subheader('MendoBERT')
    st.json(mendo)
    st.markdown(htmlMendo,unsafe_allow_html=True)
    st.subheader('IndoLEM')
    st.json(base)
    st.markdown(htmlBase,unsafe_allow_html=True)

    st.write("\n")
    st.info("'B' means Beginning of an entity, 'I' means Inside of an entity", icon="ℹ️")
    text = False


st.write("\n\n")