import streamlit as st from transformers import pipeline from ipymarkup import format_span_box_markup # Load the pre-trained NER model model = pipeline("ner", model="/home/user/app/mendobert/", tokenizer="indolem/indobert-base-uncased") basemodel = pipeline("ner", model="/home/user/app/base-model/", tokenizer="indolem/indobert-base-uncased") st.title(':blue[MendoBERT] - Named Entity Recognition :sunglasses:') text = st.text_area('Enter some text: ', 'Enter your texts here...') if text: ner_results = model(text) ner_results2 = basemodel(text) # MendoBERT formatted_results = [] for result in ner_results: end = result["start"]+len(result["word"].replace("##", "")) if result["word"].startswith("##"): formatted_results[-1]["end"] = end formatted_results[-1]["word"]+= result["word"].replace("##", "") else: formatted_results.append({ 'start': result["start"], 'end': end, 'entity': result["entity"], 'index': result["index"], 'score': result["score"], 'word': result["word"]}) for result in formatted_results: if result["entity"].startswith("LABEL_0"): result["entity"] = "O" elif result["entity"].startswith("LABEL_1"): result["entity"] = "B" elif result["entity"].startswith("LABEL_2"): result["entity"] = "I" mendo = [] spanMendo = [] for result in formatted_results: if not result["entity"].startswith("O"): spanMendo.append((result["start"],result["end"],result["entity"])) mendo.append(f"""Entity: {result["entity"]}, Start:{result["start"]}, End:{result["end"]}, word:{text[result["start"]:result["end"]]}""") # Base Model formatted_results = [] for result in ner_results2: end = result["start"]+len(result["word"].replace("##", "")) if result["word"].startswith("##"): formatted_results[-1]["end"] = end formatted_results[-1]["word"]+= result["word"].replace("##", "") else: formatted_results.append({ 'start': result["start"], 'end': end, 'entity': result["entity"], 'index': result["index"], 'score': result["score"], 'word': result["word"]}) for result in formatted_results: if result["entity"].startswith("LABEL_0"): result["entity"] = "O" elif result["entity"].startswith("LABEL_1"): result["entity"] = "B" elif result["entity"].startswith("LABEL_2"): result["entity"] = "I" base=[] spanBase=[] for result in formatted_results: if not result["entity"].startswith("O"): spanBase.append((result["start"],result["end"],result["entity"])) base.append(f"""Entity: {result["entity"]}, Start:{result["start"]}, End:{result["end"]}, word:{text[result["start"]:result["end"]]}""") formatMendo = format_span_box_markup(text, spanMendo) htmlMendo = ''.join(formatMendo) formatBase = format_span_box_markup(text, spanBase) htmlBase = ''.join(formatBase) st.subheader('MendoBERT') st.json(mendo) st.markdown(htmlMendo,unsafe_allow_html=True) st.subheader('IndoLEM') st.json(base) st.markdown(htmlBase,unsafe_allow_html=True) with st.container(): example1 = st.button('Aspartylglucosaminuria (AGU) adalah gangguan metabolisme glikoprotein langka.', use_container_width=True) example2 = st.button('Mutasi germ - line dari gen BRCA1 membuat wanita cenderung mengalami kanker payudara dini dengan mengorbankan fungsi presumtif gen sebagai penekan tumor.', use_container_width=True) if example1: text = st.text_area('Enter some text: ', 'Aspartylglucosaminuria (AGU) adalah gangguan metabolisme glikoprotein langka.') elif example2: text = st.text_area('Enter some text: ', 'Mutasi germ - line dari gen BRCA1 membuat wanita cenderung mengalami kanker payudara dini dengan mengorbankan fungsi presumtif gen sebagai penekan tumor.')