NusaBERT

Sleeping

App Files Files Community

w11wo commited on Mar 19, 2024

Commit

76d290c

1 Parent(s): fa79515

Initial Prototype

Browse files

Files changed (7) hide show

README.md +3 -3
app.py +1 -1
get_documents.ipynb +0 -92
model.py +18 -50
requirements.txt +0 -1
sample.json +0 -0
utils.py +3 -132

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: LazarusNLP
-emoji: 👀
 colorFrom: red
-colorTo: purple
 sdk: gradio
 sdk_version: 4.18.0
 app_file: app.py

 ---
+title: NusaBERT
+emoji: 🇮🇩
 colorFrom: red
+colorTo: white
 sdk: gradio
 sdk_version: 4.18.0
 app_file: app.py

app.py CHANGED Viewed

@@ -11,6 +11,6 @@ if __name__ == "__main__":
     # interface = gr.TabbedInterface(interfaces, titles, theme="soft")
     with gr.Blocks(theme="soft") as demo:
-        gr.Markdown("# LazarusNLP Indonesian NLP Demo")
         gr.TabbedInterface(interfaces, titles, theme="soft")
     demo.launch()

     # interface = gr.TabbedInterface(interfaces, titles, theme="soft")
     with gr.Blocks(theme="soft") as demo:
+        gr.Markdown("# NusaBERT Demo")
         gr.TabbedInterface(interfaces, titles, theme="soft")
     demo.launch()

get_documents.ipynb DELETED Viewed

@@ -1,92 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/miniconda3/envs/hf/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    }
-   ],
-   "source": [
-    "from datasets import load_dataset\n",
-    "dataset = load_dataset('LazarusNLP/wikipedia_id_20230520')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "train = pd.DataFrame(dataset['train'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "first_element = train.groupby('id').first()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sample = first_element.sample(n=10000).reset_index()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sample_dict = sample.to_dict(orient=\"list\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "with open('sample.json', 'w') as f:\n",
-    "    json.dump(sample_dict, f)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "hf",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

model.py CHANGED Viewed

@@ -1,15 +1,12 @@
-from utils import (
-    text_analysis_interface,
-    token_classification_interface,
-    search_interface,
-    text_interface,
-    SentenceSimilarity,
-)
 from transformers import pipeline
 models = {
     "Text Analysis": {
-        "title": "# Text Analysis",
         "examples": [
             "Allianz adalah persuhaan asuransi yang di dirikan pada tanggal February 5, 1890 di Berlin, Jerman.",
             "Restaurant ini sangat tidak enak. Enakan Pizza Hut.",
@@ -21,32 +18,15 @@ models = {
             "POS Tagging",
             "NER Tagging",
         ],
-        "desc": "A tool to showcase the full capabilities of text analysis LazarusNLP has to offer.",
         "interface": text_analysis_interface,
         "pipe": [
-            pipeline(
-                "text-classification",
-                model="w11wo/indonesian-roberta-base-sentiment-classifier",
-                tokenizer="w11wo/indonesian-roberta-base-sentiment-classifier",
-            ),
-            pipeline(
-                "text-classification",
-                model="StevenLimcorn/indonesian-roberta-base-emotion-classifier",
-                tokenizer="StevenLimcorn/indonesian-roberta-base-emotion-classifier",
-            ),
-            pipeline(model="w11wo/indonesian-roberta-base-posp-tagger"),
-            pipeline(model="w11wo/indonesian-roberta-base-nerp-tagger"),
         ],
     },
-    "Document Search": {
-        "title": "# Document Search 🔍",
-        "examples": ["Stadion bola Indonesia.", "Rusia dan Serbia", "Politik."],
-        "output_label": "Top 5 related documents",
-        "desc": "A semantic search tool to get the most related documents 📖 based on user's query.",
-        "interface": search_interface,
-        "pipe": SentenceSimilarity("LazarusNLP/all-indobert-base-v2", "sample.json"),
-        "top_k": 5,
-    },
     "Sentiment Analysis": {
         "title": "Sentiment Analysis",
         "examples": [
@@ -55,13 +35,9 @@ models = {
             "keren sekali aplikasi ini bisa menunjukan data diri secara detail, sangat di rekomendasikan untuk di pakai.",
         ],
         "output_label": "Sentiment Analysis",
-        "desc": "A sentiment-text-classification model based on the RoBERTa model. The model was originally the pre-trained Indonesian RoBERTa Base model, which is then fine-tuned on indonlu's SmSA dataset consisting of Indonesian comments and reviews.",
         "interface": text_interface,
-        "pipe": pipeline(
-            "text-classification",
-            model="w11wo/indonesian-roberta-base-sentiment-classifier",
-            tokenizer="w11wo/indonesian-roberta-base-sentiment-classifier",
-        ),
     },
     "Emotion Detection": {
         "title": "Emotion Classifier",
@@ -71,18 +47,10 @@ models = {
             "Bahaha.. dia ke kasir after me. Sambil ngangkat keresek belanjaanku, masih sempet liat mas nya nyodorin barang belanjaannya",
         ],
         "output_label": "Emotion Classifier",
-        "desc": "An emotion classifier based on the RoBERTa model. The model was originally the pre-trained Indonesian RoBERTa Base model, which is then fine-tuned on indonlu's EmoT dataset",
         "interface": text_interface,
-        "pipe": pipeline(
-            "text-classification",
-            model="StevenLimcorn/indonesian-roberta-base-emotion-classifier",
-            tokenizer="StevenLimcorn/indonesian-roberta-base-emotion-classifier",
-        ),
     },
-    # "summarization": {
-    #     "examples": [],
-    #     "desc": "This model is a fine-tuned version of LazarusNLP/IndoNanoT5-base on the indonlg dataset.",
-    # },
     "POS Tagging": {
         "title": "POS Tagging",
         "examples": [
@@ -91,9 +59,9 @@ models = {
             "Bahaha.. dia ke kasir after me. Sambil ngangkat keresek belanjaanku, masih sempet liat mas nya nyodorin barang belanjaannya",
         ],
         "output_label": "POS Tagging",
-        "desc": "A part-of-speech token-classification model based on the RoBERTa model. The model was originally the pre-trained Indonesian RoBERTa Base model, which is then fine-tuned on indonlu's POSP dataset consisting of tag-labelled news.",
         "interface": token_classification_interface,
-        "pipe": pipeline(model="w11wo/indonesian-roberta-base-posp-tagger"),
     },
     "NER Tagging": {
         "title": "NER Tagging",
@@ -103,8 +71,8 @@ models = {
             "Jakarta, Indonesia akan menjadi bagian salah satu tempat yang akan didatangi.",
         ],
         "output_label": "NER Tagging",
-        "desc": "A NER Tagging token-classification model based on the RoBERTa model. The model was originally the pre-trained Indonesian RoBERTa Base model, which is then fine-tuned on indonlu's NERP dataset consisting of tag-labelled news.",
         "interface": token_classification_interface,
-        "pipe": pipeline(model="w11wo/indonesian-roberta-base-nerp-tagger"),
     },
 }

+from utils import text_analysis_interface, token_classification_interface, text_interface
 from transformers import pipeline
+import os
+auth_token = os.environ.get("TOKEN_FROM_SECRET") or True
 models = {
     "Text Analysis": {
+        "title": "Text Analysis",
         "examples": [
             "Allianz adalah persuhaan asuransi yang di dirikan pada tanggal February 5, 1890 di Berlin, Jerman.",
             "Restaurant ini sangat tidak enak. Enakan Pizza Hut.",
             "POS Tagging",
             "NER Tagging",
         ],
+        "desc": "A tool to showcase the full capabilities of text analysis NusaBERT fine-tuning has to offer.",
         "interface": text_analysis_interface,
         "pipe": [
+            pipeline(model="LazarusNLP/NusaBERT-base-EmoT", auth_token=auth_token),
+            pipeline(model="LazarusNLP/NusaBERT-base-EmoT", auth_token=auth_token),
+            pipeline(model="LazarusNLP/NusaBERT-base-POSP", auth_token=auth_token),
+            pipeline(model="LazarusNLP/NusaBERT-base-NERP", auth_token=auth_token),
         ],
     },
     "Sentiment Analysis": {
         "title": "Sentiment Analysis",
         "examples": [
             "keren sekali aplikasi ini bisa menunjukan data diri secara detail, sangat di rekomendasikan untuk di pakai.",
         ],
         "output_label": "Sentiment Analysis",
+        "desc": "A sentiment-text-classification model based on the BERT model. The model was originally the pre-trained NusaBERT Base model, which is then fine-tuned on indonlu's SmSA dataset consisting of Indonesian comments and reviews.",
         "interface": text_interface,
+        "pipe": pipeline(model="LazarusNLP/NusaBERT-base-EmoT", auth_token=auth_token),
     },
     "Emotion Detection": {
         "title": "Emotion Classifier",
             "Bahaha.. dia ke kasir after me. Sambil ngangkat keresek belanjaanku, masih sempet liat mas nya nyodorin barang belanjaannya",
         ],
         "output_label": "Emotion Classifier",
+        "desc": "An emotion classifier based on the BERT model. The model was originally the pre-trained NusaBERT Base model, which is then fine-tuned on indonlu's EmoT dataset",
         "interface": text_interface,
+        "pipe": pipeline(model="LazarusNLP/NusaBERT-base-EmoT", auth_token=auth_token),
     },
     "POS Tagging": {
         "title": "POS Tagging",
         "examples": [
             "Bahaha.. dia ke kasir after me. Sambil ngangkat keresek belanjaanku, masih sempet liat mas nya nyodorin barang belanjaannya",
         ],
         "output_label": "POS Tagging",
+        "desc": "A part-of-speech token-classification model based on the BERT model. The model was originally the pre-trained NusaBERT Base model, which is then fine-tuned on indonlu's POSP dataset consisting of tag-labelled news.",
         "interface": token_classification_interface,
+        "pipe": pipeline(model="LazarusNLP/NusaBERT-base-POSP", auth_token=auth_token),
     },
     "NER Tagging": {
         "title": "NER Tagging",
             "Jakarta, Indonesia akan menjadi bagian salah satu tempat yang akan didatangi.",
         ],
         "output_label": "NER Tagging",
+        "desc": "A NER Tagging token-classification model based on the BERT model. The model was originally the pre-trained NusaBERT Base model, which is then fine-tuned on indonlu's NERP dataset consisting of tag-labelled news.",
         "interface": token_classification_interface,
+        "pipe": pipeline(model="LazarusNLP/NusaBERT-base-NERP", auth_token=auth_token),
     },
 }

requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
 gradio==4.19.1
 scipy==1.12.0
-sentence_transformers==2.3.1
 transformers==4.37.2

 gradio==4.19.1
 scipy==1.12.0
 transformers==4.37.2

sample.json DELETED Viewed

The diff for this file is too large to render. See raw diff

utils.py CHANGED Viewed

@@ -1,56 +1,11 @@
 import gradio as gr
 from functools import partial
 from transformers import pipeline, pipelines
-from sentence_transformers import SentenceTransformer, util
-import json
 ######################
 ##### INFERENCE ######
 ######################
-class SentenceSimilarity:
-    def __init__(self, model: str, corpus_path: str):
-        f = open(corpus_path)
-        data = json.load(f)
-        self.id, self.url, self.title, self.text = (
-            data["id"],
-            data["url"],
-            data["title"],
-            data["text"],
-        )
-        self.model = SentenceTransformer(model)
-        self.corpus_embeddings = self.model.encode(self.text)
-    def __call__(self, query: str, corpus: list[str], top_k: int = 5):
-        query_embedding = self.model.encode(query)
-        output = util.semantic_search(
-            query_embedding, self.corpus_embeddings, top_k=top_k
-        )
-        return output[0]
-# Sentence Similarity
-def sentence_similarity(
-    query: str,
-    texts: list[str],
-    titles: list[str],
-    urls: list[str],
-    pipe: SentenceSimilarity,
-    top_k: int,
-) -> list[str]:
-    answer = pipe(query=query, corpus=texts, top_k=top_k)
-    output = [
-        f"""
-            Cosine Similarity Score: {round(ans['score'], 3)}
-            ## [{titles[ans['corpus_id']]} 🔗]({urls[ans['corpus_id']]})
-            {texts[ans['corpus_id']]}
-        """
-        for ans in answer
-    ]
-    return output
 # Text Analysis
 def cls_inference(input: list[str], pipe: pipeline) -> dict:
     results = pipe(input, top_k=None)
@@ -77,77 +32,7 @@ def text_analysis(text, pipes: list[pipeline]):
 ######################
 ##### INTERFACE ######
 ######################
-def text_interface(
-    pipe: pipeline, examples: list[str], output_label: str, title: str, desc: str
-):
-    return gr.Interface(
-        fn=partial(cls_inference, pipe=pipe),
-        inputs=[
-            gr.Textbox(lines=5, label="Input Text"),
-        ],
-        title=title,
-        description=desc,
-        outputs=[gr.Label(label=output_label)],
-        examples=examples,
-        allow_flagging="never",
-    )
-def search_interface(
-    pipe: SentenceSimilarity,
-    examples: list[str],
-    output_label: str,
-    title: str,
-    desc: str,
-    top_k: int,
-):
-    with gr.Blocks() as sentence_similarity_interface:
-        gr.Markdown(title)
-        gr.Markdown(desc)
-        with gr.Row():
-            # input on the left
-            with gr.Column():
-                input_text = gr.Textbox(lines=5, label="Query")
-                # display documents
-                df = gr.DataFrame(
-                    [
-                        [id, f"<a href='{url}' target='_blank'>{title} 🔗</a>"]
-                        for id, title, url in zip(pipe.id, pipe.title, pipe.url)
-                    ],
-                    headers=["ID", "Title"],
-                    wrap=True,
-                    datatype=["markdown", "html"],
-                    interactive=False,
-                    height=300,
-                )
-                button = gr.Button("Search...")
-            with gr.Column():
-                # outputs top_k results in accordion format
-                outputs = []
-                for i in range(top_k):
-                    # open the first accordion
-                    with gr.Accordion(label=f"Document {i + 1}", open=i == 0) as a:
-                        output = gr.Markdown()
-                        outputs.append(output)
-        gr.Examples(examples, inputs=[input_text], outputs=outputs)
-        button.click(
-            fn=partial(
-                sentence_similarity,
-                pipe=pipe,
-                texts=pipe.text,
-                titles=pipe.title,
-                urls=pipe.url,
-                top_k=top_k,
-            ),
-            inputs=[input_text],
-            outputs=outputs,
-        )
-    return sentence_similarity_interface
-def token_classification_interface(
-    pipe: pipeline, examples: list[str], output_label: str, title: str, desc: str
-):
     return gr.Interface(
         fn=partial(tagging, pipe=pipe),
         inputs=[
@@ -161,9 +46,7 @@ def token_classification_interface(
     )
-def text_analysis_interface(
-    pipe: list, examples: list[str], output_label: str, title: str, desc: str
-):
     with gr.Blocks() as text_analysis_interface:
         gr.Markdown(title)
         gr.Markdown(desc)
@@ -172,9 +55,7 @@ def text_analysis_interface(
             outputs = [
                 (
                     gr.HighlightedText(label=label)
-                    if isinstance(
-                        p, pipelines.token_classification.TokenClassificationPipeline
-                    )
                     else gr.Label(label=label)
                 )
                 for label, p in zip(output_label, pipe)
@@ -191,13 +72,3 @@ def text_analysis_interface(
             outputs=outputs,
         )
     return text_analysis_interface
-# Summary
-# summary_interface = gr.Interface.from_pipeline(
-#     pipes["summarization"],
-#     title="Summarization",
-#     examples=details["summarization"]["examples"],
-#     description=details["summarization"]["description"],
-#     allow_flagging="never",
-# )

 import gradio as gr
 from functools import partial
 from transformers import pipeline, pipelines
 ######################
 ##### INFERENCE ######
 ######################
 # Text Analysis
 def cls_inference(input: list[str], pipe: pipeline) -> dict:
     results = pipe(input, top_k=None)
 ######################
 ##### INTERFACE ######
 ######################
+def token_classification_interface(pipe: pipeline, examples: list[str], output_label: str, title: str, desc: str):
     return gr.Interface(
         fn=partial(tagging, pipe=pipe),
         inputs=[
     )
+def text_analysis_interface(pipe: list, examples: list[str], output_label: str, title: str, desc: str):
     with gr.Blocks() as text_analysis_interface:
         gr.Markdown(title)
         gr.Markdown(desc)
             outputs = [
                 (
                     gr.HighlightedText(label=label)
+                    if isinstance(p, pipelines.token_classification.TokenClassificationPipeline)
                     else gr.Label(label=label)
                 )
                 for label, p in zip(output_label, pipe)
             outputs=outputs,
         )
     return text_analysis_interface