Spaces:

kiansheik
/

tupi-verb-annotation

Sleeping

File size: 2,390 Bytes

import gradio as gr
import re
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

print(f"Is CUDA available: {torch.cuda.is_available()}")

# Load the model and tokenizer
model_name = "kiansheik/tupi-verb-anotation"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Identify all non-ascii special chars in the navarro alphabet
special_chars = "û î ŷ á é í ý ó ú ã ẽ ĩ ỹ õ ũ '".split(" ")
# Create a two-way dictionary
def normalize_tupi(x):
    for i, char in enumerate(special_chars):
        x = x.replace(char, f"[w{i}q]")
    return re.sub('\s+', ' ', x).strip().replace(' ', '[SPACE]')
def replace_outside_brackets(match):
    part = match.group()
    if part.startswith('[') and part.endswith(']'):
        return part  # Return the part unchanged if it's inside brackets
    return part
def navarroize_tupi(x):
    # Pattern to match text outside square brackets
    pattern = r'\[.*?\]|[^[\]]+'
    return re.sub(pattern, replace_outside_brackets, x)

def anotate(st, debug=False):
    # Now we will test the model with a sample sentence
    sentence = normalize_tupi(st.lower()).replace(',', '').replace('.', '').replace('?', '').replace('!', '').replace('-', '').strip()
    inp_sent = navarroize_tupi(sentence).replace(' ','[SPACE]')
    if debug:
        print("Input Phrase:\t\t", inp_sent)
    input_ids = tokenizer.encode(inp_sent, return_tensors="pt")

    # Generate the annotated output using the model
    output_ids = model.generate(input_ids, max_length=100)

    # Decode the output to get the annotated sentence
    annotated_sentence = tokenizer.decode(output_ids[0])
    tl = annotated_sentence.replace(' ##', ' ').replace('##', '').replace(' \' ', "'").replace("Ġ", " ").replace('[PAD]', '').replace('[SPACE]', ' ')
    for i, char in enumerate(special_chars):
        tl = tl.replace(f"[w{i}q]", f"{char}")
    out_pred = navarroize_tupi(tl)[5:-5]
    return out_pred

def generate_text(input_text):
    return anotate(input_text)

# Create Gradio interface
iface = gr.Interface(
    fn=generate_text,
    inputs=gr.Textbox(label="Input Text"),
    outputs=gr.Textbox(label="Generated Text"),
    title="Tupi Verb Annotation",
    description="Enter text to generate output using the T5 Small model."
)

if __name__ == "__main__":
    iface.launch()