Spaces:
Sleeping
Sleeping
File size: 2,390 Bytes
af05291 020ff8e af05291 d8232db 4ab4f80 af05291 724d744 af05291 724d744 4ab4f80 af05291 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import gradio as gr
import re
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
print(f"Is CUDA available: {torch.cuda.is_available()}")
# Load the model and tokenizer
model_name = "kiansheik/tupi-verb-anotation"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
# Identify all non-ascii special chars in the navarro alphabet
special_chars = "û î ŷ á é í ý ó ú ã ẽ ĩ ỹ õ ũ '".split(" ")
# Create a two-way dictionary
def normalize_tupi(x):
for i, char in enumerate(special_chars):
x = x.replace(char, f"[w{i}q]")
return re.sub('\s+', ' ', x).strip().replace(' ', '[SPACE]')
def replace_outside_brackets(match):
part = match.group()
if part.startswith('[') and part.endswith(']'):
return part # Return the part unchanged if it's inside brackets
return part
def navarroize_tupi(x):
# Pattern to match text outside square brackets
pattern = r'\[.*?\]|[^[\]]+'
return re.sub(pattern, replace_outside_brackets, x)
def anotate(st, debug=False):
# Now we will test the model with a sample sentence
sentence = normalize_tupi(st.lower()).replace(',', '').replace('.', '').replace('?', '').replace('!', '').replace('-', '').strip()
inp_sent = navarroize_tupi(sentence).replace(' ','[SPACE]')
if debug:
print("Input Phrase:\t\t", inp_sent)
input_ids = tokenizer.encode(inp_sent, return_tensors="pt")
# Generate the annotated output using the model
output_ids = model.generate(input_ids, max_length=100)
# Decode the output to get the annotated sentence
annotated_sentence = tokenizer.decode(output_ids[0])
tl = annotated_sentence.replace(' ##', ' ').replace('##', '').replace(' \' ', "'").replace("Ġ", " ").replace('[PAD]', '').replace('[SPACE]', ' ')
for i, char in enumerate(special_chars):
tl = tl.replace(f"[w{i}q]", f"{char}")
out_pred = navarroize_tupi(tl)[5:-5]
return out_pred
def generate_text(input_text):
return anotate(input_text)
# Create Gradio interface
iface = gr.Interface(
fn=generate_text,
inputs=gr.Textbox(label="Input Text"),
outputs=gr.Textbox(label="Generated Text"),
title="Tupi Verb Annotation",
description="Enter text to generate output using the T5 Small model."
)
if __name__ == "__main__":
iface.launch()
|