File size: 2,390 Bytes
af05291
 
 
020ff8e
 
 
af05291
 
 
 
 
 
 
 
 
 
d8232db
 
4ab4f80
af05291
 
 
 
 
 
 
 
 
 
 
 
724d744
 
af05291
 
 
 
 
 
 
 
 
724d744
4ab4f80
 
af05291
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import gradio as gr
import re
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

print(f"Is CUDA available: {torch.cuda.is_available()}")

# Load the model and tokenizer
model_name = "kiansheik/tupi-verb-anotation"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Identify all non-ascii special chars in the navarro alphabet
special_chars = "û î ŷ á é í ý ó ú ã ẽ ĩ ỹ õ ũ '".split(" ")
# Create a two-way dictionary
def normalize_tupi(x):
    for i, char in enumerate(special_chars):
        x = x.replace(char, f"[w{i}q]")
    return re.sub('\s+', ' ', x).strip().replace(' ', '[SPACE]')
def replace_outside_brackets(match):
    part = match.group()
    if part.startswith('[') and part.endswith(']'):
        return part  # Return the part unchanged if it's inside brackets
    return part
def navarroize_tupi(x):
    # Pattern to match text outside square brackets
    pattern = r'\[.*?\]|[^[\]]+'
    return re.sub(pattern, replace_outside_brackets, x)

def anotate(st, debug=False):
    # Now we will test the model with a sample sentence
    sentence = normalize_tupi(st.lower()).replace(',', '').replace('.', '').replace('?', '').replace('!', '').replace('-', '').strip()
    inp_sent = navarroize_tupi(sentence).replace(' ','[SPACE]')
    if debug:
        print("Input Phrase:\t\t", inp_sent)
    input_ids = tokenizer.encode(inp_sent, return_tensors="pt")

    # Generate the annotated output using the model
    output_ids = model.generate(input_ids, max_length=100)

    # Decode the output to get the annotated sentence
    annotated_sentence = tokenizer.decode(output_ids[0])
    tl = annotated_sentence.replace(' ##', ' ').replace('##', '').replace(' \' ', "'").replace("Ġ", " ").replace('[PAD]', '').replace('[SPACE]', ' ')
    for i, char in enumerate(special_chars):
        tl = tl.replace(f"[w{i}q]", f"{char}")
    out_pred = navarroize_tupi(tl)[5:-5]
    return out_pred

def generate_text(input_text):
    return anotate(input_text)

# Create Gradio interface
iface = gr.Interface(
    fn=generate_text,
    inputs=gr.Textbox(label="Input Text"),
    outputs=gr.Textbox(label="Generated Text"),
    title="Tupi Verb Annotation",
    description="Enter text to generate output using the T5 Small model."
)

if __name__ == "__main__":
    iface.launch()