Spaces:

kiansheik
/

tupi-verb-annotation

Sleeping

App Files Files Community

tupi-verb-annotation / app.py

kiansheik

Update app.py

02d00af verified 5 months ago

raw

history blame contribute delete

2.39 kB

	import gradio as gr
	import re
	from transformers import T5Tokenizer, T5ForConditionalGeneration
	import torch

	print(f"Is CUDA available: {torch.cuda.is_available()}")

	# Load the model and tokenizer
	model_name = "kiansheik/tupi-verb-anotation"
	tokenizer = T5Tokenizer.from_pretrained(model_name)
	model = T5ForConditionalGeneration.from_pretrained(model_name)

	# Identify all non-ascii special chars in the navarro alphabet
	special_chars = "û î ŷ á é í ý ó ú ã ẽ ĩ ỹ õ ũ '".split(" ")
	# Create a two-way dictionary
	def normalize_tupi(x):
	for i, char in enumerate(special_chars):
	x = x.replace(char, f"[w{i}q]")
	return re.sub('\s+', ' ', x).strip().replace(' ', '[SPACE]')
	def replace_outside_brackets(match):
	part = match.group()
	if part.startswith('[') and part.endswith(']'):
	return part # Return the part unchanged if it's inside brackets
	return part
	def navarroize_tupi(x):
	# Pattern to match text outside square brackets
	pattern = r'\[.*?\]\|[^[\]]+'
	return re.sub(pattern, replace_outside_brackets, x)

	def anotate(st, debug=False):
	# Now we will test the model with a sample sentence
	sentence = normalize_tupi(st.lower()).replace(',', '').replace('.', '').replace('?', '').replace('!', '').replace('-', '').strip()
	inp_sent = navarroize_tupi(sentence).replace(' ','[SPACE]')
	if debug:
	print("Input Phrase:\t\t", inp_sent)
	input_ids = tokenizer.encode(inp_sent, return_tensors="pt")

	# Generate the annotated output using the model
	output_ids = model.generate(input_ids, max_length=100)

	# Decode the output to get the annotated sentence
	annotated_sentence = tokenizer.decode(output_ids[0])
	tl = annotated_sentence.replace(' ##', ' ').replace('##', '').replace(' \' ', "'").replace("Ġ", " ").replace('[PAD]', '').replace('[SPACE]', ' ')
	for i, char in enumerate(special_chars):
	tl = tl.replace(f"[w{i}q]", f"{char}")
	out_pred = navarroize_tupi(tl)[5:-5]
	return out_pred

	def generate_text(input_text):
	return anotate(input_text)

	# Create Gradio interface
	iface = gr.Interface(
	fn=generate_text,
	inputs=gr.Textbox(label="Input Text"),
	outputs=gr.Textbox(label="Generated Text"),
	title="Tupi Verb Annotation",
	description="Enter text to generate output using the T5 Small model."
	)

	if __name__ == "__main__":
	iface.launch()