Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| from transformers import AutoProcessor, AutoModelForCausalLM | |
| from PIL import Image | |
| import torch | |
| from gtts import gTTS | |
| import spacy | |
| import requests | |
| import nltk.tree | |
| import re | |
| # Baixar o modelo de português do spaCy | |
| os.system("python -m spacy download pt_core_news_sm") | |
| # Carregar o modelo de português do spaCy | |
| nlp = spacy.load("pt_core_news_sm") | |
| # Chave para o LX-Parser | |
| key = "aac7f02d6bd21703b753e25646d6426d" | |
| # Funções de manipulação gramatical | |
| def invert_adj_n(doc, tags): | |
| frase = [] | |
| already = False | |
| for i in range(len(doc)): | |
| if already: | |
| already = False | |
| continue | |
| if doc[i].tag_ != "PUNCT": | |
| if tags[i] == "A": | |
| if i + 1 < len(tags) and tags[i + 1] == "N": | |
| frase.append(doc[i + 1].text) | |
| frase.append(doc[i].text) | |
| already = True | |
| else: | |
| frase.append(doc[i].text) | |
| else: | |
| frase.append(doc[i].text) | |
| else: | |
| frase.append(doc[i].text) | |
| return frase | |
| def adjust_adj(doc, tags): | |
| frase = [] | |
| for i in range(len(doc)): | |
| frase.append(doc[i].text) | |
| if tags[i] == "A": | |
| if i + 1 < len(tags) and tags[i + 1] == "A": | |
| frase.append("e") | |
| return frase | |
| def adjust_art(doc, tags): | |
| frase = [] | |
| already = False | |
| for i in range(len(doc)): | |
| if already: | |
| already = False | |
| continue | |
| text = doc[i].text | |
| if tags[i] == "ART" and text.lower() == "a": | |
| if i + 1 < len(doc): | |
| gender = doc[i + 1].morph.get("Gender") | |
| number = doc[i + 1].morph.get("Number") | |
| if gender and number: | |
| if gender[0] == "Masc" and number[0] == "Sing": | |
| frase.append("um") | |
| elif gender[0] == "Fem" and number[0] == "Sing": | |
| frase.append("uma") | |
| elif gender[0] == "Masc" and number[0] != "Sing": | |
| frase.append("os") | |
| else: | |
| frase.append("as") | |
| else: | |
| frase.append(text) | |
| else: | |
| frase.append(text) | |
| else: | |
| frase.append(text) | |
| return frase | |
| def create_sentence(doc, tags, frase): | |
| tmp = frase | |
| for i in range(len(doc)): | |
| text = doc[i].text | |
| if doc[i].is_sent_start: | |
| tmp[i] = tmp[i].capitalize() | |
| if doc[i].tag_ == "PUNCT": | |
| tmp[i - 1] += text | |
| return tmp | |
| def get_productions(texto): | |
| format = 'parentheses' | |
| url = "https://portulanclarin.net/workbench/lx-parser/api/" | |
| request_data = { | |
| 'method': 'parse', | |
| 'jsonrpc': '2.0', | |
| 'id': 0, | |
| 'params': { | |
| 'text': texto, | |
| 'format': format, | |
| 'key': key, | |
| }, | |
| } | |
| request = requests.post(url, json=request_data) | |
| response_data = request.json() | |
| if "error" in response_data: | |
| print("Error:", response_data["error"]) | |
| return [] | |
| else: | |
| result = response_data["result"] | |
| productions = [] | |
| tree = nltk.tree.Tree.fromstring(result) | |
| for tag in tree.productions(): | |
| if len(re.findall(r"'.*'", str(tag))) > 0: | |
| productions.append(str(tag)) | |
| return productions | |
| def get_tags(productions): | |
| tags = [] | |
| for item in productions: | |
| if isinstance(item, str): | |
| tags.append(item[:item.find(' ->')]) | |
| else: | |
| tags.append(item) | |
| for item in tags: | |
| if "'" in item: | |
| tags.remove(item) | |
| return tags | |
| def reordenar_sentenca(sentenca): | |
| if not sentenca.strip(): | |
| return sentenca, [] | |
| sentenca = sentenca.lower() | |
| sentence = get_productions(sentenca) | |
| tags = get_tags(sentence) | |
| doc = nlp(sentenca) | |
| if not tags: | |
| return sentenca.strip(), [] | |
| if tags[0] != "ART": | |
| sentenca = "A " + sentenca.strip() | |
| sentence = get_productions(sentenca) | |
| tags = get_tags(sentence) | |
| doc = nlp(sentenca) | |
| if not sentence: | |
| return sentenca.strip(), [] | |
| aux = [] | |
| if len(tags) > 2 and tags[1] == "N" and tags[2] == "N": | |
| aux = sentenca.split() | |
| tmp = aux[1] | |
| aux[1] = aux[2] | |
| aux.insert(2, "de") | |
| aux[3] = tmp | |
| sentenca = " ".join(aux) | |
| sentence = get_productions(sentenca) | |
| tags = get_tags(sentence) | |
| doc = nlp(sentenca) | |
| frase = [] | |
| already = False | |
| person = 3 | |
| tmp_doc = [] | |
| for token in doc: | |
| tmp_doc.append(token) | |
| frase = invert_adj_n(tmp_doc, tags) | |
| nova_sentenca = ' '.join(frase) | |
| productions = get_productions(nova_sentenca) | |
| tags = get_tags(productions) | |
| doc = nlp(nova_sentenca) | |
| while nova_sentenca != sentenca: | |
| frase = invert_adj_n(doc, tags) | |
| sentenca = nova_sentenca | |
| nova_sentenca = ' '.join(frase) | |
| productions = get_productions(nova_sentenca) | |
| tags = get_tags(productions) | |
| doc = nlp(nova_sentenca) | |
| frase = adjust_adj(doc, tags) | |
| nova_sentenca = ' '.join(frase) | |
| productions = get_productions(nova_sentenca) | |
| tags = get_tags(productions) | |
| doc = nlp(nova_sentenca) | |
| while nova_sentenca != sentenca: | |
| frase = adjust_adj(doc, tags) | |
| sentenca = nova_sentenca | |
| nova_sentenca = ' '.join(frase) | |
| productions = get_productions(nova_sentenca) | |
| tags = get_tags(productions) | |
| doc = nlp(nova_sentenca) | |
| frase = adjust_art(doc, tags) | |
| sentenca = ' '.join(frase) | |
| productions = get_productions(sentenca) | |
| tags = get_tags(productions) | |
| doc = nlp(sentenca) | |
| frase = create_sentence(doc, tags, frase) | |
| sentenca_normalizada = "" | |
| for i in range(len(frase)): | |
| sentenca_normalizada += frase[i] + " " | |
| return sentenca_normalizada.strip(), tags | |
| # Carregar os modelos | |
| processor = AutoProcessor.from_pretrained("histlearn/microsoft-git-portuguese-neuro-simbolic") | |
| model = AutoModelForCausalLM.from_pretrained("histlearn/microsoft-git-portuguese-neuro-simbolic") | |
| # Configurar o dispositivo (GPU ou CPU) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model.to(device) | |
| # Funções auxiliares | |
| def prepare_image(image_path): | |
| image = Image.open(image_path).convert("RGB") | |
| inputs = processor(images=image, return_tensors="pt").to(device) | |
| return image, inputs.pixel_values | |
| def generate_caption(pixel_values): | |
| model.eval() | |
| with torch.no_grad(): | |
| generated_ids = model.generate( | |
| pixel_values=pixel_values, | |
| max_length=50, | |
| num_beams=4, | |
| early_stopping=True, | |
| no_repeat_ngram_size=2 | |
| ) | |
| return processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| def text_to_speech_gtts(text, lang='pt'): | |
| tts = gTTS(text=text, lang=lang) | |
| tts.save("output.mp3") | |
| return "output.mp3" | |
| # Função principal para processar a imagem e gerar a voz | |
| def process_image(image): | |
| _, pixel_values = prepare_image(image) | |
| caption_pt = generate_caption(pixel_values) | |
| caption_pt, tags = reordenar_sentenca(caption_pt) | |
| audio_file = text_to_speech_gtts(caption_pt) | |
| tags_text = " | ".join(tags) | |
| return caption_pt, tags_text, audio_file | |
| # Caminhos para as imagens de exemplo | |
| example_image_paths = [ | |
| "./example2.jpeg" | |
| ] | |
| # Interface Gradio | |
| iface = gr.Interface( | |
| fn=process_image, | |
| inputs=gr.Image(type="filepath"), | |
| outputs=[gr.Textbox(label="Caption"), gr.Textbox(label="Morphological Analysis"), gr.Audio(type="filepath")], | |
| examples=example_image_paths, | |
| title="Image to Voice", | |
| description="Gera uma descrição em português e a converte em voz a partir de uma imagem." | |
| ) | |
| # Executar a interface | |
| if __name__ == "__main__": | |
| iface.launch() | |