import os import re from datetime import datetime from typing import Dict import gradio import sign_language_translator as slt DESCRIPTION = """Enter your text and select languages from the dropdowns, then click Submit to generate a video. [`Library Repository`](https://github.com/sign-language-translator/sign-language-translator) The text is preprocessed, tokenized and rearranged and then each token is mapped to a prerecorded video which are concatenated and returned. [`Model Code`](https://github.com/sign-language-translator/sign-language-translator/blob/main/sign_language_translator/models/text_to_sign/concatenative_synthesis.py) > **NOTE** > - This model only supports a fixed vocabulary. See the [`*-dictionary-mapping.json`](https://github.com/sign-language-translator/sign-language-datasets/tree/main/parallel_texts) files for supported words. > - This version needs to re-encode the generated video so that will take some extra time after translation. > - Since this is a rule-based model, you will have to add **context** to ambiguous words (e.g. glass(material) vs glass(container)). > - Some signs correspond to words very specific in a particular language so their mapping in other languages will not make sense (e.g. in pakistan-sign-language, signs were recorded in reference to common Urdu words, hence English words "for" & "to" etc do not map well to their original Urdu words "کے لئے" and "کو" etc). """.strip() TITLE = "Concatenative Synthesis: Rule Based Text to Sign Language Translator" CUSTOM_JS = """<script> const rtlLanguages = ["urdu", "arabic"]; const keyMap = { "urdu": { "1": "۱", "2": "۲", "3": "۳", "4": "۴", "5": "۵", "6": "٦", "7": "۷", "8": "۸", "9": "۹", "0": "۰", "q": "ق", "w": "و", "e": "ع", "r": "ر", "t": "ت", "y": "ے", "u": "ء", "i": "ی", "o": "ہ", "p": "پ", "a": "ا", "s": "س", "d": "د", "f": "ف", "g": "گ", "h": "ح", "j": "ج", "k": "ک", "l": "ل", "z": "ز", "x": "ش", "c": "چ", "v": "ط", "b": "ب", "n": "ن", "m": "م", "R": "ڑ", "T": "ٹ", "Y": "َ", "U": "ئ", "I": "ِ", "P": "ُ", "A": "آ", "S": "ص", "D": "ڈ", "F": "أ", "G": "غ", "H": "ھ", "J": "ض", "K": "خ", "Z": "ذ", "X": "ژ", "C": "ث", "V": "ظ", "N": "ں", ",": "،", ".": "۔", "?": "؟", ";": "؛", }, "hindi": { "1": "१", "2": "२", "3": "३", "4": "४", "5": "५", "6": "६", "7": "७", "8": "८", "9": "९", "0": "०", "=": "ृ", "!": "ऍ", "@": "ॅ", "#": "्र", "$": "र्", "%": "ज्ञ", "^": "त्र", "&": "क्ष", "*": "श्र", "_": "ः", "+": "ऋ", "q": "ौ", "w": "ै", "e": "ा", "r": "ी", "t": "ू", "y": "ब", "u": "ह", "i": "ग", "o": "द", "p": "ज", "[": "ड", "]": "़", '\\\\': "ॉ", "Q": "औ", "W": "ऐ", "E": "आ", "R": "ई", "T": "ऊ", "Y": "भ", "U": "ङ", "I": "घ", "O": "ध", "P": "झ", "{": "ढ", "}": "ञ", "|": "ऑ", "a": "ो", "s": "े", "d": "्", "f": "ि", "g": "ु", "h": "प", "j": "र", "k": "क", "l": "त", ";": "च", "'": "ट", "A": "ओ", "S": "ए", "D": "अ", "F": "इ", "G": "उ", "H": "फ", "J": "ऱ", "K": "ख", "L": "थ", ":": "छ", '"': "ठ", "z": "ॆ", "x": "ं", "c": "म", "v": "न", "b": "व", "n": "ल", "m": "स", ".": "।", "/": "य", "Z": "ऎ", "X": "ँ", "C": "ण", "V": "ऩ", "B": "ऴ", "N": "ळ", "M": "श", "<": "ष", ">": "य़", // "?":"य़", } }; function updateTextareaDir(language) { const sourceTextarea = document.getElementById("source-textbox").querySelector("textarea"); if (rtlLanguages.includes(language)) { sourceTextarea.setAttribute("dir", "rtl"); } else { sourceTextarea.setAttribute("dir", "ltr"); } function keypressHandler(event) { const key = event.key; if (keyMap[language].hasOwnProperty(key)) { event.preventDefault(); const mappedValue = keyMap[language][key]; const start = sourceTextarea.selectionStart; const end = sourceTextarea.selectionEnd; sourceTextarea.value = sourceTextarea.value.slice(0, start) + mappedValue + sourceTextarea.value.slice(end); sourceTextarea.selectionStart = sourceTextarea.selectionEnd = start + mappedValue.length; } } sourceTextarea.removeEventListener("keypress", sourceTextarea.keypressHandler); sourceTextarea.addEventListener("keypress", keypressHandler); // Save the handler function to the textarea element for future removal sourceTextarea.keypressHandler = keypressHandler; } </script> """ # todo: add dropdown keyboard custom component with key mapping # todo: output full height CUSTOM_CSS = """ .reverse-row { flex-direction: row-reverse; } #auto-complete-button { border-color: var(--button-primary-border-color-hover); } """ HF_TOKEN = os.getenv("HF_TOKEN") request_logger = ( gradio.HuggingFaceDatasetSaver( HF_TOKEN, "sltAI/crowdsourced-text-to-sign-language-rule-based-translation-corpus", ) if HF_TOKEN else gradio.CSVLogger() ) translation_model = slt.models.ConcatenativeSynthesis("ur", "pk-sl", "video") language_models: Dict[str, slt.models.BeamSampling] = {} full_to_short = { "english": "en", "urdu": "ur", "hindi": "hi", } short_to_full = {s: f for f, s in full_to_short.items()} def auto_complete_text(model_code: str, text: str): if model_code not in language_models: lm = slt.get_model(model_code) language_models[model_code] = slt.models.BeamSampling( lm, # type: ignore start_of_sequence_token=getattr(lm, "start_of_sequence_token", "<"), # type: ignore end_of_sequence_token=getattr(lm, "end_of_sequence_token", ">"), # type: ignore ) # todo: better tokenize/detokenize tokens = [w for w in re.split(r"\b", text) if w] lm = language_models[model_code] lm.max_length = len(tokens) + 10 completion, _ = lm.complete(tokens or None) if completion[0] == lm.start_of_sequence_token: # type: ignore completion = completion[1:] # type: ignore if completion[-1] == lm.end_of_sequence_token: # type: ignore completion = completion[:-1] # type: ignore new_text = "".join(completion) return new_text def text_to_video( text: str, text_language: str, sign_language: str, sign_format: str = "video", output_path: str = "output.mp4", codec="h264", # ToDo: install h264 codec for opencv ): translation_model.text_language = text_language translation_model.sign_language = sign_language translation_model.sign_format = sign_format if sign_format == "landmarks": translation_model.sign_embedding_model = "mediapipe-world" sign = translation_model.translate(text) if isinstance(sign, slt.Landmarks): # large hands on sides # sign.data[:, 33:] *= 2 # sign.data[:, 33:54, 0] += 0.25 # sign.data[:, 54:, 0] -= 0.25 # hands moved to pose wrists sign.data[:, 33:54, :3] += -sign.data[:, 33:34, :3] + sign.data[:, 15:16, :3] sign.data[:, 54: , :3] += -sign.data[:, 54:55, :3] + sign.data[:, 16:17, :3] sign.save_animation(output_path, overwrite=True) else: sign.save(output_path, overwrite=True, codec=codec) # ToDo: video.watermark("Sign Language Translator\nAI Generated Video") def translate(text: str, text_lang: str, sign_lang: str, sign_format: str): text_lang = full_to_short.get(text_lang, text_lang) log = [ text, text_lang, sign_lang, None, datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"), ] try: if text_lang == "en": text = text[:1].lower() + text[1:] path = "output.mp4" text_to_video( text, text_lang, sign_lang, sign_format=sign_format, output_path=path, codec="mp4v", ) request_logger.flag(log) return path except Exception as exc: log[3] = str(exc) request_logger.flag(log) raise gradio.Error(f"Error during translation: {exc}") with gradio.Blocks(title=TITLE, head=CUSTOM_JS, css=CUSTOM_CSS) as gradio_app: gradio.Markdown(f"# {TITLE}") gradio.Markdown(DESCRIPTION) with gradio.Row(elem_classes=["reverse-row"]): # Inputs and Outputs with gradio.Column(): # Outputs gradio.Markdown("## Output Sign Language") output_video = gradio.Video( format="mp4", label="Synthesized Sign Language Video", autoplay=True, show_download_button=True, include_audio=False, ) with gradio.Column(): # Inputs gradio.Markdown("## Select Languages") with gradio.Row(): text_lang_dropdown = gradio.Dropdown( choices=[ short_to_full.get(code.value, code.value) for code in slt.TextLanguageCodes ], value=short_to_full.get( slt.TextLanguageCodes.URDU.value, slt.TextLanguageCodes.URDU.value, ), label="Text Language", elem_id="text-lang-dropdown", ) text_lang_dropdown.change( None, inputs=text_lang_dropdown, js="updateTextareaDir" ) sign_lang_dropdown = gradio.Dropdown( choices=[code.value for code in slt.SignLanguageCodes], value=slt.SignLanguageCodes.PAKISTAN_SIGN_LANGUAGE.value, label="Sign Language", ) output_format_dropdown = gradio.Dropdown( choices=[ slt.SignFormatCodes.VIDEO.value, slt.SignFormatCodes.LANDMARKS.value, ], value=slt.SignFormatCodes.VIDEO.value, label="Output Format", ) # todo: sign format: video/landmarks (tabs?) gradio.Markdown("## Input Text") with gradio.Row(): # Source TextArea source_textbox = gradio.Textbox( lines=4, placeholder="Enter Text Here...", label="Spoken Language Sentence", show_copy_button=True, elem_id="source-textbox", ) with gradio.Row(): # clear/auto-complete/Language Model language_model_dropdown = gradio.Dropdown( choices=[ slt.ModelCodes.MIXER_LM_NGRAM_URDU.value, slt.ModelCodes.TRANSFORMER_LM_UR_SUPPORTED.value, ], value=slt.ModelCodes.MIXER_LM_NGRAM_URDU.value, label="Select language model to Generate sample text", ) auto_complete_button = gradio.Button( "Auto-Complete", elem_id="auto-complete-button" ) auto_complete_button.click( auto_complete_text, inputs=[language_model_dropdown, source_textbox], outputs=[source_textbox], api_name=False, ) clear_button = gradio.ClearButton(source_textbox, api_name=False) with gradio.Row(): # Translate Button translate_button = gradio.Button("Translate", variant="primary") translate_button.click( translate, inputs=[ source_textbox, text_lang_dropdown, sign_lang_dropdown, output_format_dropdown, ], outputs=[output_video], api_name="translate", ) gradio.Examples( [ ["We are here to use this.", "english", "pakistan-sign-language", "video"], ["i(me) admire art.", "english", "pakistan-sign-language", "landmarks"], ["یہ بہت اچھا ہے۔", "urdu", "pakistan-sign-language", "video"], ["وہ کام آسان تھا۔", "urdu", "pakistan-sign-language", "landmarks"], ["कैसे हैं आप?", "hindi", "pakistan-sign-language", "video"], ["पाँच घंटे।", "hindi", "pakistan-sign-language", "landmarks"], ], inputs=[ source_textbox, text_lang_dropdown, sign_lang_dropdown, output_format_dropdown, ], outputs=output_video, ) request_logger.setup( [ source_textbox, text_lang_dropdown, sign_lang_dropdown, gradio.Markdown(label="Exception"), gradio.Markdown(label="Timestamp"), ], "flagged", ) gradio_app.load(None, inputs=[text_lang_dropdown], js="updateTextareaDir") if __name__ == "__main__": gradio_app.launch()