Spaces:

waleko
/

gradio-transformer-en-ru

Running

waleko commited on Nov 22, 2023

Commit

22b56e7

1 Parent(s): d81b460

Add translation application with Gradio interface

A new application for text translation from English to Russian is introduced. It uses a trained transformer model for translation and exposes the functionality through a clean interface using Gradio. The application accepts a string of English text and returns a Jason structure with the translation result that includes various details such as tokenized input and output text, output scores, and cross attention matrix.

Files changed (3) hide show

.gitignore +1 -0
app.py +44 -0
translate.py +82 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .idea/

app.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import gradio as gr
+from translate import translator_fn
+def predict(text):
+    result = translator_fn(text)
+    return {
+        "input_text": result.input_text,
+        "input_tokens": result.input_tokens,
+        "n_input": result.n_input,
+        "output_text": result.output_text,
+        "output_tokens": result.output_tokens,
+        "n_output": result.n_output,
+        "output_scores": result.output_scores,
+        "cross_attention": result.cross_attention.tolist(),
+    }
+gradio_app = gr.Interface(
+    predict,
+    inputs=gr.Text(placeholder="Enter a sentence to translate...", label="Input text"),
+    outputs=[gr.Json(description="Model output", label="Model output")],
+    title="En2Ru Scientific Translator",
+    description="Translate scientific texts from English to Russian",
+    examples=[
+        [
+            r"There is no closed form to implement the KL divergence by the definition of (REF ) and (REF ) for "
+            r"Gaussian Mixture Models. Instead, we resort to the Monte Carlo simulation method proposed in [1]}. "
+            r"Then, the KL divergence can be caculated by: \(D_{KL_{MC}}(p||q) =\frac{1}{n} \sum _{i=1}^{n} log("
+            r"\frac{p(x_i)}{q(x_i)})\) \(D_{KL_{MC}}(q||p) =\frac{1}{n} \sum _{i=1}^{n} log(\frac{q(y_i)}{p(y_i)})\)"],
+        [
+            r"Almost all currently used classifiers are not intrinsically well-calibrated [1]}, which means their "
+            r"output scores can't be interpreted as probabilities. This is an issue when the model is used for "
+            r"decision making, as a component in a more general probabilistic pipeline, or simply when one needs a "
+            r"quantification of the uncertainty in model's predictions, for example in high risk applications."],
+        [
+            r"First, with the development of the high-torque electric actuators, such as [1]}, [2]} the robots are "
+            r"becoming more dynamical. These actuators allow them not only to move at high speeds, but also to "
+            r"rapidly create forces and torques to perform dynamic actions, such as running, jumping, etc."],
+    ],
+)
+if __name__ == "__main__":
+    gradio_app.launch()

translate.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from dataclasses import dataclass
+from typing import List
+from typing import Tuple
+import numpy as np
+# Load model directly
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import torch
+tokenizer = AutoTokenizer.from_pretrained("under-tree/transformer-en-ru")
+model = AutoModelForSeq2SeqLM.from_pretrained("under-tree/transformer-en-ru")
+@dataclass
+class TranslationResult:
+    input_text: str
+    n_input: int
+    input_tokens: List[str]
+    n_output: int
+    output_text: str
+    output_tokens: List[str]
+    output_scores: List[List[Tuple[str, float]]]
+    cross_attention: np.ndarray
+def translator_fn(input_text: str, k=10) -> TranslationResult:
+    # Preprocess input
+    inputs = tokenizer(input_text, return_tensors="pt")
+    input_tokens = tokenizer.batch_decode(inputs.input_ids[0])
+    input_special_mask = torch.tensor([1 if t in tokenizer.all_special_tokens else 0 for t in input_tokens])
+    # Generate output
+    outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True, output_attentions=True)
+    output_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
+    output_tokens = tokenizer.batch_decode(outputs.sequences[0])
+    output_special_mask = torch.tensor([1 if t in tokenizer.all_special_tokens else 0 for t in output_tokens])
+    # Get cross attention matrix
+    cross_attention = torch.stack([torch.stack(t) for t in outputs.cross_attentions])
+    attention_matrix = cross_attention.mean(dim=4).mean(dim=3).mean(dim=2).mean(dim=1).detach().cpu().numpy()
+    # Get top tokens
+    top_scores = []
+    len_input = len(input_tokens)
+    len_output = len(output_tokens)
+    for i in range(len_output - 1):
+        if i + 1 < len_output and output_special_mask[i + 1] == 1:
+            # Skip special tokens (e.g. </s>, <pad>, etc.)
+            continue
+        top_elements, top_indices = outputs.scores[i].mean(dim=0).topk(k)
+        top_elements = top_elements.exp()
+        top_elements /= top_elements.sum()
+        top_indices = tokenizer.batch_decode(top_indices)
+        # filter out special tokens
+        top_pairs = [(m, t.item()) for t, m in zip(top_elements, top_indices) if m not in tokenizer.all_special_tokens]
+        top_scores.append(top_pairs)
+    # Filter out special tokens from all elements
+    clean_output_tokens = [t for t, m in zip(output_tokens, output_special_mask) if m == 0]
+    clean_input_tokens = [t for t, m in zip(input_tokens, input_special_mask) if m == 0]
+    clean_attention_matrix = attention_matrix[:len_output, :len_input]  # for padding
+    clean_attention_matrix = np.delete(clean_attention_matrix, np.where(output_special_mask == 1), axis=0)
+    clean_attention_matrix = np.delete(clean_attention_matrix, np.where(input_special_mask == 1), axis=1)
+    n_input = len(clean_input_tokens)
+    n_output = len(clean_output_tokens)
+    assert clean_attention_matrix.shape == (n_output, n_input)
+    assert len(top_scores) == n_output
+    return TranslationResult(
+        input_text=input_text,
+        n_input=n_input,
+        input_tokens=clean_input_tokens,
+        output_text=output_text,
+        n_output=n_output,
+        output_tokens=clean_output_tokens,
+        output_scores=top_scores,
+        cross_attention=clean_attention_matrix
+    )