Spaces:

msibertman
/

rank_speech

Sleeping

App Files Files Community

Mohammad Sabik Irbaz commited on Apr 2, 2023

Commit

a4a2eb9

1 Parent(s): 8aad394

speech rank

Browse files

Files changed (2) hide show

app.py +154 -3
requirements.txt +11 -0

app.py CHANGED Viewed

@@ -1,7 +1,158 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 iface.launch()

+from sentence_transformers import SentenceTransformer, util
+from transformers import AutoTokenizer, AutoModel
+from torch.nn.functional import softmax
+from transformers import pipeline
+import time, librosa, torch, io
+from pydub import AudioSegment
 import gradio as gr
+import numpy as np
+device = 'cpu'
+cols = ['A1','A2','B1','B2','C1','C2']
+tokenizer = AutoTokenizer.from_pretrained('t5-base')
+lm = AutoModel.from_pretrained('t5-base').to(device)
+model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)
+pipe = pipeline("automatic-speech-recognition",
+                model="openai/whisper-base.en",
+                chunk_length_s=30, device="cpu")
+def vocab_scoring(tokens, duration):
+    unique_vocab = {}
+    for token in tokens:
+        if token not in unique_vocab.keys():
+            unique_vocab[token] = 1
+        else:
+            unique_vocab[token] += 1
+    vocab_rate = len(unique_vocab)/duration
+    if vocab_rate < 40: return 1
+    if vocab_rate < 45: return 2
+    if vocab_rate < 55: return 3
+    if vocab_rate < 75: return 4
+    if vocab_rate < 85: return 5
+    if vocab_rate >= 85: return 6
+def word_scoring(tokens, duration):
+    word_rate = len(tokens)/duration
+    if word_rate < 65: return 1
+    if word_rate < 90: return 2
+    if word_rate < 117: return 3
+    if word_rate < 142: return 4
+    if word_rate < 175: return 5
+    if word_rate >= 175: return 6
+def fluency_scoring(tokenized_sentence, model):
+    try:
+        with torch.no_grad():
+            outputs = model(input_ids=tokenized_sentence, decoder_input_ids=tokenized_sentence)
+            logits = outputs.last_hidden_state
+            probas = softmax(logits, dim=-1)
+            perplexity = torch.exp(torch.mean(torch.sum(-probas * torch.log(probas), dim=-1)))
+    except:
+        tokenized_sentence = tokenized_sentence[:,:512]
+        with torch.no_grad():
+            outputs = model(input_ids=tokenized_sentence, decoder_input_ids=tokenized_sentence)
+            logits = outputs.last_hidden_state
+            probas = softmax(logits, dim=-1)
+            perplexity = torch.exp(torch.mean(torch.sum(-probas * torch.log(probas), dim=-1)))
+    if perplexity > 120: return 1
+    if perplexity > 100: return 2
+    if perplexity > 60: return 3
+    if perplexity > 50: return 4
+    if perplexity > 30: return 5
+    if perplexity <= 30: return 6
+def similarity_scoring(prompt, response):
+    prompt_embeddings = model.encode(prompt, convert_to_tensor=True)
+    response_embeddings = model.encode(response, convert_to_tensor=True)
+    similarity = util.pytorch_cos_sim(prompt_embeddings, response_embeddings)[0].item()
+    if similarity < 0.3: return 1
+    if similarity < 0.4: return 2
+    if similarity < 0.5: return 3
+    if similarity < 0.6: return 4
+    if similarity < 0.7: return 5
+    if similarity >= 0.7: return 6
+def classify(score):
+    if score <= 1: return (0, "A1")
+    if score == 2: return (1, "A2")
+    if score == 3: return (2, "B1")
+    if score == 4: return (3, "B2")
+    if score == 5: return (4, "C1")
+    if score >= 6: return (5, "C2")
+def speech_to_text(audio):
+    audio_, rate = librosa.load(audio, sr=16000)
+    duration = librosa.get_duration(y=audio_, sr=rate)
+    transcription = pipe(audio)["text"]
+    return transcription, duration/60.0
+def test_speech(prompt, audio):
+    response, duration = speech_to_text(audio)
+    response_tokens = tokenizer.encode(response,
+                                      return_tensors="pt",
+                                      add_special_tokens=True)
+    fluency_score = fluency_scoring(response_tokens, lm)
+    tokens = response_tokens.tolist()[0]
+    vocab_score = vocab_scoring(tokens, duration)
+    word_score = word_scoring(tokens, duration)
+    similarity_score = similarity_scoring(prompt, response)
+    print(f"Fluency Score => {fluency_score}")
+    print(f"Vocab Score => {vocab_score}")
+    print(f"Word Score => {word_score}")
+    print(f"Similarity Score => {similarity_score}")
+    scores = []
+    scores.append(word_score)
+    scores.append(vocab_score)
+    scores.append(fluency_score)
+    scores.append(similarity_score)
+    scores.append(round((word_score + vocab_score) / 2))
+    scores.append(round((word_score + fluency_score) / 2))
+    scores.append(round((word_score + similarity_score) / 2))
+    scores.append(round((vocab_score + fluency_score) / 2))
+    scores.append(round((vocab_score + similarity_score) / 2))
+    scores.append(round((word_score + vocab_score + fluency_score) / 3))
+    scores.append(round((word_score + vocab_score + similarity_score) / 3))
+    scores.append(round((word_score + vocab_score + fluency_score + similarity_score) / 4))
+    print(f"Votes =>\t{scores}")
+    # Max Voting
+    preds = [classify(score)[1] for score in scores]
+    pred_dict = {}
+    for idx, pred in enumerate(preds):
+        if pred in pred_dict.keys(): pred_dict[pred] += 1
+        else: pred_dict[pred] = 1
+    mx_val = 0
+    pred = ""
+    for key, value in pred_dict.items():
+        if value > mx_val:
+            mx_val = value
+            pred = key
+    return pred
+prompt = gr.Textbox(label="Prompt")
+audio_response = gr.Audio(type="filepath", label="Audio")
+rank = gr.Textbox(label="Rank (A1-C2)")
+iface = gr.Interface(fn=test_speech,
+                     inputs=[prompt, audio_response],
+                     outputs=rank.style(show_copy_button=True),
+                     title="Rank Speech")
 iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio==3.23.0
+librosa==0.10.0.post1
+torch==1.13.1
+sentence-transformers==2.2.2
+sentencepiece==0.1.97
+transformers==4.26.1
+tokenizers==0.13.2
+pydub==0.25.1
+ffmpeg==1.4
+numpy==1.23.5
+scipy==1.10.1