Spaces:
Sleeping
Sleeping
File size: 5,494 Bytes
a4a2eb9 8aad394 a4a2eb9 8aad394 a4a2eb9 8aad394 a4a2eb9 6e71fd8 a4a2eb9 8aad394 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModel
from torch.nn.functional import softmax
from transformers import pipeline
import time, librosa, torch, io
from pydub import AudioSegment
import gradio as gr
import numpy as np
device = 'cpu'
cols = ['A1','A2','B1','B2','C1','C2']
tokenizer = AutoTokenizer.from_pretrained('t5-base')
lm = AutoModel.from_pretrained('t5-base').to(device)
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)
pipe = pipeline("automatic-speech-recognition",
model="openai/whisper-base.en",
chunk_length_s=30, device="cpu")
def vocab_scoring(tokens, duration):
unique_vocab = {}
for token in tokens:
if token not in unique_vocab.keys():
unique_vocab[token] = 1
else:
unique_vocab[token] += 1
vocab_rate = len(unique_vocab)/duration
if vocab_rate < 40: return 1
if vocab_rate < 45: return 2
if vocab_rate < 55: return 3
if vocab_rate < 75: return 4
if vocab_rate < 85: return 5
if vocab_rate >= 85: return 6
def word_scoring(tokens, duration):
word_rate = len(tokens)/duration
if word_rate < 65: return 1
if word_rate < 90: return 2
if word_rate < 117: return 3
if word_rate < 142: return 4
if word_rate < 175: return 5
if word_rate >= 175: return 6
def fluency_scoring(tokenized_sentence, model):
try:
with torch.no_grad():
outputs = model(input_ids=tokenized_sentence, decoder_input_ids=tokenized_sentence)
logits = outputs.last_hidden_state
probas = softmax(logits, dim=-1)
perplexity = torch.exp(torch.mean(torch.sum(-probas * torch.log(probas), dim=-1)))
except:
tokenized_sentence = tokenized_sentence[:,:512]
with torch.no_grad():
outputs = model(input_ids=tokenized_sentence, decoder_input_ids=tokenized_sentence)
logits = outputs.last_hidden_state
probas = softmax(logits, dim=-1)
perplexity = torch.exp(torch.mean(torch.sum(-probas * torch.log(probas), dim=-1)))
if perplexity > 120: return 1
if perplexity > 100: return 2
if perplexity > 60: return 3
if perplexity > 50: return 4
if perplexity > 30: return 5
if perplexity <= 30: return 6
def similarity_scoring(prompt, response):
prompt_embeddings = model.encode(prompt, convert_to_tensor=True)
response_embeddings = model.encode(response, convert_to_tensor=True)
similarity = util.pytorch_cos_sim(prompt_embeddings, response_embeddings)[0].item()
if similarity < 0.3: return 1
if similarity < 0.4: return 2
if similarity < 0.5: return 3
if similarity < 0.6: return 4
if similarity < 0.7: return 5
if similarity >= 0.7: return 6
def classify(score):
if score <= 1: return (0, "A1")
if score == 2: return (1, "A2")
if score == 3: return (2, "B1")
if score == 4: return (3, "B2")
if score == 5: return (4, "C1")
if score >= 6: return (5, "C2")
def speech_to_text(audio):
audio_, rate = librosa.load(audio, sr=16000)
duration = librosa.get_duration(y=audio_, sr=rate)
transcription = pipe(audio)["text"]
return transcription, duration/60.0
def test_speech(prompt, audio):
response, duration = speech_to_text(audio)
response_tokens = tokenizer.encode(response,
return_tensors="pt",
add_special_tokens=True)
fluency_score = fluency_scoring(response_tokens, lm)
tokens = response_tokens.tolist()[0]
vocab_score = vocab_scoring(tokens, duration)
word_score = word_scoring(tokens, duration)
similarity_score = similarity_scoring(prompt, response)
print(f"Fluency Score => {fluency_score}")
print(f"Vocab Score => {vocab_score}")
print(f"Word Score => {word_score}")
print(f"Similarity Score => {similarity_score}")
scores = []
scores.append(word_score)
scores.append(vocab_score)
scores.append(fluency_score)
scores.append(similarity_score)
scores.append(round((word_score + vocab_score) / 2))
scores.append(round((word_score + fluency_score) / 2))
scores.append(round((word_score + similarity_score) / 2))
scores.append(round((vocab_score + fluency_score) / 2))
scores.append(round((vocab_score + similarity_score) / 2))
scores.append(round((word_score + vocab_score + fluency_score) / 3))
scores.append(round((word_score + vocab_score + similarity_score) / 3))
scores.append(round((word_score + vocab_score + fluency_score + similarity_score) / 4))
print(f"Votes =>\t{scores}")
# Max Voting
preds = [classify(score)[1] for score in scores]
pred_dict = {}
for idx, pred in enumerate(preds):
if pred in pred_dict.keys(): pred_dict[pred] += 1
else: pred_dict[pred] = 1
mx_val = 0
pred = ""
for key, value in pred_dict.items():
if value > mx_val:
mx_val = value
pred = key
return pred
prompt = gr.Textbox(label="Prompt")
audio_response = gr.Audio(source="microphone", type="filepath", label="Audio")
rank = gr.Textbox(label="Rank (A1-C2)")
iface = gr.Interface(fn=test_speech,
inputs=[prompt, audio_response],
outputs=rank.style(show_copy_button=True),
title="Rank Speech")
iface.launch() |