Spaces:

beyoru
/

qew

Running

App Files Files Community

beyoru commited on Jan 16

Commit

1e88a5e

verified ·

1 Parent(s): 8eb4c94

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -61

app.py CHANGED Viewed

@@ -1,34 +1,43 @@
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import numpy as np
-# Load tokenizer and model for EOU detection
 tokenizer = AutoTokenizer.from_pretrained("livekit/turn-detector")
 model = AutoModelForCausalLM.from_pretrained("livekit/turn-detector")
-# Define function to calculate softmax
-def _softmax(logits: np.ndarray) -> np.ndarray:
-    exp_logits = np.exp(logits - np.max(logits))
-    return exp_logits / np.sum(exp_logits)
-# Define the EOU probability calculation
-def get_eou_probability(chat_ctx: list) -> float:
-    """Calculate the probability of End of Utterance (EOU)"""
-    text = " ".join([msg["content"] for msg in chat_ctx])
-    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
-    # Run the model and get the logits
     with torch.no_grad():
         outputs = model(**inputs)
-        logits = outputs.logits[0, -1, :]  # Get logits of the last token
-        probs = _softmax(logits.numpy())   # Convert logits to probabilities
-    # Assuming <|im_end|> token corresponds to EOU, get the probability of that token
-    eou_token_id = tokenizer.encode("<|im_end|>")[-1]
-    return probs[eou_token_id]
-# Define the main response function for Gradio
 def respond(
     message,
     history: list[tuple[str, str]],
@@ -36,47 +45,46 @@ def respond(
     max_tokens,
     temperature,
     top_p,
-    eou_threshold: float = 0.2  # Probability threshold to stop or transition the conversation
 ):
-    # Keep only the last 4 user inputs and add the current user input
-    user_history = [msg[0] for msg in history if msg[0]]  # Extract user inputs from history
-    user_history = user_history[-4:]  # Keep the last 4 user inputs
-    user_history.append(message)  # Add the current message
-    # Check if the EOU probability is high for the combined history (previous 4 + current input)
-    chat_ctx = [{"role": "user", "content": msg} for msg in user_history]
-    eou_probability = get_eou_probability(chat_ctx)
     print(eou_probability)
-    # If the EOU probability is higher than the threshold, wait for the user to complete their sentence
-    if eou_probability > eou_threshold:
-        return f"EOU probability is high: {eou_probability:.2f}. Please complete your sentence."
-    # Otherwise, generate the model's response
-    inputs = tokenizer(system_message + "\n" + message, return_tensors="pt", max_length=max_tokens, truncation=True)
-    # Set attention_mask to avoid issues with padding and make sure the model uses the correct pad_token_id
-    attention_mask = inputs['attention_mask'] if 'attention_mask' in inputs else None
-    pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
-    response = ""
-    generated_output = model.generate(
-        inputs['input_ids'],
-        attention_mask=attention_mask,
-        max_length=max_tokens,
-        do_sample=True,  # Enable sampling
-        temperature=temperature,
-        top_p=top_p,
-        pad_token_id=pad_token_id
-    )
-    response = tokenizer.decode(generated_output[0], skip_special_tokens=True)
-    return response
-# Gradio interface setup
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
-        gr.Textbox(value="You are a assistant call Mei", label="System message"),
         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(
@@ -87,14 +95,10 @@ demo = gr.ChatInterface(
             label="Top-p (nucleus sampling)",
         ),
         gr.Slider(
-            minimum=0.0,
-            maximum=1.0,
-            value=0.9,
-            step=0.01,
-            label="EOU Probability Threshold"
-        ),
     ],
 )
-# Launch Gradio with public link sharing
-demo.launch(share=True)

 import gradio as gr
+from huggingface_hub import InferenceClient
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import numpy as np
+# Load Inference Client for the response model
+client = InferenceClient("Qwen/Qwen2.5-3B-Instruct")
+# Load tokenizer and model for the EOU detection
 tokenizer = AutoTokenizer.from_pretrained("livekit/turn-detector")
 model = AutoModelForCausalLM.from_pretrained("livekit/turn-detector")
+# Function to compute EOU probability
+def compute_eou_probability(chat_ctx: list[dict[str, str]], max_tokens: int = 512) -> float:
+    # Prepare the chat context
+    conversation = [{"role": "system", "content": "Assistant ready to help."}] + chat_ctx
+    # Tokenize and prepare the input for the EOU model
+    inputs = tokenizer(
+        conversation, padding=True, truncation=True, max_length=max_tokens, return_tensors="pt"
+    )
+    # Get model logits
     with torch.no_grad():
         outputs = model(**inputs)
+    # Get the logits for the last token in the sequence
+    logits = outputs.logits[0, -1, :]
+    # Apply softmax to get probabilities
+    probabilities = torch.nn.functional.softmax(logits, dim=-1)
+    # Get the EOU token index (typically "<|im_end|>" token in the model)
+    eou_token_id = tokenizer.encode("<|im_end|>")[0]
+    eou_probability = probabilities[eou_token_id].item()
+    return eou_probability
+# Respond function with EOU checking logic
 def respond(
     message,
     history: list[tuple[str, str]],
     max_tokens,
     temperature,
     top_p,
+    eou_threshold: float = 0.2,  # Default EOU threshold
 ):
+    messages = [{"role": "system", "content": system_message}]
+    for val in history:
+        if val[0]:
+            messages.append({"role": "user", "content": val[0]})
+        if val[1]:
+            messages.append({"role": "assistant", "content": val[1]})
+    # Compute EOU probability before responding
+    eou_probability = compute_eou_probability(messages, max_tokens=max_tokens)
     print(eou_probability)
+    # Only respond if EOU probability exceeds threshold
+    if eou_probability >= eou_threshold:
+        # Prepare message for assistant response
+        messages.append({"role": "user", "content": message})
+        response = ""
+        for message in client.chat_completion(
+            messages,
+            max_tokens=max_tokens,
+            stream=True,
+            temperature=temperature,
+            top_p=top_p,
+        ):
+            token = message.choices[0].delta.content
+            response += token
+            yield response
+    else:
+        # Let the user continue typing if the EOU probability is low
+        yield "Waiting for user to finish... Please continue."
+        print("Waiting for user to finish... Please continue.")
+# Gradio UI
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
+        gr.Textbox(value="Bạn là một trợ lý ảo", label="System message"),
         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(
             label="Top-p (nucleus sampling)",
         ),
         gr.Slider(
+            minimum=0.0, maximum=1.0, value=0.7, step=0.05, label="EOU Threshold"
+        ),  # Add EOU threshold slider
     ],
 )
+if __name__ == "__main__":
+    demo.launch()