beyoru commited on
Commit
80b54e9
ยท
verified ยท
1 Parent(s): 31391ab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -112
app.py CHANGED
@@ -1,119 +1,118 @@
1
- import re
2
- import torch
3
- from threading import Thread
4
  import gradio as gr
5
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
 
 
 
 
6
 
7
- MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
8
- CONTEXT_LENGTH = 4096
9
 
10
- # Add special tokens for thinking process
11
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
12
- tokenizer.add_special_tokens({
13
- "additional_special_tokens": ["<think>", "</think>"]
14
- })
15
 
16
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
17
- model.resize_token_embeddings(len(tokenizer))
 
 
 
18
 
19
- def predict(message, history, show_thinking, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
20
- stop_tokens = ["<|endoftext|>", "<|im_end|>", "|im_end|", "</think>"]
21
- instruction = f'<|im_start|>system\n{system_prompt}\n<|im_end|>\n'
22
-
23
- # Format chat history
24
- for user, assistant in history:
25
- instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n'
26
- instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n'
27
-
28
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
29
- enc = tokenizer(instruction, return_tensors="pt", truncation=True, max_length=CONTEXT_LENGTH)
30
- input_ids, attention_mask = enc.input_ids, enc.attention_mask
31
-
32
- generate_kwargs = dict(
33
- input_ids=input_ids,
34
- attention_mask=attention_mask,
35
- streamer=streamer,
36
- do_sample=True,
37
- temperature=temperature,
38
- max_new_tokens=max_new_tokens,
39
- top_k=top_k,
40
- repetition_penalty=repetition_penalty,
41
- top_p=top_p
 
 
 
 
 
 
 
 
42
  )
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- t = Thread(target=model.generate, kwargs=generate_kwargs)
45
- t.start()
46
-
47
- outputs = []
48
- thinking_buffer = []
49
- in_thinking = False
50
- current_chunk = ""
51
-
52
- for new_token in streamer:
53
- current_chunk += new_token
54
-
55
- # Check for thinking tags
56
- if "<think>" in current_chunk and not in_thinking:
57
- in_thinking = True
58
- pre, _, post = current_chunk.partition("<think>")
59
- if pre:
60
- outputs.append(pre)
61
- yield _clean_output("".join(outputs), show_thinking)
62
- current_chunk = post
63
-
64
- if "</think>" in current_chunk and in_thinking:
65
- in_thinking = False
66
- pre, _, post = current_chunk.partition("</think>")
67
- thinking_buffer.append(pre)
68
- if show_thinking:
69
- outputs.extend(thinking_buffer)
70
- thinking_buffer = []
71
- current_chunk = post
72
-
73
- if in_thinking:
74
- thinking_buffer.append(current_chunk)
75
- if show_thinking:
76
- outputs.append(current_chunk)
77
- yield _clean_output("".join(outputs), show_thinking)
78
- current_chunk = ""
79
- else:
80
- if current_chunk:
81
- outputs.append(current_chunk)
82
- yield _clean_output("".join(outputs), show_thinking)
83
- current_chunk = ""
84
-
85
- def _clean_output(text: str, show_thinking: bool) -> str:
86
- # Remove residual tags and format thinking content
87
- text = re.sub(r'\s*<think>\s*', '\n\n*Thinking:* ', text)
88
- text = re.sub(r'\s*</think>\s*', ' ', text)
89
- text = re.sub(r'(\*Thinking:\*)(?! )', r'\1 ', text)
90
- return text.strip()
91
-
92
- # Create interface with toggle
93
- gr.ChatInterface(
94
- predict,
95
- additional_inputs=[
96
- gr.Checkbox(value=True, label="๐Ÿ” Show Thinking Process"),
97
- gr.Textbox(
98
- "You are an AI assistant. First analyze requests using <think> tags, then provide answers. "
99
- "Put all reasoning between <think> and </think> tags.",
100
- label="System Prompt"
101
- ),
102
- gr.Slider(0, 1, 0.6, label="๐ŸŒก๏ธ Temperature"),
103
- gr.Slider(0, 4096, 512, label="๐Ÿ“ Max New Tokens"),
104
- gr.Slider(1, 80, 40, label="๐ŸŽ›๏ธ Top K"),
105
- gr.Slider(0.1, 2.0, 1.1, label="๐Ÿ”„ Repetition Penalty"),
106
- gr.Slider(0, 1, 0.95, label="๐Ÿงฎ Top P"),
107
- ],
108
- css="""
109
- .thinking {
110
- color: #666;
111
- font-style: italic;
112
- border-left: 3px solid #ddd;
113
- padding-left: 1em;
114
- margin: 0.5em 0;
115
- }
116
- """,
117
- title="DeepSeek AI Assistant with Reasoning",
118
- description="Toggle the 'Show Thinking Process' checkbox to view/hide the model's internal reasoning"
119
- ).queue().launch()
 
 
 
 
1
  import gradio as gr
2
+ from huggingface_hub import InferenceClient
3
+ import string
4
+ import numpy as np
5
+ from transformers import AutoTokenizer
6
+ import onnxruntime as ort
7
+ import os
8
 
9
+ # Initialize client and models
10
+ client = InferenceClient(api_key=os.environ.get('HF_TOKEN'))
11
 
12
+ # Constants for EOU calculation
13
+ PUNCS = string.punctuation.replace("'", "")
14
+ MAX_HISTORY = 4
15
+ MAX_HISTORY_TOKENS = 512
16
+ EOU_THRESHOLD = 0.5
17
 
18
+ # Initialize tokenizer and ONNX session
19
+ HG_MODEL = "livekit/turn-detector"
20
+ ONNX_FILENAME = "model_quantized.onnx"
21
+ tokenizer = AutoTokenizer.from_pretrained(HG_MODEL)
22
+ onnx_session = ort.InferenceSession(ONNX_FILENAME, providers=["CPUExecutionProvider"])
23
 
24
+ # Helper functions for EOU
25
+ def softmax(logits):
26
+ exp_logits = np.exp(logits - np.max(logits))
27
+ return exp_logits / np.sum(exp_logits)
28
+
29
+ def normalize_text(text):
30
+ def strip_puncs(text):
31
+ return text.translate(str.maketrans("", "", PUNCS))
32
+ return " ".join(strip_puncs(text).lower().split())
33
+
34
+ def format_chat_ctx(chat_ctx):
35
+ new_chat_ctx = []
36
+ for msg in chat_ctx:
37
+ if msg["role"] in ("user", "assistant"):
38
+ content = normalize_text(msg["content"])
39
+ if content:
40
+ msg["content"] = content
41
+ new_chat_ctx.append(msg)
42
+ convo_text = tokenizer.apply_chat_template(
43
+ new_chat_ctx, add_generation_prompt=False, add_special_tokens=False, tokenize=False
44
+ )
45
+ ix = convo_text.rfind("<|im_end|>")
46
+ return convo_text[:ix]
47
+
48
+ def calculate_eou(chat_ctx, session):
49
+ formatted_text = format_chat_ctx(chat_ctx[-MAX_HISTORY:])
50
+ inputs = tokenizer(
51
+ formatted_text,
52
+ return_tensors="np",
53
+ truncation=True,
54
+ max_length=MAX_HISTORY_TOKENS,
55
  )
56
+ input_ids = np.array(inputs["input_ids"], dtype=np.int64)
57
+ outputs = session.run(["logits"], {"input_ids": input_ids})
58
+ logits = outputs[0][0, -1, :]
59
+ probs = softmax(logits)
60
+ eou_token_id = tokenizer.encode("<|im_end|>")[-1]
61
+ return probs[eou_token_id]
62
+
63
+ # Chatbot logic with EOU
64
+ messages = []
65
+
66
+ def chatbot(user_input):
67
+ global messages
68
 
69
+ # Exit condition
70
+ if user_input.lower() == "exit":
71
+ messages = [] # Reset conversation history
72
+ return "Chat ended. Refresh the page to start again."
73
+
74
+ # Add user message to conversation history
75
+ messages.append({"role": "user", "content": user_input})
76
+
77
+ # Calculate EOU to determine if user has finished typing
78
+ eou_prob = calculate_eou(messages, onnx_session)
79
+ if eou_prob < EOU_THRESHOLD:
80
+ yield "[I'm waiting for you to complete the sentence...]"
81
+ return
82
+
83
+ # Stream the chatbot's response
84
+ stream = client.chat.completions.create(
85
+ model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
86
+ messages=messages,
87
+ temperature=0.5,
88
+ max_tokens=2048,
89
+ top_p=0.7,
90
+ stream=True
91
+ )
92
+
93
+ bot_response = ""
94
+ for chunk in stream:
95
+ bot_response += chunk.choices[0].delta.content
96
+ yield bot_response
97
+
98
+ # Add final bot response to conversation history
99
+ messages.append({"role": "assistant", "content": bot_response})
100
+
101
+ # Create Gradio interface
102
+ with gr.Blocks(theme='darkdefault') as demo:
103
+ gr.Markdown("""# Chat with DeepSeek-R1
104
+ Type your message below to interact with the chatbot. Type "exit" to end the conversation.
105
+ """)
106
+
107
+ with gr.Row():
108
+ with gr.Column():
109
+ user_input = gr.Textbox(label="Your Message", placeholder="Type your message here...")
110
+ submit_button = gr.Button("Send")
111
+ with gr.Column():
112
+ chat_output = gr.Textbox(label="Chatbot Response", interactive=False)
113
+
114
+ # Define interactions
115
+ submit_button.click(chatbot, inputs=[user_input], outputs=[chat_output])
116
+
117
+ # Launch the app
118
+ demo.launch()