Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -60,14 +60,16 @@ def generate_response(message, history, enable_reasoning, temperature, top_p, ma
|
|
| 60 |
"top_p": top_p if top_p < 1 else 0.95,
|
| 61 |
"do_sample": True,
|
| 62 |
"max_new_tokens": max_tokens,
|
| 63 |
-
"eos_token_id": tokenizer.eos_token_id
|
|
|
|
| 64 |
}
|
| 65 |
else:
|
| 66 |
# Greedy search for non-reasoning
|
| 67 |
generation_kwargs = {
|
| 68 |
"do_sample": False,
|
| 69 |
"max_new_tokens": max_tokens,
|
| 70 |
-
"eos_token_id": tokenizer.eos_token_id
|
|
|
|
| 71 |
}
|
| 72 |
|
| 73 |
# Generate response
|
|
@@ -75,15 +77,8 @@ def generate_response(message, history, enable_reasoning, temperature, top_p, ma
|
|
| 75 |
outputs = model.generate(tokenized_chat, **generation_kwargs)
|
| 76 |
|
| 77 |
# Decode and extract the assistant's response
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
# Extract only the new response (after the last user message)
|
| 81 |
-
# This is a simple approach - you might need to adjust based on the model's output format
|
| 82 |
-
response_parts = full_response.split(message)
|
| 83 |
-
if len(response_parts) > 1:
|
| 84 |
-
response = response_parts[-1].strip()
|
| 85 |
-
else:
|
| 86 |
-
response = full_response.strip()
|
| 87 |
|
| 88 |
return response
|
| 89 |
|
|
@@ -153,15 +148,19 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 153 |
return history
|
| 154 |
|
| 155 |
message = history[-1][0]
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
return history
|
| 166 |
|
| 167 |
msg.submit(
|
|
|
|
| 60 |
"top_p": top_p if top_p < 1 else 0.95,
|
| 61 |
"do_sample": True,
|
| 62 |
"max_new_tokens": max_tokens,
|
| 63 |
+
"eos_token_id": tokenizer.eos_token_id,
|
| 64 |
+
"use_cache": False # Disable cache to avoid the error
|
| 65 |
}
|
| 66 |
else:
|
| 67 |
# Greedy search for non-reasoning
|
| 68 |
generation_kwargs = {
|
| 69 |
"do_sample": False,
|
| 70 |
"max_new_tokens": max_tokens,
|
| 71 |
+
"eos_token_id": tokenizer.eos_token_id,
|
| 72 |
+
"use_cache": False # Disable cache to avoid the error
|
| 73 |
}
|
| 74 |
|
| 75 |
# Generate response
|
|
|
|
| 77 |
outputs = model.generate(tokenized_chat, **generation_kwargs)
|
| 78 |
|
| 79 |
# Decode and extract the assistant's response
|
| 80 |
+
generated_tokens = outputs[0][tokenized_chat.shape[-1]:] # Get only new tokens
|
| 81 |
+
response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
return response
|
| 84 |
|
|
|
|
| 148 |
return history
|
| 149 |
|
| 150 |
message = history[-1][0]
|
| 151 |
+
try:
|
| 152 |
+
response = generate_response(
|
| 153 |
+
message,
|
| 154 |
+
history[:-1],
|
| 155 |
+
enable_reasoning,
|
| 156 |
+
temperature,
|
| 157 |
+
top_p,
|
| 158 |
+
max_tokens
|
| 159 |
+
)
|
| 160 |
+
history[-1][1] = response
|
| 161 |
+
except Exception as e:
|
| 162 |
+
history[-1][1] = f"Error generating response: {str(e)}"
|
| 163 |
+
|
| 164 |
return history
|
| 165 |
|
| 166 |
msg.submit(
|