akhaliq HF Staff commited on
Commit
daf973d
·
verified ·
1 Parent(s): d2a0ec0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -20
app.py CHANGED
@@ -60,14 +60,16 @@ def generate_response(message, history, enable_reasoning, temperature, top_p, ma
60
  "top_p": top_p if top_p < 1 else 0.95,
61
  "do_sample": True,
62
  "max_new_tokens": max_tokens,
63
- "eos_token_id": tokenizer.eos_token_id
 
64
  }
65
  else:
66
  # Greedy search for non-reasoning
67
  generation_kwargs = {
68
  "do_sample": False,
69
  "max_new_tokens": max_tokens,
70
- "eos_token_id": tokenizer.eos_token_id
 
71
  }
72
 
73
  # Generate response
@@ -75,15 +77,8 @@ def generate_response(message, history, enable_reasoning, temperature, top_p, ma
75
  outputs = model.generate(tokenized_chat, **generation_kwargs)
76
 
77
  # Decode and extract the assistant's response
78
- full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
79
-
80
- # Extract only the new response (after the last user message)
81
- # This is a simple approach - you might need to adjust based on the model's output format
82
- response_parts = full_response.split(message)
83
- if len(response_parts) > 1:
84
- response = response_parts[-1].strip()
85
- else:
86
- response = full_response.strip()
87
 
88
  return response
89
 
@@ -153,15 +148,19 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
153
  return history
154
 
155
  message = history[-1][0]
156
- response = generate_response(
157
- message,
158
- history[:-1],
159
- enable_reasoning,
160
- temperature,
161
- top_p,
162
- max_tokens
163
- )
164
- history[-1][1] = response
 
 
 
 
165
  return history
166
 
167
  msg.submit(
 
60
  "top_p": top_p if top_p < 1 else 0.95,
61
  "do_sample": True,
62
  "max_new_tokens": max_tokens,
63
+ "eos_token_id": tokenizer.eos_token_id,
64
+ "use_cache": False # Disable cache to avoid the error
65
  }
66
  else:
67
  # Greedy search for non-reasoning
68
  generation_kwargs = {
69
  "do_sample": False,
70
  "max_new_tokens": max_tokens,
71
+ "eos_token_id": tokenizer.eos_token_id,
72
+ "use_cache": False # Disable cache to avoid the error
73
  }
74
 
75
  # Generate response
 
77
  outputs = model.generate(tokenized_chat, **generation_kwargs)
78
 
79
  # Decode and extract the assistant's response
80
+ generated_tokens = outputs[0][tokenized_chat.shape[-1]:] # Get only new tokens
81
+ response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
 
 
 
 
 
 
 
82
 
83
  return response
84
 
 
148
  return history
149
 
150
  message = history[-1][0]
151
+ try:
152
+ response = generate_response(
153
+ message,
154
+ history[:-1],
155
+ enable_reasoning,
156
+ temperature,
157
+ top_p,
158
+ max_tokens
159
+ )
160
+ history[-1][1] = response
161
+ except Exception as e:
162
+ history[-1][1] = f"Error generating response: {str(e)}"
163
+
164
  return history
165
 
166
  msg.submit(