beyoru commited on
Commit
9015f33
·
verified ·
1 Parent(s): 766b6ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -27
app.py CHANGED
@@ -6,34 +6,22 @@ import spaces
6
  import gradio as gr
7
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
8
 
9
- #subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
10
-
11
  MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
12
- CHAT_TEMPLATE = "َAuto"
13
  MODEL_NAME = MODEL_ID.split("/")[-1]
14
  CONTEXT_LENGTH = 4096
15
 
16
-
17
  def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
18
- # Format history with a given chat template
19
-
20
-
21
- stop_tokens = ["<|endoftext|>", "<|im_end|>","|im_end|"]
22
  instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
 
23
  for user, assistant in history:
24
  instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n'
25
  instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n'
26
 
27
- print(instruction)
28
-
29
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
30
- enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
31
  input_ids, attention_mask = enc.input_ids, enc.attention_mask
32
 
33
- if input_ids.shape[1] > CONTEXT_LENGTH:
34
- input_ids = input_ids[:, -CONTEXT_LENGTH:]
35
- attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
36
-
37
  generate_kwargs = dict(
38
  input_ids=input_ids,
39
  attention_mask=attention_mask,
@@ -45,32 +33,30 @@ def predict(message, history, system_prompt, temperature, max_new_tokens, top_k,
45
  repetition_penalty=repetition_penalty,
46
  top_p=top_p
47
  )
 
48
  t = Thread(target=model.generate, kwargs=generate_kwargs)
49
  t.start()
 
50
  outputs = []
51
  for new_token in streamer:
52
- outputs.append(new_token)
53
  if new_token in stop_tokens:
54
-
55
- break
56
- yield "".join(outputs)
57
-
58
 
59
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
60
- model = AutoModelForCausalLM.from_pretrained(
61
- MODEL_ID)
62
 
63
- # Create Gradio interface
64
  gr.ChatInterface(
65
  predict,
66
-
67
- additional_inputs_accordion=gr.Accordion(label="Parameters", open=False),
68
  additional_inputs=[
69
- gr.Textbox("You are a useful assistant. first recognize user request and then reply carfuly and thinking", label="System prompt"),
 
70
  gr.Slider(0, 1, 0.6, label="Temperature"),
71
  gr.Slider(0, 4096, 512, label="Max new tokens"),
72
  gr.Slider(1, 80, 40, label="Top K sampling"),
73
  gr.Slider(0, 2, 1.1, label="Repetition penalty"),
74
  gr.Slider(0, 1, 0.95, label="Top P sampling"),
75
  ],
76
- ).queue().launch()
 
 
6
  import gradio as gr
7
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
8
 
 
 
9
  MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 
10
  MODEL_NAME = MODEL_ID.split("/")[-1]
11
  CONTEXT_LENGTH = 4096
12
 
 
13
  def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
14
+ stop_tokens = ["<|endoftext|>", "<|im_end|>", "|im_end|"]
 
 
 
15
  instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
16
+
17
  for user, assistant in history:
18
  instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n'
19
  instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n'
20
 
 
 
21
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
22
+ enc = tokenizer(instruction, return_tensors="pt", truncation=True, max_length=CONTEXT_LENGTH)
23
  input_ids, attention_mask = enc.input_ids, enc.attention_mask
24
 
 
 
 
 
25
  generate_kwargs = dict(
26
  input_ids=input_ids,
27
  attention_mask=attention_mask,
 
33
  repetition_penalty=repetition_penalty,
34
  top_p=top_p
35
  )
36
+
37
  t = Thread(target=model.generate, kwargs=generate_kwargs)
38
  t.start()
39
+
40
  outputs = []
41
  for new_token in streamer:
 
42
  if new_token in stop_tokens:
43
+ break # Stop generation but don't add the stop token
44
+ outputs.append(new_token)
45
+ yield "".join(outputs).replace("<|im_end|>", "") # Ensure no leftover stop tokens
 
46
 
47
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
48
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
 
49
 
 
50
  gr.ChatInterface(
51
  predict,
 
 
52
  additional_inputs=[
53
+ gr.Textbox("You are a helpful assistant. Format responses clearly using natural Markdown formatting where appropriate.",
54
+ label="System prompt"),
55
  gr.Slider(0, 1, 0.6, label="Temperature"),
56
  gr.Slider(0, 4096, 512, label="Max new tokens"),
57
  gr.Slider(1, 80, 40, label="Top K sampling"),
58
  gr.Slider(0, 2, 1.1, label="Repetition penalty"),
59
  gr.Slider(0, 1, 0.95, label="Top P sampling"),
60
  ],
61
+ css=".message { white-space: pre-wrap; }", # Preserve newlines
62
+ ).queue().launch()