Tonic commited on
Commit
b0c4a3f
·
1 Parent(s): 1c47659

loads the lora

Browse files
Files changed (3) hide show
  1. README.md +5 -3
  2. app.py +101 -30
  3. requirements.txt +3 -1
README.md CHANGED
@@ -1,13 +1,15 @@
1
  ---
2
- title: Gpt Oss 20b Demo
3
- emoji: 💬
4
  colorFrom: blue
5
  colorTo: pink
6
  sdk: gradio
7
  sdk_version: 5.40.0
8
  app_file: app.py
9
  pinned: false
10
- short_description: Try out OpenAI's gpt-oss-20b model
11
  ---
12
 
 
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: GPT-OSS-20B Multilingual Reasoner Demo
3
+ emoji: 🌟
4
  colorFrom: blue
5
  colorTo: pink
6
  sdk: gradio
7
  sdk_version: 5.40.0
8
  app_file: app.py
9
  pinned: false
10
+ short_description: Try out Tonic's GPT-OSS-20B Multilingual Reasoner LoRA adapter
11
  ---
12
 
13
+ This demo showcases the GPT-OSS-20B model fine-tuned with LoRA for enhanced multilingual reasoning capabilities. The model is based on OpenAI's GPT-OSS-20B base model with a LoRA adapter from Tonic.
14
+
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,18 +1,69 @@
1
- from transformers import pipeline, TextIteratorStreamer
2
  import torch
3
  from threading import Thread
4
  import gradio as gr
5
  import spaces
6
  import re
 
7
 
8
- model_id = "Tonic/gpt-oss-20b-multilingual-reasoner"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- pipe = pipeline(
11
- "text-generation",
12
- model=model_id,
13
- torch_dtype="auto",
14
- device_map="auto",
15
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def format_conversation_history(chat_history):
18
  messages = []
@@ -31,7 +82,7 @@ def generate_response(input_data, chat_history, max_new_tokens, system_prompt, t
31
  processed_history = format_conversation_history(chat_history)
32
  messages = system_message + processed_history + [new_message]
33
 
34
- streamer = TextIteratorStreamer(pipe.tokenizer, skip_prompt=True, skip_special_tokens=True)
35
  generation_kwargs = {
36
  "max_new_tokens": max_new_tokens,
37
  "do_sample": True,
@@ -39,28 +90,48 @@ def generate_response(input_data, chat_history, max_new_tokens, system_prompt, t
39
  "top_p": top_p,
40
  "top_k": top_k,
41
  "repetition_penalty": repetition_penalty,
42
- "streamer": streamer
43
  }
44
- thread = Thread(target=pipe, args=(messages,), kwargs=generation_kwargs)
45
- thread.start()
46
- #streaming try #1
47
- buffer = ""
 
 
48
  full_response = ""
49
- for chunk in streamer:
50
- buffer += chunk
51
- parts = re.split(r'(\s+)', buffer)
52
- if re.match(r'\s+', parts[-1]) is not None:
53
- to_append = ''.join(parts)
54
- buffer = ""
55
- else:
56
- to_append = ''.join(parts[:-1])
57
- buffer = parts[-1]
58
- if to_append:
59
- full_response += to_append
60
- yield full_response
61
- if buffer:
62
- full_response += buffer
63
- yield full_response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  demo = gr.ChatInterface(
66
  fn=generate_response,
@@ -85,7 +156,7 @@ demo = gr.ChatInterface(
85
  cache_examples=False,
86
  type="messages",
87
  description="""
88
- # gpt-oss-20b
89
  Wait couple of seconds initially. You can adjust reasoning level in the system prompt like "Reasoning: high.
90
  """,
91
  fill_height=True,
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
2
  import torch
3
  from threading import Thread
4
  import gradio as gr
5
  import spaces
6
  import re
7
+ from peft import PeftModel
8
 
9
+ # Load the base model
10
+ try:
11
+ base_model = AutoModelForCausalLM.from_pretrained(
12
+ "openai/gpt-oss-20b",
13
+ torch_dtype="auto",
14
+ device_map="auto",
15
+ )
16
+ tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")
17
+
18
+ # Load the LoRA adapter
19
+ try:
20
+ model = PeftModel.from_pretrained(base_model, "Tonic/gpt-oss-20b-multilingual-reasoner")
21
+ print("✅ LoRA model loaded successfully!")
22
+ except Exception as lora_error:
23
+ print(f"⚠️ LoRA adapter failed to load: {lora_error}")
24
+ print("🔄 Falling back to base model...")
25
+ model = base_model
26
+
27
+ except Exception as e:
28
+ print(f"❌ Error loading model: {e}")
29
+ raise e
30
 
31
+ class LoRAPipeline:
32
+ def __init__(self, model, tokenizer):
33
+ self.model = model
34
+ self.tokenizer = tokenizer
35
+
36
+ def __call__(self, messages, **kwargs):
37
+ prompt = self.format_messages(messages)
38
+ inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
39
+
40
+ with torch.no_grad():
41
+ outputs = self.model.generate(
42
+ **inputs,
43
+ **kwargs
44
+ )
45
+
46
+ generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
47
+ response = generated_text[len(prompt):]
48
+ return response
49
+
50
+ def format_messages(self, messages):
51
+ """Format messages into a prompt string"""
52
+ formatted = ""
53
+ for message in messages:
54
+ role = message["role"]
55
+ content = message["content"]
56
+ if role == "system":
57
+ formatted += f"System: {content}\n"
58
+ elif role == "user":
59
+ formatted += f"User: {content}\n"
60
+ elif role == "assistant":
61
+ formatted += f"Assistant: {content}\n"
62
+ formatted += "Assistant: "
63
+ return formatted
64
+
65
+ # Create the pipeline
66
+ pipe = LoRAPipeline(model, tokenizer)
67
 
68
  def format_conversation_history(chat_history):
69
  messages = []
 
82
  processed_history = format_conversation_history(chat_history)
83
  messages = system_message + processed_history + [new_message]
84
 
85
+ # Generate response using the LoRA pipeline
86
  generation_kwargs = {
87
  "max_new_tokens": max_new_tokens,
88
  "do_sample": True,
 
90
  "top_p": top_p,
91
  "top_k": top_k,
92
  "repetition_penalty": repetition_penalty,
93
+ "pad_token_id": tokenizer.eos_token_id,
94
  }
95
+
96
+ # For streaming, we'll generate token by token
97
+ prompt = pipe.format_messages(messages)
98
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
99
+
100
+ # Generate with streaming
101
  full_response = ""
102
+ current_length = inputs["input_ids"].shape[1]
103
+
104
+ with torch.no_grad():
105
+ for i in range(max_new_tokens):
106
+ # Generate one token at a time
107
+ outputs = model.generate(
108
+ **inputs,
109
+ max_new_tokens=1,
110
+ do_sample=True,
111
+ temperature=temperature,
112
+ top_p=top_p,
113
+ top_k=top_k,
114
+ repetition_penalty=repetition_penalty,
115
+ pad_token_id=tokenizer.eos_token_id,
116
+ use_cache=True
117
+ )
118
+
119
+ # Get the new token
120
+ new_token = outputs[0][-1].unsqueeze(0)
121
+
122
+ # Decode the new token
123
+ new_text = tokenizer.decode(new_token, skip_special_tokens=True)
124
+
125
+ if new_text:
126
+ full_response += new_text
127
+ yield full_response
128
+
129
+ # Update inputs for next iteration
130
+ inputs = {"input_ids": torch.cat([inputs["input_ids"], new_token], dim=1)}
131
+
132
+ # Check for end of generation
133
+ if new_token.item() == tokenizer.eos_token_id:
134
+ break
135
 
136
  demo = gr.ChatInterface(
137
  fn=generate_response,
 
156
  cache_examples=False,
157
  type="messages",
158
  description="""
159
+ # 🙋🏻‍♂️Welcome to 🌟Tonic's gpt-oss-20b Multilingual Reasoner Demo !
160
  Wait couple of seconds initially. You can adjust reasoning level in the system prompt like "Reasoning: high.
161
  """,
162
  fill_height=True,
requirements.txt CHANGED
@@ -1,2 +1,4 @@
1
  git+https://github.com/huggingface/transformers.git
2
- accelerate
 
 
 
1
  git+https://github.com/huggingface/transformers.git
2
+ accelerate
3
+ peft
4
+ torch