Konstantin
commited on
Commit
·
68ee47a
1
Parent(s):
9a2d7d0
copied files
Browse files- .idea/.gitignore +3 -0
- .idea/gpt-oss-20b.iml +8 -0
- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/misc.xml +4 -0
- .idea/modules.xml +8 -0
- .idea/vcs.xml +6 -0
- README.md +183 -0
- handler.py +126 -0
.idea/.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Default ignored files
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
.idea/gpt-oss-20b.iml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<module type="PYTHON_MODULE" version="4">
|
3 |
+
<component name="NewModuleRootManager">
|
4 |
+
<content url="file://$MODULE_DIR$" />
|
5 |
+
<orderEntry type="inheritedJdk" />
|
6 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
7 |
+
</component>
|
8 |
+
</module>
|
.idea/inspectionProfiles/profiles_settings.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<settings>
|
3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
4 |
+
<version value="1.0" />
|
5 |
+
</settings>
|
6 |
+
</component>
|
.idea/misc.xml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (pythonProject)" project-jdk-type="Python SDK" />
|
4 |
+
</project>
|
.idea/modules.xml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectModuleManager">
|
4 |
+
<modules>
|
5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/gpt-oss-20b.iml" filepath="$PROJECT_DIR$/.idea/gpt-oss-20b.iml" />
|
6 |
+
</modules>
|
7 |
+
</component>
|
8 |
+
</project>
|
.idea/vcs.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="VcsDirectoryMappings">
|
4 |
+
<mapping directory="" vcs="Git" />
|
5 |
+
</component>
|
6 |
+
</project>
|
README.md
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
base_model: openai/gpt-oss-20b
|
4 |
+
pipeline_tag: text-generation
|
5 |
+
library_name: transformers
|
6 |
+
tags:
|
7 |
+
- vllm
|
8 |
+
---
|
9 |
+
|
10 |
+
<p align="center">
|
11 |
+
<img alt="gpt-oss-20b" src="https://raw.githubusercontent.com/openai/gpt-oss/main/docs/gpt-oss-20b.svg">
|
12 |
+
</p>
|
13 |
+
|
14 |
+
<p align="center">
|
15 |
+
<a href="https://gpt-oss.com"><strong>Try gpt-oss</strong></a> ·
|
16 |
+
<a href="https://cookbook.openai.com/topic/gpt-oss"><strong>Guides</strong></a> ·
|
17 |
+
<a href="https://arxiv.org/abs/2508.10925"><strong>Model card</strong></a> ·
|
18 |
+
<a href="https://openai.com/index/introducing-gpt-oss/"><strong>OpenAI blog</strong></a>
|
19 |
+
</p>
|
20 |
+
|
21 |
+
<br>
|
22 |
+
|
23 |
+
Welcome to the gpt-oss series, [OpenAI’s open-weight models](https://openai.com/open-models) designed for powerful reasoning, agentic tasks, and versatile developer use cases.
|
24 |
+
|
25 |
+
We’re releasing two flavors of these open models:
|
26 |
+
- `gpt-oss-120b` — for production, general purpose, high reasoning use cases that fit into a single 80GB GPU (like NVIDIA H100 or AMD MI300X) (117B parameters with 5.1B active parameters)
|
27 |
+
- `gpt-oss-20b` — for lower latency, and local or specialized use cases (21B parameters with 3.6B active parameters)
|
28 |
+
|
29 |
+
Both models were trained on our [harmony response format](https://github.com/openai/harmony) and should only be used with the harmony format as it will not work correctly otherwise.
|
30 |
+
|
31 |
+
|
32 |
+
> [!NOTE]
|
33 |
+
> This model card is dedicated to the smaller `gpt-oss-20b` model. Check out [`gpt-oss-120b`](https://huggingface.co/openai/gpt-oss-120b) for the larger model.
|
34 |
+
|
35 |
+
# Highlights
|
36 |
+
|
37 |
+
* **Permissive Apache 2.0 license:** Build freely without copyleft restrictions or patent risk—ideal for experimentation, customization, and commercial deployment.
|
38 |
+
* **Configurable reasoning effort:** Easily adjust the reasoning effort (low, medium, high) based on your specific use case and latency needs.
|
39 |
+
* **Full chain-of-thought:** Gain complete access to the model’s reasoning process, facilitating easier debugging and increased trust in outputs. It’s not intended to be shown to end users.
|
40 |
+
* **Fine-tunable:** Fully customize models to your specific use case through parameter fine-tuning.
|
41 |
+
* **Agentic capabilities:** Use the models’ native capabilities for function calling, [web browsing](https://github.com/openai/gpt-oss/tree/main?tab=readme-ov-file#browser), [Python code execution](https://github.com/openai/gpt-oss/tree/main?tab=readme-ov-file#python), and Structured Outputs.
|
42 |
+
* **MXFP4 quantization:** The models were post-trained with MXFP4 quantization of the MoE weights, making `gpt-oss-120b` run on a single 80GB GPU (like NVIDIA H100 or AMD MI300X) and the `gpt-oss-20b` model run within 16GB of memory. All evals were performed with the same MXFP4 quantization.
|
43 |
+
|
44 |
+
---
|
45 |
+
|
46 |
+
# Inference examples
|
47 |
+
|
48 |
+
## Transformers
|
49 |
+
|
50 |
+
You can use `gpt-oss-120b` and `gpt-oss-20b` with Transformers. If you use the Transformers chat template, it will automatically apply the [harmony response format](https://github.com/openai/harmony). If you use `model.generate` directly, you need to apply the harmony format manually using the chat template or use our [openai-harmony](https://github.com/openai/harmony) package.
|
51 |
+
|
52 |
+
To get started, install the necessary dependencies to setup your environment:
|
53 |
+
|
54 |
+
```
|
55 |
+
pip install -U transformers kernels torch
|
56 |
+
```
|
57 |
+
|
58 |
+
Once, setup you can proceed to run the model by running the snippet below:
|
59 |
+
|
60 |
+
```py
|
61 |
+
from transformers import pipeline
|
62 |
+
import torch
|
63 |
+
|
64 |
+
model_id = "openai/gpt-oss-20b"
|
65 |
+
|
66 |
+
pipe = pipeline(
|
67 |
+
"text-generation",
|
68 |
+
model=model_id,
|
69 |
+
torch_dtype="auto",
|
70 |
+
device_map="auto",
|
71 |
+
)
|
72 |
+
|
73 |
+
messages = [
|
74 |
+
{"role": "user", "content": "Explain quantum mechanics clearly and concisely."},
|
75 |
+
]
|
76 |
+
|
77 |
+
outputs = pipe(
|
78 |
+
messages,
|
79 |
+
max_new_tokens=256,
|
80 |
+
)
|
81 |
+
print(outputs[0]["generated_text"][-1])
|
82 |
+
```
|
83 |
+
|
84 |
+
Alternatively, you can run the model via [`Transformers Serve`](https://huggingface.co/docs/transformers/main/serving) to spin up a OpenAI-compatible webserver:
|
85 |
+
|
86 |
+
```
|
87 |
+
transformers serve
|
88 |
+
transformers chat localhost:8000 --model-name-or-path openai/gpt-oss-20b
|
89 |
+
```
|
90 |
+
|
91 |
+
[Learn more about how to use gpt-oss with Transformers.](https://cookbook.openai.com/articles/gpt-oss/run-transformers)
|
92 |
+
|
93 |
+
## vLLM
|
94 |
+
|
95 |
+
vLLM recommends using [uv](https://docs.astral.sh/uv/) for Python dependency management. You can use vLLM to spin up an OpenAI-compatible webserver. The following command will automatically download the model and start the server.
|
96 |
+
|
97 |
+
```bash
|
98 |
+
uv pip install --pre vllm==0.10.1+gptoss \
|
99 |
+
--extra-index-url https://wheels.vllm.ai/gpt-oss/ \
|
100 |
+
--extra-index-url https://download.pytorch.org/whl/nightly/cu128 \
|
101 |
+
--index-strategy unsafe-best-match
|
102 |
+
|
103 |
+
vllm serve openai/gpt-oss-20b
|
104 |
+
```
|
105 |
+
|
106 |
+
[Learn more about how to use gpt-oss with vLLM.](https://cookbook.openai.com/articles/gpt-oss/run-vllm)
|
107 |
+
|
108 |
+
## PyTorch / Triton
|
109 |
+
|
110 |
+
To learn about how to use this model with PyTorch and Triton, check out our [reference implementations in the gpt-oss repository](https://github.com/openai/gpt-oss?tab=readme-ov-file#reference-pytorch-implementation).
|
111 |
+
|
112 |
+
## Ollama
|
113 |
+
|
114 |
+
If you are trying to run gpt-oss on consumer hardware, you can use Ollama by running the following commands after [installing Ollama](https://ollama.com/download).
|
115 |
+
|
116 |
+
```bash
|
117 |
+
# gpt-oss-20b
|
118 |
+
ollama pull gpt-oss:20b
|
119 |
+
ollama run gpt-oss:20b
|
120 |
+
```
|
121 |
+
|
122 |
+
[Learn more about how to use gpt-oss with Ollama.](https://cookbook.openai.com/articles/gpt-oss/run-locally-ollama)
|
123 |
+
|
124 |
+
#### LM Studio
|
125 |
+
|
126 |
+
If you are using [LM Studio](https://lmstudio.ai/) you can use the following commands to download.
|
127 |
+
|
128 |
+
```bash
|
129 |
+
# gpt-oss-20b
|
130 |
+
lms get openai/gpt-oss-20b
|
131 |
+
```
|
132 |
+
|
133 |
+
Check out our [awesome list](https://github.com/openai/gpt-oss/blob/main/awesome-gpt-oss.md) for a broader collection of gpt-oss resources and inference partners.
|
134 |
+
|
135 |
+
---
|
136 |
+
|
137 |
+
# Download the model
|
138 |
+
|
139 |
+
You can download the model weights from the [Hugging Face Hub](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4) directly from Hugging Face CLI:
|
140 |
+
|
141 |
+
```shell
|
142 |
+
# gpt-oss-20b
|
143 |
+
huggingface-cli download openai/gpt-oss-20b --include "original/*" --local-dir gpt-oss-20b/
|
144 |
+
pip install gpt-oss
|
145 |
+
python -m gpt_oss.chat model/
|
146 |
+
```
|
147 |
+
|
148 |
+
# Reasoning levels
|
149 |
+
|
150 |
+
You can adjust the reasoning level that suits your task across three levels:
|
151 |
+
|
152 |
+
* **Low:** Fast responses for general dialogue.
|
153 |
+
* **Medium:** Balanced speed and detail.
|
154 |
+
* **High:** Deep and detailed analysis.
|
155 |
+
|
156 |
+
The reasoning level can be set in the system prompts, e.g., "Reasoning: high".
|
157 |
+
|
158 |
+
# Tool use
|
159 |
+
|
160 |
+
The gpt-oss models are excellent for:
|
161 |
+
* Web browsing (using built-in browsing tools)
|
162 |
+
* Function calling with defined schemas
|
163 |
+
* Agentic operations like browser tasks
|
164 |
+
|
165 |
+
# Fine-tuning
|
166 |
+
|
167 |
+
Both gpt-oss models can be fine-tuned for a variety of specialized use cases.
|
168 |
+
|
169 |
+
This smaller model `gpt-oss-20b` can be fine-tuned on consumer hardware, whereas the larger [`gpt-oss-120b`](https://huggingface.co/openai/gpt-oss-120b) can be fine-tuned on a single H100 node.
|
170 |
+
|
171 |
+
# Citation
|
172 |
+
|
173 |
+
```bibtex
|
174 |
+
@misc{openai2025gptoss120bgptoss20bmodel,
|
175 |
+
title={gpt-oss-120b & gpt-oss-20b Model Card},
|
176 |
+
author={OpenAI},
|
177 |
+
year={2025},
|
178 |
+
eprint={2508.10925},
|
179 |
+
archivePrefix={arXiv},
|
180 |
+
primaryClass={cs.CL},
|
181 |
+
url={https://arxiv.org/abs/2508.10925},
|
182 |
+
}
|
183 |
+
```
|
handler.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, List, Any
|
2 |
+
import torch
|
3 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
|
4 |
+
from fastapi.responses import StreamingResponse
|
5 |
+
import uuid
|
6 |
+
import time
|
7 |
+
import json
|
8 |
+
from threading import Thread
|
9 |
+
|
10 |
+
class EndpointHandler:
|
11 |
+
def __init__(self, path: str = "openai/gpt-oss-20b"):
|
12 |
+
# Load tokenizer and model
|
13 |
+
self.tokenizer = AutoTokenizer.from_pretrained(path)
|
14 |
+
self.model = AutoModelForCausalLM.from_pretrained(path)
|
15 |
+
self.model.eval()
|
16 |
+
|
17 |
+
# Determine the computation device
|
18 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
19 |
+
self.model.to(self.device)
|
20 |
+
|
21 |
+
def openai_id(prefix: str) -> str:
|
22 |
+
return f"{prefix}-{uuid.uuid4().hex[:24]}"
|
23 |
+
|
24 |
+
def format_non_stream(self, model: str, text: str, prompt_length: int, completion_length: int, total_tokens: int):
|
25 |
+
# Create OpenAI-compatible payload
|
26 |
+
return {
|
27 |
+
"id": self.openai_id("chatcmpl"),
|
28 |
+
"object": "chat.completion",
|
29 |
+
"created": int(time.time()),
|
30 |
+
"model": model,
|
31 |
+
"choices": [{
|
32 |
+
"index": 0,
|
33 |
+
"message": {"role": "assistant", "content": text},
|
34 |
+
"finish_reason": "stop"
|
35 |
+
}],
|
36 |
+
"usage": {
|
37 |
+
"prompt_tokens": prompt_length,
|
38 |
+
"completion_tokens": completion_length,
|
39 |
+
"total_tokens": total_tokens
|
40 |
+
}
|
41 |
+
}
|
42 |
+
|
43 |
+
def format_stream(self, model: str, token: str, usage) -> bytes:
|
44 |
+
payload = {
|
45 |
+
"id": self.openai_id("chatcmpl"),
|
46 |
+
"object": "chat.completion.chunk",
|
47 |
+
"created": int(time.time()),
|
48 |
+
"model": model,
|
49 |
+
"choices": [{
|
50 |
+
"index": 0,
|
51 |
+
"delta": {
|
52 |
+
"content": token,
|
53 |
+
"function_call": None,
|
54 |
+
"refusal": None,
|
55 |
+
"role": None,
|
56 |
+
"tool_calls": None
|
57 |
+
},
|
58 |
+
"finish_reason": None,
|
59 |
+
"logprobs": None
|
60 |
+
}],
|
61 |
+
"usage": usage
|
62 |
+
}
|
63 |
+
|
64 |
+
return f"data: {json.dumps(payload)}\n\n".encode('utf-8')
|
65 |
+
|
66 |
+
def generate(self, messages, model: str):
|
67 |
+
model_inputs = self.tokenizer(messages, return_tensors="pt").to(self.device)
|
68 |
+
full_output = self.model.generate(**model_inputs, max_new_tokens=2048)
|
69 |
+
generated_ids = [
|
70 |
+
output_ids[len(input_ids):]
|
71 |
+
for input_ids, output_ids in zip(model_inputs.input_ids, full_output)
|
72 |
+
]
|
73 |
+
text = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=False)[0]
|
74 |
+
|
75 |
+
input_length = model_inputs.input_ids.shape[1] # Prompt tokens
|
76 |
+
output_length = full_output.shape[1] # Total tokens (prompt + completion)
|
77 |
+
completion_tokens = output_length - input_length
|
78 |
+
|
79 |
+
return self.format_non_stream(model, text, input_length, completion_tokens, output_length)
|
80 |
+
|
81 |
+
def stream(self, messages, model):
|
82 |
+
model_inputs = self.tokenizer(messages, return_tensors="pt").to(self.device)
|
83 |
+
input_len = model_inputs.input_ids.shape[1]
|
84 |
+
streamer = TextIteratorStreamer(
|
85 |
+
self.tokenizer,
|
86 |
+
skip_prompt=True,
|
87 |
+
skip_special_tokens=True
|
88 |
+
)
|
89 |
+
|
90 |
+
generation_kwargs = dict(
|
91 |
+
**model_inputs,
|
92 |
+
streamer=streamer,
|
93 |
+
max_new_tokens=2048
|
94 |
+
)
|
95 |
+
|
96 |
+
thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
|
97 |
+
thread.start()
|
98 |
+
|
99 |
+
completion_tokens = 0
|
100 |
+
for token in streamer:
|
101 |
+
# Count tokens in each chunk
|
102 |
+
token_ids = self.tokenizer.encode(token, add_special_tokens=False)
|
103 |
+
token_count = len(token_ids)
|
104 |
+
completion_tokens += token_count
|
105 |
+
|
106 |
+
yield self.format_stream(model, token, None)
|
107 |
+
|
108 |
+
# Final chunk with stop reason and token counts
|
109 |
+
yield self.format_stream(model, "", {
|
110 |
+
"prompt_tokens": input_len,
|
111 |
+
"completion_tokens": completion_tokens,
|
112 |
+
"total_tokens": input_len + completion_tokens
|
113 |
+
})
|
114 |
+
|
115 |
+
def __call__(self, data: Dict[str, Any]):
|
116 |
+
messages = data.get("messages")
|
117 |
+
model = data.get("model")
|
118 |
+
stream = data.get("stream", False)
|
119 |
+
|
120 |
+
if stream is False:
|
121 |
+
return self.generate(messages, model)
|
122 |
+
else:
|
123 |
+
return StreamingResponse(
|
124 |
+
self.stream(messages, model),
|
125 |
+
media_type="text/event-stream"
|
126 |
+
)
|