Spaces:
Running
on
T4
Running
on
T4
sparkleman
commited on
Commit
·
05b6df6
1
Parent(s):
cdd6039
CKPT: Space CPU version
Browse files- Dockerfile +1 -1
- app.py +6 -5
Dockerfile
CHANGED
|
@@ -26,4 +26,4 @@ COPY --chown=user . $HOME/app
|
|
| 26 |
|
| 27 |
RUN uv sync --frozen --extra cu124
|
| 28 |
|
| 29 |
-
CMD ["uv","run","app.py","--strategy","
|
|
|
|
| 26 |
|
| 27 |
RUN uv sync --frozen --extra cu124
|
| 28 |
|
| 29 |
+
CMD ["uv","run","app.py","--strategy","cuda fp16","--model_title","RWKV-x070-World-0.1B-v2.8-20241210-ctx4096","--download_repo_id","BlinkDL/rwkv-7-world","--host","0.0.0.0","--port","7860"]
|
app.py
CHANGED
|
@@ -92,7 +92,7 @@ class ChatCompletionRequest(BaseModel):
|
|
| 92 |
description="Add `:thinking` suffix to the model name to enable reasoning. Example: `rwkv-latest:thinking`",
|
| 93 |
)
|
| 94 |
messages: List[ChatMessage]
|
| 95 |
-
prompt:
|
| 96 |
max_tokens: int = Field(default=512)
|
| 97 |
temperature: float = Field(default=1.0)
|
| 98 |
top_p: float = Field(default=0.3)
|
|
@@ -114,7 +114,7 @@ app.add_middleware(
|
|
| 114 |
)
|
| 115 |
|
| 116 |
|
| 117 |
-
def runPrefill(ctx: str, model_tokens: List[int], model_state):
|
| 118 |
ctx = ctx.replace("\r\n", "\n")
|
| 119 |
|
| 120 |
tokens = pipeline.encode(ctx)
|
|
@@ -124,6 +124,7 @@ def runPrefill(ctx: str, model_tokens: List[int], model_state):
|
|
| 124 |
while len(tokens) > 0:
|
| 125 |
out, model_state = model.forward(tokens[: CONFIG.CHUNK_LEN], model_state)
|
| 126 |
tokens = tokens[CONFIG.CHUNK_LEN :]
|
|
|
|
| 127 |
|
| 128 |
return out, model_tokens, model_state
|
| 129 |
|
|
@@ -220,7 +221,7 @@ async def chatResponse(
|
|
| 220 |
else request.prompt.strip()
|
| 221 |
)
|
| 222 |
|
| 223 |
-
out, model_tokens, model_state = runPrefill(prompt, [], model_state)
|
| 224 |
|
| 225 |
prefillTime = time.time()
|
| 226 |
promptTokenCount = len(model_tokens)
|
|
@@ -301,7 +302,7 @@ async def chatResponseStream(
|
|
| 301 |
else request.prompt.strip()
|
| 302 |
)
|
| 303 |
|
| 304 |
-
out, model_tokens, model_state = runPrefill(prompt, [], model_state)
|
| 305 |
|
| 306 |
prefillTime = time.time()
|
| 307 |
promptTokenCount = len(model_tokens)
|
|
@@ -530,7 +531,7 @@ async def chat_completions(request: ChatCompletionRequest):
|
|
| 530 |
completionId = str(next(CompletionIdGenerator))
|
| 531 |
logger.info(f"[REQ] {completionId} - {request.model_dump()}")
|
| 532 |
|
| 533 |
-
def chatResponseStreamDisconnect():
|
| 534 |
if "cuda" in CONFIG.STRATEGY:
|
| 535 |
gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
|
| 536 |
logger.info(
|
|
|
|
| 92 |
description="Add `:thinking` suffix to the model name to enable reasoning. Example: `rwkv-latest:thinking`",
|
| 93 |
)
|
| 94 |
messages: List[ChatMessage]
|
| 95 |
+
prompt: Optional[str] = Field(default=None)
|
| 96 |
max_tokens: int = Field(default=512)
|
| 97 |
temperature: float = Field(default=1.0)
|
| 98 |
top_p: float = Field(default=0.3)
|
|
|
|
| 114 |
)
|
| 115 |
|
| 116 |
|
| 117 |
+
async def runPrefill(ctx: str, model_tokens: List[int], model_state):
|
| 118 |
ctx = ctx.replace("\r\n", "\n")
|
| 119 |
|
| 120 |
tokens = pipeline.encode(ctx)
|
|
|
|
| 124 |
while len(tokens) > 0:
|
| 125 |
out, model_state = model.forward(tokens[: CONFIG.CHUNK_LEN], model_state)
|
| 126 |
tokens = tokens[CONFIG.CHUNK_LEN :]
|
| 127 |
+
await asyncio.sleep(0)
|
| 128 |
|
| 129 |
return out, model_tokens, model_state
|
| 130 |
|
|
|
|
| 221 |
else request.prompt.strip()
|
| 222 |
)
|
| 223 |
|
| 224 |
+
out, model_tokens, model_state = await runPrefill(prompt, [], model_state)
|
| 225 |
|
| 226 |
prefillTime = time.time()
|
| 227 |
promptTokenCount = len(model_tokens)
|
|
|
|
| 302 |
else request.prompt.strip()
|
| 303 |
)
|
| 304 |
|
| 305 |
+
out, model_tokens, model_state = await runPrefill(prompt, [], model_state)
|
| 306 |
|
| 307 |
prefillTime = time.time()
|
| 308 |
promptTokenCount = len(model_tokens)
|
|
|
|
| 531 |
completionId = str(next(CompletionIdGenerator))
|
| 532 |
logger.info(f"[REQ] {completionId} - {request.model_dump()}")
|
| 533 |
|
| 534 |
+
async def chatResponseStreamDisconnect():
|
| 535 |
if "cuda" in CONFIG.STRATEGY:
|
| 536 |
gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
|
| 537 |
logger.info(
|