Spaces:
Sleeping
Sleeping
import os | |
from fastapi import FastAPI | |
from pydantic import BaseModel | |
import gradio as gr | |
import threading | |
import uvicorn | |
# ======================= | |
# Load Secrets | |
# ======================= | |
SYSTEM_PROMPT = os.environ.get( | |
"prompt", | |
"You are a placeholder Sovereign. No secrets found in environment." | |
) | |
# ======================= | |
# Initialize Unsloth-optimized Falcon-3B | |
# ======================= | |
# Install via: pip install unsloth torch transformers | |
from unsloth import FastLanguageModel | |
from transformers import AutoTokenizer | |
MODEL_NAME = "tiiuae/Falcon3-3B-Instruct" | |
# 1) Load model and tokenizer with 4-bit quantization | |
model, tokenizer = FastLanguageModel.from_pretrained( | |
model_name=MODEL_NAME, | |
max_seq_length=2048, | |
load_in_4bit=True, | |
dtype=None, | |
) | |
# 2) Apply inference optimizations (fused kernels, streaming, etc.) | |
FastLanguageModel.for_inference(model) | |
# ======================= | |
# Core Chat Function | |
# ======================= | |
def chat_fn(user_input: str) -> str: | |
messages = [ | |
{"role": "system", "content": SYSTEM_PROMPT}, | |
{"role": "user", "content": f"User: {user_input}"} | |
] | |
prompt_text = "\n".join(f"{m['role'].capitalize()}: {m['content']}" for m in messages) | |
# Tokenize and run generation | |
inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device) | |
output_ids = model.generate( | |
**inputs, | |
max_new_tokens=256, | |
do_sample=False, | |
eos_token_id=tokenizer.eos_token_id | |
) | |
# Decode only the newly generated tokens | |
gen_tokens = output_ids[0][inputs.input_ids.shape[-1]:] | |
generated_text = tokenizer.decode(gen_tokens, skip_special_tokens=True) | |
return generated_text.strip() | |
# ======================= | |
# Gradio UI | |
# ======================= | |
def gradio_chat(user_input: str) -> str: | |
return chat_fn(user_input) | |
iface = gr.Interface( | |
fn=gradio_chat, | |
inputs=gr.Textbox(lines=5, placeholder="Enter your prompt…"), | |
outputs="text", | |
title="Prompt cracking challenge", | |
description="Does he really think he is the king?" | |
) | |
# Run Gradio in a separate thread so FastAPI can also start | |
def run_gradio(): | |
iface.launch(server_name="0.0.0.0", share=True) | |
# ======================= | |
# FastAPI for API access | |
# ======================= | |
app = FastAPI(title="Prompt cracking challenge API") | |
class Request(BaseModel): | |
prompt: str | |
def generate(req: Request): | |
return {"response": chat_fn(req.prompt)} | |
# ======================= | |
# Launch Both Servers | |
# ======================= | |
if __name__ == "__main__": | |
threading.Thread(target=run_gradio, daemon=True).start() | |
uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8000))) |