import os from fastapi import FastAPI from pydantic import BaseModel import gradio as gr import threading import uvicorn # ======================= # Load Secrets # ======================= SYSTEM_PROMPT = os.environ.get( "prompt", "You are a placeholder Sovereign. No secrets found in environment." ) # ======================= # Initialize Unsloth-optimized Falcon-3B # ======================= # Install via: pip install unsloth torch transformers from unsloth import FastLanguageModel from transformers import AutoTokenizer MODEL_NAME = "tiiuae/Falcon3-3B-Instruct" # 1) Load model and tokenizer with 4-bit quantization model, tokenizer = FastLanguageModel.from_pretrained( model_name=MODEL_NAME, max_seq_length=2048, load_in_4bit=True, dtype=None, ) # 2) Apply inference optimizations (fused kernels, streaming, etc.) FastLanguageModel.for_inference(model) # ======================= # Core Chat Function # ======================= def chat_fn(user_input: str) -> str: messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": f"User: {user_input}"} ] prompt_text = "\n".join(f"{m['role'].capitalize()}: {m['content']}" for m in messages) # Tokenize and run generation inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device) output_ids = model.generate( **inputs, max_new_tokens=256, do_sample=False, eos_token_id=tokenizer.eos_token_id ) # Decode only the newly generated tokens gen_tokens = output_ids[0][inputs.input_ids.shape[-1]:] generated_text = tokenizer.decode(gen_tokens, skip_special_tokens=True) return generated_text.strip() # ======================= # Gradio UI # ======================= def gradio_chat(user_input: str) -> str: return chat_fn(user_input) iface = gr.Interface( fn=gradio_chat, inputs=gr.Textbox(lines=5, placeholder="Enter your prompt…"), outputs="text", title="Prompt cracking challenge", description="Does he really think he is the king?" ) # Run Gradio in a separate thread so FastAPI can also start def run_gradio(): iface.launch(server_name="0.0.0.0", share=True) # ======================= # FastAPI for API access # ======================= app = FastAPI(title="Prompt cracking challenge API") class Request(BaseModel): prompt: str @app.post("/generate") def generate(req: Request): return {"response": chat_fn(req.prompt)} # ======================= # Launch Both Servers # ======================= if __name__ == "__main__": threading.Thread(target=run_gradio, daemon=True).start() uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8000)))