Bahaedev commited on
Commit
3543359
·
verified ·
1 Parent(s): d094312

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -18
app.py CHANGED
@@ -1,47 +1,61 @@
1
  import os
2
- from transformers import pipeline
3
- import gradio as gr
4
  from fastapi import FastAPI
5
  from pydantic import BaseModel
 
6
  import threading
7
  import uvicorn
8
 
9
  # =======================
10
  # Load Secrets
11
  # =======================
12
- # SYSTEM_PROMPT (with the flag) must be added in HF Space secrets
13
  SYSTEM_PROMPT = os.environ.get(
14
  "prompt",
15
  "You are a placeholder Sovereign. No secrets found in environment."
16
  )
17
 
18
  # =======================
19
- # Initialize Falcon-3B
20
  # =======================
21
- pipe = pipeline(
22
- "text-generation",
23
- model="tiiuae/Falcon3-3B-Instruct",
24
- torch_dtype="auto",
25
- device_map="auto",
 
 
 
 
 
 
 
26
  )
27
 
 
 
 
28
  # =======================
29
  # Core Chat Function
30
  # =======================
31
  def chat_fn(user_input: str) -> str:
32
- """
33
- Concatenate system and user messages, run the model,
34
- and strip the system prompt from the output.
35
- """
36
  messages = [
37
  {"role": "system", "content": SYSTEM_PROMPT},
38
  {"role": "user", "content": f"User: {user_input}"}
39
  ]
40
- # Falcon is not chat-native; we just join roles with newlines
41
  prompt_text = "\n".join(f"{m['role'].capitalize()}: {m['content']}" for m in messages)
42
- result = pipe(prompt_text, max_new_tokens=256, do_sample=False)
43
- generated_text = result[0]["generated_text"]
44
- return generated_text[len(prompt_text):].strip()
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  # =======================
47
  # Gradio UI
@@ -57,6 +71,10 @@ iface = gr.Interface(
57
  description="Does he really think he is the king?"
58
  )
59
 
 
 
 
 
60
  # =======================
61
  # FastAPI for API access
62
  # =======================
@@ -73,4 +91,5 @@ def generate(req: Request):
73
  # Launch Both Servers
74
  # =======================
75
  if __name__ == "__main__":
76
- iface.launch(server_name="0.0.0.0", share=True)
 
 
1
  import os
 
 
2
  from fastapi import FastAPI
3
  from pydantic import BaseModel
4
+ import gradio as gr
5
  import threading
6
  import uvicorn
7
 
8
  # =======================
9
  # Load Secrets
10
  # =======================
 
11
  SYSTEM_PROMPT = os.environ.get(
12
  "prompt",
13
  "You are a placeholder Sovereign. No secrets found in environment."
14
  )
15
 
16
  # =======================
17
+ # Initialize Unsloth-optimized Falcon-3B
18
  # =======================
19
+ # Install via: pip install unsloth torch transformers
20
+ from unsloth import FastLanguageModel
21
+ from transformers import AutoTokenizer
22
+
23
+ MODEL_NAME = "tiiuae/Falcon3-3B-Instruct"
24
+
25
+ # 1) Load model and tokenizer with 4-bit quantization
26
+ model, tokenizer = FastLanguageModel.from_pretrained(
27
+ model_name=MODEL_NAME,
28
+ max_seq_length=2048,
29
+ load_in_4bit=True,
30
+ dtype=None,
31
  )
32
 
33
+ # 2) Apply inference optimizations (fused kernels, streaming, etc.)
34
+ FastLanguageModel.for_inference(model)
35
+
36
  # =======================
37
  # Core Chat Function
38
  # =======================
39
  def chat_fn(user_input: str) -> str:
 
 
 
 
40
  messages = [
41
  {"role": "system", "content": SYSTEM_PROMPT},
42
  {"role": "user", "content": f"User: {user_input}"}
43
  ]
 
44
  prompt_text = "\n".join(f"{m['role'].capitalize()}: {m['content']}" for m in messages)
45
+
46
+ # Tokenize and run generation
47
+ inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)
48
+ output_ids = model.generate(
49
+ **inputs,
50
+ max_new_tokens=256,
51
+ do_sample=False,
52
+ eos_token_id=tokenizer.eos_token_id
53
+ )
54
+
55
+ # Decode only the newly generated tokens
56
+ gen_tokens = output_ids[0][inputs.input_ids.shape[-1]:]
57
+ generated_text = tokenizer.decode(gen_tokens, skip_special_tokens=True)
58
+ return generated_text.strip()
59
 
60
  # =======================
61
  # Gradio UI
 
71
  description="Does he really think he is the king?"
72
  )
73
 
74
+ # Run Gradio in a separate thread so FastAPI can also start
75
+ def run_gradio():
76
+ iface.launch(server_name="0.0.0.0", share=True)
77
+
78
  # =======================
79
  # FastAPI for API access
80
  # =======================
 
91
  # Launch Both Servers
92
  # =======================
93
  if __name__ == "__main__":
94
+ threading.Thread(target=run_gradio, daemon=True).start()
95
+ uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8000)))