Spaces:

phanerozoic
/

SchoolSpiritAI

Paused

App Files Files Community

phanerozoic commited on Apr 21

Commit

d0ad708

verified ·

1 Parent(s): 0ea4bc5

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -69

app.py CHANGED Viewed

@@ -1,19 +1,10 @@
-"""
-SchoolSpirit AI chatbot Space  –  Granite‑3.3‑2B
-• Streams tokens to Gradio UI
-• 5‑per‑minute rate‑limit per IP
-• Founder + email hand‑off in system prompt
-"""
 import os, re, time, datetime, threading, traceback, torch, gradio as gr
-from transformers import (
-    AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
-)
 from transformers.utils import logging as hf_logging
-# ───────────────────────────────────────────────────────────────────────────────
-# 0.  Logging helper
-# ───────────────────────────────────────────────────────────────────────────────
 os.environ["HF_HOME"] = "/data/.huggingface"
 LOG_FILE = "/data/requests.log"
 def log(msg: str):
@@ -26,21 +17,19 @@ def log(msg: str):
     except FileNotFoundError:
         pass
-# ───────────────────────────────────────────────────────────────────────────────
-# 1.  Config
-# ───────────────────────────────────────────────────────────────────────────────
 MODEL_ID          = "ibm-granite/granite-3.3-2b-instruct"
 CTX_TOKENS        = 1800
 MAX_NEW_TOKENS    = 64
 TEMP              = 0.6
 MAX_INPUT_CH      = 300
-RATE_N, RATE_SEC  = 5, 60        # 5 messages / 60 s
 SYSTEM_MSG = (
     "You are **SchoolSpirit AI**, the friendly digital mascot of "
-    "SchoolSpirit AI LLC, founded by Charles Norton in 2025. The company "
-    "installs on‑prem AI chat mascots, fine‑tunes language models, and ships "
-    "turnkey GPU servers to K‑12 schools.\n\n"
     "RULES:\n"
     "• Reply in ≤ 4 sentences unless asked for detail.\n"
     "• No personal‑data collection; no medical/legal/financial advice.\n"
@@ -48,54 +37,55 @@ SYSTEM_MSG = (
     "• If you can’t answer, politely direct the user to [email protected].\n"
     "• Keep language age‑appropriate; avoid profanity, politics, mature themes."
 )
-WELCOME = "Hi there! I’m SchoolSpirit AI. How can I help?"
 strip = lambda s: re.sub(r"\s+", " ", s.strip())
-# ───────────────────────────────────────────────────────────────────────────────
-# 2.  Load model
-# ───────────────────────────────────────────────────────────────────────────────
 hf_logging.set_verbosity_error()
 try:
-    log("Loading tokenizer / model …")
     tok = AutoTokenizer.from_pretrained(MODEL_ID)
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID,
-        device_map="auto" if torch.cuda.is_available() else "cpu",
-        torch_dtype=torch.float16 if torch.cuda.is_available() else "auto",
-        low_cpu_mem_usage=True,
-    )
     MODEL_ERR = None
     log("Model loaded ✔")
 except Exception as exc:
     MODEL_ERR = f"Model load error: {exc}"
-    log(MODEL_ERR)
-# ───────────────────────────────────────────────────────────────────────────────
-# 3.  Simple in‑memory rate‑limiter  {ip: [timestamps]}
-# ───────────────────────────────────────────────────────────────────────────────
 VISITS: dict[str, list[float]] = {}
 def allowed(ip: str) -> bool:
     now = time.time()
-    times = [t for t in VISITS.get(ip, []) if now - t < RATE_SEC]
-    VISITS[ip] = times
-    if len(times) >= RATE_N:
         return False
     VISITS[ip].append(now)
     return True
-# ───────────────────────────────────────────────────────────────────────────────
-# 4.  Prompt builder (trims old turns to fit context)
-# ───────────────────────────────────────────────────────────────────────────────
 def build_prompt(raw: list[dict]) -> str:
     def render(m):
-        role = m["role"]
-        if role == "system":
             return m["content"]
-        prefix = "User:" if role == "user" else "AI:"
         return f"{prefix} {m['content']}"
     system, convo = raw[0], raw[1:]
     while True:
         parts = [system["content"]] + [render(m) for m in convo] + ["AI:"]
@@ -103,27 +93,25 @@ def build_prompt(raw: list[dict]) -> str:
             return "\n".join(parts)
         convo = convo[2:]  # drop oldest user+assistant pair
-# ───────────────────────────────────────────────────────────────────────────────
-# 5.  Streaming chat callback
-# ───────────────────────────────────────────────────────────────────────────────
-def chat_fn(user_msg, chat_history, state, request: gr.Request):
     ip = request.client.host if request else "anon"
     if not allowed(ip):
-        chat_history.append((user_msg, "Rate limit exceeded — please wait a minute."))
-        return chat_history, state
     user_msg = strip(user_msg or "")
     if not user_msg:
-        return chat_history, state
     if len(user_msg) > MAX_INPUT_CH:
-        chat_history.append((user_msg, f"Input >{MAX_INPUT_CH} chars."))
-        return chat_history, state
     if MODEL_ERR:
-        chat_history.append((user_msg, MODEL_ERR))
-        return chat_history, state
-    # Append user line and placeholder for assistant
-    chat_history.append((user_msg, ""))
     state["raw"].append({"role": "user", "content": user_msg})
     prompt = build_prompt(state["raw"])
@@ -141,17 +129,25 @@ def chat_fn(user_msg, chat_history, state, request: gr.Request):
     ).start()
     partial = ""
-    for token in streamer:
-        partial += token
-        chat_history[-1] = (user_msg, partial)   # update last message
-        yield chat_history, state                # stream to UI
-    state["raw"].append({"role": "assistant", "content": strip(partial)})
-    yield chat_history, state  # final update
-# ───────────────────────────────────────────────────────────────────────────────
-# 6.  Gradio UI
-# ───────────────────────────────────────────────────────────────────────────────
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
     gr.Markdown("### SchoolSpirit AI Chat")
     bot = gr.Chatbot(value=[("", WELCOME)], height=480, label="SchoolSpirit AI")

+# app.py  •  SchoolSpirit AI chatbot Space
+# Granite‑3.3‑2B‑Instruct  |  Streaming + rate‑limit + hallucination guard
 import os, re, time, datetime, threading, traceback, torch, gradio as gr
+from transformers import (AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer)
 from transformers.utils import logging as hf_logging
+# ───────────────────────────────── Log helper ────────────────────────────────
 os.environ["HF_HOME"] = "/data/.huggingface"
 LOG_FILE = "/data/requests.log"
 def log(msg: str):
     except FileNotFoundError:
         pass
+# ─────────────────────────────── Configuration ───────────────────────────────
 MODEL_ID          = "ibm-granite/granite-3.3-2b-instruct"
 CTX_TOKENS        = 1800
 MAX_NEW_TOKENS    = 64
 TEMP              = 0.6
 MAX_INPUT_CH      = 300
+RATE_N, RATE_SEC  = 5, 60      # 5 msgs / 60 s per IP
 SYSTEM_MSG = (
     "You are **SchoolSpirit AI**, the friendly digital mascot of "
+    "SchoolSpirit AI LLC, founded by Charles Norton in 2025. "
+    "The company installs on‑prem AI chat mascots, fine‑tunes language models, "
+    "and ships turnkey GPU servers to K‑12 schools.\n\n"
     "RULES:\n"
     "• Reply in ≤ 4 sentences unless asked for detail.\n"
     "• No personal‑data collection; no medical/legal/financial advice.\n"
     "• If you can’t answer, politely direct the user to [email protected].\n"
     "• Keep language age‑appropriate; avoid profanity, politics, mature themes."
 )
+WELCOME = "Hi there! I’m SchoolSpirit AI. Ask me about our chat mascots, fine‑tuning, or GPU servers."
 strip = lambda s: re.sub(r"\s+", " ", s.strip())
+# ─────────────────────── Load tokenizer & model ──────────────────────────────
 hf_logging.set_verbosity_error()
 try:
+    log("Loading tokenizer …")
     tok = AutoTokenizer.from_pretrained(MODEL_ID)
+    if torch.cuda.is_available():
+        log("GPU detected → loading model in FP‑16")
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            device_map="auto",
+            torch_dtype=torch.float16,
+        )
+    else:
+        log("No GPU → loading model on CPU (this is slower)")
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            device_map="cpu",
+            torch_dtype="auto",
+            low_cpu_mem_usage=True,
+        )
     MODEL_ERR = None
     log("Model loaded ✔")
 except Exception as exc:
     MODEL_ERR = f"Model load error: {exc}"
+    log("❌ " + MODEL_ERR + "\n" + traceback.format_exc())
+# ────────────────────────── Per‑IP rate limiter ──────────────────────────────
 VISITS: dict[str, list[float]] = {}
 def allowed(ip: str) -> bool:
     now = time.time()
+    VISITS[ip] = [t for t in VISITS.get(ip, []) if now - t < RATE_SEC]
+    if len(VISITS[ip]) >= RATE_N:
         return False
     VISITS[ip].append(now)
     return True
+# ─────────────────────── Prompt builder (token budget) ───────────────────────
 def build_prompt(raw: list[dict]) -> str:
     def render(m):
+        if m["role"] == "system":
             return m["content"]
+        prefix = "User:" if m["role"] == "user" else "AI:"
         return f"{prefix} {m['content']}"
     system, convo = raw[0], raw[1:]
     while True:
         parts = [system["content"]] + [render(m) for m in convo] + ["AI:"]
             return "\n".join(parts)
         convo = convo[2:]  # drop oldest user+assistant pair
+# ───────────────────────── Streaming chat callback ───────────────────────────
+def chat_fn(user_msg, chat_hist, state, request: gr.Request):
     ip = request.client.host if request else "anon"
     if not allowed(ip):
+        chat_hist.append((user_msg, "Rate limit exceeded — please wait a minute."))
+        return chat_hist, state
     user_msg = strip(user_msg or "")
     if not user_msg:
+        return chat_hist, state
     if len(user_msg) > MAX_INPUT_CH:
+        chat_hist.append((user_msg, f"Input >{MAX_INPUT_CH} chars."))
+        return chat_hist, state
     if MODEL_ERR:
+        chat_hist.append((user_msg, MODEL_ERR))
+        return chat_hist, state
+    # append user turn & empty assistant slot
+    chat_hist.append((user_msg, ""))
     state["raw"].append({"role": "user", "content": user_msg})
     prompt = build_prompt(state["raw"])
     ).start()
     partial = ""
+    try:
+        for token in streamer:
+            partial += token
+            # hallucination guard: stop if model starts new speaker tag
+            if "User:" in partial or "\nAI:" in partial:
+                partial = re.split(r"(?:\n?User:|\n?AI:)", partial)[0].strip()
+                break
+            chat_hist[-1] = (user_msg, partial)
+            yield chat_hist, state
+    except Exception as exc:
+        log("❌ Stream error:\n" + traceback.format_exc())
+        partial = "Apologies—internal error. Please try again."
+    reply = strip(partial)
+    chat_hist[-1] = (user_msg, reply)
+    state["raw"].append({"role": "assistant", "content": reply})
+    yield chat_hist, state  # final
+# ─────────────────────────── Gradio Blocks UI ────────────────────────────────
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
     gr.Markdown("### SchoolSpirit AI Chat")
     bot = gr.Chatbot(value=[("", WELCOME)], height=480, label="SchoolSpirit AI")