Spaces:

42Cummer
/

UofTearsBotAPI

Paused

File size: 1,703 Bytes

22d76f2

import os
from typing import List, Dict
import logging
import dotenv

import torch
from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse, HTMLResponse
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download, login

import uvicorn

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    pipeline
)

from UofTearsBot import UofTearsBot

MODEL_REPO="bartowski/Mistral-7B-Instruct-v0.3-GGUF"
MODEL_FILE="Mistral-7B-Instruct-v0.3-Q4_K_M.gguf"
CHAT_FORMAT="mistral-instruct"

dotenv.load_dotenv()
login(token=os.getenv("HF_TOKEN"))

MODEL_PATH = hf_hub_download(
    repo_id=MODEL_REPO,
    filename=MODEL_FILE,
    local_dir="/tmp/models",
    local_dir_use_symlinks=False,
)

llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=int(os.getenv("N_CTX", "4096")),
    n_threads=os.cpu_count() or 4,
    n_batch=int(os.getenv("N_BATCH", "256")),
    chat_format=CHAT_FORMAT,
)

# Start the FastAPI app
app = FastAPI()
chatbots: Dict[str, UofTearsBot] = {}

class ChatRequest(BaseModel):
    user_id: str
    user_text: str

@app.post("/chat")
async def chat(request: ChatRequest):
    if request.user_id not in chatbots:
        chatbots[request.user_id] = UofTearsBot(llm)
    current_bot = chatbots[request.user_id]
    response = current_bot.converse(request.user_text)
    return JSONResponse(content={"response": response, "history": current_bot.history})

@app.get("/", response_class=HTMLResponse)
async def home():
    return "<h1>App is running 🚀</h1>"
    

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860) # huggingface port