App is running 🚀

import os
from typing import List, Dict
import logging
import dotenv

import torch
from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse, HTMLResponse
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download, login

import uvicorn

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    pipeline
)

from UofTearsBot import UofTearsBot

MODEL_REPO="bartowski/Mistral-7B-Instruct-v0.3-GGUF"
MODEL_FILE="Mistral-7B-Instruct-v0.3-Q4_K_M.gguf"
CHAT_FORMAT="mistral-instruct"

dotenv.load_dotenv()
login(token=os.getenv("HF_TOKEN"))

MODEL_PATH = hf_hub_download(
    repo_id=MODEL_REPO,
    filename=MODEL_FILE,
    local_dir="/tmp/models",
    local_dir_use_symlinks=False,
)

llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=int(os.getenv("N_CTX", "4096")),
    n_threads=os.cpu_count() or 4,
    n_batch=int(os.getenv("N_BATCH", "256")),
    chat_format=CHAT_FORMAT,
)

# Start the FastAPI app
app = FastAPI()
chatbots: Dict[str, UofTearsBot] = {}

class ChatRequest(BaseModel):
    user_id: str
    user_text: str

@app.post("/chat")
async def chat(request: ChatRequest):
    if request.user_id not in chatbots:
        chatbots[request.user_id] = UofTearsBot(llm)
    current_bot = chatbots[request.user_id]
    response = current_bot.converse(request.user_text)
    return JSONResponse(content={"response": response, "history": current_bot.history})

@app.get("/", response_class=HTMLResponse)
async def home():
    return "<h1>App is running 🚀</h1>"
    

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860) # huggingface port