import os from typing import List, Dict import logging import dotenv import torch from fastapi import FastAPI, HTTPException, Request from fastapi.responses import JSONResponse, HTMLResponse from pydantic import BaseModel from llama_cpp import Llama from huggingface_hub import hf_hub_download, login import uvicorn from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM, pipeline ) from UofTearsBot import UofTearsBot MODEL_REPO="bartowski/Mistral-7B-Instruct-v0.3-GGUF" MODEL_FILE="Mistral-7B-Instruct-v0.3-Q4_K_M.gguf" CHAT_FORMAT="mistral-instruct" dotenv.load_dotenv() login(token=os.getenv("HF_TOKEN")) MODEL_PATH = hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir="/tmp/models", local_dir_use_symlinks=False, ) llm = Llama( model_path=MODEL_PATH, n_ctx=int(os.getenv("N_CTX", "4096")), n_threads=os.cpu_count() or 4, n_batch=int(os.getenv("N_BATCH", "256")), chat_format=CHAT_FORMAT, ) # Start the FastAPI app app = FastAPI() chatbots: Dict[str, UofTearsBot] = {} class ChatRequest(BaseModel): user_id: str user_text: str @app.post("/chat") async def chat(request: ChatRequest): if request.user_id not in chatbots: chatbots[request.user_id] = UofTearsBot(llm) current_bot = chatbots[request.user_id] response = current_bot.converse(request.user_text) return JSONResponse(content={"response": response, "history": current_bot.history}) @app.get("/", response_class=HTMLResponse) async def home(): return "

App is running 🚀

" if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860) # huggingface port