Spaces:
Running
Running
File size: 1,688 Bytes
e3988a9 3afd34e a3d72f2 3afd34e a3d72f2 be43e8b 3afd34e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
import os
from fastapi import FastAPI, Query
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
from retriever import retrieve_documents
# Set writable cache location
#os.environ["HF_HOME"] = "/tmp/huggingface"
#os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
# Load Mistral 7B model
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=os.getenv("HUGGING_FACE_HUB_TOKEN"), cache_dir="/tmp/huggingface")
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME,
use_auth_token=os.getenv("HUGGING_FACE_HUB_TOKEN"),
cache_dir="/tmp/huggingface",
device_map="auto",
torch_dtype=torch.float16,
load_in_4bit=True
)
# Create inference pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
# FastAPI server
app = FastAPI()
@app.get("/")
def read_root():
return {"message": "Mistral 7B RAG API is running!"}
@app.get("/generate/")
def generate_response(query: str = Query(..., title="User Query")):
# Retrieve relevant documents
retrieved_docs = retrieve_documents(query)
# Format prompt for RAG
prompt = f"Use the following information to answer:\n{retrieved_docs}\n\nUser: {query}\nAI:"
# Generate response
output = generator(prompt, max_length=256, do_sample=True, temperature=0.7)[0]["generated_text"]
return {"query": query, "response": output}
|