|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline |
|
|
from langchain.vectorstores import FAISS |
|
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
|
from langchain.llms import HuggingFacePipeline |
|
|
from langchain.chains import RetrievalQA |
|
|
import torch |
|
|
|
|
|
class Handler: |
|
|
def __init__(self): |
|
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
print(f"Using device: {self.device}") |
|
|
|
|
|
|
|
|
print("Loading model and tokenizer...") |
|
|
self.model = AutoModelForCausalLM.from_pretrained("PranavKeshav/upf_code_generator").to(self.device) |
|
|
self.tokenizer = AutoTokenizer.from_pretrained("PranavKeshav/upf_code_generator").to(self.device) |
|
|
|
|
|
|
|
|
print("Loading FAISS index and embeddings...") |
|
|
self.embeddings = HuggingFaceEmbeddings() |
|
|
self.vectorstore = FAISS.load_local("faiss_index", self.embeddings, allow_dangerous_deserialization=True) |
|
|
|
|
|
|
|
|
print("Creating Hugging Face pipeline...") |
|
|
|
|
|
def run_inference(prompt: str): |
|
|
|
|
|
return self.model.generate( |
|
|
prompt, temperature=0.7, max_length=2048, top_p=0.95, repetition_penalty=1.15 |
|
|
) |
|
|
|
|
|
self.hf_pipeline = pipeline( |
|
|
"text-generation", |
|
|
model=self.model, |
|
|
tokenizer=self.tokenizer, |
|
|
temperature=0.7, |
|
|
max_new_tokens=2048, |
|
|
top_p=0.95, |
|
|
repetition_penalty=1.15 |
|
|
) |
|
|
|
|
|
self.hf_pipeline.model.generate = run_inference |
|
|
|
|
|
self.llm = HuggingFacePipeline(pipeline=self.hf_pipeline) |
|
|
|
|
|
|
|
|
self.retriever = self.vectorstore.as_retriever() |
|
|
self.qa_chain = RetrievalQA.from_chain_type(llm=self.llm, retriever=self.retriever) |
|
|
|
|
|
def __call__(self, request): |
|
|
|
|
|
prompt = request.json.get("prompt") |
|
|
|
|
|
|
|
|
response = self.qa_chain.run(prompt) |
|
|
|
|
|
|
|
|
return {"response": response} |