PranavKeshav's picture
Create handler.py
30e74c0 verified
raw
history blame
2.29 kB
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
import torch
class Handler:
def __init__(self):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {self.device}")
# Load the fine-tuned model and tokenizer
print("Loading model and tokenizer...")
self.model = AutoModelForCausalLM.from_pretrained("PranavKeshav/upf_code_generator").to(self.device)
self.tokenizer = AutoTokenizer.from_pretrained("PranavKeshav/upf_code_generator").to(self.device)
# Load the FAISS index and embeddings
print("Loading FAISS index and embeddings...")
self.embeddings = HuggingFaceEmbeddings()
self.vectorstore = FAISS.load_local("faiss_index", self.embeddings, allow_dangerous_deserialization=True)
# Create the Hugging Face pipeline for text generation
print("Creating Hugging Face pipeline...")
def run_inference(prompt: str):
# Assuming 2048 is the desired max sequence length
return self.model.generate(
prompt, temperature=0.7, max_length=2048, top_p=0.95, repetition_penalty=1.15
)
self.hf_pipeline = pipeline(
"text-generation",
model=self.model,
tokenizer=self.tokenizer,
temperature=0.7,
max_new_tokens=2048,
top_p=0.95,
repetition_penalty=1.15
)
self.hf_pipeline.model.generate = run_inference
# Wrap the pipeline in LangChain
self.llm = HuggingFacePipeline(pipeline=self.hf_pipeline)
# Create the retriever and pipeline
self.retriever = self.vectorstore.as_retriever()
self.qa_chain = RetrievalQA.from_chain_type(llm=self.llm, retriever=self.retriever)
def __call__(self, request):
# Get the prompt from the request
prompt = request.json.get("prompt")
# Generate UPF code using the QA chain
response = self.qa_chain.run(prompt)
# Return the response
return {"response": response}