from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from langchain.vectorstores import FAISS from langchain.embeddings import HuggingFaceEmbeddings from langchain.llms import HuggingFacePipeline from langchain.chains import RetrievalQA import torch class Handler: def __init__(self): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {self.device}") # Load the fine-tuned model and tokenizer print("Loading model and tokenizer...") self.model = AutoModelForCausalLM.from_pretrained("PranavKeshav/upf_code_generator").to(self.device) self.tokenizer = AutoTokenizer.from_pretrained("PranavKeshav/upf_code_generator").to(self.device) # Load the FAISS index and embeddings print("Loading FAISS index and embeddings...") self.embeddings = HuggingFaceEmbeddings() self.vectorstore = FAISS.load_local("faiss_index", self.embeddings, allow_dangerous_deserialization=True) # Create the Hugging Face pipeline for text generation print("Creating Hugging Face pipeline...") def run_inference(prompt: str): # Assuming 2048 is the desired max sequence length return self.model.generate( prompt, temperature=0.7, max_length=2048, top_p=0.95, repetition_penalty=1.15 ) self.hf_pipeline = pipeline( "text-generation", model=self.model, tokenizer=self.tokenizer, temperature=0.7, max_new_tokens=2048, top_p=0.95, repetition_penalty=1.15 ) self.hf_pipeline.model.generate = run_inference # Wrap the pipeline in LangChain self.llm = HuggingFacePipeline(pipeline=self.hf_pipeline) # Create the retriever and pipeline self.retriever = self.vectorstore.as_retriever() self.qa_chain = RetrievalQA.from_chain_type(llm=self.llm, retriever=self.retriever) def __call__(self, request): # Get the prompt from the request prompt = request.json.get("prompt") # Generate UPF code using the QA chain response = self.qa_chain.run(prompt) # Return the response return {"response": response}