betterdataai
/

large-tabular-model

+import os
+import json
+import torch
+from peft import PeftModel
+from transformers import (
+    LlamaForCausalLM,
+    LlamaTokenizer,
+    GenerationConfig,
+)
+class EndpointHandler:
+    def __init__(self, model_dir: str = ".", **kwargs):
+        """
+        This method runs once when the Endpoint first starts.
+        - model_dir is the local directory of *this* repository
+          which contains your LoRA adapter weights (e.g. adapter_model.safetensors).
+        """
+        # 1) Base model from Hugging Face
+        #    Make sure to use the EXACT base you trained on, or it won't match your LoRA.
+        self.base_model_id = "unsloth/Llama-3.2-3B-Instruct"
+        # If your base model is gated/private, you'll need a token:
+        # hf_token = os.getenv("HF_TOKEN", None)
+        # 2) Load the tokenizer
+        self.tokenizer = LlamaTokenizer.from_pretrained(
+            self.base_model_id,
+            trust_remote_code=True,
+            # use_auth_token=hf_token,  # if needed
+        )
+        # 3) Load the base model
+        self.base_model = LlamaForCausalLM.from_pretrained(
+            self.base_model_id,
+            device_map="auto",         # or "cuda:0"
+            torch_dtype=torch.float16, # or bfloat16
+            trust_remote_code=True,
+            # use_auth_token=hf_token, # if needed
+        )
+        # 4) Load/merge your LoRA adapter
+        self.model = PeftModel.from_pretrained(
+            self.base_model,
+            model_dir,                 # The local directory of this repo
+            torch_dtype=torch.float16,
+        ).eval()
+    def __call__(self, data):
+        """
+        This method is called for every request to the endpoint.
+        `data` is a dictionary (or JSON string) containing user inputs.
+        Returns a dictionary or string (will be serialized as JSON).
+        """
+        # If data is a JSON string, parse it:
+        if isinstance(data, str):
+            data = json.loads(data)
+        # Extract the user prompt from the request payload
+        prompt = data.get("inputs", "")
+        if not isinstance(prompt, str):
+            raise ValueError("`inputs` must be a string.")
+        # Optionally extract generation params (max_new_tokens, temperature, etc.)
+        # If none provided, use defaults:
+        gen_params = data.get("parameters", {})
+        generation_config = GenerationConfig(
+            max_new_tokens=gen_params.get("max_new_tokens", 128),
+            temperature=gen_params.get("temperature", 0.7),
+            top_p=gen_params.get("top_p", 0.9),
+            do_sample=gen_params.get("do_sample", True),
+            # etc.
+        )
+        # Tokenize the prompt
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        # Generate text
+        with torch.no_grad():
+            output_ids = self.model.generate(**inputs, generation_config=generation_config)
+        # Decode the output
+        output_text = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        # Return the generated text in a JSON-friendly format
+        return {"generated_text": output_text}

requirements.txt CHANGED Viewed

@@ -1,8 +1,9 @@
 unsloth
-transformers
 pandas
 datasets
 trl
-torch
-accelerate
 scipy

 unsloth
 pandas
 datasets
 trl
 scipy
+transformers>=4.30.0
+peft>=0.4.0
+accelerate>=0.20.0
+torch>=2.0