CodyBontecou commited on
Commit
5b6527d
·
1 Parent(s): b55cbe6

initial commit

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. README.md +50 -0
  3. handler.py +67 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ tags:
5
+ - text-generation
6
+ - llama
7
+ - instruct
8
+ license: apache-2.0
9
+ pipeline_tag: text-generation
10
+ ---
11
+
12
+ # LLaDA-8B-Instruct Model
13
+
14
+ This is the LLaDA-8B-Instruct model deployed as a Hugging Face inference endpoint.
15
+
16
+ ## Model Details
17
+
18
+ LLaDA-8B-Instruct is a language model designed for instruction-following tasks.
19
+
20
+ ## Usage
21
+
22
+ This model can be used for text generation tasks. Here's an example:
23
+
24
+ ```python
25
+ import requests
26
+
27
+ API_URL = "https://YOUR-ENDPOINT-URL"
28
+ headers = {"Authorization": "Bearer YOUR_HF_TOKEN"}
29
+
30
+ def query(payload):
31
+ response = requests.post(API_URL, headers=headers, json=payload)
32
+ return response.json()
33
+
34
+ output = query({
35
+ "inputs": "Write a short story about a robot learning to paint:",
36
+ "parameters": {
37
+ "max_new_tokens": 250,
38
+ "temperature": 0.7,
39
+ "top_p": 0.95,
40
+ "do_sample": true
41
+ }
42
+ })
43
+ ```
44
+
45
+ ## API Inference Configuration
46
+
47
+ ```yaml
48
+ api_inference:
49
+ handler_class: handler.EndpointHandler
50
+ ```
handler.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+ import torch
4
+
5
+
6
+ class EndpointHandler:
7
+ def __init__(self, path=""):
8
+ # Load model with half precision to save memory
9
+ self.model = AutoModelForCausalLM.from_pretrained(
10
+ path, torch_dtype=torch.float16, device_map="auto"
11
+ )
12
+
13
+ # Load tokenizer
14
+ self.tokenizer = AutoTokenizer.from_pretrained(path)
15
+
16
+ # Ensure pad token is properly set
17
+ if self.tokenizer.pad_token_id is None:
18
+ if (
19
+ hasattr(self.tokenizer, "eos_token_id")
20
+ and self.tokenizer.eos_token_id is not None
21
+ ):
22
+ self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
23
+ self.tokenizer.pad_token = self.tokenizer.eos_token
24
+ else:
25
+ # Fallback to a common pad token
26
+ self.tokenizer.pad_token_id = 0
27
+ self.tokenizer.pad_token = self.tokenizer.convert_ids_to_tokens(0)
28
+
29
+ print(f"Model loaded successfully. Pad token ID: {self.tokenizer.pad_token_id}")
30
+
31
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, List[Any]]:
32
+ """Handle inference requests"""
33
+ # Extract inputs and parameters from request data
34
+ inputs = data.pop("inputs", data)
35
+ parameters = data.pop("parameters", {})
36
+
37
+ # Convert single string input to list for consistent handling
38
+ if isinstance(inputs, str):
39
+ inputs = [inputs]
40
+
41
+ # Extract generation parameters with sensible defaults
42
+ max_new_tokens = parameters.get("max_new_tokens", 256)
43
+ temperature = parameters.get("temperature", 0.7)
44
+ top_p = parameters.get("top_p", 0.95)
45
+ do_sample = parameters.get("do_sample", True)
46
+
47
+ # Tokenize inputs
48
+ input_tokens = self.tokenizer(inputs, return_tensors="pt", padding=True).to(
49
+ self.model.device
50
+ )
51
+
52
+ # Generate text
53
+ with torch.no_grad():
54
+ outputs = self.model.generate(
55
+ **input_tokens,
56
+ max_new_tokens=max_new_tokens,
57
+ temperature=temperature,
58
+ top_p=top_p,
59
+ do_sample=do_sample,
60
+ pad_token_id=self.tokenizer.pad_token_id,
61
+ )
62
+
63
+ # Decode generated text
64
+ generated_texts = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
65
+
66
+ # Return results in expected format
67
+ return {"generated_text": generated_texts}