halme commited on
Commit
de8a86b
·
1 Parent(s): 50cd3df

Added code for inference using our model

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. app.py +31 -17
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ /venv
app.py CHANGED
@@ -1,21 +1,15 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
 
3
 
4
  """
5
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
  """
7
  #client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
- client = InferenceClient("halme/id2223_lora_model")
9
 
10
 
11
- def respond(
12
- message,
13
- history: list[tuple[str, str]],
14
- system_message,
15
- max_tokens,
16
- temperature,
17
- top_p,
18
- ):
19
  messages = [{"role": "system", "content": system_message}]
20
 
21
  for val in history:
@@ -28,17 +22,37 @@ def respond(
28
 
29
  response = ""
30
 
31
- for message in client.chat_completion(
32
- messages,
33
- max_tokens=max_tokens,
34
- stream=True,
35
- temperature=temperature,
36
- top_p=top_p,
37
- ):
38
  token = message.choices[0].delta.content
39
 
40
  response += token
41
- yield response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
 
44
  """
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
+ from unsloth import FastLanguageModel
4
 
5
  """
6
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
7
  """
8
  #client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
9
+ #client = InferenceClient("halme/id2223_lora_model")
10
 
11
 
12
+ def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p,):
 
 
 
 
 
 
 
13
  messages = [{"role": "system", "content": system_message}]
14
 
15
  for val in history:
 
22
 
23
  response = ""
24
 
25
+ """ for message in client.chat_completion(messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p):
 
 
 
 
 
 
26
  token = message.choices[0].delta.content
27
 
28
  response += token
29
+ yield response """
30
+
31
+ model, tokenizer = FastLanguageModel.from_pretrained(
32
+ model_name = "halme/id2223_lora_model", # YOUR MODEL YOU USED FOR TRAINING
33
+ max_seq_length = max_tokens,
34
+ dtype = None,
35
+ load_in_4bit = True,
36
+ )
37
+
38
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
39
+
40
+ """messages = [
41
+ {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
42
+ ] """
43
+
44
+ inputs = tokenizer.apply_chat_template(
45
+ messages,
46
+ tokenize = True,
47
+ add_generation_prompt = True, # Must add for generation
48
+ return_tensors = "pt",
49
+ ).to("cuda")
50
+
51
+ from transformers import TextStreamer
52
+ text_streamer = TextStreamer(tokenizer, skip_prompt = True)
53
+
54
+ yield model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
55
+ use_cache = True, temperature = 1.5, min_p = 0.1)
56
 
57
 
58
  """