MyNameIsSimon commited on
Commit
5188dae
·
1 Parent(s): c86c596

try llama cpp

Browse files
Files changed (2) hide show
  1. app.py +21 -33
  2. requirements.txt +1 -5
app.py CHANGED
@@ -1,12 +1,7 @@
1
- import os
2
- os.environ["CUDA_VISIBLE_DEVICES"] = ""
3
-
4
  import gradio as gr
5
 
6
- #from unsloth import FastLanguageModel
7
- from peft import AutoPeftModelForCausalLM
8
- from transformers import TextIteratorStreamer, AutoTokenizer
9
- from threading import Thread
10
 
11
  """
12
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
@@ -17,7 +12,6 @@ class MyModel:
17
  def __init__(self):
18
  self.client = None
19
  self.current_model = ""
20
- self.tokenizer = None
21
 
22
  def respond(
23
  self,
@@ -30,21 +24,18 @@ class MyModel:
30
  min_p,
31
  ):
32
  if model != self.current_model or self.current_model is None:
33
- # client, tokenizer = FastLanguageModel.from_pretrained(
34
- # model_name = model,
35
- # max_seq_length = 2048,
36
- # dtype = None,
37
- # load_in_4bit = True,
38
- # )
39
- # FastLanguageModel.for_inference(client) # Enable native 2x faster inference
40
- tokenizer = AutoTokenizer.from_pretrained(model)
41
- client = AutoPeftModelForCausalLM.from_pretrained(model, load_in_4bit=True)
42
 
43
  self.client = client
44
- self.tokenizer = tokenizer
45
  self.current_model = model
46
 
47
- text_streamer = TextIteratorStreamer(self.tokenizer, skip_prompt = True)
48
 
49
  messages = [{"role": "system", "content": system_message}]
50
 
@@ -56,22 +47,19 @@ class MyModel:
56
 
57
  messages.append({"role": "user", "content": message})
58
 
59
- inputs = self.tokenizer.apply_chat_template(
60
- messages,
61
- tokenize = True,
62
- add_generation_prompt = True, # Must add for generation
63
- return_tensors = "pt",
64
- )
65
-
66
- generation_kwargs = dict(input_ids=inputs, streamer=text_streamer, max_new_tokens=max_tokens, use_cache=True, temperature=temperature, min_p=min_p)
67
- thread = Thread(target=self.client.generate, kwargs=generation_kwargs)
68
- thread.start()
69
-
70
  response = ""
71
 
72
- for new_text in text_streamer:
73
- response += new_text
74
- yield response.strip("<|eot_id|>")
 
 
 
 
 
 
 
 
75
 
76
  # for message in client.chat_completion(
77
  # messages,
 
 
 
 
1
  import gradio as gr
2
 
3
+ from llama_cpp import Llama
4
+ from llama_cpp.llama_chat_format import MoondreamChatHandler
 
 
5
 
6
  """
7
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 
12
  def __init__(self):
13
  self.client = None
14
  self.current_model = ""
 
15
 
16
  def respond(
17
  self,
 
24
  min_p,
25
  ):
26
  if model != self.current_model or self.current_model is None:
27
+ chat_handler = MoondreamChatHandler.from_pretrained(
28
+ repo_id="lab2-as/lora_model_gguf",
29
+ )
30
+ client = Llama.from_pretrained(
31
+ repo_id="lab2-as/lora_model_gguf",
32
+ chat_handler=chat_handler,
33
+ n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
34
+ )
 
35
 
36
  self.client = client
 
37
  self.current_model = model
38
 
 
39
 
40
  messages = [{"role": "system", "content": system_message}]
41
 
 
47
 
48
  messages.append({"role": "user", "content": message})
49
 
 
 
 
 
 
 
 
 
 
 
 
50
  response = ""
51
 
52
+ for message in self.client.create_chat_completion(
53
+ messages,
54
+ temperature=temperature,
55
+ top_p=min_p,
56
+ stream=True,
57
+ max_tokens=max_tokens
58
+ ):
59
+ delta = message["choices"][0]["delta"]
60
+ if "content" in delta:
61
+ response += delta["content"]
62
+ yield response
63
 
64
  # for message in client.chat_completion(
65
  # messages,
requirements.txt CHANGED
@@ -1,6 +1,2 @@
1
  huggingface_hub==0.25.2
2
- transformers>=4.45.1
3
- accelerate
4
- peft
5
- torch
6
- #https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-manylinux_2_24_x86_64.whl
 
1
  huggingface_hub==0.25.2
2
+ llama-cpp-python