MyNameIsSimon commited on
Commit
3981ed2
·
1 Parent(s): eb4277b
Files changed (2) hide show
  1. app.py +16 -12
  2. requirements.txt +1 -5
app.py CHANGED
@@ -1,9 +1,10 @@
 
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
- from torch.cuda import is_available
4
 
5
- from unsloth import FastLanguageModel
6
- from transformers import TextIteratorStreamer
7
  from threading import Thread
8
 
9
  """
@@ -28,13 +29,16 @@ class MyModel:
28
  min_p,
29
  ):
30
  if model != self.current_model or self.current_model is None:
31
- client, tokenizer = FastLanguageModel.from_pretrained(
32
- model_name = model,
33
- max_seq_length = 2048,
34
- dtype = None,
35
- load_in_4bit = True,
36
- )
37
- FastLanguageModel.for_inference(client) # Enable native 2x faster inference
 
 
 
38
  self.client = client
39
  self.tokenizer = tokenizer
40
  self.current_model = model
@@ -56,7 +60,7 @@ class MyModel:
56
  tokenize = True,
57
  add_generation_prompt = True, # Must add for generation
58
  return_tensors = "pt",
59
- ).to("cuda" if is_available() else "cpu")
60
 
61
  generation_kwargs = dict(input_ids=inputs, streamer=text_streamer, max_new_tokens=max_tokens, use_cache=True, temperature=temperature, min_p=min_p)
62
  thread = Thread(target=self.client.generate, kwargs=generation_kwargs)
 
1
+ import os
2
+ os.environ["CUDA_VISIBLE_DEVICES"] = ""
3
+
4
  import gradio as gr
 
 
5
 
6
+ #from unsloth import FastLanguageModel
7
+ from transformers import TextIteratorStreamer, AutoModelForCausalLM, AutoTokenizer
8
  from threading import Thread
9
 
10
  """
 
29
  min_p,
30
  ):
31
  if model != self.current_model or self.current_model is None:
32
+ # client, tokenizer = FastLanguageModel.from_pretrained(
33
+ # model_name = model,
34
+ # max_seq_length = 2048,
35
+ # dtype = None,
36
+ # load_in_4bit = True,
37
+ # )
38
+ # FastLanguageModel.for_inference(client) # Enable native 2x faster inference
39
+ tokenizer = AutoTokenizer.from_pretrained(model)
40
+ client = AutoModelForCausalLM.from_pretrained(model)
41
+
42
  self.client = client
43
  self.tokenizer = tokenizer
44
  self.current_model = model
 
60
  tokenize = True,
61
  add_generation_prompt = True, # Must add for generation
62
  return_tensors = "pt",
63
+ )
64
 
65
  generation_kwargs = dict(input_ids=inputs, streamer=text_streamer, max_new_tokens=max_tokens, use_cache=True, temperature=temperature, min_p=min_p)
66
  thread = Thread(target=self.client.generate, kwargs=generation_kwargs)
requirements.txt CHANGED
@@ -1,6 +1,2 @@
1
  huggingface_hub==0.25.2
2
- unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git
3
- trl
4
- peft
5
- accelerate
6
- bitsandbytes
 
1
  huggingface_hub==0.25.2
2
+ transformers>=4.45.1