karimouda commited on
Commit
a4624db
·
verified ·
1 Parent(s): 8174431

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -12
app.py CHANGED
@@ -8,18 +8,16 @@ import torch
8
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
9
 
10
  DESCRIPTION = """\
11
- # Gemma 2 9B IT
12
 
13
- Gemma 2 is Google's latest iteration of open LLMs.
14
- This is a demo of [`google/gemma-2-9b-it`](https://huggingface.co/google/gemma-2-9b-it), fine-tuned for instruction following.
15
- For more details, please check [our post](https://huggingface.co/blog/gemma2).
16
 
17
- 👉 Looking for a larger and more powerful version? Try the 27B version in [HuggingChat](https://huggingface.co/chat/models/google/gemma-2-27b-it).
18
  """
19
 
20
  MAX_MAX_NEW_TOKENS = 2048
21
  DEFAULT_MAX_NEW_TOKENS = 1024
22
- MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "10096"))
23
 
24
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
25
 
@@ -30,7 +28,7 @@ model = AutoModelForCausalLM.from_pretrained(
30
  device_map="auto",
31
  torch_dtype=torch.bfloat16,
32
  )
33
- model.config.sliding_window = 12288
34
  model.eval()
35
 
36
 
@@ -56,11 +54,7 @@ def generate(
56
  streamer=streamer,
57
  max_new_tokens=max_new_tokens,
58
  do_sample=True,
59
- top_p=top_p,
60
- top_k=top_k,
61
- temperature=temperature,
62
- num_beams=1,
63
- repetition_penalty=repetition_penalty,
64
  )
65
  t = Thread(target=model.generate, kwargs=generate_kwargs)
66
  t.start()
 
8
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
9
 
10
  DESCRIPTION = """\
11
+ # SILMA Kashif 2B Instruct V1.0 Playgroud
12
 
13
+ This is a demo of [`silma-ai/SILMA-Kashif-2B-Instruct-v1.0`](https://huggingface.co/silma-ai/SILMA-Kashif-2B-Instruct-v1.0).
 
 
14
 
15
+ ** NOTE: this is a RAG model, it is only trained to answer questions based on context.
16
  """
17
 
18
  MAX_MAX_NEW_TOKENS = 2048
19
  DEFAULT_MAX_NEW_TOKENS = 1024
20
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
21
 
22
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
23
 
 
28
  device_map="auto",
29
  torch_dtype=torch.bfloat16,
30
  )
31
+ model.config.sliding_window = 4096
32
  model.eval()
33
 
34
 
 
54
  streamer=streamer,
55
  max_new_tokens=max_new_tokens,
56
  do_sample=True,
57
+ temperature=temperature
 
 
 
 
58
  )
59
  t = Thread(target=model.generate, kwargs=generate_kwargs)
60
  t.start()