kashif-2b-instruct-rag-playground

Running on Zero

karimouda commited on Jan 29

Commit

a4624db

verified ·

1 Parent(s): 8174431

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -8,18 +8,16 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 DESCRIPTION = """\
-# Gemma 2 9B IT
-Gemma 2 is Google's latest iteration of open LLMs.
-This is a demo of [`google/gemma-2-9b-it`](https://huggingface.co/google/gemma-2-9b-it), fine-tuned for instruction following.
-For more details, please check [our post](https://huggingface.co/blog/gemma2).
-👉 Looking for a larger and more powerful version? Try the 27B version in [HuggingChat](https://huggingface.co/chat/models/google/gemma-2-27b-it).
 """
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
-MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "10096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -30,7 +28,7 @@ model = AutoModelForCausalLM.from_pretrained(
     device_map="auto",
     torch_dtype=torch.bfloat16,
 )
-model.config.sliding_window = 12288
 model.eval()
@@ -56,11 +54,7 @@ def generate(
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
-        top_p=top_p,
-        top_k=top_k,
-        temperature=temperature,
-        num_beams=1,
-        repetition_penalty=repetition_penalty,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()

 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 DESCRIPTION = """\
+# SILMA Kashif 2B Instruct V1.0 Playgroud
+This is a demo of [`silma-ai/SILMA-Kashif-2B-Instruct-v1.0`](https://huggingface.co/silma-ai/SILMA-Kashif-2B-Instruct-v1.0).
+** NOTE: this is a RAG model, it is only trained to answer questions based on context.
 """
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     device_map="auto",
     torch_dtype=torch.bfloat16,
 )
+model.config.sliding_window = 4096
 model.eval()
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
+        temperature=temperature
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()