kashif-2b-instruct-rag-playground

Running on Zero

App Files Files Community

karimouda commited on Jan 29

Commit

fa87815

verified ·

1 Parent(s): aae3044

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -12

app.py CHANGED Viewed

@@ -8,11 +8,11 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 DESCRIPTION = """\
-# SILMA Kashif 2B Instruct V1.0 Playgroud
-This is a demo of [`silma-ai/SILMA-Kashif-2B-Instruct-v1.0`](https://huggingface.co/silma-ai/SILMA-Kashif-2B-Instruct-v1.0).
-** NOTE: this is a RAG model, it is only trained to answer questions based on context.
 """
 MAX_MAX_NEW_TOKENS = 2048
@@ -21,7 +21,7 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-model_id = "silma-ai/SILMA-Kashif-2B-Instruct-v1.0"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
@@ -37,7 +37,10 @@ def generate(
     message: str,
     chat_history: list[dict],
     max_new_tokens: int = 1024,
-    temperature: float = 0.01,
 ) -> Iterator[str]:
     conversation = chat_history.copy()
     conversation.append({"role": "user", "content": message})
@@ -47,21 +50,24 @@ def generate(
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
-    print(temperature)
     streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         {"input_ids": input_ids},
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
-        temperature=temperature
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
-    print(streamer)
     outputs = []
     for text in streamer:
-        print(text)
         outputs.append(text)
         yield "".join(outputs)
@@ -108,6 +114,10 @@ demo = gr.ChatInterface(
     stop_btn=None,
     examples=[
         ["Hello there! How are you doing?"],
     ],
     cache_examples=False,
     type="messages",
@@ -118,4 +128,4 @@ demo = gr.ChatInterface(
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch()

 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 DESCRIPTION = """\
+# Gemma 2 9B IT
+Gemma 2 is Google's latest iteration of open LLMs.
+This is a demo of [`google/gemma-2-9b-it`](https://huggingface.co/google/gemma-2-9b-it), fine-tuned for instruction following.
+For more details, please check [our post](https://huggingface.co/blog/gemma2).
+👉 Looking for a larger and more powerful version? Try the 27B version in [HuggingChat](https://huggingface.co/chat/models/google/gemma-2-27b-it).
 """
 MAX_MAX_NEW_TOKENS = 2048
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+model_id = "google/gemma-2-9b-it"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     message: str,
     chat_history: list[dict],
     max_new_tokens: int = 1024,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    top_k: int = 50,
+    repetition_penalty: float = 1.2,
 ) -> Iterator[str]:
     conversation = chat_history.copy()
     conversation.append({"role": "user", "content": message})
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         {"input_ids": input_ids},
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        num_beams=1,
+        repetition_penalty=repetition_penalty,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     outputs = []
     for text in streamer:
         outputs.append(text)
         yield "".join(outputs)
     stop_btn=None,
     examples=[
         ["Hello there! How are you doing?"],
+        ["Can you explain briefly to me what is the Python programming language?"],
+        ["Explain the plot of Cinderella in a sentence."],
+        ["How many hours does it take a man to eat a Helicopter?"],
+        ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
     ],
     cache_examples=False,
     type="messages",
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch()