Spaces:

prithivMLmods
/

QwQ-Edge

Running on Zero

App Files Files Community

prithivMLmods commited on 28 days ago

Commit

7d0f94b

verified ·

1 Parent(s): f21dff8

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -59

app.py CHANGED Viewed

@@ -5,13 +5,12 @@ import gradio as gr
 import spaces
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-from typing import List, Dict, Optional, Tuple
 DESCRIPTION = """
-# QwQ Distill
 """
-css = '''
 h1 {
   text-align: center;
   display: block;
@@ -31,76 +30,37 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
     torch_dtype=torch.bfloat16,
 )
-model.config.sliding_window = 4096
 model.eval()
-# Set the pad token ID if it's not already set
-if tokenizer.pad_token_id is None:
-    tokenizer.pad_token_id = tokenizer.eos_token_id
-# Define roles for the chat
-class Role:
-    SYSTEM = "system"
-    USER = "user"
-    ASSISTANT = "assistant"
-# Default system message
-default_system = "You are a helpful assistant."
-def clear_session() -> List:
-    return "", []
-def modify_system_session(system: str) -> Tuple[str, str, List]:
-    if system is None or len(system) == 0:
-        system = default_system
-    return system, system, []
-def history_to_messages(history: List, system: str) -> List[Dict]:
-    messages = [{'role': Role.SYSTEM, 'content': system}]
-    for h in history:
-        messages.append({'role': Role.USER, 'content': h[0]})
-        messages.append({'role': Role.ASSISTANT, 'content': h[1]})
-    return messages
 @spaces.GPU(duration=120)
 def generate(
-    query: Optional[str],
-    history: Optional[List],
-    system: str,
     max_new_tokens: int = 1024,
     temperature: float = 0.6,
     top_p: float = 0.9,
     top_k: int = 50,
     repetition_penalty: float = 1.2,
 ) -> Iterator[str]:
-    if query is None:
-        query = ''
-    if history is None:
-        history = []
-    # Convert history to messages
-    messages = history_to_messages(history, system)
-    messages.append({'role': Role.USER, 'content': query})
-    # Apply chat template and tokenize
-    text = tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True
-    )
-    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-    # Set up the streamer for real-time text generation
     streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
-        **model_inputs,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
@@ -109,12 +69,10 @@ def generate(
         temperature=temperature,
         num_beams=1,
         repetition_penalty=repetition_penalty,
-        pad_token_id=tokenizer.pad_token_id,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
-    # Stream the output tokens
     outputs = []
     for text in streamer:
         outputs.append(text)
@@ -124,7 +82,6 @@ def generate(
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
-        gr.Textbox(label="System Message", value=default_system, lines=2),
         gr.Slider(
             label="Max new tokens",
             minimum=1,
@@ -163,12 +120,14 @@ demo = gr.ChatInterface(
     ],
     stop_btn=None,
     examples=[
-        ["Write a Python function to reverses a string if it's length is a multiple of 4."],
-        ["What is the volume of a pyramid with a rectangular base?"],
-        ["Explain the difference between List comprehension and Lambda in Python."],
         ["What happens when the sun goes down?"],
     ],
     cache_examples=False,
     description=DESCRIPTION,
     css=css,
     fill_height=True,
@@ -176,4 +135,4 @@ demo = gr.ChatInterface(
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch(share=True)

 import spaces
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 DESCRIPTION = """
+# LlamaEXP
 """
+css ='''
 h1 {
   text-align: center;
   display: block;
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+model_id = "prithivMLmods/Llama-Express.1"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
     torch_dtype=torch.bfloat16,
 )
 model.eval()
 @spaces.GPU(duration=120)
 def generate(
+    message: str,
+    chat_history: list[dict],
     max_new_tokens: int = 1024,
     temperature: float = 0.6,
     top_p: float = 0.9,
     top_k: int = 50,
     repetition_penalty: float = 1.2,
 ) -> Iterator[str]:
+    conversation = [*chat_history, {"role": "user", "content": message}]
+    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
+    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    input_ids = input_ids.to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
+        {"input_ids": input_ids},
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
         num_beams=1,
         repetition_penalty=repetition_penalty,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     outputs = []
     for text in streamer:
         outputs.append(text)
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
         gr.Slider(
             label="Max new tokens",
             minimum=1,
     ],
     stop_btn=None,
     examples=[
+        ["Write a Python function to reverses a string if it's length is a multiple of 4. def reverse_string(str1): if len(str1) % 4 == 0: return ''.join(reversed(str1)) return str1 print(reverse_string('abcd')) print(reverse_string('python')) "],
+        ["Rectangle $ABCD$ is the base of pyramid $PABCD$. If $AB = 10$, $BC = 5$, $\overline{PA}\perp \text{plane } ABCD$, and $PA = 8$, then what is the volume of $PABCD$?"],
+        ["Difference between List comprehension and Lambda in Python lst  =  [x ** 2  for x in range (1, 11)   if  x % 2 == 1] print(lst)"],
         ["What happens when the sun goes down?"],
     ],
+    cache_examp
     cache_examples=False,
+    type="messages",
     description=DESCRIPTION,
     css=css,
     fill_height=True,
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch()