Spaces:

xlr8harder
/

405chat

Runtime error

App Files Files Community

xlr8 commited on Aug 5, 2024

Commit

672d8c3

1 Parent(s): 81ae02e

initial commit

Browse files

Files changed (2) hide show

app.py +120 -0
requirements.txt +2 -0

app.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import os
+import gradio as gr
+from openai import OpenAI
+import jinja2
+from transformers import AutoTokenizer
+# Initialize the OpenAI client
+client = OpenAI(
+    base_url="https://api.hyperbolic.xyz/v1",
+    api_key=os.environ["HYPERBOLIC_API_KEY"],
+)
+# the tokenizer complains later after gradio forks without this setting.
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# use unofficial copy of Llama to avoid access restrictions.
+tokenizer = AutoTokenizer.from_pretrained("mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated")
+# Initial prompt
+initial_prompts = {
+    "Default": ["405B", """A chat between a person and the Llama 3.1 405B base model.
+"""],
+}
+# ChatML template
+chatml_template = """{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"""
+chat_template = """{% for message in messages %}{{'<' + message['role'] + '>: ' + message['content'] + '\n'}}{% endfor %}"""
+def format_chat(messages, use_chatml=False):
+    if use_chatml:
+        template = jinja2.Template(chatml_template)
+    else:
+        template = jinja2.Template(chat_template)
+    formatted = template.render(messages=messages)
+    return formatted
+def count_tokens(text):
+    return len(tokenizer.encode(text))
+def limit_history(initial_prompt, history, new_message, max_tokens):
+    limited_history = []
+    token_count = count_tokens(new_message) + count_tokens(initial_prompt)
+    if token_count > max_tokens:
+        raise(ValueError("message too large for context window"))
+    for user_msg, assistant_msg in reversed(history):
+        # TODO add ChatML wrapping here for better counting?
+        user_tokens = count_tokens(user_msg)
+        assistant_tokens = count_tokens(assistant_msg)
+        if token_count + user_tokens + assistant_tokens > max_tokens:
+            break
+        token_count += user_tokens + assistant_tokens
+        limited_history.insert(0, (user_msg, assistant_msg))
+    return limited_history
+def generate_response(message, history, initial_prompt, user_role, assistant_role, use_chatml):
+    context_length = 8192
+    response_length = 1000
+    slop_length = 300  # slop for chatml encoding etc--TODO fix this
+    # trim history based on token count
+    history_tokens = context_length - response_length - slop_length
+    limited_history = limit_history(initial_prompt, history, message, max_tokens=history_tokens)
+    # Prepare the input
+    chat_history = [{"role": user_role if i % 2 == 0 else assistant_role, "content": m}
+                for i, m in enumerate([item for sublist in limited_history for item in sublist] + [message])]
+    formatted_input = format_chat(chat_history, use_chatml)
+    if use_chatml:
+        full_prompt = initial_prompt + "\n\n" + formatted_input + f"<|im_start|>{assistant_role}\n"
+    else:
+        full_prompt = initial_prompt + "\n\n" + formatted_input + f"<{assistant_role}>:"
+    print(full_prompt)
+    completion = client.completions.create(
+        model="meta-llama/Meta-Llama-3.1-405B-FP8",
+        prompt=full_prompt,
+        temperature=0.7,
+        frequency_penalty=0.1,
+        max_tokens=response_length,
+        stop=[f'<{user_role}>:', f'<{assistant_role}>:'] if not use_chatml else [f'<|im_end|>']
+    )
+    assistant_response = completion.choices[0].text.strip()
+    return assistant_response
+with gr.Blocks(theme=gr.themes.Soft()) as iface:
+    with gr.Row():
+        initial_prompt = gr.Textbox(
+            value="A chat between a person and the Llama 3.1 405B base model.",
+            label="Initial Prompt",
+            lines=3
+        )
+    with gr.Column():
+        user_role = gr.Textbox(value="User", label="User Role")
+        assistant_role = gr.Textbox(value="405B", label="Assistant Role")
+        use_chatml = gr.Checkbox(label="Use ChatML", value=True)
+    chatbot = gr.ChatInterface(
+        generate_response,
+        title="Chat with 405B",
+        additional_inputs=[initial_prompt, user_role, assistant_role, use_chatml],
+        concurrency_limit=10,
+        chatbot=gr.Chatbot(height=800)
+    )
+    gr.Markdown("""
+This chat interface is powered by the Llama 3.1 405B base model, served by [Hyperbolic](https://hyperbolic.xyz), The Open Access AI Cloud.
+Thank you to Hyperbolic for making this base model available!
+""")
+# Launch the interface
+iface.launch(share=True, max_threads=40)

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio
2	+ openai