MasherAI-v6-7B-Chat

Sleeping

File size: 4,986 Bytes

4375b7f
4e683ec
76a154f
 
574a1e8
 
 
 
4e683ec
574a1e8
76a154f
 
4375b7f
76a154f
574a1e8
4e683ec
7c825ed
 
76a154f
 
574a1e8
76a154f
ea3f2b3
98df5b4
574a1e8
4e683ec
7f7bb90
d534002
4e683ec
e9816b5
76a154f
574a1e8
5229350
76a154f
 
4e683ec
76a154f
4e683ec
27319be
4e683ec
 
 
 
574a1e8
846f032
ef40822
9a170de
574a1e8
4e683ec
846f032
574a1e8
846f032
4e683ec
574a1e8
318864b
6111f2c
 
 
 
4e683ec
574a1e8
3f60a5e
4e683ec
6111f2c
4e683ec
 
 
 
 
 
 
 
 
 
 
76a154f
574a1e8
4e683ec
 
 
 
76a154f
574a1e8
4e683ec
 
a6da5c7
4e683ec
a400f4b
4e683ec
 
76a154f
 
 
 
4e683ec
 
 
76a154f
 
 
4e683ec
 
 
 
76a154f
 
 
4e683ec
 
 
 
76a154f
 
 
 
4e683ec
 
 
 
 
 
 
 
 
90753c9
4e683ec
574a1e8
4e683ec
 
 
ab9a889
574a1e8
ab9a889
8d7b027
 
 
574a1e8
4e683ec
574a1e8

import os
from threading import Thread
from typing import Iterator

import gradio as gr  # Importing Gradio for creating UI interfaces.
import spaces  # Import for using Hugging Face Spaces functionalities.
import torch  # PyTorch library for deep learning applications.
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer  # Import necessary components from Hugging Face's Transformers.

# Constants for maximum token lengths and defaults.
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

# Initial description for the UI interface, showcasing the AI version and creator.
DESCRIPTION = """\
# Masher AI v6 7B
This Space demonstrates Masher AI v6 7B by Maheswar.
"""

# Check for GPU availability, append a warning to the description if running on CPU.
if not torch.cuda.is_available():
    DESCRIPTION += "\n<p>Running on CPU! This demo does not work on CPU.</p>"

# If a GPU is available, load the model and tokenizer with specific configurations.
if torch.cuda.is_available():
    model_id = "mahiatlinux/MasherAI-7B-v6.1-redo-eval"
    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.use_default_system_prompt = False

# Define a function decorated to use GPU and enable queue for processing the generation tasks.
@spaces.GPU(enable_queue=True)
def generate(
    message: str,
    chat_history: list[tuple[str, str]],
    system_prompt: str,
    max_new_tokens: int = 1024,
    temperature: float = 0.1,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.2,
) -> Iterator[str]:
    # Preparing conversation history for processing.
    conversation = []
    # Adding system prompt.
    # conversation.append({"from": "human", "value": system_prompt})
    # Extending the conversation history with user and assistant interactions.
    for user, assistant in chat_history:
        conversation.extend([{"from": "human", "value": user}, {"from": "gpt", "value": assistant}])
    # Adding the latest message from the user to the conversation.
    conversation.append({"from": "human", "value": message})

    # Tokenize and prepare the input, handle exceeding token lengths.
    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt", add_generation_prompt=True)
    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
    input_ids = input_ids.to(model.device)

    # Setup for asynchronous text generation.
    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        {"input_ids": input_ids},
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        num_beams=1,
        repetition_penalty=repetition_penalty,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    # Collect and yield generated outputs as they become available.
    outputs = []
    for text in streamer:
        outputs.append(text)
        yield "".join(outputs)

# Setup Gradio interface for chat, including additional controls for the generation parameters.
chat_interface = gr.ChatInterface(
    fn=generate,
    fill_height=True,
    additional_inputs=[
        gr.Textbox(label="System prompt", lines=6),
        gr.Slider(
            label="Max new tokens",
            minimum=1,
            maximum=MAX_MAX_NEW_TOKENS,
            step=1,
            value=DEFAULT_MAX_NEW_TOKENS,
        ),
        gr.Slider(
            label="Temperature",
            minimum=0.1,
            maximum=4.0,
            step=0.1,
            value=0.6,
        ),
        gr.Slider(
            label="Top-p (nucleus sampling)",
            minimum=0.05,
            maximum=1.0,
            step=0.05,
            value=0.9,
        ),
        gr.Slider(
            label="Top-k",
            minimum=1,
            maximum=1000,
            step=1,
            value=50,
        ),
        gr.Slider(
            label="Repetition penalty",
            minimum=1.0,
            maximum=2.0,
            step=0.05,
            value=1.2,
        ),
    ],
    stop_btn=None,
    examples=[
        # Examples to assist users in starting conversations with the AI.
    ],
)

chatbot=gr.Chatbot(height=450, label='Gradio ChatInterface')
# Setup and launch the Gradio demo with Blocks API.
with gr.Blocks(css="style.css", fill_height=True) as demo:
    gr.Markdown(DESCRIPTION)
    chat_interface.render()

# Main entry point to start the web application if this script is run directly.
if __name__ == "__main__":
    demo.queue(max_size=20).launch()