Spaces:

liwu
/

liwu_forum_post_2.0

Sleeping

App Files Files Community

esbatmop commited on Oct 23, 2024

Commit

d0b32b9

verified ·

1 Parent(s): 4ebb548

Update app.py

Browse files

Files changed (1) hide show

app.py +173 -259

app.py CHANGED Viewed

@@ -1,271 +1,185 @@
-from pathlib import Path
-from shutil import rmtree
-from typing import Union, List, Dict, Tuple, Optional
-from tqdm import tqdm
-import requests
-import gradio as gr
 from llama_cpp import Llama
-# ================== ANNOTATIONS ========================
-CHAT_HISTORY = List[Optional[Dict[str, Optional[str]]]]
-MODEL_DICT = Dict[str, Llama]
-# ================== FUNCS =============================
-def download_file(file_url: str, file_path: Union[str, Path]) -> None:
-    response = requests.get(file_url, stream=True)
-    if response.status_code != 200:
-        raise Exception(f'Файл недоступен для скачивания по ссылке: {file_url}')
-    total_size = int(response.headers.get('content-length', 0))
-    progress_tqdm = tqdm(desc='Loading GGUF file', total=total_size, unit='iB', unit_scale=True)
-    progress_gradio = gr.Progress()
-    completed_size = 0
-    with open(file_path, 'wb') as file:
-        for data in response.iter_content(chunk_size=4096):
-            size = file.write(data)
-            progress_tqdm.update(size)
-            completed_size += size
-            desc = f'Loading GGUF file, {completed_size/1024**3:.3f}/{total_size/1024**3:.3f} GB'
-            progress_gradio(completed_size/total_size, desc=desc)
-def download_gguf_and_init_model(gguf_url: str, model_dict: MODEL_DICT) -> Tuple[MODEL_DICT, bool, str]:
-    log = ''
-    if not gguf_url.endswith('.gguf'):
-        log += f'The link must be a direct link to the GGUF file\n'
-        return model_dict, log
-    gguf_filename = gguf_url.rsplit('/')[-1]
-    model_path = MODELS_PATH / gguf_filename
-    progress = gr.Progress()
-    if not model_path.is_file():
-        progress(0.3, desc='Шаг 1/2: Loading GGUF model file')
-        try:
-            download_file(gguf_url, model_path)
-            log += f'Model file {gguf_filename} successfully loaded\n'
-        except Exception as ex:
-            log += f'Error loading model from link {gguf_url}, error code:\n{ex}\n'
-            curr_model = model_dict.get('model')
-            if curr_model is None:
-                log += f'Model is missing from dictionary "model_dict"\n'
-                return model_dict, load_log
-            curr_model_filename = Path(curr_model.model_path).name
-            log += f'Current initialized model: {curr_model_filename}\n'
-            return model_dict, log
-    else:
-        log += f'Model file {gguf_filename} loaded, initializing model...\n'
-    progress(0.7, desc='Шаг 2/2: Model initialization')
-    model = Llama(model_path=str(model_path), n_gpu_layers=-1, verbose=True)
-    model_dict = {'model': model}
-    support_system_role = 'System role not supported' not in model.metadata['tokenizer.chat_template']
-    log += f'Model {gguf_filename} initialized\n'
-    return model_dict, support_system_role, log
-def user_message_to_chatbot(user_message: str, chatbot: CHAT_HISTORY) -> Tuple[str, CHAT_HISTORY]:
-    if user_message:
-        chatbot.append({'role': 'user', 'metadata': {'title': None}, 'content': user_message})
-    return '', chatbot
-def bot_response_to_chatbot(
-        chatbot: CHAT_HISTORY,
-        model_dict: MODEL_DICT,
-        system_prompt: str,
-        support_system_role: bool,
-        history_len: int,
-        do_sample: bool,
-        *generate_args,
-        ):
-    model = model_dict.get('model')
-    if model is None:
-        gr.Info('Model not initialized')
-        yield chatbot
-        return
-    if len(chatbot) == 0 or chatbot[-1]['role'] == 'assistant':
-        yield chatbot
-        return
-    messages = []
-    if support_system_role and system_prompt:
-        messages.append({'role': 'system', 'metadata': {'title': None}, 'content': system_prompt})
-    if history_len != 0:
-        messages.extend(chatbot[:-1][-(history_len*2):])
-    messages.append(chatbot[-1])
-    gen_kwargs = dict(zip(GENERATE_KWARGS.keys(), generate_args))
-    gen_kwargs['top_k'] = int(gen_kwargs['top_k'])
-    if not do_sample:
-        gen_kwargs['top_p'] = 0.0
-        gen_kwargs['top_k'] = 1
-        gen_kwargs['repeat_penalty'] = 1.0
-    stream_response = model.create_chat_completion(
-        messages=messages,
-        stream=True,
-        **gen_kwargs,
         )
-    chatbot.append({'role': 'assistant', 'metadata': {'title': None}, 'content': ''})
-    for chunk in stream_response:
-        token = chunk['choices'][0]['delta'].get('content')
-        if token is not None:
-            chatbot[-1]['content'] += token
-            yield chatbot
-def get_system_prompt_component(interactive: bool) -> gr.Textbox:
-    value = '' if interactive else 'System prompt is not supported by this model'
-    return gr.Textbox(value=value, label='System prompt', interactive=interactive)
-def get_generate_args(do_sample: bool) -> List[gr.component]:
-    generate_args = [
-        gr.Slider(minimum=0.1, maximum=3, value=GENERATE_KWARGS['temperature'], step=0.1, label='temperature', visible=do_sample),
-        gr.Slider(minimum=0, maximum=1, value=GENERATE_KWARGS['top_p'], step=0.01, label='top_p', visible=do_sample),
-        gr.Slider(minimum=1, maximum=50, value=GENERATE_KWARGS['top_k'], step=1, label='top_k', visible=do_sample),
-        gr.Slider(minimum=1, maximum=5, value=GENERATE_KWARGS['repeat_penalty'], step=0.1, label='repeat_penalty', visible=do_sample),
-    ]
-    return generate_args
-# ================== VARIABLES =============================
-MODELS_PATH = Path('models')
-MODELS_PATH.mkdir(exist_ok=True)
-DEFAULT_GGUF_URL = 'https://huggingface.co/bartowski/gemma-2-2b-it-GGUF/resolve/main/gemma-2-2b-it-Q8_0.gguf'
-start_model_dict, start_support_system_role, start_load_log = download_gguf_and_init_model(
-    gguf_url=DEFAULT_GGUF_URL, model_dict={},
     )
-GENERATE_KWARGS = dict(
-    temperature=0.2,
-    top_p=0.95,
-    top_k=40,
-    repeat_penalty=1.0,
     )
-theme = gr.themes.Base(primary_hue='green', secondary_hue='yellow', neutral_hue='zinc').set(
-    loader_color='rgb(0, 255, 0)',
-    slider_color='rgb(0, 200, 0)',
-    body_text_color_dark='rgb(0, 200, 0)',
-    button_secondary_background_fill_dark='green',
-)
-css = '''.gradio-container {width: 60% !important}'''
-# ================== INTERFACE =============================
-with gr.Blocks(theme=theme, css=css) as interface:
-    model_dict = gr.State(start_model_dict)
-    support_system_role = gr.State(start_support_system_role)
-    # ================= CHAT BOT PAGE ======================
-    with gr.Tab('Chatbot'):
-        with gr.Row():
-            with gr.Column(scale=3):
-                chatbot = gr.Chatbot(
-                    type='messages',  # new in gradio 5+
-                    show_copy_button=True,
-                    bubble_full_width=False,
-                    height=480,
-                    )
-                user_message = gr.Textbox(label='User')
-                with gr.Row():
-                    user_message_btn = gr.Button('Send')
-                    stop_btn = gr.Button('Stop')
-                    clear_btn = gr.Button('Clear')
-                system_prompt = get_system_prompt_component(interactive=support_system_role.value)
-            with gr.Column(scale=1, min_width=80):
-                with gr.Group():
-                    gr.Markdown('Length of message history')
-                    history_len = gr.Slider(
-                        minimum=0,
-                        maximum=10,
-                        value=0,
-                        step=1,
-                        info='Number of previous messages taken into account in history',
-                        label='history_len',
-                        show_label=False,
-                        )
-                    with gr.Group():
-                        gr.Markdown('Generation parameters')
-                        do_sample = gr.Checkbox(
-                            value=False,
-                            label='do_sample',
-                            info='Activate random sampling',
-                            )
-                        generate_args = get_generate_args(do_sample.value)
-                        do_sample.change(
-                            fn=get_generate_args,
-                            inputs=do_sample,
-                            outputs=generate_args,
-                            show_progress=False,
-                        )
-        generate_event = gr.on(
-            triggers=[user_message.submit, user_message_btn.click],
-            fn=user_message_to_chatbot,
-            inputs=[user_message, chatbot],
-            outputs=[user_message, chatbot],
-        ).then(
-            fn=bot_response_to_chatbot,
-            inputs=[chatbot, model_dict, system_prompt, support_system_role, history_len, do_sample, *generate_args],
-            outputs=[chatbot],
-        )
-        stop_btn.click(
-            fn=None,
-            inputs=None,
-            outputs=None,
-            cancels=generate_event,
-        )
-        clear_btn.click(
-            fn=lambda: None,
-            inputs=None,
-            outputs=[chatbot],
-            )
-    # ================= LOAD MODELS PAGE ======================
-    with gr.Tab('Load model'):
-        gguf_url = gr.Textbox(
-            value='',
-            label='Link to GGUF',
-            placeholder='URL link to the model in GGUF format',
-            )
-        load_model_btn = gr.Button('Downloading GGUF and initializing the model')
-        load_log = gr.Textbox(
-            value=start_load_log,
-            label='Model loading status',
-            lines=3,
-            )
-        load_model_btn.click(
-            fn=download_gguf_and_init_model,
-            inputs=[gguf_url, model_dict],
-            outputs=[model_dict, support_system_role, load_log],
-        ).success(
-            fn=get_system_prompt_component,
-            inputs=[support_system_role],
-            outputs=[system_prompt],
-        )
-        gr.HTML("""<h3 style='text-align: center'>
-        <a href="https://github.com/sergey21000/gradio-llamacpp-chatbot" target='_blank'>GitHub Repository</a></h3>
-        """)
-interface.launch(server_name='0.0.0.0', server_port=7860)

+# c2-standard-8		spot 9ct/h
+# sudo apt-get install git git-lfs pip cmake podman
+# git lfs install
+#conda
+# wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+# bash Miniconda3-latest-Linux-x86_64.sh
+# conda create --name dev python=3.10
+# conda activate dev
+# conda create --name dev4 python=3.10
+##########
+# git clone https://huggingface.co/spaces/TobDeBer/Qwen-2-llamacpp
+# pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+# pip install huggingface_hub scikit-build-core llama-cpp-agent
+#
+import llama_cpp
+import os
+import json
+import subprocess
 from llama_cpp import Llama
+from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
+from llama_cpp_agent.providers import LlamaCppPythonProvider
+from llama_cpp_agent.chat_history import BasicChatHistory
+from llama_cpp_agent.chat_history.messages import Roles
+import gradio as gr
+from huggingface_hub import hf_hub_download
+huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
+hf_hub_download(
+    repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
+    filename="qwen2-0_5b-instruct-q4_k_m.gguf",
+    local_dir="./models"
+)
+hf_hub_download(
+    repo_id="TobDeBer/gpt2-Q4_K_M-GGUF",
+    filename="gpt2-q4_k_m.gguf",
+    local_dir="./models"
+)
+hf_hub_download(
+    repo_id="TobDeBer/Meta-Llama-3.1-8B-Instruct-Q4_K_M-GGUF",
+    filename="meta-llama-3.1-8b-instruct-q4_k_m.gguf",
+    local_dir="./models",
+    token=huggingface_token
+)
+# 5GB
+# RichardErkhov/ibm-granite_-_granite-7b-base-gguf
+# granite-7b-base.Q4_K_M.gguf
+# 4GB
+# TobDeBer/granite-8b-code-instruct-128k-Q4_K_M-GGUF
+# granite-8b-code-instruct-128k-q4_k_m.gguf
+# 5GB
+llm = None
+llm_model = None
+def respond(
+    message,
+    history: list[tuple[str, str]],
+    model,
+    system_message,
+    max_tokens,
+    temperature,
+    top_p,
+    top_k,
+    repeat_penalty,
+):
+    chat_template = MessagesFormatterType.GEMMA_2
+    global llm
+    global llm_model
+    if llm is None or llm_model != model:
+        llm = Llama(
+            model_path=f"models/{model}",
+            flash_attn=True,
+            n_gpu_layers=81,
+            n_batch=1024,
+            n_ctx=8192,
         )
+        llm_model = model
+    provider = LlamaCppPythonProvider(llm)
+    agent = LlamaCppAgent(
+        provider,
+        system_prompt=f"{system_message}",
+        predefined_messages_formatter_type=chat_template,
+        debug_output=True
     )
+    settings = provider.get_provider_default_settings()
+    settings.temperature = temperature
+    settings.top_k = top_k
+    settings.top_p = top_p
+    settings.max_tokens = max_tokens
+    settings.repeat_penalty = repeat_penalty
+    settings.stream = True
+    messages = BasicChatHistory()
+    for msn in history:
+        user = {
+            'role': Roles.user,
+            'content': msn[0]
+        }
+        assistant = {
+            'role': Roles.assistant,
+            'content': msn[1]
+        }
+        messages.add_message(user)
+        messages.add_message(assistant)
+    stream = agent.get_chat_response(
+        message,
+        llm_sampling_settings=settings,
+        chat_history=messages,
+        returns_streaming_generator=True,
+        print_output=False
     )
+    outputs = ""
+    for output in stream:
+        outputs += output
+        yield outputs
+description = """<p align="center">Defaults to Qwen 500M<br>
+More models in Advanced Section <br></p>
+"""
+demo = gr.ChatInterface(
+    respond,
+    additional_inputs=[
+        gr.Dropdown([
+                'qwen2-0_5b-instruct-q4_k_m.gguf',
+				'gpt2-q4_k_m.gguf',
+                'meta-llama-3.1-8b-instruct-q4_k_m.gguf',
+            ],
+            value="qwen2-0_5b-instruct-q4_k_m.gguf",
+            label="Model"
+        ),
+        gr.Textbox(value="You are a helpful assistant.", label="System message"),
+        gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
+        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
+        gr.Slider(
+            minimum=0.1,
+            maximum=1.0,
+            value=0.95,
+            step=0.05,
+            label="Top-p",
+        ),
+        gr.Slider(
+            minimum=0,
+            maximum=100,
+            value=40,
+            step=1,
+            label="Top-k",
+        ),
+        gr.Slider(
+            minimum=0.0,
+            maximum=2.0,
+            value=1.1,
+            step=0.1,
+            label="Repetition penalty",
+        ),
+    ],
+    #retry_btn="Retry",
+    #undo_btn="Undo",
+    #clear_btn="Clear",
+    #submit_btn="Send",
+    title="Chat with Qwen 2 and friends using llama.cpp",
+    description=description,
+    chatbot=gr.Chatbot(
+        scale=1,
+        show_copy_button=True
+    )
+)
+if __name__ == "__main__":
+    demo.launch()