esbatmop commited on
Commit
d0b32b9
·
verified ·
1 Parent(s): 4ebb548

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +173 -259
app.py CHANGED
@@ -1,271 +1,185 @@
1
- from pathlib import Path
2
- from shutil import rmtree
3
- from typing import Union, List, Dict, Tuple, Optional
4
- from tqdm import tqdm
5
-
6
- import requests
7
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
8
  from llama_cpp import Llama
 
 
 
 
 
 
9
 
 
10
 
11
- # ================== ANNOTATIONS ========================
12
-
13
- CHAT_HISTORY = List[Optional[Dict[str, Optional[str]]]]
14
- MODEL_DICT = Dict[str, Llama]
15
-
16
-
17
- # ================== FUNCS =============================
18
-
19
- def download_file(file_url: str, file_path: Union[str, Path]) -> None:
20
- response = requests.get(file_url, stream=True)
21
- if response.status_code != 200:
22
- raise Exception(f'Файл недоступен для скачивания по ссылке: {file_url}')
23
- total_size = int(response.headers.get('content-length', 0))
24
- progress_tqdm = tqdm(desc='Loading GGUF file', total=total_size, unit='iB', unit_scale=True)
25
- progress_gradio = gr.Progress()
26
- completed_size = 0
27
- with open(file_path, 'wb') as file:
28
- for data in response.iter_content(chunk_size=4096):
29
- size = file.write(data)
30
- progress_tqdm.update(size)
31
- completed_size += size
32
- desc = f'Loading GGUF file, {completed_size/1024**3:.3f}/{total_size/1024**3:.3f} GB'
33
- progress_gradio(completed_size/total_size, desc=desc)
34
-
35
-
36
- def download_gguf_and_init_model(gguf_url: str, model_dict: MODEL_DICT) -> Tuple[MODEL_DICT, bool, str]:
37
- log = ''
38
- if not gguf_url.endswith('.gguf'):
39
- log += f'The link must be a direct link to the GGUF file\n'
40
- return model_dict, log
41
-
42
- gguf_filename = gguf_url.rsplit('/')[-1]
43
- model_path = MODELS_PATH / gguf_filename
44
- progress = gr.Progress()
45
-
46
- if not model_path.is_file():
47
- progress(0.3, desc='Шаг 1/2: Loading GGUF model file')
48
- try:
49
- download_file(gguf_url, model_path)
50
- log += f'Model file {gguf_filename} successfully loaded\n'
51
- except Exception as ex:
52
- log += f'Error loading model from link {gguf_url}, error code:\n{ex}\n'
53
- curr_model = model_dict.get('model')
54
- if curr_model is None:
55
- log += f'Model is missing from dictionary "model_dict"\n'
56
- return model_dict, load_log
57
- curr_model_filename = Path(curr_model.model_path).name
58
- log += f'Current initialized model: {curr_model_filename}\n'
59
- return model_dict, log
60
- else:
61
- log += f'Model file {gguf_filename} loaded, initializing model...\n'
62
-
63
- progress(0.7, desc='Шаг 2/2: Model initialization')
64
- model = Llama(model_path=str(model_path), n_gpu_layers=-1, verbose=True)
65
- model_dict = {'model': model}
66
- support_system_role = 'System role not supported' not in model.metadata['tokenizer.chat_template']
67
- log += f'Model {gguf_filename} initialized\n'
68
- return model_dict, support_system_role, log
69
-
70
-
71
- def user_message_to_chatbot(user_message: str, chatbot: CHAT_HISTORY) -> Tuple[str, CHAT_HISTORY]:
72
- if user_message:
73
- chatbot.append({'role': 'user', 'metadata': {'title': None}, 'content': user_message})
74
- return '', chatbot
75
-
76
-
77
- def bot_response_to_chatbot(
78
- chatbot: CHAT_HISTORY,
79
- model_dict: MODEL_DICT,
80
- system_prompt: str,
81
- support_system_role: bool,
82
- history_len: int,
83
- do_sample: bool,
84
- *generate_args,
85
- ):
86
-
87
- model = model_dict.get('model')
88
- if model is None:
89
- gr.Info('Model not initialized')
90
- yield chatbot
91
- return
92
-
93
- if len(chatbot) == 0 or chatbot[-1]['role'] == 'assistant':
94
- yield chatbot
95
- return
96
-
97
- messages = []
98
- if support_system_role and system_prompt:
99
- messages.append({'role': 'system', 'metadata': {'title': None}, 'content': system_prompt})
100
-
101
- if history_len != 0:
102
- messages.extend(chatbot[:-1][-(history_len*2):])
103
-
104
- messages.append(chatbot[-1])
105
 
106
- gen_kwargs = dict(zip(GENERATE_KWARGS.keys(), generate_args))
107
- gen_kwargs['top_k'] = int(gen_kwargs['top_k'])
108
- if not do_sample:
109
- gen_kwargs['top_p'] = 0.0
110
- gen_kwargs['top_k'] = 1
111
- gen_kwargs['repeat_penalty'] = 1.0
112
 
113
- stream_response = model.create_chat_completion(
114
- messages=messages,
115
- stream=True,
116
- **gen_kwargs,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  )
 
118
 
119
- chatbot.append({'role': 'assistant', 'metadata': {'title': None}, 'content': ''})
120
- for chunk in stream_response:
121
- token = chunk['choices'][0]['delta'].get('content')
122
- if token is not None:
123
- chatbot[-1]['content'] += token
124
- yield chatbot
125
-
126
 
127
- def get_system_prompt_component(interactive: bool) -> gr.Textbox:
128
- value = '' if interactive else 'System prompt is not supported by this model'
129
- return gr.Textbox(value=value, label='System prompt', interactive=interactive)
130
-
131
-
132
- def get_generate_args(do_sample: bool) -> List[gr.component]:
133
- generate_args = [
134
- gr.Slider(minimum=0.1, maximum=3, value=GENERATE_KWARGS['temperature'], step=0.1, label='temperature', visible=do_sample),
135
- gr.Slider(minimum=0, maximum=1, value=GENERATE_KWARGS['top_p'], step=0.01, label='top_p', visible=do_sample),
136
- gr.Slider(minimum=1, maximum=50, value=GENERATE_KWARGS['top_k'], step=1, label='top_k', visible=do_sample),
137
- gr.Slider(minimum=1, maximum=5, value=GENERATE_KWARGS['repeat_penalty'], step=0.1, label='repeat_penalty', visible=do_sample),
138
- ]
139
- return generate_args
140
-
141
-
142
- # ================== VARIABLES =============================
143
-
144
- MODELS_PATH = Path('models')
145
- MODELS_PATH.mkdir(exist_ok=True)
146
- DEFAULT_GGUF_URL = 'https://huggingface.co/bartowski/gemma-2-2b-it-GGUF/resolve/main/gemma-2-2b-it-Q8_0.gguf'
147
-
148
- start_model_dict, start_support_system_role, start_load_log = download_gguf_and_init_model(
149
- gguf_url=DEFAULT_GGUF_URL, model_dict={},
150
  )
151
-
152
- GENERATE_KWARGS = dict(
153
- temperature=0.2,
154
- top_p=0.95,
155
- top_k=40,
156
- repeat_penalty=1.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  )
158
-
159
- theme = gr.themes.Base(primary_hue='green', secondary_hue='yellow', neutral_hue='zinc').set(
160
- loader_color='rgb(0, 255, 0)',
161
- slider_color='rgb(0, 200, 0)',
162
- body_text_color_dark='rgb(0, 200, 0)',
163
- button_secondary_background_fill_dark='green',
164
- )
165
- css = '''.gradio-container {width: 60% !important}'''
166
-
167
-
168
- # ================== INTERFACE =============================
169
-
170
- with gr.Blocks(theme=theme, css=css) as interface:
171
- model_dict = gr.State(start_model_dict)
172
- support_system_role = gr.State(start_support_system_role)
173
 
174
- # ================= CHAT BOT PAGE ======================
175
- with gr.Tab('Chatbot'):
176
- with gr.Row():
177
- with gr.Column(scale=3):
178
- chatbot = gr.Chatbot(
179
- type='messages', # new in gradio 5+
180
- show_copy_button=True,
181
- bubble_full_width=False,
182
- height=480,
183
- )
184
- user_message = gr.Textbox(label='User')
185
-
186
- with gr.Row():
187
- user_message_btn = gr.Button('Send')
188
- stop_btn = gr.Button('Stop')
189
- clear_btn = gr.Button('Clear')
190
-
191
- system_prompt = get_system_prompt_component(interactive=support_system_role.value)
192
-
193
- with gr.Column(scale=1, min_width=80):
194
- with gr.Group():
195
- gr.Markdown('Length of message history')
196
- history_len = gr.Slider(
197
- minimum=0,
198
- maximum=10,
199
- value=0,
200
- step=1,
201
- info='Number of previous messages taken into account in history',
202
- label='history_len',
203
- show_label=False,
204
- )
205
-
206
- with gr.Group():
207
- gr.Markdown('Generation parameters')
208
- do_sample = gr.Checkbox(
209
- value=False,
210
- label='do_sample',
211
- info='Activate random sampling',
212
- )
213
- generate_args = get_generate_args(do_sample.value)
214
- do_sample.change(
215
- fn=get_generate_args,
216
- inputs=do_sample,
217
- outputs=generate_args,
218
- show_progress=False,
219
- )
220
-
221
- generate_event = gr.on(
222
- triggers=[user_message.submit, user_message_btn.click],
223
- fn=user_message_to_chatbot,
224
- inputs=[user_message, chatbot],
225
- outputs=[user_message, chatbot],
226
- ).then(
227
- fn=bot_response_to_chatbot,
228
- inputs=[chatbot, model_dict, system_prompt, support_system_role, history_len, do_sample, *generate_args],
229
- outputs=[chatbot],
230
- )
231
- stop_btn.click(
232
- fn=None,
233
- inputs=None,
234
- outputs=None,
235
- cancels=generate_event,
236
- )
237
- clear_btn.click(
238
- fn=lambda: None,
239
- inputs=None,
240
- outputs=[chatbot],
241
- )
242
-
243
- # ================= LOAD MODELS PAGE ======================
244
- with gr.Tab('Load model'):
245
- gguf_url = gr.Textbox(
246
- value='',
247
- label='Link to GGUF',
248
- placeholder='URL link to the model in GGUF format',
249
- )
250
- load_model_btn = gr.Button('Downloading GGUF and initializing the model')
251
- load_log = gr.Textbox(
252
- value=start_load_log,
253
- label='Model loading status',
254
- lines=3,
255
- )
256
-
257
- load_model_btn.click(
258
- fn=download_gguf_and_init_model,
259
- inputs=[gguf_url, model_dict],
260
- outputs=[model_dict, support_system_role, load_log],
261
- ).success(
262
- fn=get_system_prompt_component,
263
- inputs=[support_system_role],
264
- outputs=[system_prompt],
265
- )
266
 
267
- gr.HTML("""<h3 style='text-align: center'>
268
- <a href="https://github.com/sergey21000/gradio-llamacpp-chatbot" target='_blank'>GitHub Repository</a></h3>
269
- """)
270
-
271
- interface.launch(server_name='0.0.0.0', server_port=7860)
 
1
+ # c2-standard-8 spot 9ct/h
2
+ # sudo apt-get install git git-lfs pip cmake podman
3
+ # git lfs install
4
+ #conda
5
+ # wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
6
+ # bash Miniconda3-latest-Linux-x86_64.sh
7
+ # conda create --name dev python=3.10
8
+ # conda activate dev
9
+ # conda create --name dev4 python=3.10
10
+
11
+ ##########
12
+ # git clone https://huggingface.co/spaces/TobDeBer/Qwen-2-llamacpp
13
+ # pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
14
+ # pip install huggingface_hub scikit-build-core llama-cpp-agent
15
+ #
16
+ import llama_cpp
17
+ import os
18
+ import json
19
+ import subprocess
20
  from llama_cpp import Llama
21
+ from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
22
+ from llama_cpp_agent.providers import LlamaCppPythonProvider
23
+ from llama_cpp_agent.chat_history import BasicChatHistory
24
+ from llama_cpp_agent.chat_history.messages import Roles
25
+ import gradio as gr
26
+ from huggingface_hub import hf_hub_download
27
 
28
+ huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
29
 
30
+ hf_hub_download(
31
+ repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
32
+ filename="qwen2-0_5b-instruct-q4_k_m.gguf",
33
+ local_dir="./models"
34
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ hf_hub_download(
37
+ repo_id="TobDeBer/gpt2-Q4_K_M-GGUF",
38
+ filename="gpt2-q4_k_m.gguf",
39
+ local_dir="./models"
40
+ )
 
41
 
42
+ hf_hub_download(
43
+ repo_id="TobDeBer/Meta-Llama-3.1-8B-Instruct-Q4_K_M-GGUF",
44
+ filename="meta-llama-3.1-8b-instruct-q4_k_m.gguf",
45
+ local_dir="./models",
46
+ token=huggingface_token
47
+ )
48
+ # 5GB
49
+
50
+
51
+ # RichardErkhov/ibm-granite_-_granite-7b-base-gguf
52
+ # granite-7b-base.Q4_K_M.gguf
53
+ # 4GB
54
+
55
+ # TobDeBer/granite-8b-code-instruct-128k-Q4_K_M-GGUF
56
+ # granite-8b-code-instruct-128k-q4_k_m.gguf
57
+ # 5GB
58
+
59
+ llm = None
60
+ llm_model = None
61
+
62
+ def respond(
63
+ message,
64
+ history: list[tuple[str, str]],
65
+ model,
66
+ system_message,
67
+ max_tokens,
68
+ temperature,
69
+ top_p,
70
+ top_k,
71
+ repeat_penalty,
72
+ ):
73
+ chat_template = MessagesFormatterType.GEMMA_2
74
+
75
+ global llm
76
+ global llm_model
77
+
78
+ if llm is None or llm_model != model:
79
+ llm = Llama(
80
+ model_path=f"models/{model}",
81
+ flash_attn=True,
82
+ n_gpu_layers=81,
83
+ n_batch=1024,
84
+ n_ctx=8192,
85
  )
86
+ llm_model = model
87
 
88
+ provider = LlamaCppPythonProvider(llm)
 
 
 
 
 
 
89
 
90
+ agent = LlamaCppAgent(
91
+ provider,
92
+ system_prompt=f"{system_message}",
93
+ predefined_messages_formatter_type=chat_template,
94
+ debug_output=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  )
96
+
97
+ settings = provider.get_provider_default_settings()
98
+ settings.temperature = temperature
99
+ settings.top_k = top_k
100
+ settings.top_p = top_p
101
+ settings.max_tokens = max_tokens
102
+ settings.repeat_penalty = repeat_penalty
103
+ settings.stream = True
104
+
105
+ messages = BasicChatHistory()
106
+
107
+ for msn in history:
108
+ user = {
109
+ 'role': Roles.user,
110
+ 'content': msn[0]
111
+ }
112
+ assistant = {
113
+ 'role': Roles.assistant,
114
+ 'content': msn[1]
115
+ }
116
+ messages.add_message(user)
117
+ messages.add_message(assistant)
118
+
119
+ stream = agent.get_chat_response(
120
+ message,
121
+ llm_sampling_settings=settings,
122
+ chat_history=messages,
123
+ returns_streaming_generator=True,
124
+ print_output=False
125
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
+ outputs = ""
128
+ for output in stream:
129
+ outputs += output
130
+ yield outputs
131
+
132
+ description = """<p align="center">Defaults to Qwen 500M<br>
133
+ More models in Advanced Section <br></p>
134
+ """
135
+
136
+ demo = gr.ChatInterface(
137
+ respond,
138
+ additional_inputs=[
139
+ gr.Dropdown([
140
+ 'qwen2-0_5b-instruct-q4_k_m.gguf',
141
+ 'gpt2-q4_k_m.gguf',
142
+ 'meta-llama-3.1-8b-instruct-q4_k_m.gguf',
143
+ ],
144
+ value="qwen2-0_5b-instruct-q4_k_m.gguf",
145
+ label="Model"
146
+ ),
147
+ gr.Textbox(value="You are a helpful assistant.", label="System message"),
148
+ gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
149
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
150
+ gr.Slider(
151
+ minimum=0.1,
152
+ maximum=1.0,
153
+ value=0.95,
154
+ step=0.05,
155
+ label="Top-p",
156
+ ),
157
+ gr.Slider(
158
+ minimum=0,
159
+ maximum=100,
160
+ value=40,
161
+ step=1,
162
+ label="Top-k",
163
+ ),
164
+ gr.Slider(
165
+ minimum=0.0,
166
+ maximum=2.0,
167
+ value=1.1,
168
+ step=0.1,
169
+ label="Repetition penalty",
170
+ ),
171
+ ],
172
+ #retry_btn="Retry",
173
+ #undo_btn="Undo",
174
+ #clear_btn="Clear",
175
+ #submit_btn="Send",
176
+ title="Chat with Qwen 2 and friends using llama.cpp",
177
+ description=description,
178
+ chatbot=gr.Chatbot(
179
+ scale=1,
180
+ show_copy_button=True
181
+ )
182
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
+ if __name__ == "__main__":
185
+ demo.launch()