Spaces:

sebdg
/

unsloth

Paused

App Files Files Community

Sebastien De Greef commited on Jul 15, 2024

Commit

909b9b6

1 Parent(s): 6baccb3

handle push_to_hub_gguf and inference

Browse files

Files changed (1) hide show

app.py +57 -21

app.py CHANGED Viewed

@@ -92,7 +92,20 @@ def load_data(dataset_name, data_template_style, data_template):
     dataset = dataset.map(lambda examples: formatting_prompts_func(examples, data_template), batched=True)
     return f"Data loaded {len(dataset)} records loaded.", gr.update(visible=True, interactive=True), gr.update(visible=True, interactive=True)
 async def train_model(model_name: str, lora_r: int, lora_alpha: int, lora_dropout: float, per_device_train_batch_size: int, warmup_steps: int, max_steps: int,
@@ -143,9 +156,35 @@ async def train_model(model_name: str, lora_r: int, lora_alpha: int, lora_dropou
     trainer.train()
     return "Model training",gr.update(visible=True, interactive=False), gr.update(visible=True, interactive=True), gr.update(interactive=True)
-def save_model():
-    return "Model saved", gr.update(visible=True, interactive=True), gr.update(visible=True, interactive=False), gr.update(interactive=False)
 # Create the Gradio interface
 with gr.Blocks() as demo:
@@ -171,7 +210,7 @@ with gr.Blocks() as demo:
             dataset_name = gr.Textbox(label="Dataset Name", value="yahma/alpaca-cleaned")
             data_template_style = gr.Dropdown(label="Template", choices=["alpaca","custom"], value="alpaca",  allow_custom_value=True)
         with gr.Row():
-            data_tempalte =  gr.TextArea(label="Data Template", value="""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 ### Instruction:
 {}
@@ -184,7 +223,7 @@ with gr.Blocks() as demo:
         gr.Markdown("---")
         output_load_data = gr.Textbox(label="Data Load Status", value="Data not loaded", interactive=False)
         load_data_btn = gr.Button("Load Dataset", interactive=True)
-        load_data_btn.click(load_data, inputs=[dataset_name, data_template_style, data_tempalte], outputs=[output_load_data, load_data_btn])
     with gr.Tab("Fine-Tuning"):
         gr.Markdown("""### Fine-Tuned Model Parameters""")
@@ -238,18 +277,18 @@ with gr.Blocks() as demo:
             with gr.Column():
                 merge_16bit = gr.Checkbox(label="Merge to 16bit", value=False, interactive=True)
                 merge_4bit = gr.Checkbox(label="Merge to 4bit", value=False, interactive=True)
-            just_lora = gr.Checkbox(label="Just LoRA Adapter", value=False, interactive=True)
         gr.Markdown("---")
         with gr.Row():
             gr.Markdown("### GGUF Options")
             with gr.Column():
-                merge_16bit = gr.Checkbox(label="Quantize to f16", value=False, interactive=True)
-                merge_16bit = gr.Checkbox(label="Quantize to 8bit (Q8_0)", value=False, interactive=True)
-                merge_16bit = gr.Checkbox(label="Quantize to 4bit (q4_k_m)", value=False, interactive=True)
             with gr.Column():
-                merge_custom = gr.Checkbox(label="Custom", value=False, interactive=True)
-                merge_custom_value = gr.Textbox(label="", value="Q5_K", interactive=True)
         gr.Markdown("---")
         with gr.Row():
@@ -258,7 +297,6 @@ with gr.Blocks() as demo:
             with gr.Column():
                 hub_model_name = gr.Textbox(label="Hub Model Name", value=f"username/model_name", interactive=True)
                 hub_token = gr.Textbox(label="Hub Token", interactive=True, type="password")
-                ollama_pub_key = gr.Button("HuggingFace Access Token")
         gr.Markdown("---")
         with gr.Row():
@@ -270,23 +308,21 @@ with gr.Blocks() as demo:
                 ollama_model_name = gr.Textbox(label="Ollama Model Name", value="user/model_name")
                 ollama_pub_key = gr.Button("Ollama Pub Key")
         gr.Markdown("---")
     with gr.Tab("Inference"):
         with gr.Row():
-            gr.Textbox(label="Input Text", lines=4, value="""\
 Continue the fibonnaci sequence.
 # instruction
 1, 1, 2, 3, 5, 8
 # input
 """, interactive=True)
-            gr.Textbox(label="Output Text", lines=4, value="""\
-""", interactive=False)
-        inference_button = gr.Button("Inference", visible=False, interactive=False)
-    # Output
-     # Button click events
     load_btn.click(load_model, inputs=[initial_model_name, load_in_4bit, max_sequence_length], outputs=[output, load_btn, train_btn, initial_model_name, load_in_4bit, max_sequence_length])
 demo.launch()

     dataset = dataset.map(lambda examples: formatting_prompts_func(examples, data_template), batched=True)
     return f"Data loaded {len(dataset)} records loaded.", gr.update(visible=True, interactive=True), gr.update(visible=True, interactive=True)
+def inference(prompt, input_text):
+    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
+    inputs = tokenizer(
+    [
+        prompt.format(
+            "Continue the fibonnaci sequence.", # instruction
+            "1, 1, 2, 3, 5, 8", # input
+            "", # output - leave this blank for generation!
+        )
+    ], return_tensors = "pt").to("cuda")
+    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
+    result = tokenizer.batch_decode(outputs)
+    return result[0], gr.update(visible=True, interactive=True)
 async def train_model(model_name: str, lora_r: int, lora_alpha: int, lora_dropout: float, per_device_train_batch_size: int, warmup_steps: int, max_steps: int,
     trainer.train()
     return "Model training",gr.update(visible=True, interactive=False), gr.update(visible=True, interactive=True), gr.update(interactive=True)
+def save_model(model_name, hub_model_name, hub_token, gguf_16bit, gguf_8bit, gguf_4bit, gguf_custom, gguf_custom_value, merge_16bit, merge_4bit, just_lora, push_to_hub):
+    global model, tokenizer
+    if gguf_custom:
+        gguf_custom_value = gguf_custom_value
+    else:
+        gguf_custom_value = None
+    if gguf_16bit:
+        gguf = "f16"
+    elif gguf_8bit:
+        gguf = "Q8_0"
+    elif gguf_4bit:
+        gguf = "q4_k_m"
+    else:
+        gguf = None
+    if merge_16bit:
+        merge = "16bit"
+    elif merge_4bit:
+        merge = "4bit"
+    elif just_lora:
+        merge = "lora"
+    else:
+        merge = None
+    #model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")
+    if push_to_hub:
+        model.push_to_hub_gguf(hub_model_name, tokenizer, quantization_method=gguf, token=hub_token)
+    return "Model saved", gr.update(visible=True, interactive=True)
 # Create the Gradio interface
 with gr.Blocks() as demo:
             dataset_name = gr.Textbox(label="Dataset Name", value="yahma/alpaca-cleaned")
             data_template_style = gr.Dropdown(label="Template", choices=["alpaca","custom"], value="alpaca",  allow_custom_value=True)
         with gr.Row():
+            data_template =  gr.TextArea(label="Data Template", value="""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 ### Instruction:
 {}
         gr.Markdown("---")
         output_load_data = gr.Textbox(label="Data Load Status", value="Data not loaded", interactive=False)
         load_data_btn = gr.Button("Load Dataset", interactive=True)
+        load_data_btn.click(load_data, inputs=[dataset_name, data_template_style, data_template], outputs=[output_load_data, load_data_btn])
     with gr.Tab("Fine-Tuning"):
         gr.Markdown("""### Fine-Tuned Model Parameters""")
             with gr.Column():
                 merge_16bit = gr.Checkbox(label="Merge to 16bit", value=False, interactive=True)
                 merge_4bit = gr.Checkbox(label="Merge to 4bit", value=False, interactive=True)
+                just_lora = gr.Checkbox(label="Just LoRA Adapter", value=False, interactive=True)
         gr.Markdown("---")
         with gr.Row():
             gr.Markdown("### GGUF Options")
             with gr.Column():
+                gguf_16bit = gr.Checkbox(label="Quantize to f16", value=False, interactive=True)
+                gguf_8bit = gr.Checkbox(label="Quantize to 8bit (Q8_0)", value=False, interactive=True)
+                gguf_4bit = gr.Checkbox(label="Quantize to 4bit (q4_k_m)", value=False, interactive=True)
             with gr.Column():
+                gguf_custom = gr.Checkbox(label="Custom", value=False, interactive=True)
+                gguf_custom_value = gr.Textbox(label="", value="Q5_K", interactive=True)
         gr.Markdown("---")
         with gr.Row():
             with gr.Column():
                 hub_model_name = gr.Textbox(label="Hub Model Name", value=f"username/model_name", interactive=True)
                 hub_token = gr.Textbox(label="Hub Token", interactive=True, type="password")
         gr.Markdown("---")
         with gr.Row():
                 ollama_model_name = gr.Textbox(label="Ollama Model Name", value="user/model_name")
                 ollama_pub_key = gr.Button("Ollama Pub Key")
         gr.Markdown("---")
+        save_button = gr.Button("Save Model", visible=True, interactive=True)
+        save_button.click(save_model, inputs=[model_name, hub_model_name, hub_token, gguf_16bit, gguf_8bit, gguf_4bit, gguf_custom, gguf_custom_value, merge_16bit, merge_4bit, just_lora, push_to_hub], outputs=[save_button])
     with gr.Tab("Inference"):
         with gr.Row():
+            input_text = gr.Textbox(label="Input Text", lines=4, value="""\
 Continue the fibonnaci sequence.
 # instruction
 1, 1, 2, 3, 5, 8
 # input
 """, interactive=True)
+            output_text = gr.Textbox(label="Output Text", lines=4, value="", interactive=False)
+        inference_button = gr.Button("Inference", visible=True, interactive=True)
+        inference_button.click(inference, inputs=[data_template, input_text], outputs=[output_text, inference_button])
     load_btn.click(load_model, inputs=[initial_model_name, load_in_4bit, max_sequence_length], outputs=[output, load_btn, train_btn, initial_model_name, load_in_4bit, max_sequence_length])
 demo.launch()