Spaces:

somosnlp
/

SpanishMedicaLLM

Runtime error

App Files Files Community

inoid commited on Mar 30, 2024

Commit

478d560

1 Parent(s): a3a731c

Add finnetuing process configuration to model

Browse files

Files changed (2) hide show

app.py +12 -33
spanish_medica_llm.py +157 -3

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ import sys
 import torch
-from spanish_medica_llm import run_training, run_training_process
 import gradio as gr
@@ -45,41 +45,18 @@ def train_model(*inputs):
     if "IS_SHARED_UI" in os.environ:
         raise gr.Error("This Space only works in duplicated instances")
-    # args_general = argparse.Namespace(
-    #             image_captions_filename = True,
-    #             train_text_encoder = True,
-    #             #stop_text_encoder_training = stptxt,
-    #             save_n_steps = 0,
-    #             #pretrained_model_name_or_path = model_to_load,
-    #             instance_data_dir="instance_images",
-    #             #class_data_dir=class_data_dir,
-    #             output_dir="output_model",
-    #             instance_prompt="",
-    #             seed=42,
-    #             resolution=512,
-    #             mixed_precision="fp16",
-    #             train_batch_size=1,
-    #             gradient_accumulation_steps=1,
-    #             use_8bit_adam=True,
-    #             learning_rate=2e-6,
-    #             lr_scheduler="polynomial",
-    #             lr_warmup_steps = 0,
-    #             #max_train_steps=Training_Steps,
-    # )
-    # run_training(args_general)
-    # torch.cuda.empty_cache()
-    # #convert("output_model", "model.ckpt")
-    # #shutil.rmtree('instance_images')
-    # #shutil.make_archive("diffusers_model", 'zip', "output_model")
-    # #with zipfile.ZipFile('diffusers_model.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
-    # #    zipdir('output_model/', zipf)
-    # torch.cuda.empty_cache()
-    # return [gr.update(visible=True, value=["diffusers_model.zip"]), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)]
-    run_training_process()
     return f"Train Model Sucessful!!!"
 def stop_model(*input):
     return f"Model with Gradio!"
@@ -93,6 +70,8 @@ with gr.Blocks() as demo:
     btn_response.click(fn=generate_model, inputs=inp, outputs=out)
     btn_train = gr.Button("Train Model")
     btn_train.click(fn=train_model, inputs=[], outputs=out)
     btn_evaluate = gr.Button("Evaluate Model")
     btn_evaluate.click(fn=evaluate_model, inputs=[], outputs=out)
     btn_stop = gr.Button("Stop Model")

 import torch
+from spanish_medica_llm import run_training, run_training_process, run_finnetuning_process
 import gradio as gr
     if "IS_SHARED_UI" in os.environ:
         raise gr.Error("This Space only works in duplicated instances")
+    run_training_process()
     return f"Train Model Sucessful!!!"
+def finnetuning_model(*inputs):
+    if "IS_SHARED_UI" in os.environ:
+        raise gr.Error("This Space only works in duplicated instances")
+    run_finnetuning_process()
+    return f"Finnetuning Model Sucessful!!!"
 def stop_model(*input):
     return f"Model with Gradio!"
     btn_response.click(fn=generate_model, inputs=inp, outputs=out)
     btn_train = gr.Button("Train Model")
     btn_train.click(fn=train_model, inputs=[], outputs=out)
+    btn_finnetuning = gr.Button("Finnetuning Model")
+    btn_finnetuning.click(fn=finnetuning_model, inputs=[], outputs=out)
     btn_evaluate = gr.Button("Evaluate Model")
     btn_evaluate.click(fn=evaluate_model, inputs=[], outputs=out)
     btn_stop = gr.Button("Stop Model")

spanish_medica_llm.py CHANGED Viewed

@@ -331,6 +331,45 @@ MAX_TRAINING_STEPS = 2
 TOKEN_NAME = TOKEN_MISTRAL_NAME
 def loadSpanishTokenizer():
     """
@@ -379,12 +418,32 @@ def splitDatasetInTestValid(dataset):
         return (dataset['train'], eval_dataset, test_dataset)
 def loadSpanishDataset():
     spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train")
     spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] not in  FILTER_CRITERIA)
     spanishMedicaLllmDataset = spanishMedicaLllmDataset.train_test_split(0.2, seed=203984)
     return spanishMedicaLllmDataset
     ##See Jupyter Notebook for change CONTEXT_LENGTH size
 def accelerateConfigModel():
     """
@@ -483,6 +542,26 @@ def modelLoraConfigBioMistral(model):
         model = accelerator.prepare_model(model)
         return (model)
 # A note on training. You can set the max_steps to be high initially, and examine at what step your
 # model's performance starts to degrade. There is where you'll find a sweet spot for how many steps
@@ -541,10 +620,85 @@ def configAndRunTraining(basemodel, dataset, eval_dataset, tokenizer):
         trainer.push_to_hub()
 def run_training_process():
     #Loggin to Huggin Face
     login(token = os.environ.get('HG_FACE_TOKEN'))
     os.environ['WANDB_DISABLED'] = 'true'
@@ -554,7 +708,7 @@ def run_training_process():
         getTokenizedDataset( medicalSpanishDataset, tokenizer)
        )
-    base_model =  loadBaseModel(MISTRAL_BASE_MODEL_ID)
-    base_model = modelLoraConfigBioMistral(base_model)
-    configAndRunTraining(base_model,train_dataset, eval_dataset, tokenizer)

 TOKEN_NAME = TOKEN_MISTRAL_NAME
+def get_chat_format(element):
+    """
+    Processes a single sample from the alpaca dataset to structure it for chatbot training.
+    This function transforms the dataset sample into a format suitable for training,
+    where each message is categorized by its role in the conversation (system, input, user, assistant).
+    It initializes the conversation with a system message, then conditionally adds an input message,
+    follows with the user's instruction, and finally, the assistant's output based on the provided inputs.
+    Parameters
+    ----------
+    sample : dict
+        A dictionary representing a single sample from the dataset. It must contain
+        keys corresponding to input and output components of the conversation.
+    Returns
+    -------
+    dict
+        A modified dictionary with a 'messages' key that contains a list of ordered messages,
+        each annotated with its role in the conversation.
+    """
+    prompt_template="""A partir del caso clínico que se expone a continuación, tu tarea es la siguiente.
+      Como médico experto, tu tarea es la de diagnosticar al paciente en base al caso clínico. Responde únicamente con el diagnóstico para el paciente de forma concisa.
+      Caso clínico: {caso_clinico}
+      """
+      # cómo usarlo con un LLM:
+    system_prompt = "Eres un experto en medicina que realiza diagnósticos en base a casos clínicos."
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": prompt_template.format(caso_clinico=element["raw_text"])},
+        {"role": "assistant", "content": element["topic"]},
+    ]
+    element["raw_text"] = messages
+    return element
 def loadSpanishTokenizer():
     """
         return (dataset['train'], eval_dataset, test_dataset)
 def loadSpanishDataset():
     spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train")
     spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] not in  FILTER_CRITERIA)
     spanishMedicaLllmDataset = spanishMedicaLllmDataset.train_test_split(0.2, seed=203984)
     return spanishMedicaLllmDataset
+def loadSpanishDatasetFinnetuning():
+    spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train")
+    spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"]  in  FILTER_CRITERIA)
+    return spanishMedicaLllmDataset
     ##See Jupyter Notebook for change CONTEXT_LENGTH size
+def applyChatInstructFormat(dataset, filterColumns = ['raw_text', 'topic']):
+    """
+      Apply instruccion chat_template
+    """
+    if dataset == None:
+        return dataset
+    else:
+        dataset = dataset.remove_columns([col for col in dataset.features if col not in filterColumns])
+        return dataset.map(
+                    get_chat_format,
+                    batched=False,
+                    num_proc=4
+                )
 def accelerateConfigModel():
     """
         model = accelerator.prepare_model(model)
         return (model)
+def getLoraConfiguration():
+    """
+    """
+    return LoraConfig(
+            r=8,
+            lora_alpha=16,
+            target_modules=[
+                "q_proj",
+                "k_proj",
+                "v_proj",
+                "o_proj",
+                "gate_proj",
+                "up_proj",
+                "down_proj",
+                "lm_head",
+            ],
+            bias="none",
+            lora_dropout=0.05,  # Conventional
+            task_type="CAUSAL_LM",
+        )
 # A note on training. You can set the max_steps to be high initially, and examine at what step your
 # model's performance starts to degrade. There is where you'll find a sweet spot for how many steps
         trainer.push_to_hub()
+def configAndRunFineTuning(basemodel, dataset, eval_dataset, tokenizer):
+    if basemodel is None or dataset is None or tokenizer is None:
+        return None
+    else:
+        tokenizer.pad_token = tokenizer.eos_token
+        training_args = TrainingArguments(
+                output_dir=output_dir,
+                push_to_hub = True,
+                hub_private_repo = False,
+                hub_model_id = HUB_MODEL_ID,
+                warmup_steps = 5,
+                per_device_train_batch_size = MICRO_BATCH_SIZE,
+                per_device_eval_batch_size=1,
+                #gradient_checkpointing=True,
+                gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS,
+                num_train_epochs = 1,
+                learning_rate = 2.5e-5, # Want about 10x smaller than the Mistral learning rate
+                logging_steps = 5,
+                optim="paged_adamw_8bit",
+                logging_dir="./logs",        # Directory for storing logs
+                save_strategy = "steps",       # Save the model checkpoint every logging step
+                save_steps = 50,                # Save checkpoints every 50 steps
+                evaluation_strategy = "steps", # Evaluate the model every logging step
+                eval_steps = 50,               # Evaluate and save checkpoints every 50 steps
+                do_eval = True,                # Perform evaluation at the end of training
+                eval_steps=50,
+                save_total_limit=2,
+                remove_unused_columns = True,
+                report_to = None,           # Comment this out if you don't want to use weights & baises
+                run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}" ,         # Name of the W&B run (optional)
+                fp16=True,  #Set for GPU T4 for more powerful GPU as G-100 or another change to false and bf16 parameter
+                bf16=False
+            )
+        trainer = SFTTrainer(
+                     model=basemodel,
+                     train_dataset = dataset,
+                     eval_dataset = eval_dataset,
+                    peft_config = getLoraConfiguration(),
+                    dataset_text_field = "raw_text",
+                    max_seq_length = 1024, #512
+                    tokenizer = tokenizer,
+                    args = training_args,
+                    dataset_kwargs={
+                        "add_special_tokens": False,  # We template with special tokens
+                        "append_concat_token": False, # No need to add additional separator token
+                    },
+                    packing=True
+                )
+        basemodel.config.use_cache = False  # silence the warnings. Please re-enable for inference!
+        trainer.train()
+        trainer.push_to_hub()
 def run_training_process():
+    #Loggin to Huggin Face
+    login(token = os.environ.get('HG_FACE_TOKEN'))
+    os.environ['WANDB_DISABLED'] = 'true'
+    tokenizer = loadSpanishTokenizer()
+    medicalSpanishDataset = applyChatInstructFormat( loadSpanishDatasetFinnetuning())
+    medicalSpanishDataset = medicalSpanishDataset.train_test_split(0.2, seed=203984)
+    # train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid(
+    #     getTokenizedDataset( medicalSpanishDataset, tokenizer)
+    #    )
+    train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid(  medicalSpanishDataset )
+    base_model =  loadBaseModel(MISTRAL_BASE_MODEL_ID)
+    base_model = modelLoraConfigBioMistral(base_model)
+    configAndRunTraining(base_model,train_dataset, eval_dataset, tokenizer)
+def run_finnetuning_process():
     #Loggin to Huggin Face
     login(token = os.environ.get('HG_FACE_TOKEN'))
     os.environ['WANDB_DISABLED'] = 'true'
         getTokenizedDataset( medicalSpanishDataset, tokenizer)
        )
+    base_model =  loadBaseModel(HUB_MODEL_ID)
+    configAndRunFineTuning(base_model,train_dataset, eval_dataset, tokenizer)