Spaces:
Runtime error
Runtime error
Add finnetuing process configuration to model
Browse files- app.py +12 -33
- spanish_medica_llm.py +157 -3
app.py
CHANGED
|
@@ -10,7 +10,7 @@ import sys
|
|
| 10 |
import torch
|
| 11 |
|
| 12 |
|
| 13 |
-
from spanish_medica_llm import run_training, run_training_process
|
| 14 |
|
| 15 |
import gradio as gr
|
| 16 |
|
|
@@ -45,41 +45,18 @@ def train_model(*inputs):
|
|
| 45 |
if "IS_SHARED_UI" in os.environ:
|
| 46 |
raise gr.Error("This Space only works in duplicated instances")
|
| 47 |
|
| 48 |
-
|
| 49 |
-
# image_captions_filename = True,
|
| 50 |
-
# train_text_encoder = True,
|
| 51 |
-
# #stop_text_encoder_training = stptxt,
|
| 52 |
-
# save_n_steps = 0,
|
| 53 |
-
# #pretrained_model_name_or_path = model_to_load,
|
| 54 |
-
# instance_data_dir="instance_images",
|
| 55 |
-
# #class_data_dir=class_data_dir,
|
| 56 |
-
# output_dir="output_model",
|
| 57 |
-
# instance_prompt="",
|
| 58 |
-
# seed=42,
|
| 59 |
-
# resolution=512,
|
| 60 |
-
# mixed_precision="fp16",
|
| 61 |
-
# train_batch_size=1,
|
| 62 |
-
# gradient_accumulation_steps=1,
|
| 63 |
-
# use_8bit_adam=True,
|
| 64 |
-
# learning_rate=2e-6,
|
| 65 |
-
# lr_scheduler="polynomial",
|
| 66 |
-
# lr_warmup_steps = 0,
|
| 67 |
-
# #max_train_steps=Training_Steps,
|
| 68 |
-
# )
|
| 69 |
-
# run_training(args_general)
|
| 70 |
-
# torch.cuda.empty_cache()
|
| 71 |
-
# #convert("output_model", "model.ckpt")
|
| 72 |
-
# #shutil.rmtree('instance_images')
|
| 73 |
-
# #shutil.make_archive("diffusers_model", 'zip', "output_model")
|
| 74 |
-
# #with zipfile.ZipFile('diffusers_model.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
|
| 75 |
-
# # zipdir('output_model/', zipf)
|
| 76 |
-
# torch.cuda.empty_cache()
|
| 77 |
-
# return [gr.update(visible=True, value=["diffusers_model.zip"]), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)]
|
| 78 |
-
run_training_process()
|
| 79 |
-
|
| 80 |
|
| 81 |
return f"Train Model Sucessful!!!"
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
def stop_model(*input):
|
| 84 |
return f"Model with Gradio!"
|
| 85 |
|
|
@@ -93,6 +70,8 @@ with gr.Blocks() as demo:
|
|
| 93 |
btn_response.click(fn=generate_model, inputs=inp, outputs=out)
|
| 94 |
btn_train = gr.Button("Train Model")
|
| 95 |
btn_train.click(fn=train_model, inputs=[], outputs=out)
|
|
|
|
|
|
|
| 96 |
btn_evaluate = gr.Button("Evaluate Model")
|
| 97 |
btn_evaluate.click(fn=evaluate_model, inputs=[], outputs=out)
|
| 98 |
btn_stop = gr.Button("Stop Model")
|
|
|
|
| 10 |
import torch
|
| 11 |
|
| 12 |
|
| 13 |
+
from spanish_medica_llm import run_training, run_training_process, run_finnetuning_process
|
| 14 |
|
| 15 |
import gradio as gr
|
| 16 |
|
|
|
|
| 45 |
if "IS_SHARED_UI" in os.environ:
|
| 46 |
raise gr.Error("This Space only works in duplicated instances")
|
| 47 |
|
| 48 |
+
run_training_process()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
return f"Train Model Sucessful!!!"
|
| 51 |
|
| 52 |
+
def finnetuning_model(*inputs):
|
| 53 |
+
if "IS_SHARED_UI" in os.environ:
|
| 54 |
+
raise gr.Error("This Space only works in duplicated instances")
|
| 55 |
+
|
| 56 |
+
run_finnetuning_process()
|
| 57 |
+
|
| 58 |
+
return f"Finnetuning Model Sucessful!!!"
|
| 59 |
+
|
| 60 |
def stop_model(*input):
|
| 61 |
return f"Model with Gradio!"
|
| 62 |
|
|
|
|
| 70 |
btn_response.click(fn=generate_model, inputs=inp, outputs=out)
|
| 71 |
btn_train = gr.Button("Train Model")
|
| 72 |
btn_train.click(fn=train_model, inputs=[], outputs=out)
|
| 73 |
+
btn_finnetuning = gr.Button("Finnetuning Model")
|
| 74 |
+
btn_finnetuning.click(fn=finnetuning_model, inputs=[], outputs=out)
|
| 75 |
btn_evaluate = gr.Button("Evaluate Model")
|
| 76 |
btn_evaluate.click(fn=evaluate_model, inputs=[], outputs=out)
|
| 77 |
btn_stop = gr.Button("Stop Model")
|
spanish_medica_llm.py
CHANGED
|
@@ -331,6 +331,45 @@ MAX_TRAINING_STEPS = 2
|
|
| 331 |
|
| 332 |
TOKEN_NAME = TOKEN_MISTRAL_NAME
|
| 333 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
def loadSpanishTokenizer():
|
| 335 |
"""
|
| 336 |
|
|
@@ -379,12 +418,32 @@ def splitDatasetInTestValid(dataset):
|
|
| 379 |
return (dataset['train'], eval_dataset, test_dataset)
|
| 380 |
|
| 381 |
def loadSpanishDataset():
|
|
|
|
| 382 |
spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train")
|
| 383 |
spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] not in FILTER_CRITERIA)
|
| 384 |
spanishMedicaLllmDataset = spanishMedicaLllmDataset.train_test_split(0.2, seed=203984)
|
| 385 |
return spanishMedicaLllmDataset
|
| 386 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
##See Jupyter Notebook for change CONTEXT_LENGTH size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
|
| 389 |
def accelerateConfigModel():
|
| 390 |
"""
|
|
@@ -483,6 +542,26 @@ def modelLoraConfigBioMistral(model):
|
|
| 483 |
model = accelerator.prepare_model(model)
|
| 484 |
return (model)
|
| 485 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
|
| 487 |
# A note on training. You can set the max_steps to be high initially, and examine at what step your
|
| 488 |
# model's performance starts to degrade. There is where you'll find a sweet spot for how many steps
|
|
@@ -541,10 +620,85 @@ def configAndRunTraining(basemodel, dataset, eval_dataset, tokenizer):
|
|
| 541 |
|
| 542 |
trainer.push_to_hub()
|
| 543 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 544 |
|
| 545 |
|
|
|
|
|
|
|
| 546 |
|
| 547 |
def run_training_process():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
#Loggin to Huggin Face
|
| 549 |
login(token = os.environ.get('HG_FACE_TOKEN'))
|
| 550 |
os.environ['WANDB_DISABLED'] = 'true'
|
|
@@ -554,7 +708,7 @@ def run_training_process():
|
|
| 554 |
getTokenizedDataset( medicalSpanishDataset, tokenizer)
|
| 555 |
)
|
| 556 |
|
| 557 |
-
base_model = loadBaseModel(
|
| 558 |
-
base_model = modelLoraConfigBioMistral(base_model)
|
| 559 |
|
| 560 |
-
|
|
|
|
|
|
| 331 |
|
| 332 |
TOKEN_NAME = TOKEN_MISTRAL_NAME
|
| 333 |
|
| 334 |
+
def get_chat_format(element):
|
| 335 |
+
"""
|
| 336 |
+
Processes a single sample from the alpaca dataset to structure it for chatbot training.
|
| 337 |
+
|
| 338 |
+
This function transforms the dataset sample into a format suitable for training,
|
| 339 |
+
where each message is categorized by its role in the conversation (system, input, user, assistant).
|
| 340 |
+
It initializes the conversation with a system message, then conditionally adds an input message,
|
| 341 |
+
follows with the user's instruction, and finally, the assistant's output based on the provided inputs.
|
| 342 |
+
|
| 343 |
+
Parameters
|
| 344 |
+
----------
|
| 345 |
+
sample : dict
|
| 346 |
+
A dictionary representing a single sample from the dataset. It must contain
|
| 347 |
+
keys corresponding to input and output components of the conversation.
|
| 348 |
+
|
| 349 |
+
Returns
|
| 350 |
+
-------
|
| 351 |
+
dict
|
| 352 |
+
A modified dictionary with a 'messages' key that contains a list of ordered messages,
|
| 353 |
+
each annotated with its role in the conversation.
|
| 354 |
+
"""
|
| 355 |
+
|
| 356 |
+
prompt_template="""A partir del caso cl铆nico que se expone a continuaci贸n, tu tarea es la siguiente.
|
| 357 |
+
Como m茅dico experto, tu tarea es la de diagnosticar al paciente en base al caso cl铆nico. Responde 煤nicamente con el diagn贸stico para el paciente de forma concisa.
|
| 358 |
+
Caso cl铆nico: {caso_clinico}
|
| 359 |
+
"""
|
| 360 |
+
# c贸mo usarlo con un LLM:
|
| 361 |
+
|
| 362 |
+
system_prompt = "Eres un experto en medicina que realiza diagn贸sticos en base a casos cl铆nicos."
|
| 363 |
+
|
| 364 |
+
messages = [
|
| 365 |
+
{"role": "system", "content": system_prompt},
|
| 366 |
+
{"role": "user", "content": prompt_template.format(caso_clinico=element["raw_text"])},
|
| 367 |
+
{"role": "assistant", "content": element["topic"]},
|
| 368 |
+
]
|
| 369 |
+
|
| 370 |
+
element["raw_text"] = messages
|
| 371 |
+
return element
|
| 372 |
+
|
| 373 |
def loadSpanishTokenizer():
|
| 374 |
"""
|
| 375 |
|
|
|
|
| 418 |
return (dataset['train'], eval_dataset, test_dataset)
|
| 419 |
|
| 420 |
def loadSpanishDataset():
|
| 421 |
+
|
| 422 |
spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train")
|
| 423 |
spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] not in FILTER_CRITERIA)
|
| 424 |
spanishMedicaLllmDataset = spanishMedicaLllmDataset.train_test_split(0.2, seed=203984)
|
| 425 |
return spanishMedicaLllmDataset
|
| 426 |
|
| 427 |
+
def loadSpanishDatasetFinnetuning():
|
| 428 |
+
|
| 429 |
+
spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train")
|
| 430 |
+
spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] in FILTER_CRITERIA)
|
| 431 |
+
return spanishMedicaLllmDataset
|
| 432 |
+
|
| 433 |
##See Jupyter Notebook for change CONTEXT_LENGTH size
|
| 434 |
+
def applyChatInstructFormat(dataset, filterColumns = ['raw_text', 'topic']):
|
| 435 |
+
"""
|
| 436 |
+
Apply instruccion chat_template
|
| 437 |
+
"""
|
| 438 |
+
if dataset == None:
|
| 439 |
+
return dataset
|
| 440 |
+
else:
|
| 441 |
+
dataset = dataset.remove_columns([col for col in dataset.features if col not in filterColumns])
|
| 442 |
+
return dataset.map(
|
| 443 |
+
get_chat_format,
|
| 444 |
+
batched=False,
|
| 445 |
+
num_proc=4
|
| 446 |
+
)
|
| 447 |
|
| 448 |
def accelerateConfigModel():
|
| 449 |
"""
|
|
|
|
| 542 |
model = accelerator.prepare_model(model)
|
| 543 |
return (model)
|
| 544 |
|
| 545 |
+
def getLoraConfiguration():
|
| 546 |
+
"""
|
| 547 |
+
"""
|
| 548 |
+
return LoraConfig(
|
| 549 |
+
r=8,
|
| 550 |
+
lora_alpha=16,
|
| 551 |
+
target_modules=[
|
| 552 |
+
"q_proj",
|
| 553 |
+
"k_proj",
|
| 554 |
+
"v_proj",
|
| 555 |
+
"o_proj",
|
| 556 |
+
"gate_proj",
|
| 557 |
+
"up_proj",
|
| 558 |
+
"down_proj",
|
| 559 |
+
"lm_head",
|
| 560 |
+
],
|
| 561 |
+
bias="none",
|
| 562 |
+
lora_dropout=0.05, # Conventional
|
| 563 |
+
task_type="CAUSAL_LM",
|
| 564 |
+
)
|
| 565 |
|
| 566 |
# A note on training. You can set the max_steps to be high initially, and examine at what step your
|
| 567 |
# model's performance starts to degrade. There is where you'll find a sweet spot for how many steps
|
|
|
|
| 620 |
|
| 621 |
trainer.push_to_hub()
|
| 622 |
|
| 623 |
+
def configAndRunFineTuning(basemodel, dataset, eval_dataset, tokenizer):
|
| 624 |
+
if basemodel is None or dataset is None or tokenizer is None:
|
| 625 |
+
return None
|
| 626 |
+
else:
|
| 627 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 628 |
+
|
| 629 |
+
|
| 630 |
+
training_args = TrainingArguments(
|
| 631 |
+
output_dir=output_dir,
|
| 632 |
+
push_to_hub = True,
|
| 633 |
+
hub_private_repo = False,
|
| 634 |
+
hub_model_id = HUB_MODEL_ID,
|
| 635 |
+
warmup_steps = 5,
|
| 636 |
+
per_device_train_batch_size = MICRO_BATCH_SIZE,
|
| 637 |
+
per_device_eval_batch_size=1,
|
| 638 |
+
#gradient_checkpointing=True,
|
| 639 |
+
gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS,
|
| 640 |
+
num_train_epochs = 1,
|
| 641 |
+
learning_rate = 2.5e-5, # Want about 10x smaller than the Mistral learning rate
|
| 642 |
+
logging_steps = 5,
|
| 643 |
+
optim="paged_adamw_8bit",
|
| 644 |
+
logging_dir="./logs", # Directory for storing logs
|
| 645 |
+
save_strategy = "steps", # Save the model checkpoint every logging step
|
| 646 |
+
save_steps = 50, # Save checkpoints every 50 steps
|
| 647 |
+
evaluation_strategy = "steps", # Evaluate the model every logging step
|
| 648 |
+
eval_steps = 50, # Evaluate and save checkpoints every 50 steps
|
| 649 |
+
do_eval = True, # Perform evaluation at the end of training
|
| 650 |
+
eval_steps=50,
|
| 651 |
+
save_total_limit=2,
|
| 652 |
+
remove_unused_columns = True,
|
| 653 |
+
report_to = None, # Comment this out if you don't want to use weights & baises
|
| 654 |
+
run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}" , # Name of the W&B run (optional)
|
| 655 |
+
fp16=True, #Set for GPU T4 for more powerful GPU as G-100 or another change to false and bf16 parameter
|
| 656 |
+
bf16=False
|
| 657 |
+
)
|
| 658 |
+
|
| 659 |
+
trainer = SFTTrainer(
|
| 660 |
+
model=basemodel,
|
| 661 |
+
train_dataset = dataset,
|
| 662 |
+
eval_dataset = eval_dataset,
|
| 663 |
+
peft_config = getLoraConfiguration(),
|
| 664 |
+
dataset_text_field = "raw_text",
|
| 665 |
+
max_seq_length = 1024, #512
|
| 666 |
+
tokenizer = tokenizer,
|
| 667 |
+
args = training_args,
|
| 668 |
+
dataset_kwargs={
|
| 669 |
+
"add_special_tokens": False, # We template with special tokens
|
| 670 |
+
"append_concat_token": False, # No need to add additional separator token
|
| 671 |
+
},
|
| 672 |
+
packing=True
|
| 673 |
+
)
|
| 674 |
+
basemodel.config.use_cache = False # silence the warnings. Please re-enable for inference!
|
| 675 |
+
trainer.train()
|
| 676 |
|
| 677 |
|
| 678 |
+
trainer.push_to_hub()
|
| 679 |
+
|
| 680 |
|
| 681 |
def run_training_process():
|
| 682 |
+
#Loggin to Huggin Face
|
| 683 |
+
login(token = os.environ.get('HG_FACE_TOKEN'))
|
| 684 |
+
os.environ['WANDB_DISABLED'] = 'true'
|
| 685 |
+
tokenizer = loadSpanishTokenizer()
|
| 686 |
+
medicalSpanishDataset = applyChatInstructFormat( loadSpanishDatasetFinnetuning())
|
| 687 |
+
medicalSpanishDataset = medicalSpanishDataset.train_test_split(0.2, seed=203984)
|
| 688 |
+
|
| 689 |
+
# train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid(
|
| 690 |
+
# getTokenizedDataset( medicalSpanishDataset, tokenizer)
|
| 691 |
+
# )
|
| 692 |
+
|
| 693 |
+
|
| 694 |
+
train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid( medicalSpanishDataset )
|
| 695 |
+
|
| 696 |
+
base_model = loadBaseModel(MISTRAL_BASE_MODEL_ID)
|
| 697 |
+
base_model = modelLoraConfigBioMistral(base_model)
|
| 698 |
+
|
| 699 |
+
configAndRunTraining(base_model,train_dataset, eval_dataset, tokenizer)
|
| 700 |
+
|
| 701 |
+
def run_finnetuning_process():
|
| 702 |
#Loggin to Huggin Face
|
| 703 |
login(token = os.environ.get('HG_FACE_TOKEN'))
|
| 704 |
os.environ['WANDB_DISABLED'] = 'true'
|
|
|
|
| 708 |
getTokenizedDataset( medicalSpanishDataset, tokenizer)
|
| 709 |
)
|
| 710 |
|
| 711 |
+
base_model = loadBaseModel(HUB_MODEL_ID)
|
|
|
|
| 712 |
|
| 713 |
+
configAndRunFineTuning(base_model,train_dataset, eval_dataset, tokenizer)
|
| 714 |
+
|