Spaces:
Runtime error
Runtime error
| import argparse | |
| import itertools | |
| import math | |
| import os | |
| from pathlib import Path | |
| from typing import Optional | |
| import subprocess | |
| import sys | |
| from datetime import datetime | |
| from dataclasses import dataclass, field | |
| from typing import Optional | |
| import numpy as np | |
| import torch | |
| from datasets import load_dataset, concatenate_datasets | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| BitsAndBytesConfig, | |
| DataCollatorForLanguageModeling, | |
| TrainingArguments, | |
| Trainer, | |
| GenerationConfig | |
| ) | |
| from accelerate import FullyShardedDataParallelPlugin, Accelerator | |
| from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig | |
| from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training | |
| #import wandb | |
| from trl import SFTTrainer | |
| from huggingface_hub import login | |
| CHAT_ML_TEMPLATE_Mistral_7B_Instruct = """ | |
| {% if messages[0]['role'] == 'system' %} | |
| {% set loop_messages = messages[1:] %} | |
| {% set system_message = messages[0]['content'].strip() + '\n\n' %} | |
| {% else %} | |
| {% set loop_messages = messages %} | |
| {% set system_message = '' %} | |
| {% endif %} | |
| {{ bos_token }} | |
| {% for message in loop_messages %} | |
| {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} | |
| {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} | |
| {% endif %} | |
| {% if loop.index0 == 0 %} | |
| {% set content = system_message + message['content'] %} | |
| {% else %} | |
| {% set content = message['content'] %} | |
| {% endif %} | |
| {% if message['role'] == 'user' %} | |
| {{ '[INST] ' + content.strip() + ' [/INST]' }} | |
| {% elif message['role'] == 'assistant' %} | |
| {{ ' ' + content.strip() + ' ' + eos_token }} | |
| {% endif %} | |
| {% endfor %} | |
| """ | |
| def parse_args(): | |
| parser = argparse.ArgumentParser(description="Simple example of a training script.") | |
| parser.add_argument( | |
| "--pretrained_model_name_or_path", | |
| type=str, | |
| default=None, | |
| #required=True, | |
| help="Path to pretrained model or model identifier from huggingface.co/models.", | |
| ) | |
| parser.add_argument( | |
| "--tokenizer_name", | |
| type=str, | |
| default=None, | |
| help="Pretrained tokenizer name or path if not the same as model_name", | |
| ) | |
| parser.add_argument( | |
| "--instance_data_dir", | |
| type=str, | |
| default=None, | |
| #required=True, | |
| help="A folder containing the training data of instance images.", | |
| ) | |
| parser.add_argument( | |
| "--class_data_dir", | |
| type=str, | |
| default=None, | |
| required=False, | |
| help="A folder containing the training data of class images.", | |
| ) | |
| parser.add_argument( | |
| "--instance_prompt", | |
| type=str, | |
| default=None, | |
| help="The prompt with identifier specifying the instance", | |
| ) | |
| parser.add_argument( | |
| "--class_prompt", | |
| type=str, | |
| default="", | |
| help="The prompt to specify images in the same class as provided instance images.", | |
| ) | |
| parser.add_argument( | |
| "--with_prior_preservation", | |
| default=False, | |
| action="store_true", | |
| help="Flag to add prior preservation loss.", | |
| ) | |
| parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.") | |
| parser.add_argument( | |
| "--num_class_images", | |
| type=int, | |
| default=100, | |
| help=( | |
| "Minimal class images for prior preservation loss. If not have enough images, additional images will be" | |
| " sampled with class_prompt." | |
| ), | |
| ) | |
| parser.add_argument( | |
| "--output_dir", | |
| type=str, | |
| default="", | |
| help="The output directory where the model predictions and checkpoints will be written.", | |
| ) | |
| parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") | |
| parser.add_argument( | |
| "--resolution", | |
| type=int, | |
| default=512, | |
| help=( | |
| "The resolution for input images, all the images in the train/validation dataset will be resized to this" | |
| " resolution" | |
| ), | |
| ) | |
| parser.add_argument( | |
| "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution" | |
| ) | |
| parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder") | |
| parser.add_argument( | |
| "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader." | |
| ) | |
| parser.add_argument( | |
| "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images." | |
| ) | |
| parser.add_argument("--num_train_epochs", type=int, default=1) | |
| parser.add_argument( | |
| "--max_train_steps", | |
| type=int, | |
| default=None, | |
| help="Total number of training steps to perform. If provided, overrides num_train_epochs.", | |
| ) | |
| parser.add_argument( | |
| "--gradient_accumulation_steps", | |
| type=int, | |
| default=1, | |
| help="Number of updates steps to accumulate before performing a backward/update pass.", | |
| ) | |
| parser.add_argument( | |
| "--gradient_checkpointing", | |
| action="store_true", | |
| help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", | |
| ) | |
| parser.add_argument( | |
| "--learning_rate", | |
| type=float, | |
| default=5e-6, | |
| help="Initial learning rate (after the potential warmup period) to use.", | |
| ) | |
| parser.add_argument( | |
| "--scale_lr", | |
| action="store_true", | |
| default=False, | |
| help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.", | |
| ) | |
| parser.add_argument( | |
| "--lr_scheduler", | |
| type=str, | |
| default="constant", | |
| help=( | |
| 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' | |
| ' "constant", "constant_with_warmup"]' | |
| ), | |
| ) | |
| parser.add_argument( | |
| "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." | |
| ) | |
| parser.add_argument( | |
| "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." | |
| ) | |
| parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") | |
| parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") | |
| parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") | |
| parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") | |
| parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") | |
| parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") | |
| parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") | |
| parser.add_argument( | |
| "--hub_model_id", | |
| type=str, | |
| default=None, | |
| help="The name of the repository to keep in sync with the local `output_dir`.", | |
| ) | |
| parser.add_argument( | |
| "--logging_dir", | |
| type=str, | |
| default="logs", | |
| help=( | |
| "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" | |
| " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." | |
| ), | |
| ) | |
| parser.add_argument( | |
| "--mixed_precision", | |
| type=str, | |
| default="no", | |
| choices=["no", "fp16", "bf16"], | |
| help=( | |
| "Whether to use mixed precision. Choose" | |
| "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10." | |
| "and an Nvidia Ampere GPU." | |
| ), | |
| ) | |
| parser.add_argument( | |
| "--save_n_steps", | |
| type=int, | |
| default=1, | |
| help=("Save the model every n global_steps"), | |
| ) | |
| parser.add_argument( | |
| "--save_starting_step", | |
| type=int, | |
| default=1, | |
| help=("The step from which it starts saving intermediary checkpoints"), | |
| ) | |
| parser.add_argument( | |
| "--stop_text_encoder_training", | |
| type=int, | |
| default=1000000, | |
| help=("The step at which the text_encoder is no longer trained"), | |
| ) | |
| parser.add_argument( | |
| "--image_captions_filename", | |
| action="store_true", | |
| help="Get captions from filename", | |
| ) | |
| parser.add_argument( | |
| "--dump_only_text_encoder", | |
| action="store_true", | |
| default=False, | |
| help="Dump only text encoder", | |
| ) | |
| parser.add_argument( | |
| "--train_only_unet", | |
| action="store_true", | |
| default=False, | |
| help="Train only the unet", | |
| ) | |
| parser.add_argument( | |
| "--Session_dir", | |
| type=str, | |
| default="", | |
| help="Current session directory", | |
| ) | |
| parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") | |
| args = parser.parse_args() | |
| env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) | |
| if env_local_rank != -1 and env_local_rank != args.local_rank: | |
| args.local_rank = env_local_rank | |
| #if args.instance_data_dir is None: | |
| # raise ValueError("You must specify a train data directory.") | |
| #if args.with_prior_preservation: | |
| # if args.class_data_dir is None: | |
| # raise ValueError("You must specify a data directory for class images.") | |
| # if args.class_prompt is None: | |
| # raise ValueError("You must specify prompt for class images.") | |
| return args | |
| def run_training(args_imported): | |
| args_default = parse_args() | |
| #args = merge_args(args_default, args_imported) | |
| return(args) | |
| TOKEN_NAME = "DeepESP/gpt2-spanish-medium" | |
| TOKEN_MISTRAL_NAME = "mistralai/Mistral-7B-Instruct-v0.1" | |
| SPANISH_MEDICA_LLM_DATASET = "somosnlp/spanish_medica_llm" | |
| TOPIC_TYPE_DIAGNOSTIC = 'medical_diagnostic' | |
| TOPIC_TYPE_TRATAMIENT = 'medical_topic' | |
| FILTER_CRITERIA = [TOPIC_TYPE_DIAGNOSTIC, TOPIC_TYPE_TRATAMIENT] | |
| CONTEXT_LENGTH = 256 #Max of tokens | |
| MISTRAL_BASE_MODEL_ID = "BioMistral/BioMistral-7B" | |
| MICRO_BATCH_SIZE = 16 #32 For other GPU BIGGER THAN T4 | |
| BATCH_SIZE = 64 #128 For other GPU BIGGER THAN T4 | |
| GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE | |
| PROJECT_NAME = "spanish-medica-llm" | |
| BASE_MODEL_NAME = "biomistral" | |
| run_name = BASE_MODEL_NAME + "-" + PROJECT_NAME | |
| output_dir = "./" + run_name | |
| HUB_MODEL_ID = 'somosnlp/spanish_medica_llm' | |
| MAX_TRAINING_STEPS = int(1500/2) | |
| MAX_TRAINING_STEPS = 2 | |
| TOKEN_NAME = TOKEN_MISTRAL_NAME | |
| def get_chat_format(element): | |
| """ | |
| Processes a single sample from the alpaca dataset to structure it for chatbot training. | |
| This function transforms the dataset sample into a format suitable for training, | |
| where each message is categorized by its role in the conversation (system, input, user, assistant). | |
| It initializes the conversation with a system message, then conditionally adds an input message, | |
| follows with the user's instruction, and finally, the assistant's output based on the provided inputs. | |
| Parameters | |
| ---------- | |
| sample : dict | |
| A dictionary representing a single sample from the dataset. It must contain | |
| keys corresponding to input and output components of the conversation. | |
| Returns | |
| ------- | |
| dict | |
| A modified dictionary with a 'messages' key that contains a list of ordered messages, | |
| each annotated with its role in the conversation. | |
| """ | |
| prompt_template="""A partir del caso clínico que se expone a continuación, tu tarea es la siguiente. | |
| Como médico experto, tu tarea es la de diagnosticar al paciente en base al caso clínico. Responde únicamente con el diagnóstico para el paciente de forma concisa. | |
| Caso clínico: {caso_clinico} | |
| """ | |
| # cómo usarlo con un LLM: | |
| system_prompt = "Eres un experto en medicina que realiza diagnósticos en base a casos clínicos." | |
| messages = [ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": prompt_template.format(caso_clinico=element["raw_text"])}, | |
| {"role": "assistant", "content": element["topic"]}, | |
| ] | |
| element["raw_text"] = messages | |
| return element | |
| def loadSpanishTokenizer(): | |
| """ | |
| """ | |
| #Load first the mistral used tokenizer | |
| tokenizerMistrall = AutoTokenizer.from_pretrained(TOKEN_MISTRAL_NAME) | |
| #Load second an spanish specialized tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| TOKEN_NAME, | |
| eos_token = tokenizerMistrall.special_tokens_map['eos_token'], | |
| bos_token = tokenizerMistrall.special_tokens_map['bos_token'], | |
| unk_token = tokenizerMistrall.special_tokens_map['unk_token'] | |
| ) | |
| tokenizer.chat_template = CHAT_ML_TEMPLATE_Mistral_7B_Instruct | |
| return tokenizer | |
| def tokenize(element, tokenizer): | |
| outputs = tokenizer( | |
| element["raw_text"], | |
| truncation = True, | |
| max_length = CONTEXT_LENGTH, | |
| return_overflowing_tokens = True, | |
| return_length = True, | |
| ) | |
| input_batch = [] | |
| for length, input_ids in zip(outputs["length"], outputs["input_ids"]): | |
| if length == CONTEXT_LENGTH: | |
| input_batch.append(input_ids) | |
| return {"input_ids": input_batch} | |
| def apply_chat_template(example, tokenizer): | |
| example['raw_text'] = tokenizer.apply_chat_template(example['raw_text'], tokenize=False) | |
| return example | |
| def splitDatasetInTestValid(dataset): | |
| """ | |
| """ | |
| if dataset == None or dataset['train'] == None: | |
| return dataset | |
| elif dataset['test'] == None: | |
| return None | |
| else: | |
| test_eval = dataset['test'].train_test_split(test_size=0.001) | |
| eval_dataset = test_eval['train'] | |
| test_dataset = test_eval['test'] | |
| return (dataset['train'], eval_dataset, test_dataset) | |
| def loadSpanishDataset(): | |
| spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train") | |
| spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] not in FILTER_CRITERIA) | |
| spanishMedicaLllmDataset = spanishMedicaLllmDataset.train_test_split(0.2, seed=203984) | |
| return spanishMedicaLllmDataset | |
| def loadSpanishDatasetFinnetuning(): | |
| spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train") | |
| spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] in FILTER_CRITERIA) | |
| return spanishMedicaLllmDataset | |
| ##See Jupyter Notebook for change CONTEXT_LENGTH size | |
| def applyChatInstructFormat(dataset, filterColumns = ['raw_text', 'topic']): | |
| """ | |
| Apply instruccion chat_template | |
| """ | |
| if dataset == None: | |
| return dataset | |
| else: | |
| dataset = dataset.remove_columns([col for col in dataset.features if col not in filterColumns]) | |
| return dataset.map( | |
| get_chat_format, | |
| batched=False, | |
| num_proc=4 | |
| ) | |
| def accelerateConfigModel(): | |
| """ | |
| Only with GPU support | |
| RuntimeError: There are currently no available devices found, must be one of 'XPU', 'CUDA', or 'NPU'. | |
| """ | |
| fsdp_plugin = FullyShardedDataParallelPlugin( | |
| state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False), | |
| optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False), | |
| ) | |
| return Accelerator(fsdp_plugin=fsdp_plugin) | |
| def getTokenizedDataset(dataset, tokenizer): | |
| if dataset == None or tokenizer == None: | |
| return dataset | |
| return dataset.map( | |
| lambda element : tokenize(element, tokenizer), | |
| batched = True, | |
| remove_columns = dataset["train"].column_names | |
| ) | |
| def loadBaseModel(base_model_id): | |
| if base_model_id in [ "", None]: | |
| return None | |
| else: | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit = True, | |
| bnb_4bit_quant_type = "nf4", | |
| bnb_4bit_use_double_quant = True, | |
| bnb_4bit_compute_dtype = torch.bfloat16 | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| base_model_id, | |
| quantization_config = bnb_config | |
| ) | |
| model.gradient_checkpointing_enable() | |
| model = prepare_model_for_kbit_training(model) | |
| return model | |
| def print_trainable_parameters(model): | |
| """ | |
| Prints the number of trainable parameters in the model. | |
| """ | |
| trainable_params = 0 | |
| all_param = 0 | |
| for _, param in model.named_parameters(): | |
| all_param += param.numel() | |
| if param.requires_grad: | |
| trainable_params += param.numel() | |
| print( | |
| f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" | |
| ) | |
| def modelLoraConfigBioMistral(model): | |
| """ | |
| r is the rank of the low-rank matrix used in the adapters, which thus controls | |
| the number of parameters trained. A higher rank will allow for more expressivity, but there is a | |
| compute tradeoff. | |
| alpha is the scaling factor for the learned weights. The weight matrix is scaled by | |
| alpha/r, and thus a higher value for alpha assigns more weight to the LoRA activations. | |
| The values used in the QLoRA paper werer=64 and lora_alpha=16, | |
| and these are said to generalize well, but we will user=8 and lora_alpha=16 so that we have more emphasis on the new fine-tuned data while also reducing computational complexity. | |
| """ | |
| if model == None: | |
| return model | |
| else: | |
| config = LoraConfig( | |
| r=8, | |
| lora_alpha=16, | |
| target_modules=[ | |
| "q_proj", | |
| "k_proj", | |
| "v_proj", | |
| "o_proj", | |
| "gate_proj", | |
| "up_proj", | |
| "down_proj", | |
| "lm_head", | |
| ], | |
| bias="none", | |
| lora_dropout=0.05, # Conventional | |
| task_type="CAUSAL_LM", | |
| ) | |
| model = get_peft_model(model, config) | |
| print_trainable_parameters(model) | |
| accelerator = accelerateConfigModel() | |
| # Apply the accelerator. You can comment this out to remove the accelerator. | |
| model = accelerator.prepare_model(model) | |
| return (model) | |
| def getLoraConfiguration(): | |
| """ | |
| """ | |
| return LoraConfig( | |
| r=8, | |
| lora_alpha=16, | |
| target_modules=[ | |
| "q_proj", | |
| "k_proj", | |
| "v_proj", | |
| "o_proj", | |
| "gate_proj", | |
| "up_proj", | |
| "down_proj", | |
| "lm_head", | |
| ], | |
| bias="none", | |
| lora_dropout=0.05, # Conventional | |
| task_type="CAUSAL_LM", | |
| ) | |
| # A note on training. You can set the max_steps to be high initially, and examine at what step your | |
| # model's performance starts to degrade. There is where you'll find a sweet spot for how many steps | |
| # to perform. For example, say you start with 1000 steps, and find that at around 500 steps | |
| # the model starts overfitting - the validation loss goes up (bad) while the training | |
| # loss goes down significantly, meaning the model is learning the training set really well, | |
| # but is unable to generalize to new datapoints. Therefore, 500 steps would be your sweet spot, | |
| # so you would use the checkpoint-500 model repo in your output dir (biomistral-medqa-finetune) | |
| # as your final model in step 6 below. | |
| def configAndRunTraining(basemodel, dataset, eval_dataset, tokenizer): | |
| if basemodel is None or dataset is None or tokenizer is None: | |
| return None | |
| else: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| data_collator_pretrain = DataCollatorForLanguageModeling(tokenizer, mlm = False) | |
| training_args = TrainingArguments( | |
| output_dir=output_dir, | |
| push_to_hub = True, | |
| hub_private_repo = False, | |
| hub_model_id = HUB_MODEL_ID, | |
| warmup_steps = 5, | |
| per_device_train_batch_size = MICRO_BATCH_SIZE, | |
| per_device_eval_batch_size=1, | |
| #gradient_checkpointing=True, | |
| gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS, | |
| max_steps = MAX_TRAINING_STEPS, | |
| learning_rate = 2.5e-5, # Want about 10x smaller than the Mistral learning rate | |
| logging_steps = 50, | |
| optim="paged_adamw_8bit", | |
| logging_dir="./logs", # Directory for storing logs | |
| save_strategy = "steps", # Save the model checkpoint every logging step | |
| save_steps = 50, # Save checkpoints every 50 steps | |
| evaluation_strategy = "steps", # Evaluate the model every logging step | |
| eval_steps = 50, # Evaluate and save checkpoints every 50 steps | |
| do_eval = True, # Perform evaluation at the end of training | |
| report_to = None, # Comment this out if you don't want to use weights & baises | |
| run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}" , # Name of the W&B run (optional) | |
| fp16=True, #Set for GPU T4 for more powerful GPU as G-100 or another change to false and bf16 parameter | |
| bf16=False | |
| ) | |
| trainer = Trainer( | |
| model= basemodel, | |
| train_dataset = dataset, | |
| eval_dataset = eval_dataset, | |
| args = training_args, | |
| data_collator = data_collator_pretrain | |
| ) | |
| basemodel.config.use_cache = False # silence the warnings. Please re-enable for inference! | |
| trainer.train() | |
| trainer.push_to_hub() | |
| def configAndRunFineTuning(basemodel, dataset, eval_dataset, tokenizer): | |
| if basemodel is None or dataset is None or tokenizer is None: | |
| return None | |
| else: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| training_args = TrainingArguments( | |
| output_dir=output_dir, | |
| push_to_hub = True, | |
| hub_private_repo = False, | |
| hub_model_id = HUB_MODEL_ID, | |
| warmup_steps = 5, | |
| per_device_train_batch_size = MICRO_BATCH_SIZE, | |
| per_device_eval_batch_size=1, | |
| #gradient_checkpointing=True, | |
| gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS, | |
| num_train_epochs = 1, | |
| learning_rate = 2.5e-5, # Want about 10x smaller than the Mistral learning rate | |
| logging_steps = 5, | |
| optim="paged_adamw_8bit", | |
| logging_dir="./logs", # Directory for storing logs | |
| save_strategy = "steps", # Save the model checkpoint every logging step | |
| save_steps = 50, # Save checkpoints every 50 steps | |
| evaluation_strategy = "steps", # Evaluate the model every logging step | |
| eval_steps = 50, # Evaluate and save checkpoints every 50 steps | |
| do_eval = True, # Perform evaluation at the end of training | |
| save_total_limit=2, | |
| remove_unused_columns = True, | |
| report_to = None, # Comment this out if you don't want to use weights & baises | |
| run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}" , # Name of the W&B run (optional) | |
| fp16=True, #Set for GPU T4 for more powerful GPU as G-100 or another change to false and bf16 parameter | |
| bf16=False | |
| ) | |
| trainer = SFTTrainer( | |
| model=basemodel, | |
| train_dataset = dataset, | |
| eval_dataset = eval_dataset, | |
| peft_config = getLoraConfiguration(), | |
| dataset_text_field = "raw_text", | |
| max_seq_length = 1024, #512 | |
| tokenizer = tokenizer, | |
| args = training_args, | |
| dataset_kwargs={ | |
| "add_special_tokens": False, # We template with special tokens | |
| "append_concat_token": False, # No need to add additional separator token | |
| }, | |
| packing=True | |
| ) | |
| basemodel.config.use_cache = False # silence the warnings. Please re-enable for inference! | |
| trainer.train() | |
| trainer.push_to_hub() | |
| def run_training_process(): | |
| #Loggin to Huggin Face | |
| login(token = os.environ.get('HG_FACE_TOKEN')) | |
| os.environ['WANDB_DISABLED'] = 'true' | |
| tokenizer = loadSpanishTokenizer() | |
| medicalSpanishDataset = loadSpanishDataset() | |
| train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid( | |
| getTokenizedDataset( medicalSpanishDataset, tokenizer) | |
| ) | |
| train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid( medicalSpanishDataset ) | |
| base_model = loadBaseModel(MISTRAL_BASE_MODEL_ID) | |
| base_model = modelLoraConfigBioMistral(base_model) | |
| configAndRunTraining(base_model,train_dataset, eval_dataset, tokenizer) | |
| def run_finnetuning_process(): | |
| #Loggin to Huggin Face | |
| login(token = os.environ.get('HG_FACE_TOKEN')) | |
| os.environ['WANDB_DISABLED'] = 'true' | |
| tokenizer = loadSpanishTokenizer() | |
| medicalSpanishDataset = applyChatInstructFormat( loadSpanishDatasetFinnetuning()) | |
| print ( tokenizer.apply_chat_template(medicalSpanishDataset[5]['raw_text'], tokenize=False)) | |
| print('----------------------------------------------------------') | |
| medicalSpanishDataset = medicalSpanishDataset.map(apply_chat_template, | |
| num_proc = os.cpu_count(), | |
| fn_kwargs = {'tokenizer':tokenizer}, | |
| remove_columns = [col for col in medicalSpanishDataset.features if col not in ['raw_text']], | |
| desc = 'Applying chat template' | |
| ) | |
| medicalSpanishDataset = medicalSpanishDataset.train_test_split(0.2, seed=203984) | |
| train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid( medicalSpanishDataset ) | |
| base_model = loadBaseModel(HUB_MODEL_ID) | |
| print('Dataset in One ') | |
| print (train_dataset[5]) | |
| configAndRunFineTuning(base_model,train_dataset, eval_dataset, tokenizer) | |
| def generate_response(query): | |
| max_new_tokens=256 | |
| temperature=0.1 | |
| top_p=0.75 | |
| top_k=40 | |
| num_beams=2 | |
| tokenizer = loadSpanishTokenizer() | |
| model = loadBaseModel(HUB_MODEL_ID) | |
| system = f"[INST]\nYou are a helpful coding assistant.[/INST]\n" | |
| prompt = f"{system}\n{query}\n \n" | |
| print(prompt) | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| input_ids = inputs["input_ids"].to("cuda") | |
| attention_mask = inputs["attention_mask"].to("cuda") | |
| generation_config = GenerationConfig( | |
| temperature=temperature, | |
| top_p=top_p, | |
| top_k=top_k, | |
| num_beams=num_beams, | |
| ) | |
| with torch.no_grad(): | |
| generation_output = model.generate( | |
| input_ids=input_ids, | |
| attention_mask=attention_mask, | |
| generation_config=generation_config, | |
| return_dict_in_generate=True, | |
| #output_scores=True, | |
| max_new_tokens=max_new_tokens, | |
| early_stopping=True | |
| ) | |
| s = generation_output.sequences[0] | |
| output = tokenizer.decode(s, skip_special_tokens=True) | |
| return output | |
| # return output.split("<|assistant|>")[1] | |