Spaces:
Running
Running
#BS_app.py_07 | |
#Training OK - Correct Upload Training File | |
#testing bloom1b training | |
import gradio as gr | |
import os | |
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling | |
from datasets import load_dataset, Dataset | |
from huggingface_hub import HfApi, HfFolder | |
import requests | |
from io import BytesIO | |
import json | |
# Récupérer token depuis les variables d'environnement | |
hf_token = os.getenv("MisterAI_bigscience_bloom_560m") | |
# Configurer le token pour l'utilisation avec Hugging Face | |
if hf_token: | |
HfFolder.save_token(hf_token) | |
else: | |
raise ValueError("Le token Hugging Face n'est pas configuré. Assurez-vous qu'il est défini dans les variables d'environnement.") | |
# Chargement du modèle et du tokenizer | |
model_name = "MisterAI/bigscience_bloom-560m" | |
model = AutoModelForCausalLM.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# Fonction pour générer une réponse | |
def generate_response(input_text): | |
inputs = tokenizer(input_text, return_tensors="pt") | |
outputs = model.generate(**inputs, max_length=100) | |
response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return response | |
## Fonction pour le fine-tuning | |
#def fine_tune_model(dataset_path, dataset_file, split, colonne_input_ids, colonne_attention_mask, colonne_labels, epochs, batch_size): | |
# # Récupération du fichier à partir de l'URL fournie | |
# response = requests.get(f"{dataset_path}/resolve/main/{dataset_file}") | |
# dataset_lines = response.text.strip().split('\n') | |
# | |
# # Convertir les lignes en dictionnaires pour JSONL | |
# dataset_dict = [json.loads(line) for line in dataset_lines if line] | |
# | |
# # Créer un Dataset Hugging Face | |
# dataset = Dataset.from_dict({ | |
# colonne_input_ids: [item[colonne_input_ids] for item in dataset_dict], | |
# colonne_attention_mask: [item.get(colonne_attention_mask, "") for item in dataset_dict], | |
# colonne_labels: [item.get(colonne_labels, "") for item in dataset_dict] | |
# }) | |
# | |
# # Préparation des données | |
# def preprocess_function(examples): | |
# inputs = [p +'' + c for p, c in zip(examples[colonne_input_ids], examples[colonne_attention_mask])] | |
# model_inputs = tokenizer(inputs, truncation=True, padding='max_length', max_length=2048) | |
# model_inputs["labels"] = model_inputs["input_ids"].copy() | |
# return model_inputs | |
# | |
# dataset = dataset.map(preprocess_function, batched=True) | |
# dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) | |
# | |
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
def fine_tune_model(dataset_path, dataset_file, split, colonne_input_ids, colonne_attention_mask, colonne_labels, epochs, batch_size): | |
# Récupération du fichier à partir de l'URL fournie | |
# dataset = load_dataset(dataset_path, data_files={dataset_file: dataset_file}) | |
# dataset = load_dataset(dataset_path, dataset_file) | |
dataset = load_dataset(dataset_path, dataset_file, split, colonne_input_ids, colonne_attention_mask, colonne_labels) | |
# Chargement des données dans un dictionnaire | |
dataset_dict = [item for item in dataset] | |
# Créer un Dataset Hugging Face | |
dataset = Dataset.from_dict({ | |
colonne_input_ids: [item[colonne_input_ids] for item in dataset_dict], | |
colonne_attention_mask: [item.get(colonne_attention_mask, "") for item in dataset_dict], | |
colonne_labels: [item.get(colonne_labels, "") for item in dataset_dict] | |
}) | |
# Préparation des données | |
def preprocess_function(examples): | |
inputs = [p +'' + c for p, c in zip(examples[colonne_input_ids], examples[colonne_attention_mask])] | |
model_inputs = tokenizer(inputs, truncation=True, padding='max_length', max_length=2048) | |
model_inputs["labels"] = model_inputs["input_ids"].copy() | |
return model_inputs | |
dataset = dataset.map(preprocess_function, batched=True) | |
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) | |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
# Configuration de l'entraînement | |
training_args = TrainingArguments( | |
output_dir="./", | |
num_train_epochs=epochs, | |
per_device_train_batch_size=batch_size, | |
save_steps=10_000, | |
save_total_limit=2, | |
push_to_hub=False, # Désactiver la création automatique de repo | |
hub_token=hf_token, | |
) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
data_collator=data_collator, | |
train_dataset=dataset, | |
) | |
# Lancement de l'entraînement | |
trainer.train() | |
# Sauvegarde du modèle | |
trainer.save_model("./") | |
tokenizer.save_pretrained("./") | |
# Push vers Hugging Face Hub | |
api = HfApi() | |
api.upload_folder( | |
folder_path="./", | |
repo_id=model_name, # Utiliser le même repo_id que le modèle original | |
repo_type="model", | |
token=hf_token, | |
) | |
return "Fine-tuning terminé et modèle sauvegardé." | |
# Interface Gradio | |
with gr.Blocks() as demo: | |
with gr.Tab("Chatbot"): | |
chat_interface = gr.Interface( | |
fn=generate_response, | |
inputs="text", | |
outputs="text", | |
title="Chat avec le modèle", | |
description="Entrez votre message pour obtenir une réponse du modèle" | |
) | |
with gr.Tab("Fine-Tuning"): | |
with gr.Row(): | |
dataset_path = gr.Textbox(label="Chemin du dataset", placeholder="https://huggingface.co/datasets/MisterAI/SimpleSmallFrenchQA") | |
dataset_file = gr.Textbox(label="Nom du fichier du dataset", placeholder="Dataset02_01L_QR_256_Francais.jsonl") | |
with gr.Row(): | |
split = gr.Textbox(label="Split (si applicable)") | |
colonne_input_ids = gr.Textbox(label="Colonne input_ids") | |
colonne_attention_mask = gr.Textbox(label="Colonne attention_mask") | |
colonne_labels = gr.Textbox(label="Colonne labels (si applicable)") | |
with gr.Row(): | |
load_button = gr.Button("Charger Le DataSet") | |
with gr.Row(): | |
load_output = gr.Textbox(label="État du chargement du DataSet") | |
with gr.Row(): | |
epochs = gr.Number(label="Nombre d'époques", value=1) | |
batch_size = gr.Number(label="Taille du batch", value=2) | |
with gr.Row(): | |
fine_tune_button = gr.Button("Lancer Le Fine-Tuning") | |
with gr.Row(): | |
fine_tune_output = gr.Textbox(label="État du Fine-Tuning") | |
def load_dataset(dataset_path, dataset_file, split, colonne_input_ids, colonne_attention_mask, colonne_labels): | |
try: | |
response = requests.get(f"{dataset_path}/resolve/main/{dataset_file}") | |
dataset_lines = response.text.strip().split('\n') | |
dataset_dict = [json.loads(line) for line in dataset_lines if line] | |
return "DataSet chargé avec succès" | |
except Exception as e: | |
return str(e) | |
load_button.click( | |
load_dataset, | |
inputs=[dataset_path, dataset_file, split, colonne_input_ids, colonne_attention_mask, colonne_labels], | |
outputs=load_output | |
) | |
fine_tune_button.click( | |
fine_tune_model, | |
inputs=[dataset_path, dataset_file, split, colonne_input_ids, colonne_attention_mask, colonne_labels, epochs, batch_size], | |
outputs=fine_tune_output | |
) | |
# Lancement de la démo | |
if __name__ == "__main__": | |
demo.launch() | |