Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
#BS_app.
|
2 |
-
#Training OK BUT REWRITE ALL FILES OF MODEL ON ORIGINAL REPO
|
3 |
|
4 |
#testing bloom1b training
|
5 |
|
@@ -10,6 +10,7 @@ from datasets import load_dataset, Dataset
|
|
10 |
from huggingface_hub import HfApi, HfFolder
|
11 |
import requests
|
12 |
from io import BytesIO
|
|
|
13 |
|
14 |
# Récupérer token depuis les variables d'environnement
|
15 |
hf_token = os.getenv("MisterAI_bigscience_bloom_560m")
|
@@ -33,9 +34,7 @@ def generate_response(input_text):
|
|
33 |
return response
|
34 |
|
35 |
# Fonction pour le fine-tuning
|
36 |
-
def fine_tune_model(dataset_path, dataset_file, epochs, batch_size, prefix):
|
37 |
-
import json # Assurez-vous que json est importé
|
38 |
-
|
39 |
# Récupération du fichier à partir de l'URL fournie
|
40 |
response = requests.get(f"{dataset_path}/resolve/main/{dataset_file}")
|
41 |
dataset_lines = response.text.strip().split('\n')
|
@@ -43,42 +42,20 @@ def fine_tune_model(dataset_path, dataset_file, epochs, batch_size, prefix):
|
|
43 |
# Convertir les lignes en dictionnaires pour JSONL
|
44 |
dataset_dict = [json.loads(line) for line in dataset_lines if line]
|
45 |
|
46 |
-
#MAPPAGE
|
47 |
-
# # Créer un Dataset Hugging Face
|
48 |
-
# dataset = Dataset.from_dict({
|
49 |
-
# 'question': [item['question'] for item in dataset_dict],
|
50 |
-
# 'chosen': [item['chosen'] for item in dataset_dict]
|
51 |
-
# })
|
52 |
-
#
|
53 |
-
# # Préparation des données
|
54 |
-
# def preprocess_function(examples):
|
55 |
-
# inputs = [q + ' ' + c for q, c in zip(examples['question'], examples['chosen'])]
|
56 |
-
# model_inputs = tokenizer(inputs, truncation=True, padding='max_length', max_length=512)
|
57 |
-
# model_inputs["labels"] = model_inputs["input_ids"].copy()
|
58 |
-
# return model_inputs#*
|
59 |
-
#
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
# Créer un Dataset Hugging Face
|
66 |
dataset = Dataset.from_dict({
|
67 |
-
|
68 |
-
|
|
|
69 |
})
|
70 |
|
71 |
# Préparation des données
|
72 |
def preprocess_function(examples):
|
73 |
-
inputs = [p +
|
74 |
model_inputs = tokenizer(inputs, truncation=True, padding='max_length', max_length=2048)
|
75 |
model_inputs["labels"] = model_inputs["input_ids"].copy()
|
76 |
return model_inputs
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
dataset = dataset.map(preprocess_function, batched=True)
|
83 |
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
|
84 |
|
@@ -122,17 +99,6 @@ def fine_tune_model(dataset_path, dataset_file, epochs, batch_size, prefix):
|
|
122 |
|
123 |
|
124 |
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
# Interface Gradio
|
137 |
with gr.Blocks() as demo:
|
138 |
with gr.Tab("Chatbot"):
|
@@ -148,6 +114,10 @@ with gr.Blocks() as demo:
|
|
148 |
with gr.Row():
|
149 |
dataset_path = gr.Textbox(label="Chemin du dataset")
|
150 |
dataset_file = gr.Textbox(label="Nom du fichier du dataset")
|
|
|
|
|
|
|
|
|
151 |
epochs = gr.Number(label="Nombre d'époques", value=1)
|
152 |
batch_size = gr.Number(label="Taille du batch", value=2)
|
153 |
prefix = gr.Textbox(label="Préfixe pour les fichiers sauvegardés")
|
@@ -158,7 +128,7 @@ with gr.Blocks() as demo:
|
|
158 |
|
159 |
fine_tune_button.click(
|
160 |
fine_tune_model,
|
161 |
-
inputs=[dataset_path, dataset_file, epochs, batch_size, prefix],
|
162 |
outputs=fine_tune_output
|
163 |
)
|
164 |
|
@@ -166,5 +136,3 @@ with gr.Blocks() as demo:
|
|
166 |
# Lancement de la démo
|
167 |
if __name__ == "__main__":
|
168 |
demo.launch()
|
169 |
-
|
170 |
-
|
|
|
1 |
+
#BS_app.py_06
|
2 |
+
#Training OK BUT REWRITE ALL FILES OF MODEL ON ORIGINAL REPO!!
|
3 |
|
4 |
#testing bloom1b training
|
5 |
|
|
|
10 |
from huggingface_hub import HfApi, HfFolder
|
11 |
import requests
|
12 |
from io import BytesIO
|
13 |
+
import json
|
14 |
|
15 |
# Récupérer token depuis les variables d'environnement
|
16 |
hf_token = os.getenv("MisterAI_bigscience_bloom_560m")
|
|
|
34 |
return response
|
35 |
|
36 |
# Fonction pour le fine-tuning
|
37 |
+
def fine_tune_model(dataset_path, dataset_file, split, colonne_input_ids, colonne_attention_mask, colonne_labels, epochs, batch_size, prefix):
|
|
|
|
|
38 |
# Récupération du fichier à partir de l'URL fournie
|
39 |
response = requests.get(f"{dataset_path}/resolve/main/{dataset_file}")
|
40 |
dataset_lines = response.text.strip().split('\n')
|
|
|
42 |
# Convertir les lignes en dictionnaires pour JSONL
|
43 |
dataset_dict = [json.loads(line) for line in dataset_lines if line]
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
# Créer un Dataset Hugging Face
|
46 |
dataset = Dataset.from_dict({
|
47 |
+
colonne_input_ids: [item[colonne_input_ids] for item in dataset_dict],
|
48 |
+
colonne_attention_mask: [item.get(colonne_attention_mask, "") for item in dataset_dict],
|
49 |
+
colonne_labels: [item.get(colonne_labels, "") for item in dataset_dict]
|
50 |
})
|
51 |
|
52 |
# Préparation des données
|
53 |
def preprocess_function(examples):
|
54 |
+
inputs = [p +'' + c for p, c in zip(examples[colonne_input_ids], examples[colonne_attention_mask])]
|
55 |
model_inputs = tokenizer(inputs, truncation=True, padding='max_length', max_length=2048)
|
56 |
model_inputs["labels"] = model_inputs["input_ids"].copy()
|
57 |
return model_inputs
|
58 |
+
|
|
|
|
|
|
|
|
|
59 |
dataset = dataset.map(preprocess_function, batched=True)
|
60 |
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
|
61 |
|
|
|
99 |
|
100 |
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
# Interface Gradio
|
103 |
with gr.Blocks() as demo:
|
104 |
with gr.Tab("Chatbot"):
|
|
|
114 |
with gr.Row():
|
115 |
dataset_path = gr.Textbox(label="Chemin du dataset")
|
116 |
dataset_file = gr.Textbox(label="Nom du fichier du dataset")
|
117 |
+
split = gr.Textbox(label="Split (si applicable)")
|
118 |
+
colonne_input_ids = gr.Textbox(label="Colonne input_ids")
|
119 |
+
colonne_attention_mask = gr.Textbox(label="Colonne attention_mask")
|
120 |
+
colonne_labels = gr.Textbox(label="Colonne labels (si applicable)")
|
121 |
epochs = gr.Number(label="Nombre d'époques", value=1)
|
122 |
batch_size = gr.Number(label="Taille du batch", value=2)
|
123 |
prefix = gr.Textbox(label="Préfixe pour les fichiers sauvegardés")
|
|
|
128 |
|
129 |
fine_tune_button.click(
|
130 |
fine_tune_model,
|
131 |
+
inputs=[dataset_path, dataset_file, split, colonne_input_ids, colonne_attention_mask, colonne_labels, epochs, batch_size, prefix],
|
132 |
outputs=fine_tune_output
|
133 |
)
|
134 |
|
|
|
136 |
# Lancement de la démo
|
137 |
if __name__ == "__main__":
|
138 |
demo.launch()
|
|
|
|