Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,8 @@ import os
|
|
8 |
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
|
9 |
from datasets import load_dataset, Dataset
|
10 |
from huggingface_hub import HfApi, HfFolder
|
|
|
|
|
11 |
|
12 |
# Récupérer token depuis les variables d'environnement
|
13 |
hf_token = os.getenv("MisterAI_bigscience_bloom_560m")
|
@@ -31,20 +33,37 @@ def generate_response(input_text):
|
|
31 |
return response
|
32 |
|
33 |
# Fonction pour le fine-tuning
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
def fine_tune_model(dataset_path, dataset_file, epochs, batch_size, prefix):
|
35 |
-
#
|
36 |
-
|
37 |
-
|
38 |
-
else:
|
39 |
-
dataset = load_dataset('json', data_files={dataset_file: dataset_path})
|
40 |
|
41 |
# Préparation des données
|
42 |
-
dataset = Dataset.from_dict(dataset
|
43 |
dataset = dataset.map(lambda x: tokenizer(x['question'] + ' ' + x['chosen'], truncation=True, padding='max_length'), batched=True)
|
44 |
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
|
45 |
|
46 |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
47 |
|
|
|
|
|
48 |
# Configuration de l'entraînement
|
49 |
training_args = TrainingArguments(
|
50 |
output_dir=f"./{prefix}_{model_name.split('/')[-1]}",
|
|
|
8 |
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
|
9 |
from datasets import load_dataset, Dataset
|
10 |
from huggingface_hub import HfApi, HfFolder
|
11 |
+
import requests
|
12 |
+
from io import BytesIO
|
13 |
|
14 |
# Récupérer token depuis les variables d'environnement
|
15 |
hf_token = os.getenv("MisterAI_bigscience_bloom_560m")
|
|
|
33 |
return response
|
34 |
|
35 |
# Fonction pour le fine-tuning
|
36 |
+
#def fine_tune_model(dataset_path, dataset_file, epochs, batch_size, prefix):
|
37 |
+
# # Chargement du dataset
|
38 |
+
# if dataset_path.startswith("https://huggingface.co/datasets/"):
|
39 |
+
# dataset = load_dataset('json', data_files={dataset_file: dataset_path})
|
40 |
+
# else:
|
41 |
+
# dataset = load_dataset('json', data_files={dataset_file: dataset_path})
|
42 |
+
#
|
43 |
+
# # Préparation des données
|
44 |
+
# dataset = Dataset.from_dict(dataset[dataset_file])
|
45 |
+
# dataset = dataset.map(lambda x: tokenizer(x['question'] + ' ' + x['chosen'], truncation=True, padding='max_length'), batched=True)
|
46 |
+
# dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
|
47 |
+
#
|
48 |
+
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
def fine_tune_model(dataset_path, dataset_file, epochs, batch_size, prefix):
|
54 |
+
# Récupération du fichier à partir de l'URL fournie
|
55 |
+
response = requests.get(dataset_path)
|
56 |
+
dataset = list(response.iter_lines())
|
|
|
|
|
57 |
|
58 |
# Préparation des données
|
59 |
+
dataset = Dataset.from_dict({'data': dataset})
|
60 |
dataset = dataset.map(lambda x: tokenizer(x['question'] + ' ' + x['chosen'], truncation=True, padding='max_length'), batched=True)
|
61 |
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
|
62 |
|
63 |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
64 |
|
65 |
+
|
66 |
+
|
67 |
# Configuration de l'entraînement
|
68 |
training_args = TrainingArguments(
|
69 |
output_dir=f"./{prefix}_{model_name.split('/')[-1]}",
|