Spaces:

cicero-im
/

synthetic-data-generator-new

Runtime error

App Files Files Community

davidberenstein1957 commited on Sep 12, 2024

Commit

71fd9c5

1 Parent(s): fedb936

feat: add token roulation logic

Browse files

Files changed (2) hide show

src/distilabel_dataset_generator/pipelines/sft.py +13 -6
src/distilabel_dataset_generator/utils.py +5 -0

src/distilabel_dataset_generator/pipelines/sft.py CHANGED Viewed

@@ -1,11 +1,11 @@
-import os
 import pandas as pd
 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.pipeline import Pipeline
 from distilabel.steps import KeepColumns
 from distilabel.steps.tasks import MagpieGenerator, TextGeneration
 INFORMATION_SEEKING_PROMPT = (
     "You are an AI assistant designed to provide accurate and concise information on a wide"
     " range of topics. Your purpose is to assist users in finding specific facts,"
@@ -139,6 +139,7 @@ _STOP_SEQUENCES = [
     " \n\n",
 ]
 DEFAULT_BATCH_SIZE = 1
 def _get_output_mappings(num_turns):
@@ -189,15 +190,18 @@ if __name__ == "__main__":
 def get_pipeline(num_turns, num_rows, system_prompt):
     input_mappings = _get_output_mappings(num_turns)
     output_mappings = input_mappings
     if num_turns == 1:
         with Pipeline(name="sft") as pipeline:
             magpie = MagpieGenerator(
                 llm=InferenceEndpointsLLM(
                     model_id=MODEL,
                     tokenizer_id=MODEL,
-                    api_key=os.environ["HF_TOKEN"],
                     magpie_pre_query_template="llama3",
                     generation_kwargs={
                         "temperature": 0.8,  # it's the best value for Llama 3.1 70B Instruct
@@ -218,7 +222,7 @@ def get_pipeline(num_turns, num_rows, system_prompt):
                 llm=InferenceEndpointsLLM(
                     model_id=MODEL,
                     tokenizer_id=MODEL,
-                    api_key=os.environ["HF_TOKEN"],
                     generation_kwargs={"temperature": 0.8, "max_new_tokens": 1024},
                 ),
                 system_prompt=system_prompt,
@@ -239,7 +243,7 @@ def get_pipeline(num_turns, num_rows, system_prompt):
                 llm=InferenceEndpointsLLM(
                     model_id=MODEL,
                     tokenizer_id=MODEL,
-                    api_key=os.environ["HF_TOKEN"],
                     magpie_pre_query_template="llama3",
                     generation_kwargs={
                         "temperature": 0.8,  # it's the best value for Llama 3.1 70B Instruct
@@ -262,9 +266,12 @@ def get_pipeline(num_turns, num_rows, system_prompt):
 def get_prompt_generation_step():
     generate_description = TextGeneration(
         llm=InferenceEndpointsLLM(
-            api_key=os.environ["HF_TOKEN"],
             model_id=MODEL,
             tokenizer_id=MODEL,
             generation_kwargs={

 import pandas as pd
 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.pipeline import Pipeline
 from distilabel.steps import KeepColumns
 from distilabel.steps.tasks import MagpieGenerator, TextGeneration
+from src.distilabel_dataset_generator.utils import HF_TOKENS
 INFORMATION_SEEKING_PROMPT = (
     "You are an AI assistant designed to provide accurate and concise information on a wide"
     " range of topics. Your purpose is to assist users in finding specific facts,"
     " \n\n",
 ]
 DEFAULT_BATCH_SIZE = 1
+TOKEN_INDEX = 0
 def _get_output_mappings(num_turns):
 def get_pipeline(num_turns, num_rows, system_prompt):
+    global TOKEN_INDEX
     input_mappings = _get_output_mappings(num_turns)
     output_mappings = input_mappings
+    api_key = HF_TOKENS[TOKEN_INDEX % len(HF_TOKENS)]
+    TOKEN_INDEX += 1
     if num_turns == 1:
         with Pipeline(name="sft") as pipeline:
             magpie = MagpieGenerator(
                 llm=InferenceEndpointsLLM(
                     model_id=MODEL,
                     tokenizer_id=MODEL,
+                    api_key=api_key,
                     magpie_pre_query_template="llama3",
                     generation_kwargs={
                         "temperature": 0.8,  # it's the best value for Llama 3.1 70B Instruct
                 llm=InferenceEndpointsLLM(
                     model_id=MODEL,
                     tokenizer_id=MODEL,
+                    api_key=api_key,
                     generation_kwargs={"temperature": 0.8, "max_new_tokens": 1024},
                 ),
                 system_prompt=system_prompt,
                 llm=InferenceEndpointsLLM(
                     model_id=MODEL,
                     tokenizer_id=MODEL,
+                    api_key=api_key,
                     magpie_pre_query_template="llama3",
                     generation_kwargs={
                         "temperature": 0.8,  # it's the best value for Llama 3.1 70B Instruct
 def get_prompt_generation_step():
+    global TOKEN_INDEX
+    api_key = HF_TOKENS[TOKEN_INDEX % len(HF_TOKENS)]
+    TOKEN_INDEX += 1
     generate_description = TextGeneration(
         llm=InferenceEndpointsLLM(
+            api_key=api_key,
             model_id=MODEL,
             tokenizer_id=MODEL,
             generation_kwargs={

src/distilabel_dataset_generator/utils.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import gradio as gr
 from gradio.oauth import (
     OAUTH_CLIENT_ID,
@@ -8,6 +10,9 @@ from gradio.oauth import (
 )
 from huggingface_hub import whoami
 _CHECK_IF_SPACE_IS_SET = (
     all(
         [

+import os
 import gradio as gr
 from gradio.oauth import (
     OAUTH_CLIENT_ID,
 )
 from huggingface_hub import whoami
+HF_TOKENS = os.getenv("HF_TOKEN") + [os.getenv(f"HF_TOKEN_{i}") for i in range(1, 10)]
+HF_TOKENS = [token for token in HF_TOKENS if token]
 _CHECK_IF_SPACE_IS_SET = (
     all(
         [