Spaces:
Build error
Build error
| distilabel: | |
| version: 1.0.0 | |
| pipeline: | |
| name: farming | |
| description: null | |
| steps: | |
| - step: | |
| name: load_data | |
| input_mappings: {} | |
| output_mappings: {} | |
| batch_size: 64 | |
| data: | |
| - input: animal welfare from a Family Farming perspective | |
| - input: animal welfare from a Agribusiness perspective | |
| - input: animal welfare from a Permaculture perspective | |
| - input: animal welfare from a Agroforestery perspective | |
| - input: animal welfare from a Conventional Farming perspective | |
| - input: economic growth from a Family Farming perspective | |
| - input: economic growth from a Agribusiness perspective | |
| - input: economic growth from a Permaculture perspective | |
| - input: economic growth from a Agroforestery perspective | |
| - input: economic growth from a Conventional Farming perspective | |
| - input: land from a Family Farming perspective | |
| - input: land from a Agribusiness perspective | |
| - input: land from a Permaculture perspective | |
| - input: land from a Agroforestery perspective | |
| - input: land from a Conventional Farming perspective | |
| - input: resources from a Family Farming perspective | |
| - input: resources from a Agribusiness perspective | |
| - input: resources from a Permaculture perspective | |
| - input: resources from a Agroforestery perspective | |
| - input: resources from a Conventional Farming perspective | |
| - input: efficiency from a Family Farming perspective | |
| - input: efficiency from a Agribusiness perspective | |
| - input: efficiency from a Permaculture perspective | |
| - input: efficiency from a Agroforestery perspective | |
| - input: efficiency from a Conventional Farming perspective | |
| runtime_parameters_info: | |
| - name: batch_size | |
| optional: true | |
| description: The number of rows that will contain the batches generated by | |
| the step. | |
| type_info: | |
| module: distilabel.steps.generators.data | |
| name: LoadDataFromDicts | |
| name: load_data | |
| - step: | |
| name: self-instruct | |
| input_mappings: {} | |
| output_mappings: {} | |
| input_batch_size: 8 | |
| llm: | |
| generation_kwargs: {} | |
| model_id: null | |
| endpoint_name: null | |
| endpoint_namespace: null | |
| base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud | |
| tokenizer_id: null | |
| model_display_name: null | |
| use_openai_client: false | |
| type_info: | |
| module: distilabel.llms.huggingface.inference_endpoints | |
| name: InferenceEndpointsLLM | |
| group_generations: false | |
| num_generations: 1 | |
| num_instructions: 5 | |
| criteria_for_query_generation: 'Incorporate a diverse range of verbs, avoiding | |
| repetition. | |
| Ensure queries are compatible with AI model''s text generation functions and | |
| are limited to 1-2 sentences. | |
| Design queries to be self-contained and standalone. | |
| Blend interrogative (e.g., "What is the significance of x?") and imperative | |
| (e.g., "Detail the process of x.") styles.' | |
| application_description: 'You are an AI assistant than generates queries around | |
| the domain of farming. | |
| Your should not expect basic but profound questions from your users. | |
| The queries should reflect a diversity of vision and economic positions and | |
| political positions. | |
| The queries may know about different methods of farming. | |
| The queries can be positioned politically, economically, socially, or practically. | |
| Also take into account the impact of diverse causes on diverse domains.' | |
| runtime_parameters_info: | |
| - name: input_batch_size | |
| optional: true | |
| description: The number of rows that will contain the batches processed by | |
| the step. | |
| - name: llm | |
| runtime_parameters_info: | |
| - name: generation_kwargs | |
| description: The kwargs to be propagated to either `generate` or `agenerate` | |
| methods within each `LLM`. | |
| keys: | |
| - name: max_new_tokens | |
| optional: true | |
| description: the maximum number of new tokens that the model will generate. Defaults | |
| to `128`. | |
| - name: frequency_penalty | |
| optional: true | |
| description: the repetition penalty to use for the generation. Defaults to | |
| `0.0`. Only applies if `use_openai_client=True`. | |
| - name: presence_penalty | |
| optional: true | |
| description: the presence penalty to use for the generation. Defaults | |
| to `0.0`. Only applies if `use_openai_client=True`. | |
| - name: repetition_penalty | |
| optional: true | |
| description: the repetition penalty to use for the generation. Defaults to | |
| `None`. Only applies if `use_openai_client=False`. | |
| - name: temperature | |
| optional: true | |
| description: the temperature to use for the generation. Defaults to `1.0`. | |
| - name: do_sample | |
| optional: true | |
| description: whether to use sampling for the generation. Defaults to `False`. Only | |
| applies if `use_openai_client=False`. | |
| - name: top_k | |
| optional: true | |
| description: the top-k value to use for the generation. Defaults to `0.8`, | |
| since neither `0.0` nor `1.0` are valid values in TGI. | |
| - name: top_p | |
| optional: true | |
| description: the top-p value to use for the generation. Defaults to `1.0`. | |
| - name: typical_p | |
| optional: true | |
| description: the typical-p value to use for the generation. Defaults to | |
| `0.5`. | |
| - name: endpoint_name | |
| optional: true | |
| description: The name of the Inference Endpoint to use for the LLM. | |
| - name: endpoint_namespace | |
| optional: true | |
| description: The namespace of the Inference Endpoint to use for the LLM. | |
| - name: base_url | |
| optional: true | |
| description: The base URL to use for the Inference Endpoints API requests. | |
| - name: api_key | |
| optional: true | |
| description: The API key to authenticate the requests to the Inference Endpoints | |
| API. | |
| - name: num_generations | |
| optional: true | |
| description: The number of generations to be produced per input. | |
| type_info: | |
| module: distilabel.steps.tasks.self_instruct | |
| name: SelfInstruct | |
| name: self-instruct | |
| - step: | |
| name: evol_instruction_complexity | |
| input_mappings: | |
| instruction: question | |
| output_mappings: {} | |
| input_batch_size: 8 | |
| llm: | |
| generation_kwargs: {} | |
| model_id: null | |
| endpoint_name: null | |
| endpoint_namespace: null | |
| base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud | |
| tokenizer_id: null | |
| model_display_name: null | |
| use_openai_client: false | |
| type_info: | |
| module: distilabel.llms.huggingface.inference_endpoints | |
| name: InferenceEndpointsLLM | |
| group_generations: false | |
| num_generations: 1 | |
| num_evolutions: 2 | |
| store_evolutions: true | |
| generate_answers: false | |
| include_original_instruction: true | |
| mutation_templates: | |
| CONSTRAINTS: "I want you act as a Prompt Rewriter.\n\nYour objective is to\ | |
| \ rewrite a given prompt into a more complex version to make those famous\ | |
| \ AI systems (e.g., chatgpt and GPT4) a bit harder to handle.\n\nBut the\ | |
| \ rewritten prompt must be reasonable and must be understood and responded\ | |
| \ by humans.\n\nYour rewriting cannot omit the non-text parts such as the\ | |
| \ table and code in #The Given Prompt#:. Also, please do not omit the input\ | |
| \ in #The Given Prompt#.\n\nYou SHOULD complicate the given prompt using\ | |
| \ the following method: \nPlease add one more constraints/requirements into\ | |
| \ '#The Given Prompt#'\n\nYou should try your best not to make the #Rewritten\ | |
| \ Prompt# become verbose, #Rewritten Prompt# can only add 10 to 20 words\ | |
| \ into #The Given Prompt#.\n\n'#The Given Prompt#', '#Rewritten Prompt#',\ | |
| \ 'given prompt' and 'rewritten prompt' are not allowed to appear in #Rewritten\ | |
| \ Prompt#\n\n#The Given Prompt#:\n<PROMPT>\n#Rewritten Prompt#:\n\n" | |
| DEEPENING: "I want you act as a Prompt Rewriter.\n\nYour objective is to rewrite\ | |
| \ a given prompt into a more complex version to make those famous AI systems\ | |
| \ (e.g., chatgpt and GPT4) a bit harder to handle.\n\nBut the rewritten\ | |
| \ prompt must be reasonable and must be understood and responded by humans.\n\ | |
| \nYour rewriting cannot omit the non-text parts such as the table and code\ | |
| \ in #The Given Prompt#:. Also, please do not omit the input in #The Given\ | |
| \ Prompt#.\n\nYou SHOULD complicate the given prompt using the following\ | |
| \ method: \nIf #The Given Prompt# contains inquiries about certain issues,\ | |
| \ the depth and breadth of the inquiry can be increased.\n\nYou should try\ | |
| \ your best not to make the #Rewritten Prompt# become verbose, #Rewritten\ | |
| \ Prompt# can only add 10 to 20 words into #The Given Prompt#.\n\n'#The\ | |
| \ Given Prompt#', '#Rewritten Prompt#', 'given prompt' and 'rewritten prompt'\ | |
| \ are not allowed to appear in #Rewritten Prompt#\n\n#The Given Prompt#:\n\ | |
| <PROMPT>\n#Rewritten Prompt#:\n\n" | |
| CONCRETIZING: "I want you act as a Prompt Rewriter.\n\nYour objective is to\ | |
| \ rewrite a given prompt into a more complex version to make those famous\ | |
| \ AI systems (e.g., chatgpt and GPT4) a bit harder to handle.\n\nBut the\ | |
| \ rewritten prompt must be reasonable and must be understood and responded\ | |
| \ by humans.\n\nYour rewriting cannot omit the non-text parts such as the\ | |
| \ table and code in #The Given Prompt#:. Also, please do not omit the input\ | |
| \ in #The Given Prompt#.\n\nYou SHOULD complicate the given prompt using\ | |
| \ the following method: \nPlease replace general concepts with more specific\ | |
| \ concepts.\n\nYou should try your best not to make the #Rewritten Prompt#\ | |
| \ become verbose, #Rewritten Prompt# can only add 10 to 20 words into #The\ | |
| \ Given Prompt#.\n\n'#The Given Prompt#', '#Rewritten Prompt#', 'given prompt'\ | |
| \ and 'rewritten prompt' are not allowed to appear in #Rewritten Prompt#\n\ | |
| \n#The Given Prompt#:\n<PROMPT>\n#Rewritten Prompt#:\n\n" | |
| INCREASED_REASONING_STEPS: "I want you act as a Prompt Rewriter.\n\nYour objective\ | |
| \ is to rewrite a given prompt into a more complex version to make those\ | |
| \ famous AI systems (e.g., chatgpt and GPT4) a bit harder to handle.\n\n\ | |
| But the rewritten prompt must be reasonable and must be understood and responded\ | |
| \ by humans.\n\nYour rewriting cannot omit the non-text parts such as the\ | |
| \ table and code in #The Given Prompt#:. Also, please do not omit the input\ | |
| \ in #The Given Prompt#.\n\nYou SHOULD complicate the given prompt using\ | |
| \ the following method: \nIf #The Given Prompt# can be solved with just\ | |
| \ a few simple thinking processes, you can rewrite it to explicitly request\ | |
| \ multiple-step reasoning.\n\nYou should try your best not to make the #Rewritten\ | |
| \ Prompt# become verbose, #Rewritten Prompt# can only add 10 to 20 words\ | |
| \ into #The Given Prompt#.\n\n'#The Given Prompt#', '#Rewritten Prompt#',\ | |
| \ 'given prompt' and 'rewritten prompt' are not allowed to appear in #Rewritten\ | |
| \ Prompt#\n\n#The Given Prompt#:\n<PROMPT>\n#Rewritten Prompt#:\n\n" | |
| BREADTH: 'I want you act as a Prompt Creator. | |
| Your goal is to draw inspiration from the #Given Prompt# to create a brand | |
| new prompt. | |
| This new prompt should belong to the same domain as the #Given Prompt# but | |
| be even more rare. | |
| The LENGTH and complexity of the #Created Prompt# should be similar to that | |
| of the #Given Prompt#. | |
| The #Created Prompt# must be reasonable and must be understood and responded | |
| by humans. | |
| ''#Given Prompt#'', ''#Created Prompt#'', ''given prompt'' and ''created | |
| prompt'' are not allowed to appear in #Created Prompt# | |
| #Given Prompt#: | |
| <PROMPT> | |
| #Created Prompt#: | |
| ' | |
| seed: 42 | |
| runtime_parameters_info: | |
| - name: input_batch_size | |
| optional: true | |
| description: The number of rows that will contain the batches processed by | |
| the step. | |
| - name: llm | |
| runtime_parameters_info: | |
| - name: generation_kwargs | |
| description: The kwargs to be propagated to either `generate` or `agenerate` | |
| methods within each `LLM`. | |
| keys: | |
| - name: max_new_tokens | |
| optional: true | |
| description: the maximum number of new tokens that the model will generate. Defaults | |
| to `128`. | |
| - name: frequency_penalty | |
| optional: true | |
| description: the repetition penalty to use for the generation. Defaults to | |
| `0.0`. Only applies if `use_openai_client=True`. | |
| - name: presence_penalty | |
| optional: true | |
| description: the presence penalty to use for the generation. Defaults | |
| to `0.0`. Only applies if `use_openai_client=True`. | |
| - name: repetition_penalty | |
| optional: true | |
| description: the repetition penalty to use for the generation. Defaults to | |
| `None`. Only applies if `use_openai_client=False`. | |
| - name: temperature | |
| optional: true | |
| description: the temperature to use for the generation. Defaults to `1.0`. | |
| - name: do_sample | |
| optional: true | |
| description: whether to use sampling for the generation. Defaults to `False`. Only | |
| applies if `use_openai_client=False`. | |
| - name: top_k | |
| optional: true | |
| description: the top-k value to use for the generation. Defaults to `0.8`, | |
| since neither `0.0` nor `1.0` are valid values in TGI. | |
| - name: top_p | |
| optional: true | |
| description: the top-p value to use for the generation. Defaults to `1.0`. | |
| - name: typical_p | |
| optional: true | |
| description: the typical-p value to use for the generation. Defaults to | |
| `0.5`. | |
| - name: endpoint_name | |
| optional: true | |
| description: The name of the Inference Endpoint to use for the LLM. | |
| - name: endpoint_namespace | |
| optional: true | |
| description: The namespace of the Inference Endpoint to use for the LLM. | |
| - name: base_url | |
| optional: true | |
| description: The base URL to use for the Inference Endpoints API requests. | |
| - name: api_key | |
| optional: true | |
| description: The API key to authenticate the requests to the Inference Endpoints | |
| API. | |
| - name: num_generations | |
| optional: true | |
| description: The number of generations to be produced per input. | |
| - name: seed | |
| optional: true | |
| description: As `numpy` is being used in order to randomly pick a mutation | |
| method, then is nice to seed a random seed. | |
| type_info: | |
| module: distilabel.steps.tasks.evol_instruct.base | |
| name: EvolInstruct | |
| name: evol_instruction_complexity | |
| - step: | |
| name: expand_columns | |
| input_mappings: {} | |
| output_mappings: {} | |
| input_batch_size: 50 | |
| columns: | |
| instructions: question | |
| runtime_parameters_info: | |
| - name: input_batch_size | |
| optional: true | |
| description: The number of rows that will contain the batches processed by | |
| the step. | |
| type_info: | |
| module: distilabel.steps.expand | |
| name: ExpandColumns | |
| name: expand_columns | |
| - step: | |
| name: clean_numbered_list | |
| input_mappings: {} | |
| output_mappings: {} | |
| input_batch_size: 50 | |
| runtime_parameters_info: | |
| - name: input_batch_size | |
| optional: true | |
| description: The number of rows that will contain the batches processed by | |
| the step. | |
| type_info: | |
| module: domain | |
| name: CleanNumberedList | |
| name: clean_numbered_list | |
| - step: | |
| name: expand_columns_evolved | |
| input_mappings: {} | |
| output_mappings: {} | |
| input_batch_size: 50 | |
| columns: | |
| evolved_instructions: evolved_questions | |
| runtime_parameters_info: | |
| - name: input_batch_size | |
| optional: true | |
| description: The number of rows that will contain the batches processed by | |
| the step. | |
| type_info: | |
| module: distilabel.steps.expand | |
| name: ExpandColumns | |
| name: expand_columns_evolved | |
| - step: | |
| name: domain_expert | |
| input_mappings: | |
| instruction: evolved_questions | |
| output_mappings: | |
| generation: domain_expert_answer | |
| input_batch_size: 8 | |
| llm: | |
| generation_kwargs: {} | |
| model_id: null | |
| endpoint_name: null | |
| endpoint_namespace: null | |
| base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud | |
| tokenizer_id: null | |
| model_display_name: null | |
| use_openai_client: false | |
| type_info: | |
| module: distilabel.llms.huggingface.inference_endpoints | |
| name: InferenceEndpointsLLM | |
| group_generations: false | |
| num_generations: 1 | |
| runtime_parameters_info: | |
| - name: input_batch_size | |
| optional: true | |
| description: The number of rows that will contain the batches processed by | |
| the step. | |
| - name: llm | |
| runtime_parameters_info: | |
| - name: generation_kwargs | |
| description: The kwargs to be propagated to either `generate` or `agenerate` | |
| methods within each `LLM`. | |
| keys: | |
| - name: max_new_tokens | |
| optional: true | |
| description: the maximum number of new tokens that the model will generate. Defaults | |
| to `128`. | |
| - name: frequency_penalty | |
| optional: true | |
| description: the repetition penalty to use for the generation. Defaults to | |
| `0.0`. Only applies if `use_openai_client=True`. | |
| - name: presence_penalty | |
| optional: true | |
| description: the presence penalty to use for the generation. Defaults | |
| to `0.0`. Only applies if `use_openai_client=True`. | |
| - name: repetition_penalty | |
| optional: true | |
| description: the repetition penalty to use for the generation. Defaults to | |
| `None`. Only applies if `use_openai_client=False`. | |
| - name: temperature | |
| optional: true | |
| description: the temperature to use for the generation. Defaults to `1.0`. | |
| - name: do_sample | |
| optional: true | |
| description: whether to use sampling for the generation. Defaults to `False`. Only | |
| applies if `use_openai_client=False`. | |
| - name: top_k | |
| optional: true | |
| description: the top-k value to use for the generation. Defaults to `0.8`, | |
| since neither `0.0` nor `1.0` are valid values in TGI. | |
| - name: top_p | |
| optional: true | |
| description: the top-p value to use for the generation. Defaults to `1.0`. | |
| - name: typical_p | |
| optional: true | |
| description: the typical-p value to use for the generation. Defaults to | |
| `0.5`. | |
| - name: endpoint_name | |
| optional: true | |
| description: The name of the Inference Endpoint to use for the LLM. | |
| - name: endpoint_namespace | |
| optional: true | |
| description: The namespace of the Inference Endpoint to use for the LLM. | |
| - name: base_url | |
| optional: true | |
| description: The base URL to use for the Inference Endpoints API requests. | |
| - name: api_key | |
| optional: true | |
| description: The API key to authenticate the requests to the Inference Endpoints | |
| API. | |
| - name: num_generations | |
| optional: true | |
| description: The number of generations to be produced per input. | |
| type_info: | |
| module: domain | |
| name: DomainExpert | |
| name: domain_expert | |
| - step: | |
| name: keep_columns | |
| input_mappings: {} | |
| output_mappings: {} | |
| input_batch_size: 50 | |
| columns: | |
| - model_name | |
| - evolved_questions | |
| - domain_expert_answer | |
| runtime_parameters_info: | |
| - name: input_batch_size | |
| optional: true | |
| description: The number of rows that will contain the batches processed by | |
| the step. | |
| type_info: | |
| module: distilabel.steps.keep | |
| name: KeepColumns | |
| name: keep_columns | |
| - step: | |
| name: text_generation_to_argilla | |
| input_mappings: | |
| instruction: evolved_questions | |
| generation: domain_expert_answer | |
| output_mappings: {} | |
| input_batch_size: 50 | |
| dataset_name: farming | |
| dataset_workspace: admin | |
| api_url: https://argilla-farming.hf.space | |
| runtime_parameters_info: | |
| - name: input_batch_size | |
| optional: true | |
| description: The number of rows that will contain the batches processed by | |
| the step. | |
| - name: dataset_name | |
| optional: false | |
| description: The name of the dataset in Argilla. | |
| - name: dataset_workspace | |
| optional: true | |
| description: The workspace where the dataset will be created in Argilla. Defaultsto | |
| `None` which means it will be created in the default workspace. | |
| - name: api_url | |
| optional: true | |
| description: The base URL to use for the Argilla API requests. | |
| - name: api_key | |
| optional: true | |
| description: The API key to authenticate the requests to the Argilla API. | |
| type_info: | |
| module: distilabel.steps.argilla.text_generation | |
| name: TextGenerationToArgilla | |
| name: text_generation_to_argilla | |
| connections: | |
| - from: load_data | |
| to: | |
| - self-instruct | |
| - from: self-instruct | |
| to: | |
| - expand_columns | |
| - from: evol_instruction_complexity | |
| to: | |
| - expand_columns_evolved | |
| - from: expand_columns | |
| to: | |
| - clean_numbered_list | |
| - from: clean_numbered_list | |
| to: | |
| - evol_instruction_complexity | |
| - from: expand_columns_evolved | |
| to: | |
| - domain_expert | |
| - from: domain_expert | |
| to: | |
| - keep_columns | |
| - from: keep_columns | |
| to: | |
| - text_generation_to_argilla | |
| - from: text_generation_to_argilla | |
| to: [] | |
| type_info: | |
| module: distilabel.pipeline.local | |
| name: Pipeline | |