diff --git "a/gen.py" "b/gen.py"
deleted file mode 100644--- "a/gen.py"
+++ /dev/null
@@ -1,3821 +0,0 @@
-import ast
-import copy
-import functools
-import inspect
-import queue
-import sys
-import os
-import time
-import traceback
-import typing
-import warnings
-from datetime import datetime
-import requests
-from requests import ConnectTimeout, JSONDecodeError
-from urllib3.exceptions import ConnectTimeoutError, MaxRetryError, ConnectionError
-from requests.exceptions import ConnectionError as ConnectionError2
-from requests.exceptions import ReadTimeout as ReadTimeout2
-
-if os.path.dirname(os.path.abspath(__file__)) not in sys.path:
-    sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
-os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
-os.environ['BITSANDBYTES_NOWELCOME'] = '1'
-warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
-
-# more is not useful typically, don't let these go beyond limits and eat up resources
-max_cores = max(1, os.cpu_count() // 2)
-if os.getenv('NUMEXPR_MAX_THREADS') is None:
-    os.environ['NUMEXPR_MAX_THREADS'] = str(min(8, max_cores))
-if os.getenv('NUMEXPR_NUM_THREADS') is None:
-    os.environ['NUMEXPR_NUM_THREADS'] = str(min(8, max_cores))
-if os.getenv('OMP_NUM_THREADS') is None:
-    os.environ['OMP_NUM_THREADS'] = str(min(8, max_cores))
-if os.getenv('OPENBLAS_NUM_THREADS') is None:
-    os.environ['OPENBLAS_NUM_THREADS'] = str(min(8, max_cores))
-if os.getenv('DUCKDB_NUM_THREADS') is None:
-    os.environ['DUCKDB_NUM_THREADS'] = str(min(4, max_cores))
-if os.getenv('RAYON_RS_NUM_CPUS') is None:
-    os.environ['RAYON_RS_NUM_CPUS'] = str(min(8, max_cores))
-if os.getenv('RAYON_NUM_THREADS') is None:
-    os.environ['RAYON_NUM_THREADS'] = str(min(8, max_cores))
-
-import numpy as np
-from evaluate_params import eval_func_param_names, no_default_param_names, input_args_list
-from enums import DocumentSubset, LangChainMode, no_lora_str, model_token_mapping, no_model_str, \
-    LangChainAction, LangChainAgent, DocumentChoice, LangChainTypes, super_source_prefix, \
-    super_source_postfix, t5_type, get_langchain_prompts, gr_to_lg, invalid_key_msg
-from loaders import get_loaders
-from utils import set_seed, clear_torch_cache, NullContext, wrapped_partial, EThread, get_githash, \
-    import_matplotlib, get_device, makedirs, get_kwargs, start_faulthandler, get_hf_server, FakeTokenizer, \
-    have_langchain, set_openai, cuda_vis_check, H2O_Fire, lg_to_gr, str_to_list, str_to_dict, get_token_count
-
-start_faulthandler()
-import_matplotlib()
-
-SEED = 1236
-set_seed(SEED)
-
-from typing import Union
-
-import torch
-from transformers import GenerationConfig, AutoModel, TextIteratorStreamer
-
-from prompter import Prompter, inv_prompt_type_to_model_lower, non_hf_types, PromptType, get_prompt, generate_prompt
-from stopping import get_stopping
-
-langchain_actions = [x.value for x in list(LangChainAction)]
-
-langchain_agents_list = [x.value for x in list(LangChainAgent)]
-
-
-def main(
-        load_8bit: bool = False,
-        load_4bit: bool = False,
-        low_bit_mode: int = 1,
-        load_half: bool = None,
-        load_gptq: str = '',
-        load_exllama: bool = False,
-        use_safetensors: bool = False,
-        revision: str = None,
-        use_gpu_id: bool = True,
-        base_model: str = '',
-        tokenizer_base_model: str = '',
-        lora_weights: str = "",
-        gpu_id: int = 0,
-        compile_model: bool = None,
-        use_cache: bool = None,
-        inference_server: str = "",
-        prompt_type: Union[int, str] = None,
-        prompt_dict: typing.Dict = None,
-        system_prompt: str = '',
-
-        # llama and gpt4all settings
-        llamacpp_dict: typing.Dict = dict(n_gpu_layers=100, use_mlock=True, n_batch=1024, n_gqa=0),
-        model_path_llama: str = 'https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q8_0.bin',
-        # 'llama-2-7b-chat.ggmlv3.q8_0.bin',
-        model_name_gptj: str = 'ggml-gpt4all-j-v1.3-groovy.bin',
-        model_name_gpt4all_llama: str = 'ggml-wizardLM-7B.q4_2.bin',
-        model_name_exllama_if_no_config: str = 'TheBloke/Nous-Hermes-Llama2-GPTQ',
-
-        model_lock: typing.List[typing.Dict[str, str]] = None,
-        model_lock_columns: int = None,
-        fail_if_cannot_connect: bool = False,
-
-        # input to generation
-        temperature: float = None,
-        top_p: float = None,
-        top_k: int = None,
-        num_beams: int = None,
-        repetition_penalty: float = None,
-        num_return_sequences: int = None,
-        do_sample: bool = None,
-        max_new_tokens: int = None,
-        min_new_tokens: int = None,
-        early_stopping: Union[bool, str] = None,
-        max_time: float = None,
-
-        memory_restriction_level: int = None,
-        debug: bool = False,
-        save_dir: str = None,
-        share: bool = False,
-        local_files_only: bool = False,
-        resume_download: bool = True,
-        use_auth_token: Union[str, bool] = False,
-        trust_remote_code: Union[str, bool] = True,
-        rope_scaling: dict = None,
-        max_seq_len: int = None,
-        offload_folder: str = "offline_folder",
-
-        src_lang: str = "English",
-        tgt_lang: str = "Russian",
-
-        prepare_offline_level: int = 0,
-        cli: bool = False,
-        cli_loop: bool = True,
-        gradio: bool = True,
-        gradio_offline_level: int = 0,
-        server_name: str = "0.0.0.0",
-        root_path: str = "",
-        chat: bool = True,
-        chat_conversation: typing.List[typing.Tuple[str, str]] = None,
-        text_context_list: typing.List[str] = None,
-        stream_output: bool = True,
-        async_output: bool = True,
-        num_async: int = 3,
-        show_examples: bool = None,
-        verbose: bool = False,
-        h2ocolors: bool = True,
-        dark: bool = False,  # light tends to be best
-        height: int = 600,
-        show_lora: bool = True,
-        show_llama: bool = True,
-        show_gpt4all: bool = False,
-        login_mode_if_model0: bool = False,
-        block_gradio_exit: bool = True,
-        concurrency_count: int = 1,
-        api_open: bool = False,
-        allow_api: bool = True,
-        input_lines: int = 1,
-        gradio_size: str = None,
-        show_copy_button: bool = True,
-        large_file_count_mode: bool = False,
-        pre_load_embedding_model: bool = True,
-
-        auth: Union[typing.List[typing.Tuple[str, str]], str] = None,
-        auth_filename: str = None,
-        auth_access: str = 'open',
-        auth_freeze: bool = False,
-        auth_message: str = None,
-        guest_name: str = "guest",
-        enforce_h2ogpt_api_key: bool = None,
-        h2ogpt_api_keys: Union[list, str] = [],
-        h2ogpt_key: str = None,
-
-        max_max_time=None,
-        max_max_new_tokens=None,
-
-        visible_models: list = None,
-        visible_visible_models: bool = True,
-        visible_submit_buttons: bool = True,
-        visible_side_bar: bool = True,
-        visible_doc_track: bool = True,
-        visible_chat_tab: bool = True,
-        visible_doc_selection_tab: bool = True,
-        visible_doc_view_tab: bool = True,
-        visible_chat_history_tab: bool = True,
-        visible_expert_tab: bool = True,
-        visible_models_tab: bool = True,
-        visible_system_tab: bool = True,
-        visible_tos_tab: bool = False,
-        visible_login_tab: bool = True,
-        visible_hosts_tab: bool = False,
-        chat_tables: bool = False,
-        visible_h2ogpt_header: bool = True,
-        max_raw_chunks: int = None,
-
-        sanitize_user_prompt: bool = False,
-        sanitize_bot_response: bool = False,
-
-        extra_model_options: typing.List[str] = [],
-        extra_lora_options: typing.List[str] = [],
-        extra_server_options: typing.List[str] = [],
-
-        score_model: str = 'auto',
-
-        eval_filename: str = None,
-        eval_prompts_only_num: int = 0,
-        eval_prompts_only_seed: int = 1234,
-        eval_as_output: bool = False,
-
-        langchain_mode: str = None,
-        user_path: str = None,
-        langchain_modes: list = [LangChainMode.USER_DATA.value, LangChainMode.MY_DATA.value, LangChainMode.LLM.value,
-                                 LangChainMode.DISABLED.value],
-        langchain_mode_paths: dict = {LangChainMode.USER_DATA.value: None},
-        langchain_mode_types: dict = {LangChainMode.USER_DATA.value: LangChainTypes.SHARED.value},
-        detect_user_path_changes_every_query: bool = False,
-
-        langchain_action: str = LangChainAction.QUERY.value,
-        langchain_agents: list = [],
-        force_langchain_evaluate: bool = False,
-
-        visible_langchain_actions: list = [LangChainAction.QUERY.value, LangChainAction.SUMMARIZE_MAP.value],
-        visible_langchain_agents: list = langchain_agents_list.copy(),
-
-        document_subset: str = DocumentSubset.Relevant.name,
-        document_choice: list = [DocumentChoice.ALL.value],
-
-        use_llm_if_no_docs: bool = True,
-        load_db_if_exists: bool = True,
-        keep_sources_in_context: bool = False,
-        db_type: str = 'chroma',
-        use_openai_embedding: bool = False,
-        use_openai_model: bool = False,
-        hf_embedding_model: str = None,
-        migrate_embedding_model: str = False,
-        auto_migrate_db: bool = False,
-        cut_distance: float = 1.64,
-        answer_with_sources: bool = True,
-        append_sources_to_answer: bool = True,
-        show_accordions: bool = True,
-        top_k_docs_max_show: int = 10,
-        show_link_in_sources: bool = True,
-        pre_prompt_query: str = None,
-        prompt_query: str = None,
-        pre_prompt_summary: str = None,
-        prompt_summary: str = None,
-        add_chat_history_to_context: bool = True,
-        add_search_to_context: bool = False,
-        context: str = '',
-        iinput: str = '',
-        allow_upload_to_user_data: bool = True,
-        reload_langchain_state: bool = True,
-        allow_upload_to_my_data: bool = True,
-        enable_url_upload: bool = True,
-        enable_text_upload: bool = True,
-        enable_sources_list: bool = True,
-        chunk: bool = True,
-        chunk_size: int = 512,
-        top_k_docs: int = None,
-        docs_ordering_type: str = 'reverse_ucurve_sort',
-        min_max_new_tokens=256,
-        auto_reduce_chunks: bool = True,
-        max_chunks: int = 100,
-        headsize: int = 50,
-        n_jobs: int = -1,
-
-        # urls
-        use_unstructured=True,
-        use_playwright=False,
-        use_selenium=False,
-
-        # pdfs
-        use_pymupdf='auto',
-        use_unstructured_pdf='auto',
-        use_pypdf='auto',
-        enable_pdf_ocr='auto',
-        enable_pdf_doctr='auto',
-        try_pdf_as_html='auto',
-
-        # images
-        enable_ocr=False,
-        enable_doctr=False,
-        enable_pix2struct=False,
-        enable_captions=True,
-
-        pre_load_caption_model: bool = False,
-        caption_gpu: bool = True,
-        captions_model: str = "Salesforce/blip-image-captioning-base",
-        doctr_gpu: bool = True,
-
-        # json
-        jq_schema='.[]',
-
-        max_quality: bool = False,
-
-        enable_heap_analytics: bool = True,
-        heap_app_id: str = "1680123994",
-):
-    """
-
-    :param load_8bit: load model in 8-bit using bitsandbytes
-    :param load_4bit: load model in 4-bit using bitsandbytes
-    :param low_bit_mode: 0: no quantization config 1: change compute 2: nf4 3: double quant 4: 2 and 3
-           See: https://huggingface.co/docs/transformers/main_classes/quantization
-           If using older bitsandbytes or transformers, 0 is required
-    :param load_half: load model in float16 (None means auto, which means True unless t5 based model)
-                      otherwise specify bool
-    :param load_gptq: to load model with GPTQ, put model_basename here, e.g. gptq_model-4bit--1g
-    :param load_exllama: whether to use exllama (only applicable to LLaMa1/2 models with 16-bit or GPTQ
-    :param use_safetensors: to use safetensors version (assumes file/HF points to safe tensors version)
-    :param revision: Which HF revision to use
-    :param use_gpu_id: whether to control devices with gpu_id.  If False, then spread across GPUs
-    :param base_model: model HF-type name.  If use --base_model to preload model, cannot unload in gradio in models tab
-    :param tokenizer_base_model: tokenizer HF-type name.  Usually not required, inferred from base_model.
-    :param lora_weights: LORA weights path/HF link
-    :param gpu_id: if use_gpu_id, then use gpu_id for cuda device ID, or auto mode if gpu_id != -1
-    :param compile_model Whether to compile the model
-    :param use_cache: Whether to use caching in model (some models fail when multiple threads use)
-    :param inference_server: Consume base_model as type of model at this address
-                             Address can be text-generation-server hosting that base_model
-                             e.g. python generate.py --inference_server="http://192.168.1.46:6112" --base_model=h2oai/h2ogpt-oasst1-512-12b
-
-                             Or Address can be "openai_chat" or "openai" for OpenAI API
-                             Or Address can be "openai_azure_chat" or "openai_azure" for Azure OpenAI API
-                             e.g. python generate.py --inference_server="openai_chat" --base_model=gpt-3.5-turbo
-                             e.g. python generate.py --inference_server="openai" --base_model=text-davinci-003
-                             e.g. python generate.py --inference_server="openai_azure_chat:<deployment_name>:<baseurl>:<api_version>:<model_version>" --base_model=gpt-3.5-turbo
-                             e.g. python generate.py --inference_server="openai_azure:<deployment_name>:<baseurl>:<api_version>:<model_version>" --base_model=text-davinci-003
-                             Optionals (Replace with None or just leave empty but keep :)
-                                 <deployment_name> of some deployment name
-                                 <baseurl>: e.g. "<endpoint>.openai.azure.com" for some <endpoint> without https://
-                                 <api_version> of some api, e.g. 2023-05-15
-                                 <model_version> e.g. 0613
-
-                             Or Address can be for vLLM:
-                              Use: "vllm:IP:port" for OpenAI-compliant vLLM endpoint
-                              Note: vllm_chat not supported by vLLM project.
-
-                             Or Address can be replicate:
-                             Use:
-                              --inference_server=replicate:<model name string> will use a Replicate server, requiring a Replicate key.
-                              e.g. <model name string> looks like "a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5"
-
-                             Or Address can be for AWS SageMaker:
-                              Use: "sagemaker_chat:<endpoint name>" for chat models that AWS sets up as dialog
-                              Use: "sagemaker:<endpoint name>" for foundation models that AWS only text as inputs
-
-    :param prompt_type: type of prompt, usually matched to fine-tuned model or plain for foundational model
-    :param prompt_dict: If prompt_type=custom, then expects (some) items returned by get_prompt(..., return_dict=True)
-    :param system_prompt: Universal system prompt to use if model supports, like LLaMa2, regardless of prompt_type definition.
-           Useful for langchain case to control behavior, or OpenAI and Replicate.
-           If None, 'None', or 'auto', then for LLaMa or other models that internally have system_prompt, will use default for each model
-           If '', then no system prompt (no empty template given to model either, just no system part added at all)
-           If some string not in ['None', 'auto'], then use that as system prompt
-           Default is '', no system_prompt, because often it hurts performance/accuracy
-
-    :param llamacpp_dict:
-           n_gpu_layers: for llama.cpp based models, number of GPU layers to offload (default is all by using large value)
-           use_mlock: when using `llama.cpp` based CPU models, for computers with low system RAM or slow CPUs, recommended False
-           n_batch: Can make smaller to 128 for slower low-memory CPU systems
-           n_gqa: Required to be 8 for LLaMa 70B
-           ... etc. anything that could be passed to llama.cpp or GPT4All models
-           e.g. python generate.py --base_model='llama' --prompt_type=llama2 --score_model=None --langchain_mode='UserData' --user_path=user_path --llamacpp_dict="{'n_gpu_layers':25,'n_batch':128}"
-    :param model_path_llama: model path or URL (for auto-download)
-    :param model_name_gptj: model path or URL (for auto-download)
-    :param model_name_gpt4all_llama: model path or URL (for auto-download)
-    :param model_name_exllama_if_no_config: exllama model's full path for model, tokenizer, generator for use when no HuggingFace config
-
-    :param model_lock: Lock models to specific combinations, for ease of use and extending to many models
-           Only used if gradio = True
-           List of dicts, each dict has base_model, tokenizer_base_model, lora_weights, inference_server, prompt_type, and prompt_dict
-           If all models have same prompt_type, and prompt_dict, can still specify that once in CLI outside model_lock as default for dict
-           Can specify model_lock instead of those items on CLI
-           As with CLI itself, base_model can infer prompt_type and prompt_dict if in prompter.py.
-             Also, tokenizer_base_model and lora_weights are optional.
-             Also, inference_server is optional if loading model from local system.
-           All models provided will automatically appear in compare model mode
-           Model loading-unloading and related choices will be disabled.  Model/lora/server adding will be disabled
-    :param model_lock_columns: How many columns to show if locking models (and so showing all at once)
-           If None, then defaults to up to 3
-           if -1, then all goes into 1 row
-           Maximum value is 4 due to non-dynamic gradio rendering elements
-    :param fail_if_cannot_connect: if doing model locking (e.g. with many models), fail if True.  Otherwise ignore.
-           Useful when many endpoints and want to just see what works, but still have to wait for timeout.
-
-    :param temperature: generation temperature
-    :param top_p: generation top_p
-    :param top_k: generation top_k
-    :param num_beams: generation number of beams
-    :param repetition_penalty: generation repetition penalty
-    :param num_return_sequences: generation number of sequences (1 forced for chat)
-    :param do_sample: generation sample
-    :param max_new_tokens: generation max new tokens
-    :param min_new_tokens: generation min tokens
-    :param early_stopping: generation early stopping
-    :param max_time: maximum time to allow for generation
-    :param memory_restriction_level: 0 = no restriction to tokens or model, 1 = some restrictions on token 2 = HF like restriction 3 = very low memory case
-    :param debug: enable debug mode
-    :param save_dir: directory chat data is saved to
-    :param share: whether to share the gradio app with sharable URL
-    :param local_files_only: whether to only use local files instead of doing to HF for models
-    :param resume_download: whether to resume downloads from HF for models
-    :param use_auth_token: whether to use HF auth token (requires CLI did huggingface-cli login before)
-    :param trust_remote_code: whether to use trust any code needed for HF model
-    :param rope_scaling:
-           For HF transformers model: scaling for rope-based models, e.g. --rope_scaling="{'type':'dynamic', 'factor':4}"
-           For exllama model: --rope_scaling="{'alpha_value':4}" .  This automatically scales max_seq_len for exllama
-    :param max_seq_len: Manually set maximum sequence length for the LLM
-    :param offload_folder: path for spilling model onto disk
-    :param src_lang: source languages to include if doing translation (None = all)
-    :param tgt_lang: target languages to include if doing translation (None = all)
-
-    :param prepare_offline_level:
-           Whether to just prepare for offline use, do not go into cli, eval, or gradio run modes
-           0 : no prep
-           1: prepare just h2oGPT with exact same setup as passed to CLI and ensure all artifacts for h2oGPT alone added to ~/.cache/
-           2: prepare h2oGPT + all inference servers so h2oGPT+inference servers can use the ~/.cache/
-    :param cli: whether to use CLI (non-gradio) interface.
-    :param cli_loop: whether to loop for CLI (False usually only for testing)
-    :param gradio: whether to enable gradio, or to enable benchmark mode
-    :param gradio_offline_level: > 0, then change fonts so full offline
-           == 1 means backend won't need internet for fonts, but front-end UI might if font not cached
-           == 2 means backend and frontend don't need internet to download any fonts.
-           Note: Some things always disabled include HF telemetry, gradio telemetry, chromadb posthog that involve uploading.
-           This option further disables google fonts for downloading, which is less intrusive than uploading,
-           but still required in air-gapped case.  The fonts don't look as nice as google fonts, but ensure full offline behavior.
-           Also set --share=False to avoid sharing a gradio live link.
-    :param server_name: IP to use.  In linux 0.0.0.0 is good choice so exposed to outside host, else for only local use 127.0.0.1.
-                        For windows/MAC 0.0.0.0 or 127.0.0.1 will work, but may need to specify actual LAN IP address for other LAN clients to see.
-    :param root_path: The root path (or "mount point") of the application,
-           if it's not served from the root ("/") of the domain. Often used when the application is behind a reverse proxy
-           that forwards requests to the application. For example, if the application is served at "https://example.com/myapp",
-           the `root_path` should be set to "/myapp".
-    :param chat: whether to enable chat mode with chat history
-    :param chat_conversation: list of tuples of (human, bot) conversation pre-appended to existing chat when using instruct/chat models
-           Requires also add_chat_history_to_context = True
-           It does *not* require chat=True, so works with nochat_api etc.
-    :param text_context_list: List of strings to add to context for non-database version of document Q/A for faster handling via API etc.
-           Forces LangChain code path and uses as many entries in list as possible given max_seq_len, with first assumed to be most relevant and to go near prompt.
-    :param stream_output: whether to stream output
-    :param async_output: Whether to do asyncio handling
-           For summarization
-           Applicable to HF TGI server
-           Only if stream_output=False in CLI, UI, or API
-    :param num_async: Number of simultaneously allowed asyncio calls to make for async_output
-           Too many will overload inference server, too few will be too slow
-    :param show_examples: whether to show clickable examples in gradio
-    :param verbose: whether to show verbose prints
-    :param h2ocolors: whether to use H2O.ai theme
-    :param dark: whether to use dark mode for UI by default (still controlled in UI)
-    :param height: height of chat window
-    :param show_lora: whether to show LORA options in UI (expert so can be hard to understand)
-    :param show_llama: whether to show LLaMa.cpp/GPT4All options in UI (only likely useful if have weak GPUs)
-    :param show_gpt4all: whether to show GPT4All models in UI (not often useful, llama.cpp models best)
-    :param login_mode_if_model0: set to True to load --base_model after client logs in, to be able to free GPU memory when model is swapped
-    :param block_gradio_exit: whether to block gradio exit (used for testing)
-    :param concurrency_count: gradio concurrency count (1 is optimal for LLMs)
-    :param api_open: If False, don't let API calls skip gradio queue
-    :param allow_api: whether to allow API calls at all to gradio server
-    :param input_lines: how many input lines to show for chat box (>1 forces shift-enter for submit, else enter is submit)
-    :param gradio_size: Overall size of text and spaces: "xsmall", "small", "medium", "large".
-           Small useful for many chatbots in model_lock mode
-    :param show_copy_button: Whether to show copy button for chatbots
-    :param large_file_count_mode: Whether to force manual update to UI of drop-downs, good idea if millions of chunks or documents
-    :param pre_load_embedding_model: Whether to preload embedding model for shared use across DBs and users (multi-thread safe only)
-
-    :param auth: gradio auth for launcher in form [(user1, pass1), (user2, pass2), ...]
-                 e.g. --auth=[('jon','password')] with no spaces
-                 e.g. --auth="[('jon', 'password)())(')]" so any special characters can be used
-                 e.g. --auth=auth.json to specify persisted state file with name auth.json (auth_filename then not required)
-                 e.g. --auth='' will use default auth.json as file name for persisted state file (auth_filename then not required)
-                 e.g. --auth=None will use no auth, but still keep track of auth state, just not from logins
-    :param auth_filename:
-         Set auth filename, used only if --auth= was passed list of user/passwords
-    :param auth_access:
-         'open': Allow new users to be added
-         'closed': Stick to existing users
-    :param auth_freeze: whether freeze authentication based upon current file, no longer update file
-    :param auth_message: Message to show if having users login, fixed if passed, else dynamic internally
-    :param guest_name: guess name if using auth and have open access.
-           If '', then no guest allowed even if open access, then all databases for each user always persisted
-    :param enforce_h2ogpt_api_key: Whether to enforce h2oGPT token usage for API
-    :param h2ogpt_api_keys: list of tokens allowed for API access or file accessed on demand for json of list of keys
-    :param h2ogpt_key: E.g. can be set when accessing gradio h2oGPT server from local gradio h2oGPT server that acts as client to that inference server
-
-    :param max_max_time: Maximum max_time for gradio slider
-    :param max_max_new_tokens: Maximum max_new_tokens for gradio slider
-    :param min_max_new_tokens: Minimum of max_new_tokens, when auto-scaling down to handle more docs/prompt, but still let generation have some tokens
-
-    :param visible_models: Which models in model_lock list to show by default
-           Takes integers of position in model_lock (model_states) list or strings of base_model names
-           Ignored if model_lock not used
-           For nochat API, this is single item within a list for model by name or by index in model_lock
-                                If None, then just use first model in model_lock list
-                                If model_lock not set, use model selected by CLI --base_model etc.
-
-    :param visible_visible_models: Whether visible models drop-down is visible in UI
-    :param visible_submit_buttons: whether submit buttons are visible when UI first comes up
-    :param visible_side_bar: whether left side bar is visible when UI first comes up
-    :param visible_doc_track: whether left side bar's document tracking is visible when UI first comes up
-    :param visible_chat_tab: "" for chat tab
-    :param visible_doc_selection_tab:  "" for doc selection tab
-    :param visible_doc_view_tab: "" for doc view tab
-    :param visible_chat_history_tab: "" for chat history tab
-    :param visible_expert_tab: "" for expert tab
-    :param visible_models_tab: "" for models tab
-    :param visible_system_tab: "" for system tab
-    :param visible_tos_tab: "" for ToS tab
-    :param visible_login_tab: "" for Login tab
-    :param visible_hosts_tab: "" for hosts tab
-    :param chat_tables: Just show Chat as block without tab (useful if want only chat view)
-    :param visible_h2ogpt_header: Whether github stars, URL, logo, and QR code are visible
-    :param max_raw_chunks: Maximum number of chunks to show in UI when asking for raw DB text from documents/collection
-
-    :param sanitize_user_prompt: whether to remove profanity from user input (slows down input processing)
-      Requires optional packages:
-      pip install alt-profanity-check==1.2.2 better-profanity==0.7.0
-    :param sanitize_bot_response: whether to remove profanity and repeat lines from bot output (about 2x slower generation for long streaming cases due to better_profanity being slow)
-    :param extra_model_options: extra models to show in list in gradio
-    :param extra_lora_options: extra LORA to show in list in gradio
-    :param extra_server_options: extra servers to show in list in gradio
-    :param score_model: which model to score responses
-           None: no response scoring
-           'auto': auto mode, '' (no model) for CPU or 1 GPU, 'OpenAssistant/reward-model-deberta-v3-large-v2' for >=2 GPUs,
-            because on CPU takes too much compute just for scoring response
-    :param eval_filename: json file to use for evaluation, if None is sharegpt
-    :param eval_prompts_only_num: for no gradio benchmark, if using eval_filename prompts for eval instead of examples
-    :param eval_prompts_only_seed: for no gradio benchmark, seed for eval_filename sampling
-    :param eval_as_output: for no gradio benchmark, whether to test eval_filename output itself
-
-    :param langchain_mode: Data source to include.  Choose "UserData" to only consume files from make_db.py.
-           None: auto mode, check if langchain package exists, at least do LLM if so, else Disabled
-           If not passed, then chosen to be first langchain_modes, else langchain_mode->Disabled is set if no langchain_modes either
-           WARNING: wiki_full requires extra data processing via read_wiki_full.py and requires really good workstation to generate db, unless already present.
-    :param user_path: user path to glob from to generate db for vector search, for 'UserData' langchain mode.
-           If already have db, any new/changed files are added automatically if path set, does not have to be same path used for prior db sources
-    :param langchain_modes: dbs to generate at launch to be ready for LLM
-           Apart from additional user-defined collections, can include ['wiki', 'wiki_full', 'UserData', 'MyData', 'github h2oGPT', 'DriverlessAI docs']
-             But wiki_full is expensive and requires preparation
-           To allow personal space only live in session, add 'MyData' to list
-           Default: If only want to consume local files, e.g. prepared by make_db.py, only include ['UserData']
-           If have own user modes, need to add these here or add in UI.
-    :param langchain_mode_paths: dict of langchain_mode keys and disk path values to use for source of documents
-           E.g. "{'UserData2': 'userpath2'}"
-           A disk path be None, e.g. --langchain_mode_paths="{'UserData2': None}" even if existing DB, to avoid new documents being added from that path, source links that are on disk still work.
-           If `--user_path` was passed, that path is used for 'UserData' instead of the value in this dict
-    :param langchain_mode_types: dict of langchain_mode keys and database types
-           E.g. python generate.py --base_model=llama --langchain_modes=['TestData'] --langchain_mode_types="{'TestData':'shared'}"
-           The type is attempted to be inferred if directory already exists, then don't have to pass this
-    :param detect_user_path_changes_every_query: whether to detect if any files changed or added every similarity search (by file hashes).
-           Expensive for large number of files, so not done by default.  By default only detect changes during db loading.
-
-    :param langchain_action: Mode langchain operations in on documents.
-            Query: Make query of document(s)
-            Summarize or Summarize_map_reduce: Summarize document(s) via map_reduce
-            Summarize_all: Summarize document(s) using entire document at once
-            Summarize_refine: Summarize document(s) using entire document, and try to refine before returning summary
-    :param langchain_agents: Which agents to use
-            'search': Use Web Search as context for LLM response, e.g. SERP if have SERPAPI_API_KEY in env
-    :param force_langchain_evaluate: Whether to force langchain LLM use even if not doing langchain, mostly for testing.
-
-    :param visible_langchain_actions: Which actions to allow
-    :param visible_langchain_agents: Which agents to allow
-
-    :param document_subset: Default document choice when taking subset of collection
-    :param document_choice: Chosen document(s) by internal name, 'All' means use all docs
-
-    :param use_llm_if_no_docs: Whether to use LLM even if no documents, when langchain_mode=UserData or MyData or custom
-    :param load_db_if_exists: Whether to load chroma db if exists or re-generate db
-    :param keep_sources_in_context: Whether to keep url sources in context, not helpful usually
-    :param db_type: 'faiss' for in-memory
-                    'chroma' (for chroma >= 0.4)
-                    'chroma_old' (for chroma < 0.4) -- recommended for large collections
-                    'weaviate' for persisted on disk
-    :param use_openai_embedding: Whether to use OpenAI embeddings for vector db
-    :param use_openai_model: Whether to use OpenAI model for use with vector db
-    :param hf_embedding_model: Which HF embedding model to use for vector db
-           Default is instructor-large with 768 parameters per embedding if have GPUs, else all-MiniLM-L6-v2 if no GPUs
-           Can also choose simpler model with 384 parameters per embedding: "sentence-transformers/all-MiniLM-L6-v2"
-           Can also choose even better embedding with 1024 parameters: 'hkunlp/instructor-xl'
-           We support automatically changing of embeddings for chroma, with a backup of db made if this is done
-    :param migrate_embedding_model: whether to use hf_embedding_model embedding even if database already had an embedding set.
-           used to migrate all embeddings to a new one, but will take time to re-embed.
-           Default (False) is to use the prior embedding for existing databases, and only use hf_embedding_model for new databases
-           If had old database without embedding saved, then hf_embedding_model is also used.
-    :param auto_migrate_db: whether to automatically migrate any chroma<0.4 database from duckdb -> sqlite version
-    :param cut_distance: Distance to cut off references with larger distances when showing references.
-           1.64 is good to avoid dropping references for all-MiniLM-L6-v2, but instructor-large will always show excessive references.
-           For all-MiniLM-L6-v2, a value of 1.5 can push out even more references, or a large value of 100 can avoid any loss of references.
-    :param answer_with_sources: Whether to determine (and return) sources
-    :param append_sources_to_answer: Whether to place source information in chat response (ignored by LLM).  Always disabled for API.
-    :param show_accordions: whether to show accordion for document references in chatbot UI
-    :param top_k_docs_max_show: Max number of docs to show in UI for sources
-           If web search is enabled, then this is modified to be max(top_k_docs_max_show, number of links used in search)
-    :param show_link_in_sources: Whether to show URL link to source document in references
-    :param pre_prompt_query: prompt before documents to query, if None then use internal defaults
-    :param prompt_query: prompt after documents to query, if None then use internal defaults
-    :param pre_prompt_summary: prompt before documents to summarize, if None then use internal defaults
-    :param prompt_summary: prompt after documents to summarize, if None then use internal defaults
-           For summarize, normal to have empty query (nothing added in ask anything in UI or empty string in API)
-           If pass query, template is "Focusing on %s, %s" % (query, prompt_summary)
-           If pass query and iinput, template is "Focusing on %s, %s, %s" % (query, iinput, prompt_summary)
-    :param add_chat_history_to_context: Include chat context when performing action
-           Not supported yet for openai_chat when using document collection instead of LLM
-           Also not supported when using CLI mode
-    :param add_search_to_context: Include web search in context as augmented prompt
-    :param context: Default context to use (for system pre-context in gradio UI)
-           context comes before chat_conversation and any document Q/A from text_context_list
-    :param iinput: Default input for instruction-based prompts
-    :param allow_upload_to_user_data: Whether to allow file uploads to update shared vector db (UserData or custom user dbs)
-           Ensure pass user_path for the files uploaded to be moved to this location for linking.
-    :param reload_langchain_state: Whether to reload langchain_modes.pkl file that contains any new user collections.
-    :param allow_upload_to_my_data: Whether to allow file uploads to update personal vector db
-    :param enable_url_upload: Whether to allow upload from URL
-    :param enable_text_upload: Whether to allow upload of text
-    :param enable_sources_list: Whether to allow list (or download for non-shared db) of list of sources for chosen db
-    :param chunk: Whether to chunk data (True unless know data is already optimally chunked)
-    :param chunk_size: Size of chunks, with typically top-4 passed to LLM, so needs to be in context length
-    :param top_k_docs: For langchain_action query: number of chunks to give LLM
-                       -1 : auto-fills context up to max_seq_len
-                       For langchain_action summarize: number of document parts, like pages for PDF.
-                       There's no such thing as chunks for summarization.
-                       -1 : auto-fills context up to max_seq_len
-    :param docs_ordering_type:
-        Type of ordering of docs.
-        'best_first': Order by score so score is worst match near prompt
-        'best_near_prompt' or 'reverse_sort' : reverse docs order so most relevant is closest to question.
-           Best choice for sufficiently smart model, and truncation occurs for oldest context, so best then too.
-           But smaller 6_9 models fail to use newest context and can get stuck on old information.
-        '' or None (i.e. default) or 'reverse_ucurve_sort' : Sort so most relevant is either near start or near end
-           Best to avoid "lost in middle" as well as avoid hallucinating off starting content that LLM focuses on alot.
-    :param auto_reduce_chunks: Whether to automatically reduce top_k_docs to fit context given prompt
-    :param max_chunks: If top_k_docs=-1, maximum number of chunks to allow
-    :param headsize: Maximum number of characters for head of document document for UI to show
-    :param n_jobs: Number of processors to use when consuming documents (-1 = all, is default)
-
-    :param use_unstructured: Enable unstructured URL loader
-    :param use_playwright: Enable PlayWright URL loader
-    :param use_selenium: Enable Selenium URL loader
-
-    :param use_pymupdf: enable PyMUPDF 'auto' means use first, use others if they are 'auto' if no result
-    :param use_unstructured_pdf: enable Unstructured PDF loader, 'auto' means use if pymupdf fails to get doc result
-    :param use_pypdf: enable PyPDF loader 'auto' means use if unstructured fails to get doc result
-    :param enable_pdf_ocr: 'auto' means only use OCR if normal text extraction fails.  Useful for pure image-based PDFs with text.
-                                  if enable_pdf_doctr == 'on' then don't do.
-                            'on' means always do OCR as additional parsing of same documents
-                            'off' means don't do OCR (e.g. because it's slow even if 'auto' only would trigger if nothing else worked)
-    :param enable_pdf_doctr: Whether to support doctr on pdfs, 'auto' means use do if failed to get doc result so far
-    :param try_pdf_as_html: Try "PDF" as if HTML file, in case web link has .pdf extension but really is just HTML
-
-    :param enable_ocr: Whether to support OCR on images
-    :param enable_doctr: Whether to support doctr on images (using OCR better than enable_ocr=True)
-    :param enable_pix2struct: Whether to support pix2struct on images for captions
-    :param enable_captions: Whether to support captions using BLIP for image files as documents,
-           then preloads that model if pre_load_caption_model=True
-
-    :param pre_load_caption_model: Whether to preload caption model, or load after forking parallel doc loader
-           parallel loading disabled if preload and have images, to prevent deadlocking on cuda context
-           Recommended if using larger caption model
-    :param captions_model: Which model to use for captions.
-           captions_model: str = "Salesforce/blip-image-captioning-base",  # continue capable
-           captions_model: str = "Salesforce/blip2-flan-t5-xl",   # question/answer capable, 16GB state
-           captions_model: str = "Salesforce/blip2-flan-t5-xxl",  # question/answer capable, 60GB state
-           Note: opt-based blip2 are not permissive license due to opt and Meta license restrictions
-           Disabled for CPU since BLIP requires CUDA
-    :param caption_gpu: If support caption, then use GPU if exists
-
-    :param doctr_gpu: If support doctr, then use GPU if exists
-
-    :param jq_schema: control json loader
-           By default '.[]' ingests everything in brute-force way, but better to match your schema
-           See: https://python.langchain.com/docs/modules/data_connection/document_loaders/json#using-jsonloader
-
-    :param max_quality: Choose maximum quality ingestion with all available parsers
-           Pro: Catches document when some default parsers would fail
-           Pro: Enables DocTR that has much better OCR than Tesseract
-           Con: Fills DB with results from all parsers, so similarity search gives redundant results
-
-    :param enable_heap_analytics: Toggle telemetry.
-    :param heap_app_id: App ID for Heap, change to your ID.
-    :return:
-    """
-    if base_model is None:
-        base_model = ''
-    if tokenizer_base_model is None:
-        tokenizer_base_model = ''
-    if lora_weights is None:
-        lora_weights = ''
-    if inference_server is None:
-        inference_server = ''
-
-    # listen to env if set
-    model_lock = os.getenv('model_lock', str(model_lock))
-    model_lock = ast.literal_eval(model_lock)
-
-    chat_conversation = str_to_list(chat_conversation)
-    text_context_list = str_to_list(text_context_list)
-
-    llamacpp_dict = str_to_dict(llamacpp_dict)
-    # add others to single dict
-    llamacpp_dict['model_path_llama'] = model_path_llama
-    llamacpp_dict['model_name_gptj'] = model_name_gptj
-    llamacpp_dict['model_name_gpt4all_llama'] = model_name_gpt4all_llama
-    llamacpp_dict['model_name_exllama_if_no_config'] = model_name_exllama_if_no_config
-    # if user overrides but doesn't set these:
-    if 'n_batch' not in llamacpp_dict:
-        llamacpp_dict['n_batch'] = 128
-    if 'n_gpu_layers' not in llamacpp_dict:
-        llamacpp_dict['n_gpu_layers'] = 100
-    if 'n_gqa' not in llamacpp_dict:
-        llamacpp_dict['n_gqa'] = 0
-
-    if os.environ.get('SERPAPI_API_KEY') is None and LangChainAgent.SEARCH.value in visible_langchain_agents:
-        visible_langchain_agents.remove(LangChainAgent.SEARCH.value)
-
-    if model_lock:
-        assert gradio, "model_lock only supported for gradio=True"
-        assert not cli, "model_lock only supported for cli=False"
-        assert not (not cli and not gradio), "model_lock only supported for eval (cli=gradio=False)"
-        assert not base_model, "Don't specify model_lock and base_model"
-        assert not tokenizer_base_model, "Don't specify model_lock and tokenizer_base_model"
-        assert not lora_weights, "Don't specify model_lock and lora_weights"
-        assert not inference_server, "Don't specify model_lock and inference_server"
-        # assert not prompt_type, "Don't specify model_lock and prompt_type"
-        # assert not prompt_dict, "Don't specify model_lock and prompt_dict"
-
-    n_jobs = int(os.getenv('n_jobs', str(n_jobs)))
-    is_hf = bool(int(os.getenv("HUGGINGFACE_SPACES", '0')))
-    is_gpth2oai = bool(int(os.getenv("GPT_H2O_AI", '0')))
-    is_public = is_hf or is_gpth2oai  # multi-user case with fixed model and disclaimer
-    if is_public:
-        visible_tos_tab = visible_hosts_tab = True
-        if enforce_h2ogpt_api_key is None:
-            enforce_h2ogpt_api_key = True
-    else:
-        if enforce_h2ogpt_api_key is None:
-            enforce_h2ogpt_api_key = False
-    if isinstance(h2ogpt_api_keys, str) and not os.path.isfile(h2ogpt_api_keys):
-        h2ogpt_api_keys = str_to_list(h2ogpt_api_keys)
-    if memory_restriction_level is None:
-        memory_restriction_level = 2 if is_hf else 0  # 2 assumes run on 24GB consumer GPU
-    else:
-        assert 0 <= memory_restriction_level <= 3, "Bad memory_restriction_level=%s" % memory_restriction_level
-    if n_jobs == -1:
-        # if -1, assume hypercores, don't use, force user to pass n_jobs to be specific if not standard cores
-        n_jobs = max(1, os.cpu_count() // 2)
-    if is_public and os.getenv('n_jobs') is None:
-        n_jobs = min(n_jobs, max(1, min(os.cpu_count() // 2, 8)))
-    admin_pass = os.getenv("ADMIN_PASS")
-    # will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
-    # but becomes unrecoverable sometimes if raise, so just be silent for now
-    raise_generate_gpu_exceptions = True
-
-    rope_scaling = str_to_dict(rope_scaling)
-
-    if isinstance(auth, str):
-        if auth.strip().startswith('['):
-            auth = str_to_list(auth)
-    if isinstance(auth, str) and auth:
-        auth_filename = auth
-    if not auth_filename:
-        auth_filename = "auth.json"
-    assert isinstance(auth, (str, list, tuple, type(None))), "Unknown type %s for auth=%s" % (type(auth), auth)
-
-    # allow set token directly
-    use_auth_token = os.environ.get("HUGGING_FACE_HUB_TOKEN", use_auth_token)
-    allow_upload_to_user_data = bool(
-        int(os.environ.get("allow_upload_to_user_data", str(int(allow_upload_to_user_data)))))
-    allow_upload_to_my_data = bool(int(os.environ.get("allow_upload_to_my_data", str(int(allow_upload_to_my_data)))))
-    height = int(os.environ.get("HEIGHT", height))
-    h2ocolors = bool(int(os.getenv('h2ocolors', h2ocolors)))
-
-    # allow enabling langchain via ENV
-    # FIRST PLACE where LangChain referenced, but no imports related to it
-    langchain_modes = ast.literal_eval(os.environ.get("langchain_modes", str(langchain_modes)))
-    if not isinstance(langchain_modes, list):
-        langchain_modes = []
-    # always allow DISABLED
-    if LangChainMode.DISABLED.value not in langchain_modes:
-        langchain_modes.append(LangChainMode.DISABLED.value)
-
-    # update
-    langchain_mode_paths = str_to_dict(langchain_mode_paths)
-    langchain_mode_types = str_to_dict(langchain_mode_types)
-    for lmode in [LangChainMode.GITHUB_H2OGPT.value,
-                  LangChainMode.H2O_DAI_DOCS.value,
-                  LangChainMode.WIKI.value,
-                  LangChainMode.WIKI_FULL.value,
-                  ]:
-        if lmode not in langchain_mode_types:
-            langchain_mode_types[lmode] = 'shared'
-    if lmode not in langchain_mode_paths:
-        langchain_mode_types[lmode] = ''
-    if user_path:
-        user_path = makedirs(user_path, use_base=True)
-        langchain_mode_paths['UserData'] = user_path
-        langchain_mode_paths['UserData'] = LangChainTypes.SHARED.value
-
-    if is_public:
-        allow_upload_to_user_data = False
-        if LangChainMode.USER_DATA.value in langchain_modes:
-            langchain_modes.remove(LangChainMode.USER_DATA.value)
-    if max_raw_chunks is None:
-        max_raw_chunks = 30 if is_public else 1000000
-
-    # in-place, for non-scratch dbs
-    if allow_upload_to_user_data:
-        # always listen to CLI-passed user_path if passed
-        if user_path:
-            langchain_mode_paths['UserData'] = user_path
-
-    assert langchain_action in langchain_actions, "Invalid langchain_action %s not in %s" % (
-        langchain_action, langchain_actions)
-    assert len(
-        set(langchain_agents).difference(langchain_agents_list)) == 0, "Invalid langchain_agents %s" % langchain_agents
-
-    # auto-set langchain_mode
-    langchain_mode = os.environ.get("LANGCHAIN_MODE", langchain_mode)
-    if have_langchain and langchain_mode is None:
-        # start in chat mode, in case just want to chat and don't want to get "No documents to query" by default.
-        if LangChainMode.LLM.value in langchain_modes:
-            langchain_mode = LangChainMode.LLM.value
-        elif len(langchain_modes) >= 1:
-            # infer even if don't pass which langchain_mode, just langchain_modes.
-            langchain_mode = langchain_modes[0]
-        if allow_upload_to_user_data and not is_public and langchain_mode_paths['UserData']:
-            if verbose:
-                print("Auto set langchain_mode=%s.  Could use UserData instead." % langchain_mode, flush=True)
-        elif allow_upload_to_my_data:
-            if verbose:
-                print("Auto set langchain_mode=%s.  Could use MyData instead."
-                      "  To allow UserData to pull files from disk,"
-                      " set user_path or langchain_mode_paths, and ensure allow_upload_to_user_data=True" % langchain_mode,
-                      flush=True)
-        else:
-            raise RuntimeError("Please pass --langchain_mode=<chosen mode> out of %s" % langchain_modes)
-    if not have_langchain and langchain_mode not in [None, LangChainMode.DISABLED.value, LangChainMode.LLM.value]:
-        raise RuntimeError("Asked for LangChain mode but langchain python package cannot be found.")
-    if langchain_mode is None:
-        # if not set yet, disable
-        langchain_mode = LangChainMode.DISABLED.value
-        print("Auto set langchain_mode=%s  Have langchain package: %s" % (langchain_mode, have_langchain), flush=True)
-    # go ahead and add
-    if langchain_mode not in langchain_modes:
-        langchain_modes.append(langchain_mode)
-
-    if is_public:
-        allow_upload_to_user_data = False
-        input_lines = 1  # ensure set, for ease of use
-        temperature = 0.2 if temperature is None else temperature
-        top_p = 0.85 if top_p is None else top_p
-        top_k = 70 if top_k is None else top_k
-        if is_hf:
-            do_sample = True if do_sample is None else do_sample
-            top_k_docs = 3 if top_k_docs is None else top_k_docs
-        else:
-            # by default don't sample, too chatty
-            do_sample = False if do_sample is None else do_sample
-            top_k_docs = 4 if top_k_docs is None else top_k_docs
-
-        if memory_restriction_level == 2:
-            if not base_model and not inference_server and not model_lock:
-                base_model = 'h2oai/h2ogpt-oasst1-512-12b'
-                # don't set load_8bit if passed base_model, doesn't always work so can't just override
-                load_8bit = True
-                load_4bit = False  # FIXME - consider using 4-bit instead of 8-bit
-        elif not inference_server:
-            top_k_docs = 10 if top_k_docs is None else top_k_docs
-    if memory_restriction_level >= 2:
-        load_8bit = True
-        load_4bit = False  # FIXME - consider using 4-bit instead of 8-bit
-        if hf_embedding_model is None:
-            hf_embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
-        top_k_docs = 3 if top_k_docs is None else top_k_docs
-    if top_k_docs is None:
-        top_k_docs = 3
-    if is_public:
-        if not max_time:
-            max_time = 60 * 2
-        if not max_max_time:
-            max_max_time = max_time
-        if not max_new_tokens:
-            max_new_tokens = 256
-        if not max_max_new_tokens:
-            max_max_new_tokens = 512
-    else:
-        if not max_max_time:
-            max_max_time = 60 * 20
-        if not max_max_new_tokens:
-            max_max_new_tokens = 1024
-    if is_hf:
-        # must override share if in spaces
-        share = False
-        if not max_time:
-            max_time = 60 * 1
-        if not max_max_time:
-            max_max_time = max_time
-        # HF accounted for later in get_max_max_new_tokens()
-    save_dir = os.getenv('SAVE_DIR', save_dir)
-    save_dir = makedirs(save_dir, exist_ok=True, tmp_ok=True, use_base=True)
-    score_model = os.getenv('SCORE_MODEL', score_model)
-    if str(score_model) == 'None':
-        score_model = ''
-    concurrency_count = int(os.getenv('CONCURRENCY_COUNT', concurrency_count))
-    api_open = bool(int(os.getenv('API_OPEN', str(int(api_open)))))
-    allow_api = bool(int(os.getenv('ALLOW_API', str(int(allow_api)))))
-
-    n_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
-    n_gpus, gpu_ids = cuda_vis_check(n_gpus)
-
-    if load_half is None and t5_type(base_model):
-        load_half = False
-        print("load_half=%s auto-set for %s to avoid bad generation" % (load_half, base_model), flush=True)
-
-    if n_gpus == 0 or get_device() == "mps":
-        # No CUDA GPUs usable
-
-        if get_device() != "mps":
-            print("No GPUs detected", flush=True)
-
-        enable_captions = False
-        gpu_id = None
-        load_8bit = False
-        load_4bit = False
-        low_bit_mode = 1
-        if load_half is None:
-            # wouldn't work if specified True, but respect
-            load_half = False
-        load_gptq = ''
-        load_exllama = False
-        use_gpu_id = False
-        if get_device() == "cuda":
-            torch.backends.cudnn.benchmark = True
-            torch.backends.cudnn.enabled = False
-            torch.set_default_dtype(torch.float32)
-        if is_public and not inference_server and not model_lock:
-            # 12B uses ~94GB
-            # 6.9B uses ~47GB
-            base_model = 'h2oai/h2ogpt-oig-oasst1-512-6_9b' if not base_model else base_model
-        if hf_embedding_model is None:
-            # if no GPUs, use simpler embedding model to avoid cost in time
-            hf_embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
-        if score_model == 'auto':
-            score_model = ''
-    else:
-        if load_half is None:
-            load_half = True
-        # CUDA GPUs visible
-        if score_model == 'auto':
-            if n_gpus >= 2:
-                # will by default place scoring model on last GPU
-                score_model = 'OpenAssistant/reward-model-deberta-v3-large-v2'
-            else:
-                score_model = ''
-        if hf_embedding_model is None:
-            # if still None, then set default
-            hf_embedding_model = 'hkunlp/instructor-large'
-
-    # get defaults
-    if base_model:
-        model_lower = base_model.lower()
-    elif model_lock:
-        # have 0th model be thought of as normal model
-        assert len(model_lock) > 0 and model_lock[0]['base_model']
-        model_lower = model_lock[0]['base_model'].lower()
-    else:
-        model_lower = ''
-    if not gradio:
-        # force, else not single response like want to look at
-        stream_output = False
-        # else prompt removal can mess up output
-        chat = False
-    # hard-coded defaults
-    first_para = False
-    text_limit = None
-
-    if compile_model is None:
-        # too avoid noisy CLI
-        compile_model = not cli
-
-    if offload_folder:
-        offload_folder = makedirs(offload_folder, exist_ok=True, tmp_ok=True, use_base=True)
-
-    # defaults
-    caption_loader = None
-    doctr_loader = None
-    pix2struct_loader = None
-
-    image_loaders_options0, image_loaders_options, \
-        pdf_loaders_options0, pdf_loaders_options, \
-        url_loaders_options0, url_loaders_options = lg_to_gr(**locals())
-    jq_schema0 = jq_schema
-    # transcribe
-    image_loaders = image_loaders_options0
-    pdf_loaders = pdf_loaders_options0
-    url_loaders = url_loaders_options0
-
-    placeholder_instruction, placeholder_input, \
-        stream_output, show_examples, \
-        prompt_type, prompt_dict, \
-        temperature, top_p, top_k, num_beams, \
-        max_new_tokens, min_new_tokens, early_stopping, max_time, \
-        repetition_penalty, num_return_sequences, \
-        do_sample, \
-        src_lang, tgt_lang, \
-        examples, \
-        task_info = \
-        get_generate_params(model_lower,
-                            chat,
-                            stream_output, show_examples,
-                            prompt_type, prompt_dict,
-                            system_prompt,
-                            pre_prompt_query, prompt_query,
-                            pre_prompt_summary, prompt_summary,
-                            temperature, top_p, top_k, num_beams,
-                            max_new_tokens, min_new_tokens, early_stopping, max_time,
-                            repetition_penalty, num_return_sequences,
-                            do_sample,
-                            top_k_docs,
-                            chunk,
-                            chunk_size,
-                            image_loaders,
-                            pdf_loaders,
-                            url_loaders,
-                            jq_schema,
-                            docs_ordering_type,
-                            min_max_new_tokens,
-                            verbose,
-                            )
-
-    git_hash = get_githash() if is_public or os.getenv('GET_GITHASH') else "GET_GITHASH"
-    locals_dict = locals()
-    locals_print = '\n'.join(['%s: %s' % (k, v) for k, v in locals_dict.items()])
-    if verbose:
-        print(f"Generating model with params:\n{locals_print}", flush=True)
-        print("Command: %s\nHash: %s" % (str(' '.join(sys.argv)), git_hash), flush=True)
-
-    if langchain_mode != "Disabled":
-        # SECOND PLACE where LangChain referenced, but all imports are kept local so not required
-        from gpt_langchain import prep_langchain, get_some_dbs_from_hf, get_persist_directory
-        if is_hf:
-            get_some_dbs_from_hf()
-        dbs = {}
-        for langchain_mode1 in langchain_modes:
-            langchain_type = langchain_mode_types.get(langchain_mode1, LangChainTypes.EITHER.value)
-            if langchain_type == LangChainTypes.PERSONAL.value:
-                # shouldn't prepare per-user databases here
-                continue
-            persist_directory1, langchain_type = get_persist_directory(langchain_mode1, langchain_type=langchain_type)
-            langchain_mode_types[langchain_mode1] = langchain_type
-            if langchain_type == LangChainTypes.PERSONAL.value:
-                # shouldn't prepare per-user databases here
-                continue
-            try:
-                db = prep_langchain(persist_directory1,
-                                    load_db_if_exists,
-                                    db_type, use_openai_embedding,
-                                    langchain_mode1, langchain_mode_paths, langchain_mode_types,
-                                    hf_embedding_model,
-                                    migrate_embedding_model,
-                                    auto_migrate_db,
-                                    kwargs_make_db=locals(),
-                                    verbose=verbose)
-            finally:
-                # in case updated embeddings or created new embeddings
-                clear_torch_cache()
-            dbs[langchain_mode1] = db
-        # remove None db's so can just rely upon k in dbs for if hav db
-        dbs = {k: v for k, v in dbs.items() if v is not None}
-    else:
-        dbs = {}
-        # import control
-        if os.environ.get("TEST_LANGCHAIN_IMPORT"):
-            assert 'gpt_langchain' not in sys.modules, "Dev bug, import of langchain when should not have"
-            assert 'langchain' not in sys.modules, "Dev bug, import of langchain when should not have"
-
-    other_model_state_defaults = dict(load_8bit=load_8bit, load_4bit=load_4bit, low_bit_mode=low_bit_mode,
-                                      load_half=load_half,
-                                      load_gptq=load_gptq, load_exllama=load_exllama, use_safetensors=use_safetensors,
-                                      revision=revision, use_gpu_id=use_gpu_id, gpu_id=gpu_id,
-                                      compile_model=compile_model,
-                                      use_cache=use_cache,
-                                      llamacpp_dict=llamacpp_dict, model_path_llama=model_path_llama,
-                                      model_name_gptj=model_name_gptj,
-                                      model_name_gpt4all_llama=model_name_gpt4all_llama,
-                                      model_name_exllama_if_no_config=model_name_exllama_if_no_config,
-                                      )
-    model_state_none = dict(model=None, tokenizer=None, device=None,
-                            base_model=None, tokenizer_base_model=None, lora_weights=None,
-                            inference_server=None, prompt_type=None, prompt_dict=None,
-                            visible_models=None, h2ogpt_key=None,
-                            )
-    model_state_none.update(other_model_state_defaults)
-    my_db_state0 = {LangChainMode.MY_DATA.value: [None, None, None]}
-    selection_docs_state0 = dict(langchain_modes=langchain_modes,
-                                 langchain_mode_paths=langchain_mode_paths,
-                                 langchain_mode_types=langchain_mode_types)
-    selection_docs_state = copy.deepcopy(selection_docs_state0)
-
-    if cli or not gradio:
-        # initial state for query prompt
-        model_name = base_model
-        pre_prompt_query, prompt_query, pre_prompt_summary, prompt_summary = \
-            get_langchain_prompts(pre_prompt_query, prompt_query,
-                                  pre_prompt_summary, prompt_summary,
-                                  model_name, inference_server,
-                                  model_path_llama)
-
-    if cli:
-        from cli import run_cli
-        return run_cli(**get_kwargs(run_cli, exclude_names=['model_state0'], **locals()))
-    elif not gradio:
-        from eval import run_eval
-        return run_eval(**get_kwargs(run_eval, exclude_names=['model_state0'], **locals()))
-    elif gradio or prepare_offline_level > 0:
-        # imported here so don't require gradio to run generate
-        from gradio_runner import go_gradio
-
-        # get default model
-        model_states = []
-        model_list = [dict(base_model=base_model, tokenizer_base_model=tokenizer_base_model, lora_weights=lora_weights,
-                           inference_server=inference_server, prompt_type=prompt_type, prompt_dict=prompt_dict,
-                           visible_models=None, h2ogpt_key=None)]
-        model_list[0].update(other_model_state_defaults)
-        # FIXME: hyper per model, not about model loading
-        # for k in gen_hyper:
-        #     model_list[k] = locals()[k]
-
-        model_list0 = copy.deepcopy(model_list)  # just strings, safe to deepcopy
-        model_state0 = model_state_none.copy()
-        assert len(model_state_none) == len(model_state0)
-        if model_lock:
-            model_list = model_lock
-        # do reverse, so first is default base_model etc., so some logic works in go_gradio() more easily
-        for model_dict in reversed(model_list):
-            # handle defaults user didn't have to pass
-            # special defaults, ignore defaults for these if not specifically set, replace with ''
-            model_dict['base_model'] = model_dict.get('base_model', '')
-            model_dict['tokenizer_base_model'] = model_dict.get('tokenizer_base_model', '')
-            model_dict['lora_weights'] = model_dict.get('lora_weights', '')
-            model_dict['inference_server'] = model_dict.get('inference_server', '')
-            if prepare_offline_level >= 2:
-                if 'openai' not in model_dict['inference_server'] and 'replicate' not in model_dict['inference_server']:
-                    # assume want locally, but OpenAI and replicate are never local for model part
-                    model_dict['inference_server'] = ''
-            prompt_type_infer = not model_dict.get('prompt_type')
-            model_dict['prompt_type'] = model_dict.get('prompt_type',
-                                                       model_list0[0]['prompt_type'])  # don't use mutated value
-            # rest of generic defaults
-            for k in model_list0[0]:
-                if k not in model_dict:
-                    model_dict[k] = model_list0[0][k]
-
-            # begin prompt adjustments
-            # get query prompt for (say) last base model if using model lock
-            pre_prompt_query1, prompt_query1, pre_prompt_summary1, prompt_summary1 = (
-                get_langchain_prompts(pre_prompt_query, prompt_query,
-                                      pre_prompt_summary, prompt_summary,
-                                      model_dict['base_model'],
-                                      model_dict['inference_server'],
-                                      model_dict['model_path_llama']))
-            # if mixed setup, choose non-empty so best models best
-            # FIXME: Make per model dict passed through to evaluate
-            pre_prompt_query = pre_prompt_query or pre_prompt_query1
-            prompt_query = prompt_query or prompt_query1
-            pre_prompt_summary = pre_prompt_summary or pre_prompt_summary1
-            prompt_summary = prompt_summary or prompt_summary1
-
-            # try to infer, ignore empty initial state leading to get_generate_params -> 'plain'
-            if prompt_type_infer:
-                model_lower1 = model_dict['base_model'].lower()
-                if model_lower1 in inv_prompt_type_to_model_lower:
-                    model_dict['prompt_type'] = inv_prompt_type_to_model_lower[model_lower1]
-                    model_dict['prompt_dict'], error0 = get_prompt(model_dict['prompt_type'], '',
-                                                                   chat=False, context='', reduced=False,
-                                                                   making_context=False,
-                                                                   return_dict=True,
-                                                                   system_prompt=system_prompt)
-                else:
-                    model_dict['prompt_dict'] = prompt_dict
-            else:
-                model_dict['prompt_dict'] = prompt_dict
-            model_dict['prompt_dict'] = model_dict.get('prompt_dict', model_dict['prompt_dict'])
-            # end prompt adjustments
-            all_kwargs = locals().copy()
-            all_kwargs.update(model_dict)
-            if model_dict['base_model'] and not login_mode_if_model0:
-                model0, tokenizer0, device = get_model(reward_type=False,
-                                                       **get_kwargs(get_model, exclude_names=['reward_type'],
-                                                                    **all_kwargs))
-            else:
-                # if empty model, then don't load anything, just get gradio up
-                model0, tokenizer0, device = None, None, None
-            if model0 is None:
-                if fail_if_cannot_connect:
-                    raise RuntimeError("Could not connect, see logs")
-                # skip
-                if isinstance(model_lock, list):
-                    model_lock.remove(model_dict)
-                continue
-            model_state_trial = dict(model=model0, tokenizer=tokenizer0, device=device)
-            model_state_trial.update(model_dict)
-            diff_keys = set(list(model_state_none.keys())).symmetric_difference(model_state_trial.keys())
-            assert len(model_state_none) == len(model_state_trial), diff_keys
-            print("Model %s" % model_dict, flush=True)
-            if model_lock:
-                # last in iteration will be first
-                model_states.insert(0, model_state_trial)
-                # fill model_state0 so go_gradio() easier, manage model_states separately
-                model_state0 = model_state_trial.copy()
-            else:
-                model_state0 = model_state_trial.copy()
-            assert len(model_state_none) == len(model_state0)
-
-        visible_models = str_to_list(visible_models, allow_none=True)  # None means first model
-        all_models = [x.get('base_model', xi) for xi, x in enumerate(model_states)]
-        visible_models_state0 = [x.get('base_model', xi) for xi, x in enumerate(model_states) if
-                                 visible_models is None or
-                                 x.get('base_model', xi) in visible_models or
-                                 xi in visible_models]
-
-        # update to be consistent with what is passed from CLI and model chose
-        # do after go over all models if multi-model, so don't contaminate
-        # This is just so UI shows reasonable correct value, not 2048 dummy value
-        if len(model_states) >= 1:
-            max_seq_len = model_states[0]['tokenizer'].model_max_length
-
-        # get score model
-        all_kwargs = locals().copy()
-        smodel, stokenizer, sdevice = get_score_model(reward_type=True,
-                                                      **get_kwargs(get_score_model, exclude_names=['reward_type'],
-                                                                   **all_kwargs))
-        score_model_state0 = dict(model=smodel, tokenizer=stokenizer, device=sdevice,
-                                  base_model=score_model, tokenizer_base_model='', lora_weights='',
-                                  inference_server='', prompt_type='', prompt_dict='')
-
-        if enable_captions:
-            if pre_load_caption_model:
-                from image_captions import H2OImageCaptionLoader
-                caption_loader = H2OImageCaptionLoader(caption_gpu=caption_gpu).load_model()
-            else:
-                caption_loader = 'gpu' if n_gpus > 0 and caption_gpu else 'cpu'
-        else:
-            caption_loader = False
-
-        if pre_load_embedding_model and langchain_mode != 'Disabled' and not use_openai_embedding:
-            from src.gpt_langchain import get_embedding
-            hf_embedding_model = dict(name=hf_embedding_model,
-                                      model=get_embedding(use_openai_embedding, hf_embedding_model=hf_embedding_model,
-                                                          preload=True))
-        if enable_doctr or enable_pdf_ocr in [True, 'auto', 'on']:
-            doctr_loader = 'gpu' if n_gpus > 0 and doctr_gpu else 'cpu'
-        else:
-            doctr_loader = False
-
-        # assume gradio needs everything
-        go_gradio(**locals())
-
-
-def get_config(base_model,
-               use_auth_token=False,
-               trust_remote_code=True,
-               offload_folder=None,
-               revision=None,
-               rope_scaling=None,
-               triton_attn=False,
-               long_sequence=True,
-               return_model=False,
-               raise_exception=False,
-               max_seq_len=None,
-               verbose=False,
-               ):
-    from accelerate import init_empty_weights
-    with init_empty_weights():
-        from transformers import AutoConfig
-        try:
-            config = AutoConfig.from_pretrained(base_model, use_auth_token=use_auth_token,
-                                                trust_remote_code=trust_remote_code,
-                                                offload_folder=offload_folder,
-                                                revision=revision,
-                                                rope_scaling=rope_scaling if rope_scaling else None)
-        except OSError as e:
-            if raise_exception:
-                raise
-            if 'not a local folder and is not a valid model identifier listed on' in str(
-                    e) or '404 Client Error' in str(e) or "couldn't connect" in str(e):
-                # e.g. llama, gpjt, etc.
-                # e.g. HF TGI but not model on HF or private etc.
-                if max_seq_len is None and base_model.lower() in non_hf_types:
-                    print("Could not determine --max_seq_len, setting to 2048.  Pass if not correct", flush=True)
-                    max_seq_len = 2048
-                # HF TGI server only should really require prompt_type, not HF model state
-                return None, None, max_seq_len
-            else:
-                raise
-        if triton_attn and 'mpt-' in base_model.lower():
-            config.attn_config['attn_impl'] = 'triton'
-        if long_sequence:
-            if 'mpt-7b-storywriter' in base_model.lower():
-                config.update({"max_seq_len": 83968})
-            if 'mosaicml/mpt-7b-chat' in base_model.lower():
-                config.update({"max_seq_len": 4096})
-            if 'mpt-30b' in base_model.lower():
-                config.update({"max_seq_len": 2 * 8192})
-        if return_model and \
-                issubclass(config.__class__, tuple(AutoModel._model_mapping.keys())):
-            model = AutoModel.from_config(
-                config,
-                trust_remote_code=trust_remote_code,
-            )
-        else:
-            # can't infer
-            model = None
-    if 'falcon' in base_model.lower():
-        config.use_cache = False
-
-    # allow override
-    if max_seq_len is not None:
-        print("Overriding max_seq_len -> %d" % max_seq_len, flush=True)
-    else:
-        if hasattr(config, 'max_seq_len'):
-            max_seq_len = int(config.max_seq_len)
-        elif hasattr(config, 'max_position_embeddings') and isinstance(config.max_position_embeddings, int):
-            # help automatically limit inputs to generate
-            max_seq_len = config.max_position_embeddings
-            if verbose:
-                print("Used max_position_embeddings=%s as base model (pre-rope) max_seq_len."
-                      "  If not desired, pass --max_seq_len and set to some integer value." % config.max_position_embeddings,
-                      flush=True)
-        elif hasattr(config, 'n_ctx'):
-            # e.g. gpt2
-            max_seq_len = int(config.n_ctx)
-        else:
-            print("Could not determine --max_seq_len, setting to 2048.  Pass if not correct", flush=True)
-            max_seq_len = 2048
-            # FIXME:
-            # raise RuntimeError("Could not determine max_seq_len,"
-            #                   " please pass --max_seq_len and set to some value, e.g. 2048.")
-
-        if rope_scaling:
-            if rope_scaling.get('factor'):
-                # HF transformers
-                max_seq_len *= rope_scaling.get('factor')
-            elif rope_scaling.get('alpha_value'):
-                # exllama
-                # Note: exllama's own tokenizer has this set correctly in loaders.py, this config will be unused
-                max_seq_len *= rope_scaling.get('alpha_value')
-            print("Automatically setting max_seq_len=%d for RoPE scaling" % max_seq_len, flush=True)
-
-    return config, model, max_seq_len
-
-
-def get_non_lora_model(base_model, model_loader, load_half,
-                       load_gptq,
-                       load_exllama,
-                       use_safetensors,
-                       revision,
-                       model_kwargs, reward_type,
-                       config, model,
-                       gpu_id=0,
-                       ):
-    """
-    Ensure model gets on correct device
-    """
-
-    if model is not None:
-        # NOTE: Can specify max_memory={0: max_mem, 1: max_mem}, to shard model
-        # NOTE: Some models require avoiding sharding some layers,
-        # then would pass no_split_module_classes and give list of those layers.
-        from accelerate import infer_auto_device_map
-        device_map = infer_auto_device_map(
-            model,
-            dtype=torch.float16 if load_half else torch.float32,
-        )
-        if hasattr(model, 'model'):
-            device_map_model = infer_auto_device_map(
-                model.model,
-                dtype=torch.float16 if load_half else torch.float32,
-            )
-            device_map.update(device_map_model)
-    else:
-        device_map = "auto"
-
-    n_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
-    n_gpus, gpu_ids = cuda_vis_check(n_gpus)
-
-    if n_gpus > 0:
-        if gpu_id >= 0:
-            # FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
-            # So avoid for now, just put on first GPU, unless score_model, put on last
-            if reward_type:
-                device_map = {'': n_gpus - 1}
-            else:
-                device_map = {'': min(n_gpus - 1, gpu_id)}
-        if gpu_id == -1:
-            device_map = {'': 'cuda'}
-    else:
-        device_map = {'': 'cpu'}
-        model_kwargs['load_in_8bit'] = False
-        model_kwargs['load_in_4bit'] = False
-    print('device_map: %s' % device_map, flush=True)
-
-    load_in_8bit = model_kwargs.get('load_in_8bit', False)
-    load_in_4bit = model_kwargs.get('load_in_4bit', False)
-    model_kwargs['device_map'] = device_map
-    model_kwargs['use_safetensors'] = use_safetensors
-    model_kwargs['revision'] = revision
-    pop_unused_model_kwargs(model_kwargs)
-
-    if load_exllama:
-        model = model_loader
-    elif load_gptq:
-        if 'Llama-2-70B-chat-GPTQ' in base_model:
-            model_kwargs.update(dict(inject_fused_attention=False))
-        model_kwargs.pop('torch_dtype', None)
-        model_kwargs.pop('device_map')
-        model = model_loader(
-            model_name_or_path=base_model,
-            model_basename=load_gptq,
-            **model_kwargs,
-        )
-    elif load_in_8bit or load_in_4bit or not load_half:
-        model = model_loader(
-            base_model,
-            config=config,
-            **model_kwargs,
-        )
-    else:
-
-        model = model_loader(
-            base_model,
-            config=config,
-            **model_kwargs,
-        )
-        if not getattr(model, "is_quantized", False):
-            model = model.half()
-    return model
-
-
-def get_client_from_inference_server(inference_server, base_model=None, raise_connection_exception=False):
-    inference_server, headers = get_hf_server(inference_server)
-    # preload client since slow for gradio case especially
-    from gradio_utils.grclient import GradioClient
-    gr_client = None
-    hf_client = None
-    if headers is None:
-        try:
-            print("GR Client Begin: %s %s" % (inference_server, base_model), flush=True)
-            # first do sanity check if alive, else gradio client takes too long by default
-            requests.get(inference_server, timeout=int(os.getenv('REQUEST_TIMEOUT', '30')))
-            gr_client = GradioClient(inference_server)
-            print("GR Client End: %s" % inference_server, flush=True)
-        except (OSError, ValueError) as e:
-            # Occurs when wrong endpoint and should have been HF client, so don't hard raise, just move to HF
-            gr_client = None
-            print("GR Client Failed %s %s: %s" % (inference_server, base_model, str(e)), flush=True)
-        except (ConnectTimeoutError, ConnectTimeout, MaxRetryError, ConnectionError, ConnectionError2,
-                JSONDecodeError, ReadTimeout2, KeyError) as e:
-            t, v, tb = sys.exc_info()
-            ex = ''.join(traceback.format_exception(t, v, tb))
-            print("GR Client Failed %s %s: %s" % (inference_server, base_model, str(ex)), flush=True)
-            if raise_connection_exception:
-                raise
-
-    if gr_client is None:
-        res = None
-        from text_generation import Client as HFClient
-        print("HF Client Begin: %s %s" % (inference_server, base_model))
-        try:
-            hf_client = HFClient(inference_server, headers=headers, timeout=int(os.getenv('REQUEST_TIMEOUT', '30')))
-            # quick check valid TGI endpoint
-            res = hf_client.generate('What?', max_new_tokens=1)
-            hf_client = HFClient(inference_server, headers=headers, timeout=300)
-        except (ConnectTimeoutError, ConnectTimeout, MaxRetryError, ConnectionError, ConnectionError2,
-                JSONDecodeError, ReadTimeout2, KeyError) as e:
-            hf_client = None
-            t, v, tb = sys.exc_info()
-            ex = ''.join(traceback.format_exception(t, v, tb))
-            print("HF Client Failed %s %s: %s" % (inference_server, base_model, str(ex)))
-            if raise_connection_exception:
-                raise
-        print("HF Client End: %s %s : %s" % (inference_server, base_model, res))
-    return inference_server, gr_client, hf_client
-
-
-def get_model(
-        load_8bit: bool = False,
-        load_4bit: bool = False,
-        low_bit_mode: int = 1,
-        load_half: bool = True,
-        load_gptq: str = '',
-        load_exllama: bool = False,
-        use_safetensors: bool = False,
-        revision: str = None,
-        use_gpu_id: bool = True,
-        base_model: str = '',
-        inference_server: str = "",
-        tokenizer_base_model: str = '',
-        lora_weights: str = "",
-        gpu_id: int = 0,
-        n_jobs=None,
-
-        reward_type: bool = None,
-        local_files_only: bool = False,
-        resume_download: bool = True,
-        use_auth_token: Union[str, bool] = False,
-        trust_remote_code: bool = True,
-        offload_folder: str = None,
-        rope_scaling: dict = None,
-        max_seq_len: int = None,
-        compile_model: bool = True,
-        llamacpp_dict=None,
-
-        verbose: bool = False,
-):
-    """
-
-    :param load_8bit: load model in 8-bit, not supported by all models
-    :param load_4bit: load model in 4-bit, not supported by all models
-    :param low_bit_mode: See gen.py
-    :param load_half: load model in 16-bit
-    :param load_gptq: GPTQ model_basename
-    :param load_exllama: whether to use exllama
-    :param use_safetensors: use safetensors file
-    :param revision:
-    :param use_gpu_id: Use torch infer of optimal placement of layers on devices (for non-lora case)
-           For non-LORA case, False will spread shards across multiple GPUs, but this can lead to cuda:x cuda:y mismatches
-           So it is not the default
-    :param base_model: name/path of base model
-    :param inference_server: whether base_model is hosted locally ('') or via http (url)
-    :param tokenizer_base_model: name/path of tokenizer
-    :param lora_weights: name/path
-    :param gpu_id: which GPU (0..n_gpus-1) or allow all GPUs if relevant (-1)
-    :param n_jobs: number of cores to use (e.g. for llama CPU model)
-    :param reward_type: reward type model for sequence classification
-    :param local_files_only: use local files instead of from HF
-    :param resume_download: resume downloads from HF
-    :param use_auth_token: assumes user did on CLI `huggingface-cli login` to access private repo
-    :param trust_remote_code: trust code needed by model
-    :param offload_folder: offload folder
-    :param rope_scaling: scaling for rope-based models, e.g. "{'type':'dynamic', 'factor':4}"
-    :param max_seq_len: override for maximum sequence length for model
-    :param max_seq_len: if set, use as max_seq_len for model
-    :param compile_model: whether to compile torch model
-    :param llamacpp_dict: dict of llama.cpp and GPT4All model options
-    :param verbose:
-    :return:
-    """
-    print("Starting get_model: %s %s" % (base_model, inference_server), flush=True)
-
-    triton_attn = False
-    long_sequence = True
-    config_kwargs = dict(use_auth_token=use_auth_token,
-                         trust_remote_code=trust_remote_code,
-                         offload_folder=offload_folder,
-                         rope_scaling=rope_scaling,
-                         triton_attn=triton_attn,
-                         long_sequence=long_sequence,
-                         revision=revision,
-                         max_seq_len=max_seq_len,
-                         verbose=verbose)
-    config, _, max_seq_len = get_config(base_model, **config_kwargs, raise_exception=False)
-
-    if base_model in non_hf_types:
-        assert config is None, "Expected config None for %s" % base_model
-
-    llama_type_from_config = 'llama' in str(config).lower()
-    llama_type_from_name = "llama" in base_model.lower()
-    llama_type = llama_type_from_config or llama_type_from_name
-    if "xgen" in base_model.lower() or 'llama2' in base_model.lower() or 'llama-2' in base_model.lower():
-        llama_type = False
-    if llama_type:
-        if verbose:
-            print("Detected as llama type from"
-                  " config (%s) or name (%s)" % (llama_type_from_config, llama_type_from_name), flush=True)
-
-    model_name_exllama_if_no_config = '' if not llamacpp_dict else llamacpp_dict.get('model_name_exllama_if_no_config',
-                                                                                     '')
-    model_loader, tokenizer_loader, conditional_type = (
-        get_loaders(model_name=base_model, reward_type=reward_type, llama_type=llama_type,
-                    load_gptq=load_gptq, load_exllama=load_exllama, config=config,
-                    rope_scaling=rope_scaling, max_seq_len=max_seq_len,
-                    model_name_exllama_if_no_config=model_name_exllama_if_no_config))
-
-    tokenizer_kwargs = dict(local_files_only=local_files_only,
-                            resume_download=resume_download,
-                            use_auth_token=use_auth_token,
-                            trust_remote_code=trust_remote_code,
-                            offload_folder=offload_folder,
-                            revision=revision,
-                            padding_side='left',
-                            config=config,
-                            )
-    if not tokenizer_base_model:
-        tokenizer_base_model = base_model
-
-    if load_exllama:
-        tokenizer = tokenizer_loader
-    elif config is not None and tokenizer_loader is not None and not isinstance(tokenizer_loader, str):
-        if load_exllama:
-            tokenizer = tokenizer_loader
-        else:
-            tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model, **tokenizer_kwargs)
-            # sets raw (no cushion) limit
-            # If using RoPE with scaling, then for non-exllama models (e.g. HF models),
-            #  then config -> tokenizer will set model_max_length correctly
-            set_model_max_len(max_seq_len, tokenizer, verbose=False)
-            # if using fake tokenizer, not really accurate when lots of numbers, give a bit of buffer, else get:
-            # Generation Failed: Input validation error: `inputs` must have less than 2048 tokens. Given: 2233
-            tokenizer.model_max_length = tokenizer.model_max_length - 50
-    else:
-        tokenizer = None
-
-    if isinstance(inference_server, str) and inference_server.startswith("http"):
-        inference_server, gr_client, hf_client = get_client_from_inference_server(inference_server,
-                                                                                  base_model=base_model)
-        client = gr_client or hf_client
-        # Don't return None, None for model, tokenizer so triggers
-        if tokenizer is None:
-            # FIXME: Could use only tokenizer from llama etc. but hard to detatch from model, just use fake for now
-            if os.getenv("HARD_ASSERTS") and base_model not in non_hf_types:
-                raise RuntimeError("Unexpected tokenizer=None")
-            tokenizer = FakeTokenizer()
-        return client, tokenizer, 'http'
-    if isinstance(inference_server, str) and (
-            inference_server.startswith('openai') or
-            inference_server.startswith('vllm') or
-            inference_server.startswith('replicate') or
-            inference_server.startswith('sagemaker')
-    ):
-        if inference_server.startswith('openai'):
-            assert os.getenv('OPENAI_API_KEY'), "Set environment for OPENAI_API_KEY"
-            # Don't return None, None for model, tokenizer so triggers
-            # include small token cushion
-            max_seq_len = model_token_mapping[base_model]
-        if inference_server.startswith('replicate'):
-            assert len(inference_server.split(':')) >= 3, "Expected replicate:model string, got %s" % inference_server
-            assert os.getenv('REPLICATE_API_TOKEN'), "Set environment for REPLICATE_API_TOKEN"
-            assert max_seq_len is not None, "Please pass --max_seq_len=<max_seq_len> for replicate models."
-            try:
-                import replicate as replicate_python
-            except ImportError:
-                raise ImportError(
-                    "Could not import replicate python package. "
-                    "Please install it with `pip install replicate`."
-                )
-        if inference_server.startswith('sagemaker'):
-            assert len(
-                inference_server.split(
-                    ':')) >= 3, "Expected sagemaker_chat:<endpoint name>:<region>, got %s" % inference_server
-            assert os.getenv('AWS_ACCESS_KEY_ID'), "Set environment for AWS_ACCESS_KEY_ID"
-            assert os.getenv('AWS_SECRET_ACCESS_KEY'), "Set environment for AWS_SECRET_ACCESS_KEY"
-        # Don't return None, None for model, tokenizer so triggers
-        # include small token cushion
-        if inference_server.startswith('openai') or tokenizer is None:
-            # don't use fake (tiktoken) tokenizer for vLLM//replicate if know actual model with actual tokenizer
-            tokenizer = FakeTokenizer(model_max_length=max_seq_len - 50)
-        return inference_server, tokenizer, inference_server
-    assert not inference_server, "Malformed inference_server=%s" % inference_server
-    if base_model in non_hf_types:
-        from gpt4all_llm import get_model_tokenizer_gpt4all
-        model, tokenizer, device = get_model_tokenizer_gpt4all(base_model, n_jobs=n_jobs,
-                                                               max_seq_len=max_seq_len,
-                                                               llamacpp_dict=llamacpp_dict)
-        return model, tokenizer, device
-    if load_exllama:
-        return model_loader, tokenizer, 'cuda'
-
-    # get local torch-HF model
-    return get_hf_model(load_8bit=load_8bit,
-                        load_4bit=load_4bit,
-                        low_bit_mode=low_bit_mode,
-                        load_half=load_half,
-                        load_gptq=load_gptq,
-                        use_safetensors=use_safetensors,
-                        revision=revision,
-                        use_gpu_id=use_gpu_id,
-                        base_model=base_model,
-                        tokenizer_base_model=tokenizer_base_model,
-                        lora_weights=lora_weights,
-                        gpu_id=gpu_id,
-
-                        reward_type=reward_type,
-                        local_files_only=local_files_only,
-                        resume_download=resume_download,
-                        use_auth_token=use_auth_token,
-                        trust_remote_code=trust_remote_code,
-                        offload_folder=offload_folder,
-                        rope_scaling=rope_scaling,
-                        compile_model=compile_model,
-
-                        llama_type=llama_type,
-                        config_kwargs=config_kwargs,
-                        tokenizer_kwargs=tokenizer_kwargs,
-
-                        verbose=verbose)
-
-
-def get_hf_model(load_8bit: bool = False,
-                 load_4bit: bool = False,
-                 low_bit_mode: int = 1,
-                 load_half: bool = True,
-                 load_gptq: str = '',
-                 use_safetensors: bool = False,
-                 revision: str = None,
-                 use_gpu_id: bool = True,
-                 base_model: str = '',
-                 tokenizer_base_model: str = '',
-                 lora_weights: str = "",
-                 gpu_id: int = 0,
-
-                 reward_type: bool = None,
-                 local_files_only: bool = False,
-                 resume_download: bool = True,
-                 use_auth_token: Union[str, bool] = False,
-                 trust_remote_code: bool = True,
-                 offload_folder: str = None,
-                 rope_scaling: dict = None,
-                 compile_model: bool = True,
-
-                 llama_type: bool = False,
-                 config_kwargs=None,
-                 tokenizer_kwargs=None,
-
-                 verbose: bool = False,
-                 ):
-    assert config_kwargs is not None
-    assert tokenizer_kwargs is not None
-
-    load_exllama = False  # Never should be in HF code for exllama
-
-    if lora_weights is not None and lora_weights.strip():
-        if verbose:
-            print("Get %s lora weights" % lora_weights, flush=True)
-    device = get_device()
-
-    if 'gpt2' in base_model.lower():
-        # RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Half
-        load_8bit = False
-        load_4bit = False
-
-    assert base_model.strip(), (
-        "Please choose a base model with --base_model (CLI) or load one from Models Tab (gradio)"
-    )
-
-    model_loader, tokenizer_loader, conditional_type = (
-        get_loaders(model_name=base_model, reward_type=reward_type, llama_type=llama_type,
-                    load_gptq=load_gptq, load_exllama=load_exllama))
-
-    config, _, max_seq_len = get_config(base_model, return_model=False, raise_exception=True, **config_kwargs)
-
-    if tokenizer_loader is not None and not isinstance(tokenizer_loader, str):
-        if load_exllama:
-            tokenizer = tokenizer_loader
-        else:
-            tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model,
-                                                         **tokenizer_kwargs)
-    else:
-        tokenizer = tokenizer_loader
-
-    if isinstance(tokenizer, str):
-        # already a pipeline, tokenizer_loader is string for task
-        model = model_loader(tokenizer,
-                             model=base_model,
-                             device=0 if device == "cuda" else -1,
-                             torch_dtype=torch.float16 if device == 'cuda' else torch.float32)
-    else:
-        assert device in ["cuda", "cpu", "mps"], "Unsupported device %s" % device
-        model_kwargs = dict(local_files_only=local_files_only,
-                            torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
-                            resume_download=resume_download,
-                            use_auth_token=use_auth_token,
-                            trust_remote_code=trust_remote_code,
-                            offload_folder=offload_folder,
-                            revision=revision,
-                            # rope_scaling=rope_scaling,  # only put into config
-                            )
-        if 'mbart-' not in base_model.lower() and 'mpt-' not in base_model.lower():
-            if use_gpu_id and gpu_id is not None and gpu_id >= 0 and device == 'cuda':
-                device_map = {"": gpu_id}
-            else:
-                device_map = "auto"
-            model_kwargs.update(dict(load_in_8bit=load_8bit,
-                                     load_in_4bit=load_4bit,
-                                     device_map=device_map,
-                                     ))
-        if 'mpt-' in base_model.lower() and gpu_id is not None and gpu_id >= 0:
-            # MPT doesn't support spreading over GPUs
-            model_kwargs.update(dict(device_map={"": gpu_id} if device == 'cuda' else "cpu"))
-
-        if 'OpenAssistant/reward-model'.lower() in base_model.lower():
-            # FIXME: could put on other GPUs
-            model_kwargs['device_map'] = {"": 0} if device == 'cuda' else {"": 'cpu'}
-            model_kwargs.pop('torch_dtype', None)
-        pop_unused_model_kwargs(model_kwargs)
-
-        n_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
-        n_gpus, gpu_ids = cuda_vis_check(n_gpus)
-        if low_bit_mode == 1 and n_gpus != 0:
-            from transformers import BitsAndBytesConfig
-            model_kwargs['quantization_config'] = BitsAndBytesConfig(bnb_4bit_compute_dtype=torch.bfloat16,
-                                                                     load_in_4bit=load_4bit,
-                                                                     load_in_8bit=load_8bit,
-                                                                     )
-        elif low_bit_mode == 2 and n_gpus != 0:
-            from transformers import BitsAndBytesConfig
-            model_kwargs['quantization_config'] = BitsAndBytesConfig(bnb_4bit_quant_type="nf4",
-                                                                     load_in_4bit=load_4bit,
-                                                                     load_in_8bit=load_8bit,
-                                                                     )
-        elif low_bit_mode == 3 and n_gpus != 0:
-            from transformers import BitsAndBytesConfig
-            model_kwargs['quantization_config'] = BitsAndBytesConfig(bnb_4bit_use_double_quant=True,
-                                                                     load_in_4bit=load_4bit,
-                                                                     load_in_8bit=load_8bit,
-                                                                     )
-        elif low_bit_mode == 4 and n_gpus != 0:
-            from transformers import BitsAndBytesConfig
-            model_kwargs['quantization_config'] = BitsAndBytesConfig(bnb_4bit_use_double_quant=True,
-                                                                     bnb_4bit_quant_type="nf4",
-                                                                     load_in_4bit=load_4bit,
-                                                                     load_in_8bit=load_8bit,
-                                                                     )
-
-        if not lora_weights:
-            # torch.device context uses twice memory for AutoGPTQ
-            context = NullContext if load_gptq else torch.device
-            with context(device):
-
-                if use_gpu_id:
-                    config, model, max_seq_len = get_config(base_model,
-                                                            return_model=True, raise_exception=True, **config_kwargs)
-                    model = get_non_lora_model(base_model, model_loader, load_half, load_gptq,
-                                               load_exllama,
-                                               use_safetensors,
-                                               revision,
-                                               model_kwargs, reward_type,
-                                               config, model,
-                                               gpu_id=gpu_id,
-                                               )
-                else:
-                    config, _, max_seq_len = get_config(base_model, **config_kwargs)
-                    if load_half and not (load_8bit or load_4bit or load_gptq):
-                        model = model_loader(
-                            base_model,
-                            config=config,
-                            **model_kwargs)
-                        if not getattr(model, "is_quantized", False):
-                            model = model.half()
-                    else:
-                        model = model_loader(
-                            base_model,
-                            config=config,
-                            **model_kwargs)
-        elif load_8bit or load_4bit:
-            config, _, max_seq_len = get_config(base_model, **config_kwargs)
-            model = model_loader(
-                base_model,
-                config=config,
-                **model_kwargs
-            )
-            from peft import PeftModel  # loads cuda, so avoid in global scope
-            model = PeftModel.from_pretrained(
-                model,
-                lora_weights,
-                torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
-                local_files_only=local_files_only,
-                resume_download=resume_download,
-                use_auth_token=use_auth_token,
-                trust_remote_code=trust_remote_code,
-                offload_folder=offload_folder,
-                rope_scaling=rope_scaling,
-                revision=revision,
-                device_map={"": 0} if device == 'cuda' else {"": 'cpu'},  # seems to be required
-            )
-        else:
-            with torch.device(device):
-                config, _, max_seq_len = get_config(base_model, raise_exception=True, **config_kwargs)
-                model = model_loader(
-                    base_model,
-                    config=config,
-                    **model_kwargs
-                )
-                from peft import PeftModel  # loads cuda, so avoid in global scope
-                model = PeftModel.from_pretrained(
-                    model,
-                    lora_weights,
-                    torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
-                    local_files_only=local_files_only,
-                    resume_download=resume_download,
-                    use_auth_token=use_auth_token,
-                    trust_remote_code=trust_remote_code,
-                    offload_folder=offload_folder,
-                    rope_scaling=rope_scaling,
-                    device_map="auto",
-                )
-                if load_half and not load_gptq:
-                    if not getattr(model, "is_quantized", False):
-                        model = model.half()
-
-    # unwind broken decapoda-research config
-    if llama_type:
-        model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
-        model.config.bos_token_id = 1
-        model.config.eos_token_id = 2
-    if 'gpt2' in base_model.lower():
-        # add special tokens that otherwise all share the same id
-        tokenizer.add_special_tokens({'bos_token': '<bos>',
-                                      'eos_token': '<eos>',
-                                      'pad_token': '<pad>'})
-
-    if not isinstance(tokenizer, str):
-        model.eval()
-        if torch.__version__ >= "2" and sys.platform != "win32" and compile_model:
-            model = torch.compile(model)
-
-    set_model_max_len(max_seq_len, tokenizer, verbose=False, reward_type=reward_type)
-
-    # tell if conditional type
-    model.conditional_type = conditional_type
-    tokenizer.conditional_type = conditional_type
-
-    return model, tokenizer, device
-
-
-def set_model_max_len(max_seq_len, tokenizer, verbose=False, reward_type=False):
-    if reward_type:
-        # limit deberta, else uses too much memory and not worth response score
-        tokenizer.model_max_length = 512
-        return
-
-    tokenizer.model_max_length = int(max_seq_len)
-    if verbose:
-        print("model_max_length=%s" % tokenizer.model_max_length, flush=True)
-    # for bug in HF transformers
-    if tokenizer.model_max_length > 100000000:
-        tokenizer.model_max_length = 2048
-
-
-def pop_unused_model_kwargs(model_kwargs):
-    """
-    in-place pop unused kwargs that are not dependency-upgrade friendly
-    no point passing in False, is default, and helps avoid needing to update requirements for new deps
-    :param model_kwargs:
-    :return:
-    """
-    check_list = ['load_in_8bit', 'load_in_4bit']
-    for k in check_list:
-        if k in model_kwargs and not model_kwargs[k]:
-            model_kwargs.pop(k)
-
-
-def get_score_model(score_model: str = None,
-                    load_8bit: bool = False,
-                    load_4bit: bool = False,
-                    low_bit_mode=1,
-                    load_half: bool = True,
-                    load_gptq: str = '',
-                    load_exllama: bool = False,
-                    use_gpu_id: bool = True,
-                    base_model: str = '',
-                    inference_server: str = '',
-                    tokenizer_base_model: str = '',
-                    lora_weights: str = "",
-                    gpu_id: int = 0,
-                    n_jobs=None,
-
-                    reward_type: bool = None,
-                    local_files_only: bool = False,
-                    resume_download: bool = True,
-                    use_auth_token: Union[str, bool] = False,
-                    trust_remote_code: bool = True,
-                    offload_folder: str = None,
-                    rope_scaling: dict = None,
-                    compile_model: bool = True,
-                    llamacpp_dict: typing.Dict = None,
-
-                    verbose: bool = False,
-                    ):
-    if score_model is not None and score_model.strip():
-        load_8bit = False
-        load_4bit = False
-        low_bit_mode = 1
-        load_half = False
-        load_gptq = ''
-        load_exllama = False
-        use_safetensors = False
-        revision = None
-        base_model = score_model.strip()
-        tokenizer_base_model = ''
-        lora_weights = ''
-        inference_server = ''
-        llama_type = False
-        max_seq_len = None
-        compile_model = False
-        llamacpp_dict = {}
-        smodel, stokenizer, sdevice = get_model(reward_type=True,
-                                                **get_kwargs(get_model, exclude_names=['reward_type'], **locals()))
-    else:
-        smodel, stokenizer, sdevice = None, None, None
-    return smodel, stokenizer, sdevice
-
-
-def evaluate_fake(*args, **kwargs):
-    yield dict(response=invalid_key_msg, sources='')
-    return
-
-
-def evaluate(
-        model_state,
-        my_db_state,
-        selection_docs_state,
-        requests_state,
-        # START NOTE: Examples must have same order of parameters
-        instruction,
-        iinput,
-        context,
-        stream_output,
-        prompt_type,
-        prompt_dict,
-        temperature,
-        top_p,
-        top_k,
-        num_beams,
-        max_new_tokens,
-        min_new_tokens,
-        early_stopping,
-        max_time,
-        repetition_penalty,
-        num_return_sequences,
-        do_sample,
-        chat,
-        instruction_nochat,
-        iinput_nochat,
-        langchain_mode,
-        add_chat_history_to_context,
-        langchain_action,
-        langchain_agents,
-        top_k_docs,
-        chunk,
-        chunk_size,
-        document_subset,
-        document_choice,
-        pre_prompt_query,
-        prompt_query,
-        pre_prompt_summary,
-        prompt_summary,
-        system_prompt,
-
-        image_loaders,
-        pdf_loaders,
-        url_loaders,
-        jq_schema,
-        visible_models,
-        h2ogpt_key,
-        add_search_to_context,
-        chat_conversation,
-        text_context_list,
-        docs_ordering_type,
-        min_max_new_tokens,
-
-        # END NOTE: Examples must have same order of parameters
-        captions_model=None,
-        caption_loader=None,
-        doctr_loader=None,
-        pix2struct_loader=None,
-        async_output=None,
-        num_async=None,
-        src_lang=None,
-        tgt_lang=None,
-        debug=False,
-        concurrency_count=None,
-        save_dir=None,
-        sanitize_bot_response=False,
-        model_state0=None,
-        memory_restriction_level=None,
-        max_max_new_tokens=None,
-        is_public=None,
-        max_max_time=None,
-        raise_generate_gpu_exceptions=None,
-        lora_weights=None,
-        use_llm_if_no_docs=True,
-        load_db_if_exists=True,
-        dbs=None,
-        detect_user_path_changes_every_query=None,
-        use_openai_embedding=None,
-        use_openai_model=None,
-        hf_embedding_model=None,
-        migrate_embedding_model=None,
-        auto_migrate_db=None,
-        cut_distance=None,
-        db_type=None,
-        n_jobs=None,
-        first_para=None,
-        text_limit=None,
-        show_accordions=None,
-        top_k_docs_max_show=None,
-        show_link_in_sources=None,
-        verbose=False,
-        cli=False,
-        use_cache=None,
-        auto_reduce_chunks=None,
-        max_chunks=None,
-        headsize=None,
-        model_lock=None,
-        force_langchain_evaluate=None,
-        model_state_none=None,
-        load_exllama=None,
-        answer_with_sources=None,
-        append_sources_to_answer=None,
-        image_loaders_options0=None,
-        pdf_loaders_options0=None,
-        url_loaders_options0=None,
-        jq_schema0=None,
-        keep_sources_in_context=None,
-):
-    # ensure passed these
-    assert concurrency_count is not None
-    assert memory_restriction_level is not None
-    assert raise_generate_gpu_exceptions is not None
-    assert use_openai_embedding is not None
-    assert use_openai_model is not None
-    assert hf_embedding_model is not None
-    assert migrate_embedding_model is not None
-    assert auto_migrate_db is not None
-    assert db_type is not None
-    assert top_k_docs is not None and isinstance(top_k_docs, int)
-    assert chunk is not None and isinstance(chunk, bool)
-    assert chunk_size is not None and isinstance(chunk_size, int)
-    assert n_jobs is not None
-    assert first_para is not None
-    assert isinstance(add_chat_history_to_context, bool)
-    assert isinstance(add_search_to_context, bool)
-    assert load_exllama is not None
-    # for lazy client (even chat client)
-    if image_loaders is None:
-        image_loaders = image_loaders_options0
-    if pdf_loaders is None:
-        pdf_loaders = pdf_loaders_options0
-    if url_loaders is None:
-        url_loaders = url_loaders_options0
-    if jq_schema is None:
-        jq_schema = jq_schema0
-    if isinstance(langchain_agents, str):
-        if langchain_agents.strip().startswith('['):
-            # already list, but as string
-            langchain_agents = str_to_list(langchain_agents)
-        else:
-            # just 1 item and make list
-            langchain_agents = [langchain_agents]
-    chat_conversation = str_to_list(chat_conversation)
-    text_context_list = str_to_list(text_context_list)
-
-    langchain_modes = selection_docs_state['langchain_modes']
-    langchain_mode_paths = selection_docs_state['langchain_mode_paths']
-    langchain_mode_types = selection_docs_state['langchain_mode_types']
-
-    if debug:
-        locals_dict = locals().copy()
-        locals_dict.pop('model_state', None)
-        locals_dict.pop('model_state0', None)
-        locals_dict.pop('model_states', None)
-        print(locals_dict)
-
-    no_model_msg = "Please choose a base model with --base_model (CLI) or load in Models Tab (gradio).\n" \
-                   "Then start New Conversation"
-
-    if model_state is None:
-        model_state = model_state_none.copy()
-    if model_state0 is None:
-        # e.g. for no gradio case, set dummy value, else should be set
-        model_state0 = model_state_none.copy()
-
-    # model_state['model] is only 'model' if should use model_state0
-    # model could also be None
-    have_model_lock = model_lock is not None
-    have_fresh_model = model_state['model'] not in [None, 'model', no_model_str]
-    # for gradio UI control, expect model_state and model_state0 to match, so if have_model_lock=True, then should have_fresh_model=True
-    # but gradio API control will only use nochat api etc. and won't use fresh model, so can't assert in general
-    # if have_model_lock:
-    #    assert have_fresh_model, "Expected model_state and model_state0 to match if have_model_lock"
-    have_cli_model = model_state0['model'] not in [None, 'model', no_model_str]
-
-    if have_fresh_model:
-        # USE FRESH MODEL
-        if not have_model_lock:
-            # model_state0 is just one of model_state if model_lock, so don't nuke
-            # try to free-up original model (i.e. list was passed as reference)
-            if model_state0['model'] and hasattr(model_state0['model'], 'cpu'):
-                model_state0['model'].cpu()
-                model_state0['model'] = None
-            # try to free-up original tokenizer (i.e. list was passed as reference)
-            if model_state0['tokenizer']:
-                model_state0['tokenizer'] = None
-            clear_torch_cache()
-        chosen_model_state = model_state
-    elif have_cli_model:
-        # USE MODEL SETUP AT CLI
-        assert isinstance(model_state['model'], (type(None), str))  # expect no fresh model
-        chosen_model_state = model_state0
-    else:
-        raise AssertionError(no_model_msg)
-    # get variables
-    model = chosen_model_state['model']
-    tokenizer = chosen_model_state['tokenizer']
-    device = chosen_model_state['device']
-    base_model = chosen_model_state['base_model']
-    tokenizer_base_model = chosen_model_state['tokenizer_base_model']
-    lora_weights = chosen_model_state['lora_weights']
-    inference_server = chosen_model_state['inference_server']
-    visible_models = chosen_model_state['visible_models']
-    # use overall key if have, so key for this gradio and any inner gradio
-    if chosen_model_state['h2ogpt_key'] is not None:
-        h2ogpt_key = chosen_model_state['h2ogpt_key']
-    # prefer use input from API over model state
-    prompt_type = prompt_type or chosen_model_state['prompt_type']
-    prompt_dict = prompt_dict or chosen_model_state['prompt_dict']
-
-    if base_model is None:
-        raise AssertionError(no_model_msg)
-
-    assert base_model.strip(), no_model_msg
-    assert model, "Model is missing"
-    assert tokenizer, "Tokenizer is missing"
-
-    # choose chat or non-chat mode
-    if not chat:
-        instruction = instruction_nochat
-        iinput = iinput_nochat
-
-    # in some cases, like lean nochat API, don't want to force sending prompt_type, allow default choice
-    model_lower = base_model.lower()
-    if not prompt_type and model_lower in inv_prompt_type_to_model_lower and prompt_type != 'custom':
-        prompt_type = inv_prompt_type_to_model_lower[model_lower]
-        if verbose:
-            print("Auto-selecting prompt_type=%s for %s" % (prompt_type, model_lower), flush=True)
-    assert prompt_type is not None, "prompt_type was None"
-
-    # Control generation hyperparameters
-    # adjust for bad inputs, e.g. in case also come from API that doesn't get constrained by gradio sliders
-    # below is for TGI server, not required for HF transformers
-    # limits are chosen similar to gradio_runner.py sliders/numbers
-    top_p = min(max(1e-3, top_p), 1.0 - 1e-3)
-    top_k = min(max(1, int(top_k)), 100)
-    temperature = min(max(0.01, temperature), 2.0)
-    # FIXME: https://github.com/h2oai/h2ogpt/issues/106
-    num_beams = 1 if stream_output else num_beams  # See max_beams in gradio_runner
-    max_max_new_tokens = get_max_max_new_tokens(chosen_model_state,
-                                                memory_restriction_level=memory_restriction_level,
-                                                max_new_tokens=max_new_tokens,
-                                                max_max_new_tokens=max_max_new_tokens)
-    if min_max_new_tokens is None:
-        # default for nochat api
-        min_max_new_tokens = 256
-    if docs_ordering_type is None:
-        docs_ordering_type = 'reverse_ucurve_sort'
-    model_max_length = get_model_max_length(chosen_model_state)
-    max_new_tokens = min(max(1, int(max_new_tokens)), max_max_new_tokens)
-    min_new_tokens = min(max(0, int(min_new_tokens)), max_new_tokens)
-    max_time = min(max(0, max_time), max_max_time)
-    repetition_penalty = min(max(0.01, repetition_penalty), 3.0)
-    num_return_sequences = 1 if chat else min(max(1, int(num_return_sequences)), 10)
-    min_top_k_docs, max_top_k_docs, label_top_k_docs = get_minmax_top_k_docs(is_public)
-    # limit total tokens processed, e.g. for summarization, if public instance
-    if is_public:
-        total_tokens_for_docs = min(2 * model_max_length, 16384)
-    else:
-        total_tokens_for_docs = None
-    top_k_docs = min(max(min_top_k_docs, int(top_k_docs)), max_top_k_docs)
-    chunk_size = min(max(128, int(chunk_size)), 2048)
-    if not context:
-        context = ''
-
-    # get prompter
-    prompter = Prompter(prompt_type, prompt_dict, debug=debug, chat=chat, stream_output=stream_output,
-                        system_prompt=system_prompt)
-
-    # THIRD PLACE where LangChain referenced, but imports only occur if enabled and have db to use
-    assert langchain_mode in langchain_modes, "Invalid langchain_mode %s not in %s" % (langchain_mode, langchain_modes)
-    assert langchain_action in langchain_actions, "Invalid langchain_action %s not in %s" % (
-        langchain_action, langchain_actions)
-    assert len(
-        set(langchain_agents).difference(langchain_agents_list)) == 0, "Invalid langchain_agents %s" % langchain_agents
-
-    # get db, but also fill db state so return already has my_db_state and dbs filled so faster next query
-    from src.gpt_langchain import get_any_db
-    db = get_any_db(my_db_state, langchain_mode, langchain_mode_paths, langchain_mode_types,
-                    dbs=dbs,
-                    load_db_if_exists=load_db_if_exists,
-                    db_type=db_type,
-                    use_openai_embedding=use_openai_embedding,
-                    hf_embedding_model=hf_embedding_model,
-                    migrate_embedding_model=migrate_embedding_model,
-                    auto_migrate_db=auto_migrate_db,
-                    for_sources_list=True,
-                    verbose=verbose,
-                    n_jobs=n_jobs,
-                    )
-
-    t_generate = time.time()
-    langchain_only_model = base_model in non_hf_types or \
-                           load_exllama or \
-                           inference_server.startswith('replicate') or \
-                           inference_server.startswith('sagemaker') or \
-                           inference_server.startswith('openai_azure_chat') or \
-                           inference_server.startswith('openai_azure')
-    do_langchain_path = langchain_mode not in [False, 'Disabled', 'LLM'] or \
-                        langchain_only_model or \
-                        force_langchain_evaluate or \
-                        len(text_context_list) > 0
-
-    if len(langchain_agents) > 0:
-        do_langchain_path = True
-    if add_search_to_context:
-        # easier to manage prompt etc. by doing full langchain path
-        do_langchain_path = True
-
-    if do_langchain_path:
-        text = ''
-        sources = ''
-        response = ''
-        # use smaller cut_distance for wiki_full since so many matches could be obtained, and often irrelevant unless close
-        from gpt_langchain import run_qa_db
-        gen_hyper_langchain = dict(do_sample=do_sample,
-                                   temperature=temperature,
-                                   repetition_penalty=repetition_penalty,
-                                   top_k=top_k,
-                                   top_p=top_p,
-                                   num_beams=num_beams,
-                                   min_new_tokens=min_new_tokens,
-                                   max_new_tokens=max_new_tokens,
-                                   early_stopping=early_stopping,
-                                   max_time=max_time,
-                                   num_return_sequences=num_return_sequences,
-                                   )
-        loaders_dict, captions_model = gr_to_lg(image_loaders,
-                                                pdf_loaders,
-                                                url_loaders,
-                                                captions_model=captions_model,
-                                                )
-        loaders_dict.update(dict(captions_model=captions_model,
-                                 caption_loader=caption_loader,
-                                 doctr_loader=doctr_loader,
-                                 pix2struct_loader=pix2struct_loader,
-                                 jq_schema=jq_schema,
-                                 ))
-        data_point = dict(context=context, instruction=instruction, input=iinput)
-        # no longer stuff chat history directly into context this early
-        prompt_basic = prompter.generate_prompt(data_point, context_from_history=False)
-        prompt = prompt_basic
-        num_prompt_tokens = 0
-        for r in run_qa_db(
-                inference_server=inference_server,
-                model_name=base_model, model=model, tokenizer=tokenizer,
-                langchain_only_model=langchain_only_model,
-                async_output=async_output,
-                num_async=num_async,
-                prompter=prompter,
-                use_llm_if_no_docs=use_llm_if_no_docs,
-                load_db_if_exists=load_db_if_exists,
-                db=db,
-                langchain_mode_paths=langchain_mode_paths,
-                langchain_mode_types=langchain_mode_types,
-                detect_user_path_changes_every_query=detect_user_path_changes_every_query,
-                cut_distance=1.1 if langchain_mode in ['wiki_full'] else cut_distance,
-                answer_with_sources=answer_with_sources,
-                append_sources_to_answer=append_sources_to_answer,
-                add_chat_history_to_context=add_chat_history_to_context,
-                add_search_to_context=add_search_to_context,
-                keep_sources_in_context=keep_sources_in_context,
-                memory_restriction_level=memory_restriction_level,
-                system_prompt=system_prompt,
-                use_openai_embedding=use_openai_embedding,
-                use_openai_model=use_openai_model,
-                hf_embedding_model=hf_embedding_model,
-                migrate_embedding_model=migrate_embedding_model,
-                auto_migrate_db=auto_migrate_db,
-                first_para=first_para,
-                text_limit=text_limit,
-                show_accordions=show_accordions,
-                top_k_docs_max_show=top_k_docs_max_show,
-                show_link_in_sources=show_link_in_sources,
-
-                # evaluate args items
-                query=instruction,
-                iinput=iinput,
-                context=context,
-                stream_output=stream_output,
-                chunk=chunk,
-                chunk_size=chunk_size,
-
-                **loaders_dict,
-
-                langchain_mode=langchain_mode,
-                langchain_action=langchain_action,
-                langchain_agents=langchain_agents,
-                document_subset=document_subset,
-                document_choice=document_choice,
-                top_k_docs=top_k_docs,
-                prompt_type=prompt_type,
-                prompt_dict=prompt_dict,
-                pre_prompt_query=pre_prompt_query,
-                prompt_query=prompt_query,
-                pre_prompt_summary=pre_prompt_summary,
-                prompt_summary=prompt_summary,
-                text_context_list=text_context_list,
-                chat_conversation=chat_conversation,
-                visible_models=visible_models,
-                h2ogpt_key=h2ogpt_key,
-                docs_ordering_type=docs_ordering_type,
-                min_max_new_tokens=min_max_new_tokens,
-
-                **gen_hyper_langchain,
-
-                db_type=db_type,
-                n_jobs=n_jobs,
-                verbose=verbose,
-                cli=cli,
-                sanitize_bot_response=sanitize_bot_response,
-
-                lora_weights=lora_weights,
-
-                auto_reduce_chunks=auto_reduce_chunks,
-                max_chunks=max_chunks,
-                total_tokens_for_docs=total_tokens_for_docs,
-                headsize=headsize,
-        ):
-            # doesn't accumulate, new answer every yield, so only save that full answer
-            response = r['response']
-            sources = r['sources']
-            prompt = r['prompt']
-            num_prompt_tokens = r['num_prompt_tokens']
-            yield dict(response=response, sources=sources, save_dict=dict())
-        if save_dir:
-            # estimate using tiktoken
-            extra_dict = gen_hyper_langchain.copy()
-            extra_dict.update(prompt_type=prompt_type,
-                              inference_server=inference_server,
-                              langchain_mode=langchain_mode,
-                              langchain_action=langchain_action,
-                              langchain_agents=langchain_agents,
-                              document_subset=document_subset,
-                              document_choice=document_choice,
-                              chat_conversation=chat_conversation,
-                              add_search_to_context=add_search_to_context,
-                              num_prompt_tokens=num_prompt_tokens,
-                              instruction=instruction,
-                              iinput=iinput,
-                              context=context,
-                              t_generate=time.time() - t_generate,
-                              ntokens=None,
-                              tokens_persecond=None,
-                              )
-            save_dict = dict(prompt=prompt,
-                             output=response, base_model=base_model, save_dir=save_dir,
-                             where_from='run_qa_db',
-                             extra_dict=extra_dict)
-            yield dict(response=response, sources=sources, save_dict=save_dict)
-            if verbose:
-                print(
-                    'Post-Generate Langchain: %s decoded_output: %s' %
-                    (str(datetime.now()), len(response) if response else -1),
-                    flush=True)
-        if response or sources or langchain_only_model:
-            # if got no response (e.g. not showing sources and got no sources,
-            # so nothing to give to LLM), then slip through and ask LLM
-            # Or if llama/gptj, then just return since they had no response and can't go down below code path
-            # don't clear torch cache here, delays multi-generation, and bot(), all_bot(), and evaluate_nochat() do it
-            return
-
-    # NOT LANGCHAIN PATH, raw LLM
-    # restrict instruction + , typically what has large input
-    prompt, \
-        instruction, iinput, context, \
-        num_prompt_tokens, max_new_tokens, num_prompt_tokens0, num_prompt_tokens_actual, \
-        chat_index, top_k_docs_trial, one_doc_size = \
-        get_limited_prompt(instruction,
-                           iinput,
-                           tokenizer,
-                           prompter=prompter,
-                           inference_server=inference_server,
-                           # prompt_type=prompt_type,
-                           # prompt_dict=prompt_dict,
-                           # chat=chat,
-                           max_new_tokens=max_new_tokens,
-                           # system_prompt=system_prompt,
-                           context=context,
-                           chat_conversation=chat_conversation,
-                           keep_sources_in_context=keep_sources_in_context,
-                           model_max_length=model_max_length,
-                           memory_restriction_level=memory_restriction_level,
-                           langchain_mode=langchain_mode,
-                           add_chat_history_to_context=add_chat_history_to_context,
-                           min_max_new_tokens=min_max_new_tokens,
-                           )
-
-    if inference_server.startswith('vllm') or \
-            inference_server.startswith('openai') or \
-            inference_server.startswith('http'):
-        if inference_server.startswith('vllm') or inference_server.startswith('openai'):
-            assert not inference_server.startswith('openai_azure_chat'), "Not fo Azure, use langchain path"
-            assert not inference_server.startswith('openai_azure'), "Not for Azure, use langchain path"
-            openai, inf_type, deployment_name, base_url, api_version = set_openai(inference_server)
-            where_from = inf_type
-
-            terminate_response = prompter.terminate_response or []
-            stop_sequences = list(set(terminate_response + [prompter.PreResponse]))
-            stop_sequences = [x for x in stop_sequences if x]
-            # OpenAI will complain if ask for too many new tokens, takes it as min in some sense, wrongly so.
-            max_new_tokens_openai = min(max_new_tokens, model_max_length - num_prompt_tokens)
-            gen_server_kwargs = dict(temperature=temperature if do_sample else 0,
-                                     max_tokens=max_new_tokens_openai,
-                                     top_p=top_p if do_sample else 1,
-                                     frequency_penalty=0,
-                                     n=num_return_sequences,
-                                     presence_penalty=1.07 - repetition_penalty + 0.6,  # so good default
-                                     )
-            if inf_type == 'vllm' or inference_server == 'openai':
-                responses = openai.Completion.create(
-                    model=base_model,
-                    prompt=prompt,
-                    **gen_server_kwargs,
-                    stop=stop_sequences,
-                    stream=stream_output,
-                )
-                text = ''
-                sources = ''
-                response = ''
-                if not stream_output:
-                    text = responses['choices'][0]['text']
-                    response = prompter.get_response(prompt + text, prompt=prompt,
-                                                     sanitize_bot_response=sanitize_bot_response)
-                    yield dict(response=response, sources=sources, save_dict=dict())
-                else:
-                    collected_events = []
-                    for event in responses:
-                        collected_events.append(event)  # save the event response
-                        event_text = event['choices'][0]['text']  # extract the text
-                        text += event_text  # append the text
-                        response = prompter.get_response(prompt + text, prompt=prompt,
-                                                         sanitize_bot_response=sanitize_bot_response)
-                        yield dict(response=response, sources=sources, save_dict=dict())
-            elif inf_type == 'vllm_chat' or inference_server == 'openai_chat':
-                if inf_type == 'vllm_chat':
-                    raise NotImplementedError('%s not supported by vLLM' % inf_type)
-                if system_prompt in [None, 'None', 'auto']:
-                    openai_system_prompt = "You are a helpful assistant."
-                else:
-                    openai_system_prompt = system_prompt
-                messages0 = []
-                if openai_system_prompt:
-                    messages0.append({"role": "system", "content": openai_system_prompt})
-                messages0.append({'role': 'user', 'content': prompt})
-                responses = openai.ChatCompletion.create(
-                    model=base_model,
-                    messages=messages0,
-                    stream=stream_output,
-                    **gen_server_kwargs,
-                )
-                text = ""
-                sources = ''
-                response = ""
-                if not stream_output:
-                    text = responses["choices"][0]["message"]["content"]
-                    response = prompter.get_response(prompt + text, prompt=prompt,
-                                                     sanitize_bot_response=sanitize_bot_response)
-                    yield dict(response=response, sources=sources, save_dict=dict())
-                else:
-                    for chunk in responses:
-                        delta = chunk["choices"][0]["delta"]
-                        if 'content' in delta:
-                            text += delta['content']
-                            response = prompter.get_response(prompt + text, prompt=prompt,
-                                                             sanitize_bot_response=sanitize_bot_response)
-                            yield dict(response=response, sources=sources, save_dict=dict())
-            else:
-                raise RuntimeError("No such OpenAI mode: %s" % inference_server)
-        elif inference_server.startswith('http'):
-            inference_server, headers = get_hf_server(inference_server)
-            from gradio_utils.grclient import GradioClient
-            from text_generation import Client as HFClient
-            if isinstance(model, GradioClient):
-                gr_client = model
-                hf_client = None
-            elif isinstance(model, HFClient):
-                gr_client = None
-                hf_client = model
-            else:
-                inference_server, gr_client, hf_client = get_client_from_inference_server(inference_server,
-                                                                                          base_model=base_model)
-
-            # quick sanity check to avoid long timeouts, just see if can reach server
-            requests.get(inference_server, timeout=int(os.getenv('REQUEST_TIMEOUT_FAST', '10')))
-
-            if gr_client is not None:
-                # Note: h2oGPT gradio server could handle input token size issues for prompt,
-                # but best to handle here so send less data to server
-
-                chat_client = False
-                where_from = "gr_client"
-                client_langchain_mode = 'Disabled'
-                client_add_chat_history_to_context = True
-                client_add_search_to_context = False
-                client_langchain_action = LangChainAction.QUERY.value
-                client_langchain_agents = []
-                gen_server_kwargs = dict(temperature=temperature,
-                                         top_p=top_p,
-                                         top_k=top_k,
-                                         num_beams=num_beams,
-                                         max_new_tokens=max_new_tokens,
-                                         min_new_tokens=min_new_tokens,
-                                         early_stopping=early_stopping,
-                                         max_time=max_time,
-                                         repetition_penalty=repetition_penalty,
-                                         num_return_sequences=num_return_sequences,
-                                         do_sample=do_sample,
-                                         chat=chat_client,
-                                         )
-                # account for gradio into gradio that handles prompting, avoid duplicating prompter prompt injection
-                if prompt_type in [None, '', PromptType.plain.name, PromptType.plain.value,
-                                   str(PromptType.plain.value)]:
-                    # if our prompt is plain, assume either correct or gradio server knows different prompt type,
-                    # so pass empty prompt_Type
-                    gr_prompt_type = ''
-                    gr_prompt_dict = ''
-                    gr_prompt = prompt  # already prepared prompt
-                    gr_context = ''
-                    gr_iinput = ''
-                else:
-                    # if already have prompt_type that is not plain, None, or '', then already applied some prompting
-                    #  But assume server can handle prompting, and need to avoid double-up.
-                    #  Also assume server can do better job of using stopping.py to stop early, so avoid local prompting, let server handle
-                    #  So avoid "prompt" and let gradio server reconstruct from prompt_type we passed
-                    # Note it's ok that prompter.get_response() has prompt+text, prompt=prompt passed,
-                    #  because just means extra processing and removal of prompt, but that has no human-bot prompting doesn't matter
-                    #  since those won't appear
-                    gr_context = context
-                    gr_prompt = instruction
-                    gr_iinput = iinput
-                    gr_prompt_type = prompt_type
-                    gr_prompt_dict = prompt_dict
-                client_kwargs = dict(instruction=gr_prompt if chat_client else '',  # only for chat=True
-                                     iinput=gr_iinput,  # only for chat=True
-                                     context=gr_context,
-                                     # streaming output is supported, loops over and outputs each generation in streaming mode
-                                     # but leave stream_output=False for simple input/output mode
-                                     stream_output=stream_output,
-
-                                     **gen_server_kwargs,
-
-                                     prompt_type=gr_prompt_type,
-                                     prompt_dict=gr_prompt_dict,
-
-                                     instruction_nochat=gr_prompt if not chat_client else '',
-                                     iinput_nochat=gr_iinput,  # only for chat=False
-                                     langchain_mode=client_langchain_mode,
-                                     add_chat_history_to_context=client_add_chat_history_to_context,
-                                     langchain_action=client_langchain_action,
-                                     langchain_agents=client_langchain_agents,
-                                     top_k_docs=top_k_docs,
-                                     chunk=chunk,
-                                     chunk_size=chunk_size,
-                                     document_subset=DocumentSubset.Relevant.name,
-                                     document_choice=[DocumentChoice.ALL.value],
-                                     pre_prompt_query=pre_prompt_query,
-                                     prompt_query=prompt_query,
-                                     pre_prompt_summary=pre_prompt_summary,
-                                     prompt_summary=prompt_summary,
-                                     system_prompt=system_prompt,
-                                     image_loaders=image_loaders,
-                                     pdf_loaders=pdf_loaders,
-                                     url_loaders=url_loaders,
-                                     jq_schema=jq_schema,
-                                     visible_models=visible_models,
-                                     h2ogpt_key=h2ogpt_key,
-                                     add_search_to_context=client_add_search_to_context,
-                                     docs_ordering_type=None,
-                                     min_max_new_tokens=min_max_new_tokens,
-                                     )
-                api_name = '/submit_nochat_api'  # NOTE: like submit_nochat but stable API for string dict passing
-                response = ''
-                text = ''
-                sources = ''
-                if not stream_output:
-                    res = gr_client.predict(str(dict(client_kwargs)), api_name=api_name)
-                    res_dict = ast.literal_eval(res)
-                    text = res_dict['response']
-                    sources = res_dict['sources']
-                    response = prompter.get_response(prompt + text, prompt=prompt,
-                                                     sanitize_bot_response=sanitize_bot_response)
-                    yield dict(response=response, sources=sources, save_dict=dict())
-                else:
-                    job = gr_client.submit(str(dict(client_kwargs)), api_name=api_name)
-                    res_dict = dict(response=text, sources=sources, save_dict=dict())
-                    text0 = ''
-                    while not job.done():
-                        if job.communicator.job.latest_status.code.name == 'FINISHED':
-                            break
-                        e = job.future._exception
-                        if e is not None:
-                            break
-                        outputs_list = job.communicator.job.outputs
-                        if outputs_list:
-                            res = job.communicator.job.outputs[-1]
-                            res_dict = ast.literal_eval(res)
-                            text = res_dict['response']
-                            sources = res_dict['sources']
-                            if gr_prompt_type == 'plain':
-                                # then gradio server passes back full prompt + text
-                                prompt_and_text = text
-                            else:
-                                prompt_and_text = prompt + text
-                            response = prompter.get_response(prompt_and_text, prompt=prompt,
-                                                             sanitize_bot_response=sanitize_bot_response)
-                            text_chunk = response[len(text0):]
-                            if not text_chunk:
-                                continue
-                            # save old
-                            text0 = response
-                            yield dict(response=response, sources=sources, save_dict=dict())
-                        time.sleep(0.01)
-                    # ensure get last output to avoid race
-                    res_all = job.outputs()
-                    if len(res_all) > 0:
-                        res = res_all[-1]
-                        res_dict = ast.literal_eval(res)
-                        text = res_dict['response']
-                        sources = res_dict['sources']
-                    else:
-                        # go with old text if last call didn't work
-                        e = job.future._exception
-                        if e is not None:
-                            stre = str(e)
-                            strex = ''.join(traceback.format_tb(e.__traceback__))
-                        else:
-                            stre = ''
-                            strex = ''
-
-                        print("Bad final response: %s %s %s %s %s: %s %s" % (base_model, inference_server,
-                                                                             res_all, prompt, text, stre, strex),
-                              flush=True)
-                    if gr_prompt_type == 'plain':
-                        # then gradio server passes back full prompt + text
-                        prompt_and_text = text
-                    else:
-                        prompt_and_text = prompt + text
-                    response = prompter.get_response(prompt_and_text, prompt=prompt,
-                                                     sanitize_bot_response=sanitize_bot_response)
-                    yield dict(response=response, sources=sources, save_dict=dict())
-            elif hf_client:
-                # HF inference server needs control over input tokens
-                where_from = "hf_client"
-                response = ''
-                extra = ''
-                sources = ''
-
-                # prompt must include all human-bot like tokens, already added by prompt
-                # https://github.com/huggingface/text-generation-inference/tree/main/clients/python#types
-                terminate_response = prompter.terminate_response or []
-                stop_sequences = list(set(terminate_response + [prompter.PreResponse]))
-                stop_sequences = [x for x in stop_sequences if x]
-                gen_server_kwargs = dict(do_sample=do_sample,
-                                         max_new_tokens=max_new_tokens,
-                                         # best_of=None,
-                                         repetition_penalty=repetition_penalty,
-                                         return_full_text=False,
-                                         seed=SEED,
-                                         stop_sequences=stop_sequences,
-                                         temperature=temperature,
-                                         top_k=top_k,
-                                         top_p=top_p,
-                                         # truncate=False,  # behaves oddly
-                                         # typical_p=top_p,
-                                         # watermark=False,
-                                         # decoder_input_details=False,
-                                         )
-                # work-around for timeout at constructor time, will be issue if multi-threading,
-                # so just do something reasonable or max_time if larger
-                # lower bound because client is re-used if multi-threading
-                hf_client.timeout = max(300, max_time)
-                if not stream_output:
-                    text = hf_client.generate(prompt, **gen_server_kwargs).generated_text
-                    response = prompter.get_response(prompt + text, prompt=prompt,
-                                                     sanitize_bot_response=sanitize_bot_response)
-                    yield dict(response=response, sources=sources, save_dict=dict())
-                else:
-                    text = ""
-                    for responses in hf_client.generate_stream(prompt, **gen_server_kwargs):
-                        if not responses.token.special:
-                            # stop_sequences
-                            text_chunk = responses.token.text
-                            text += text_chunk
-                            response = prompter.get_response(prompt + text, prompt=prompt,
-                                                             sanitize_bot_response=sanitize_bot_response)
-                            sources = ''
-                            yield dict(response=response, sources=sources, save_dict=dict())
-            else:
-                raise RuntimeError("Failed to get client: %s" % inference_server)
-        else:
-            raise RuntimeError("No such inference_server  %s" % inference_server)
-
-        if save_dir and text:
-            # save prompt + new text
-            extra_dict = gen_server_kwargs.copy()
-            extra_dict.update(dict(inference_server=inference_server, num_prompt_tokens=num_prompt_tokens,
-                                   t_generate=time.time() - t_generate,
-                                   ntokens=None,
-                                   tokens_persecond=None,
-                                   ))
-            save_dict = dict(prompt=prompt, output=text, base_model=base_model, save_dir=save_dir,
-                             where_from=where_from, extra_dict=extra_dict)
-            yield dict(response=response, sources=sources, save_dict=save_dict)
-        return
-    else:
-        assert not inference_server, "inference_server=%s not supported" % inference_server
-
-    if isinstance(tokenizer, str):
-        # pipeline
-        if tokenizer == "summarization":
-            key = 'summary_text'
-        else:
-            raise RuntimeError("No such task type %s" % tokenizer)
-        # NOTE: uses max_length only
-        sources = ''
-        yield dict(response=model(prompt, max_length=max_new_tokens)[0][key], sources=sources, save_dict=dict())
-
-    if 'mbart-' in base_model.lower():
-        assert src_lang is not None
-        tokenizer.src_lang = languages_covered()[src_lang]
-
-    stopping_criteria = get_stopping(prompt_type, prompt_dict, tokenizer, device, base_model,
-                                     model_max_length=model_max_length,
-                                     prompter=prompter)
-
-    inputs = tokenizer(prompt, return_tensors="pt")
-    if debug and len(inputs["input_ids"]) > 0:
-        print('input_ids length', len(inputs["input_ids"][0]), flush=True)
-    input_ids = inputs["input_ids"].to(device)
-    # CRITICAL LIMIT else will fail
-    max_max_tokens = tokenizer.model_max_length
-    max_input_tokens = max(0, int(max_max_tokens - min_new_tokens))
-    # NOTE: Don't limit up front due to max_new_tokens, let go up to max or reach max_max_tokens in stopping.py
-    assert isinstance(max_input_tokens, int), "Bad type for max_input_tokens=%s %s" % (
-        max_input_tokens, type(max_input_tokens))
-    input_ids = input_ids[:, -max_input_tokens:]
-    # required for falcon if multiple threads or asyncio accesses to model during generation
-    if use_cache is None:
-        use_cache = False if 'falcon' in base_model else True
-    gen_config_kwargs = dict(num_beams=num_beams,
-                             do_sample=do_sample,
-                             repetition_penalty=float(repetition_penalty),
-                             num_return_sequences=num_return_sequences,
-                             renormalize_logits=True,
-                             remove_invalid_values=True,
-                             use_cache=use_cache,
-                             )
-    if do_sample:
-        gen_config_kwargs.update(dict(temperature=float(temperature),
-                                      top_p=float(top_p),
-                                      top_k=top_k))
-    if True:
-        # unclear impact, some odd things going on inside
-        # leads to:
-        # The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
-        # Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
-        # or leads to:
-        # Using cls_token, but it is not set yet.
-        # Using mask_token, but it is not set yet.
-        # Using pad_token, but it is not set yet.
-        # Using sep_token, but it is not set yet.
-        token_ids = ['eos_token_id', 'pad_token_id', 'bos_token_id', 'cls_token_id', 'sep_token_id']
-        for token_id in token_ids:
-            if hasattr(tokenizer, token_id) and getattr(tokenizer, token_id) is not None:
-                gen_config_kwargs.update({token_id: getattr(tokenizer, token_id)})
-    generation_config = GenerationConfig(**gen_config_kwargs)
-
-    gen_kwargs = dict(input_ids=input_ids,
-                      generation_config=generation_config,
-                      return_dict_in_generate=True,
-                      output_scores=True,
-                      max_new_tokens=max_new_tokens,  # prompt + new
-                      min_new_tokens=min_new_tokens,  # prompt + new
-                      early_stopping=early_stopping,  # False, True, "never"
-                      max_time=max_time,
-                      stopping_criteria=stopping_criteria,
-                      )
-    if 'gpt2' in base_model.lower():
-        gen_kwargs.update(dict(bos_token_id=tokenizer.bos_token_id, pad_token_id=tokenizer.eos_token_id))
-    elif 'mbart-' in base_model.lower():
-        assert tgt_lang is not None
-        tgt_lang = languages_covered()[tgt_lang]
-        gen_kwargs.update(dict(forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang]))
-    else:
-        token_ids = ['eos_token_id', 'bos_token_id', 'pad_token_id']
-        for token_id in token_ids:
-            if hasattr(tokenizer, token_id) and getattr(tokenizer, token_id) is not None:
-                gen_kwargs.update({token_id: getattr(tokenizer, token_id)})
-
-    decoder_kwargs = dict(skip_special_tokens=True,
-                          clean_up_tokenization_spaces=True)
-
-    decoder = functools.partial(tokenizer.decode,
-                                **decoder_kwargs
-                                )
-    with torch.no_grad():
-        have_lora_weights = lora_weights not in [no_lora_str, '', None]
-        context_class_cast = NullContext if device == 'cpu' or have_lora_weights or device == 'mps' else torch.autocast
-        if t5_type(base_model):
-            # issues when casting to float16, can mess up t5 model, e.g. only when not streaming, or other odd behaviors
-            context_class_cast = NullContext
-        with context_class_cast(device):
-            # protection for gradio not keeping track of closed users,
-            # else hit bitsandbytes lack of thread safety:
-            # https://github.com/h2oai/h2ogpt/issues/104
-            # but only makes sense if concurrency_count == 1
-            context_class = NullContext  # if concurrency_count > 1 else filelock.FileLock
-            if verbose:
-                print('Pre-Generate: %s' % str(datetime.now()), flush=True)
-            decoded_output = None
-            response = ''
-            with context_class("generate.lock"):
-                if verbose:
-                    print('Generate: %s' % str(datetime.now()), flush=True)
-                always_use_streaming_method = True  # to deal with complex parsing of prompt vs. generation due to odd tokenizing
-                if stream_output or always_use_streaming_method:
-                    skip_prompt = True  # True means first output excludes prompt
-                    streamer = H2OTextIteratorStreamer(tokenizer, skip_prompt=skip_prompt, block=False,
-                                                       **decoder_kwargs)
-                    gen_kwargs.update(dict(streamer=streamer))
-                    target = wrapped_partial(generate_with_exceptions, model.generate,
-                                             raise_generate_gpu_exceptions=raise_generate_gpu_exceptions,
-                                             **gen_kwargs)
-                    bucket = queue.Queue()
-                    thread = EThread(target=target, streamer=streamer, bucket=bucket)
-                    thread.start()
-                    ret = dict(response='', sources='', save_dict=dict())
-                    outputs = ""
-                    sources = ''
-                    try:
-                        for new_text in streamer:
-                            if bucket.qsize() > 0 or thread.exc:
-                                thread.join()
-                            outputs += new_text
-                            response = prompter.get_response(outputs, prompt=None,
-                                                             only_new_text=True,
-                                                             sanitize_bot_response=sanitize_bot_response)
-                            ret = dict(response=response, sources=sources, save_dict=dict())
-                            if stream_output:
-                                yield ret
-                        if not stream_output:
-                            yield ret
-                    except BaseException:
-                        # if any exception, raise that exception if was from thread, first
-                        if thread.exc:
-                            raise thread.exc
-                        raise
-                    finally:
-                        # don't clear torch cache here, delays multi-generation, and bot(), all_bot(), and evaluate_nochat() do it
-                        # in case no exception and didn't join with thread yet, then join
-                        if not thread.exc:
-                            thread.join()
-                    # in case raise StopIteration or broke queue loop in streamer, but still have exception
-                    if thread.exc:
-                        raise thread.exc
-                    decoded_output = outputs
-                    ntokens = len(outputs) // 4  # hack for now
-                else:
-                    # below length removal doesn't work in general, because encoding does not match internal of model generation
-                    input_ids_len = gen_kwargs['input_ids'][0].shape[0]
-                    try:
-                        outputs = model.generate(**gen_kwargs)
-                    finally:
-                        pass
-                        # don't clear torch cache here, delays multi-generation, and bot(), all_bot(), and evaluate_nochat() do it
-                    # skip first IDs
-                    ntokens = sum([len(s) - input_ids_len for s in outputs.sequences]) if save_dir else -1
-                    outputs = [decoder(s[input_ids_len:]) for s in outputs.sequences]
-                    sources = ''
-                    response = prompter.get_response(outputs, prompt=None,
-                                                     only_new_text=True,
-                                                     sanitize_bot_response=sanitize_bot_response)
-                    yield dict(response=response, sources=sources, save_dict=dict())
-                    if outputs and len(outputs) >= 1:
-                        decoded_output = prompt + outputs[0]
-                if save_dir and decoded_output:
-                    extra_dict = gen_config_kwargs.copy()
-                    extra_dict.update(dict(num_prompt_tokens=num_prompt_tokens,
-                                           t_generate=time.time() - t_generate,
-                                           ntokens=ntokens,
-                                           tokens_persecond=ntokens / (time.time() - t_generate),
-                                           ))
-                    save_dict = dict(prompt=prompt, output=decoded_output, base_model=base_model, save_dir=save_dir,
-                                     where_from="evaluate_%s" % str(stream_output),
-                                     extra_dict=extra_dict)
-                    yield dict(response=response, sources=sources, save_dict=save_dict)
-            if verbose:
-                print('Post-Generate: %s decoded_output: %s' % (
-                    str(datetime.now()), len(decoded_output) if decoded_output else -1), flush=True)
-
-
-inputs_list_names = list(inspect.signature(evaluate).parameters)
-state_names = input_args_list.copy()  # doesn't have to be the same, but state_names must match evaluate() and how filled then
-inputs_kwargs_list = [x for x in inputs_list_names if x not in eval_func_param_names + state_names]
-
-
-def get_cutoffs(memory_restriction_level, for_context=False, model_max_length=2048):
-    # help to avoid errors like:
-    # RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
-    # RuntimeError: expected scalar type Half but found Float
-    # with - 256
-    if memory_restriction_level > 0:
-        max_length_tokenize = 768 - 256 if memory_restriction_level <= 2 else 512 - 256
-    else:
-        # at least give room for 1 paragraph output
-        max_length_tokenize = model_max_length - 256
-    cutoff_len = max_length_tokenize * 4  # if reaches limit, then can't generate new tokens
-    output_smallest = 30 * 4
-    max_prompt_length = cutoff_len - output_smallest
-
-    if for_context:
-        # then lower even more to avoid later chop, since just estimate tokens in context bot
-        max_prompt_length = max(64, int(max_prompt_length * 0.8))
-
-    return cutoff_len, output_smallest, max_length_tokenize, max_prompt_length
-
-
-class H2OTextIteratorStreamer(TextIteratorStreamer):
-    """
-    normally, timeout required for now to handle exceptions, else get()
-    but with H2O version of TextIteratorStreamer, loop over block to handle
-    """
-
-    def __init__(self, tokenizer, skip_prompt: bool = False, timeout: typing.Optional[float] = None,
-                 block=True, **decode_kwargs):
-        super().__init__(tokenizer, skip_prompt, **decode_kwargs)
-        self.text_queue = queue.Queue()
-        self.stop_signal = None
-        self.do_stop = False
-        self.timeout = timeout
-        self.block = block
-
-    def on_finalized_text(self, text: str, stream_end: bool = False):
-        """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue."""
-        self.text_queue.put(text, timeout=self.timeout)
-        if stream_end:
-            self.text_queue.put(self.stop_signal, timeout=self.timeout)
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        while True:
-            try:
-                value = self.stop_signal  # value looks unused in pycharm, not true
-                if self.do_stop:
-                    print("hit stop", flush=True)
-                    # could raise or break, maybe best to raise and make parent see if any exception in thread
-                    self.clear_queue()
-                    self.do_stop = False
-                    raise StopIteration()
-                    # break
-                value = self.text_queue.get(block=self.block, timeout=self.timeout)
-                break
-            except queue.Empty:
-                time.sleep(0.01)
-        if value == self.stop_signal:
-            self.clear_queue()
-            self.do_stop = False
-            raise StopIteration()
-        else:
-            return value
-
-    def clear_queue(self):
-        # make sure streamer is reusable after stop hit
-        with self.text_queue.mutex:
-            self.text_queue.queue.clear()
-
-    def put(self, value):
-        """
-        Receives tokens, decodes them, and prints them to stdout as soon as they form entire words.
-        # same as base class, except remove hack w.r.t. text.rfind(" ") that ruins LLaMa2
-        """
-        if len(value.shape) > 1 and value.shape[0] > 1:
-            raise ValueError("TextStreamer only supports batch size 1")
-        elif len(value.shape) > 1:
-            value = value[0]
-
-        if self.skip_prompt and self.next_tokens_are_prompt:
-            self.next_tokens_are_prompt = False
-            return
-
-        # Add the new token to the cache and decodes the entire thing.
-        self.token_cache.extend(value.tolist())
-        text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
-
-        # After the symbol for a new line, we flush the cache.
-        if text.endswith("\n"):
-            printable_text = text[self.print_len:]
-            self.token_cache = []
-            self.print_len = 0
-        # If the last token is a CJK character, we print the characters.
-        elif len(text) > 0 and self._is_chinese_char(ord(text[-1])):
-            printable_text = text[self.print_len:]
-            self.print_len += len(printable_text)
-        # Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
-        # which may change with the subsequent token -- there are probably smarter ways to do this!)
-        elif len(text) > 0 and text[-1] == '�':
-            printable_text = text[self.print_len: text.rfind(" ") + 1]
-            self.print_len += len(printable_text)
-        else:
-            printable_text = text[self.print_len:]
-            self.print_len += len(printable_text)
-
-        self.on_finalized_text(printable_text)
-
-
-def generate_with_exceptions(func, *args, raise_generate_gpu_exceptions=True, **kwargs):
-    try:
-        func(*args, **kwargs)
-    except torch.cuda.OutOfMemoryError as e:
-        print("GPU OOM 2: exception: %s" % str(e),
-              flush=True)
-        if 'input_ids' in kwargs:
-            if kwargs['input_ids'] is not None:
-                kwargs['input_ids'].cpu()
-            kwargs['input_ids'] = None
-        traceback.print_exc()
-        clear_torch_cache()
-        return
-    except (Exception, RuntimeError) as e:
-        if 'Expected all tensors to be on the same device' in str(e) or \
-                'expected scalar type Half but found Float' in str(e) or \
-                'probability tensor contains either' in str(e) or \
-                'cublasLt ran into an error!' in str(e) or \
-                'mat1 and mat2 shapes cannot be multiplied' in str(e):
-            print(
-                "GPU Error: exception: %s" % str(e),
-                flush=True)
-            traceback.print_exc()
-            clear_torch_cache()
-            if raise_generate_gpu_exceptions:
-                raise
-            return
-        else:
-            clear_torch_cache()
-            if raise_generate_gpu_exceptions:
-                raise
-
-
-def get_generate_params(model_lower,
-                        chat,
-                        stream_output, show_examples,
-                        prompt_type, prompt_dict,
-                        system_prompt,
-                        pre_prompt_query, prompt_query,
-                        pre_prompt_summary, prompt_summary,
-                        temperature, top_p, top_k, num_beams,
-                        max_new_tokens, min_new_tokens, early_stopping, max_time,
-                        repetition_penalty, num_return_sequences,
-                        do_sample,
-                        top_k_docs, chunk, chunk_size,
-                        image_loaders,
-                        pdf_loaders,
-                        url_loaders,
-                        jq_schema,
-                        docs_ordering_type,
-                        min_max_new_tokens,
-                        verbose,
-                        ):
-    use_defaults = False
-    use_default_examples = True
-    examples = []
-    task_info = 'LLM'
-    if model_lower:
-        print(f"Using Model {model_lower}", flush=True)
-    else:
-        if verbose:
-            print("No model defined yet", flush=True)
-
-    min_new_tokens = min_new_tokens if min_new_tokens is not None else 0
-    early_stopping = early_stopping if early_stopping is not None else False
-    max_time_defaults = 60 * 3
-    max_time = max_time if max_time is not None else max_time_defaults
-
-    if not prompt_type and model_lower in inv_prompt_type_to_model_lower and prompt_type != 'custom':
-        prompt_type = inv_prompt_type_to_model_lower[model_lower]
-        if verbose:
-            print("Auto-selecting prompt_type=%s for %s" % (prompt_type, model_lower), flush=True)
-
-    # examples at first don't include chat, instruction_nochat, iinput_nochat, added at end
-    if show_examples is None:
-        if chat:
-            show_examples = False
-        else:
-            show_examples = True
-
-    summarize_example1 = """Jeff: Can I train a ? Transformers model on Amazon SageMaker?
-Philipp: Sure you can use the new Hugging Face Deep Learning Container.
-Jeff: ok.
-Jeff: and how can I get started?
-Jeff: where can I find documentation?
-Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-partnership-amazon-sagemaker-and-hugging-face"""
-
-    use_placeholder_instruction_as_example = False
-    if 'bart-large-cnn-samsum' in model_lower or 'flan-t5-base-samsum' in model_lower:
-        placeholder_instruction = summarize_example1
-        placeholder_input = ""
-        use_defaults = True
-        use_default_examples = False
-        use_placeholder_instruction_as_example = True
-        task_info = "Summarization"
-    elif 't5-' in model_lower or 't5' == model_lower or 'flan-' in model_lower:
-        placeholder_instruction = "The square root of x is the cube root of y. What is y to the power of 2, if x = 4?"
-        placeholder_input = ""
-        use_defaults = True
-        use_default_examples = True
-        task_info = "Multi-Task: Q/A, translation, Chain-of-Thought, Logical Reasoning, Summarization, etc.  Best to use task prefix as trained on, e.g. `translate English to German: ` (space after colon)"
-    elif 'mbart-' in model_lower:
-        placeholder_instruction = "The girl has long hair."
-        placeholder_input = ""
-        use_defaults = True
-        use_default_examples = False
-        use_placeholder_instruction_as_example = True
-    elif 'gpt2' in model_lower:
-        placeholder_instruction = "The sky is"
-        placeholder_input = ""
-        prompt_type = prompt_type or 'plain'
-        use_default_examples = True  # some will be odd "continuations" but can be ok
-        use_placeholder_instruction_as_example = True
-        task_info = "Auto-complete phrase, code, etc."
-        use_defaults = True
-    else:
-        if chat:
-            placeholder_instruction = ""
-        else:
-            placeholder_instruction = "Give detailed answer for whether Einstein or Newton is smarter."
-        placeholder_input = ""
-        if not prompt_type and model_lower in inv_prompt_type_to_model_lower and prompt_type != 'custom':
-            prompt_type = inv_prompt_type_to_model_lower[model_lower]
-        elif model_lower:
-            # default is plain, because might rely upon trust_remote_code to handle prompting
-            prompt_type = prompt_type or 'plain'
-        else:
-            prompt_type = ''
-        task_info = "No task"
-        if prompt_type == 'instruct':
-            task_info = "Answer question or follow imperative as instruction with optionally input."
-        elif prompt_type == 'plain':
-            task_info = "Auto-complete phrase, code, etc."
-        elif prompt_type == 'human_bot':
-            if chat:
-                task_info = "Chat (Shift-Enter to give question/imperative, input concatenated with instruction)"
-            else:
-                task_info = "Ask question/imperative (input concatenated with instruction)"
-
-    # revert to plain if still nothing
-    prompt_type = prompt_type or 'plain'
-    if use_defaults:
-        temperature = 1.0 if temperature is None else temperature
-        top_p = 1.0 if top_p is None else top_p
-        top_k = 40 if top_k is None else top_k
-        num_beams = num_beams or 1
-        max_new_tokens = max_new_tokens or 512
-        repetition_penalty = repetition_penalty or 1.07
-        num_return_sequences = min(num_beams, num_return_sequences or 1)
-        do_sample = False if do_sample is None else do_sample
-    else:
-        temperature = 0.1 if temperature is None else temperature
-        top_p = 0.75 if top_p is None else top_p
-        top_k = 40 if top_k is None else top_k
-        num_beams = num_beams or 1
-        max_new_tokens = max_new_tokens or 1024
-        repetition_penalty = repetition_penalty or 1.07
-        num_return_sequences = min(num_beams, num_return_sequences or 1)
-        do_sample = False if do_sample is None else do_sample
-    # doesn't include chat, instruction_nochat, iinput_nochat, added later
-    params_list = ["",
-                   stream_output,
-                   prompt_type, prompt_dict,
-                   temperature, top_p, top_k, num_beams,
-                   max_new_tokens, min_new_tokens,
-                   early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]
-
-    if use_placeholder_instruction_as_example:
-        examples += [[placeholder_instruction, ''] + params_list]
-
-    if use_default_examples:
-        examples += [
-            ["Translate English to French", "Good morning"] + params_list,
-            ["Give detailed answer for whether Einstein or Newton is smarter.", ''] + params_list,
-            ["Explain in detailed list, all the best practices for coding in python.", ''] + params_list,
-            [
-                "Create a markdown table with 3 rows for the primary colors, and 2 columns, with color name and hex codes.",
-                ''] + params_list,
-            ['Translate to German:  My name is Arthur', ''] + params_list,
-            ["Please answer to the following question. Who is going to be the next Ballon d'or?", ''] + params_list,
-            ['Can Geoffrey Hinton have a conversation with George Washington? Give the rationale before answering.',
-             ''] + params_list,
-            ['Please answer the following question. What is the boiling point of Nitrogen?', ''] + params_list,
-            ['Answer the following yes/no question. Can you write a whole Haiku in a single tweet?', ''] + params_list,
-            ["Simplify the following expression: (False or False and True). Explain your answer.", ''] + params_list,
-            [
-                "Premise: At my age you will probably have learnt one lesson. Hypothesis:  It's not certain how many lessons you'll learn by your thirties. Does the premise entail the hypothesis?",
-                ''] + params_list,
-            ['The square root of x is the cube root of y. What is y to the power of 2, if x = 4?', ''] + params_list,
-            [
-                'Answer the following question by reasoning step by step.  The cafeteria had 23 apples. If they used 20 for lunch, and bought 6 more, how many apple do they have?',
-                ''] + params_list,
-            ["""def area_of_rectangle(a: float, b: float):
-    \"\"\"Return the area of the rectangle.\"\"\"""", ''] + params_list,
-            ["""# a function in native python:
-def mean(a):
-    return sum(a)/len(a)
-
-# the same function using numpy:
-import numpy as np
-def mean(a):""", ''] + params_list,
-            ["""X = np.random.randn(100, 100)
-y = np.random.randint(0, 1, 100)
-
-# fit random forest classifier with 20 estimators""", ''] + params_list,
-        ]
-    # add summary example
-    examples += [
-        [summarize_example1, 'Summarize' if prompt_type not in ['plain', 'instruct_simple'] else ''] + params_list]
-
-    src_lang = "English"
-    tgt_lang = "Russian"
-
-    # move to correct position
-    for example in examples:
-        example += [chat, '', '', LangChainMode.DISABLED.value, True,
-                    LangChainAction.QUERY.value, [],
-                    top_k_docs, chunk, chunk_size, DocumentSubset.Relevant.name, [],
-                    pre_prompt_query, prompt_query,
-                    pre_prompt_summary, prompt_summary,
-                    system_prompt,
-                    image_loaders,
-                    pdf_loaders,
-                    url_loaders,
-                    jq_schema,
-                    None,
-                    None,
-                    False,
-                    None,
-                    None,
-                    docs_ordering_type,
-                    min_max_new_tokens,
-                    ]
-        # adjust examples if non-chat mode
-        if not chat:
-            example[eval_func_param_names.index('instruction_nochat')] = example[
-                eval_func_param_names.index('instruction')]
-            example[eval_func_param_names.index('instruction')] = ''
-
-            example[eval_func_param_names.index('iinput_nochat')] = example[eval_func_param_names.index('iinput')]
-            example[eval_func_param_names.index('iinput')] = ''
-        assert len(example) == len(eval_func_param_names), "Wrong example: %s %s" % (
-            len(example), len(eval_func_param_names))
-
-    if prompt_type == PromptType.custom.name and not prompt_dict:
-        raise ValueError("Unexpected to get non-empty prompt_dict=%s for prompt_type=%s" % (prompt_dict, prompt_type))
-
-    # get prompt_dict from prompt_type, so user can see in UI etc., or for custom do nothing except check format
-    prompt_dict, error0 = get_prompt(prompt_type, prompt_dict,
-                                     chat=False, context='', reduced=False, making_context=False, return_dict=True,
-                                     system_prompt=system_prompt)
-    if error0:
-        raise RuntimeError("Prompt wrong: %s" % error0)
-
-    return placeholder_instruction, placeholder_input, \
-        stream_output, show_examples, \
-        prompt_type, prompt_dict, \
-        temperature, top_p, top_k, num_beams, \
-        max_new_tokens, min_new_tokens, early_stopping, max_time, \
-        repetition_penalty, num_return_sequences, \
-        do_sample, \
-        src_lang, tgt_lang, \
-        examples, \
-        task_info
-
-
-def languages_covered():
-    # https://huggingface.co/facebook/mbart-large-50-many-to-many-mmt#languages-covered
-    covered = """Arabic (ar_AR), Czech (cs_CZ), German (de_DE), English (en_XX), Spanish (es_XX), Estonian (et_EE), Finnish (fi_FI), French (fr_XX), Gujarati (gu_IN), Hindi (hi_IN), Italian (it_IT), Japanese (ja_XX), Kazakh (kk_KZ), Korean (ko_KR), Lithuanian (lt_LT), Latvian (lv_LV), Burmese (my_MM), Nepali (ne_NP), Dutch (nl_XX), Romanian (ro_RO), Russian (ru_RU), Sinhala (si_LK), Turkish (tr_TR), Vietnamese (vi_VN), Chinese (zh_CN), Afrikaans (af_ZA), Azerbaijani (az_AZ), Bengali (bn_IN), Persian (fa_IR), Hebrew (he_IL), Croatian (hr_HR), Indonesian (id_ID), Georgian (ka_GE), Khmer (km_KH), Macedonian (mk_MK), Malayalam (ml_IN), Mongolian (mn_MN), Marathi (mr_IN), Polish (pl_PL), Pashto (ps_AF), Portuguese (pt_XX), Swedish (sv_SE), Swahili (sw_KE), Tamil (ta_IN), Telugu (te_IN), Thai (th_TH), Tagalog (tl_XX), Ukrainian (uk_UA), Urdu (ur_PK), Xhosa (xh_ZA), Galician (gl_ES), Slovene (sl_SI)"""
-    covered = covered.split(', ')
-    covered = {x.split(' ')[0]: x.split(' ')[1].replace(')', '').replace('(', '') for x in covered}
-    return covered
-
-
-def score_qa(smodel, stokenizer, max_length_tokenize, question, answer, cutoff_len):
-    question = question[-cutoff_len:]
-    answer = answer[-cutoff_len:]
-
-    inputs = stokenizer(question, answer,
-                        return_tensors="pt",
-                        truncation=True,
-                        max_length=max_length_tokenize).to(smodel.device)
-    try:
-        score = torch.sigmoid(smodel(**inputs.to(smodel.device)).logits[0].float()).cpu().detach().numpy()[0]
-    except torch.cuda.OutOfMemoryError as e:
-        print("GPU OOM 3: question: %s answer: %s exception: %s" % (question, answer, str(e)), flush=True)
-        del inputs
-        traceback.print_exc()
-        clear_torch_cache()
-        return 'Response Score: GPU OOM'
-    except (Exception, RuntimeError) as e:
-        if 'Expected all tensors to be on the same device' in str(e) or \
-                'expected scalar type Half but found Float' in str(e) or \
-                'probability tensor contains either' in str(e) or \
-                'cublasLt ran into an error!' in str(e) or \
-                'device-side assert triggered' in str(e):
-            print("GPU Error: question: %s answer: %s exception: %s" % (question, answer, str(e)),
-                  flush=True)
-            traceback.print_exc()
-            clear_torch_cache()
-            return 'Response Score: GPU Error'
-        else:
-            raise
-    os.environ['TOKENIZERS_PARALLELISM'] = 'true'
-    return score
-
-
-def check_locals(**kwargs):
-    # ensure everything in evaluate is here
-    can_skip_because_locally_generated = no_default_param_names + [
-        # get_model:
-        'reward_type'
-    ]
-    for k in eval_func_param_names:
-        if k in can_skip_because_locally_generated:
-            continue
-        assert k in kwargs, "Missing %s" % k
-    for k in inputs_kwargs_list:
-        if k in can_skip_because_locally_generated:
-            continue
-        assert k in kwargs, "Missing %s" % k
-
-    for k in list(inspect.signature(get_model).parameters):
-        if k in can_skip_because_locally_generated:
-            continue
-        assert k in kwargs, "Missing %s" % k
-
-
-def get_model_max_length(model_state):
-    if not isinstance(model_state['tokenizer'], (str, type(None))):
-        return model_state['tokenizer'].model_max_length
-    else:
-        return 2048
-
-
-def get_max_max_new_tokens(model_state, **kwargs):
-    if not isinstance(model_state['tokenizer'], (str, type(None))):
-        max_max_new_tokens = model_state['tokenizer'].model_max_length
-    else:
-        max_max_new_tokens = None
-
-    if kwargs['max_max_new_tokens'] is not None and max_max_new_tokens is not None:
-        return min(max_max_new_tokens, kwargs['max_max_new_tokens'])
-    elif kwargs['max_max_new_tokens'] is not None:
-        return kwargs['max_max_new_tokens']
-    elif kwargs['memory_restriction_level'] == 1:
-        return 768
-    elif kwargs['memory_restriction_level'] == 2:
-        return 512
-    elif kwargs['memory_restriction_level'] >= 3:
-        return 256
-    else:
-        # FIXME: Need to update after new model loaded, so user can control with slider
-        return 2048
-
-
-def get_minmax_top_k_docs(is_public):
-    if is_public:
-        min_top_k_docs = 1
-        max_top_k_docs = 8
-        label_top_k_docs = "Number of document chunks"
-    else:
-        min_top_k_docs = -1
-        max_top_k_docs = 100
-        label_top_k_docs = "Number of document chunks (-1 = auto fill model context)"
-    return min_top_k_docs, max_top_k_docs, label_top_k_docs
-
-
-def merge_chat_conversation_history(chat_conversation1, history):
-    # chat_conversation and history ordered so largest index of list is most recent
-    if chat_conversation1:
-        chat_conversation1 = str_to_list(chat_conversation1)
-        for conv1 in chat_conversation1:
-            assert isinstance(conv1, (list, tuple))
-            assert len(conv1) == 2
-
-    if isinstance(history, list):
-        # make copy so only local change
-        if chat_conversation1:
-            # so priority will be newest that comes from actual chat history from UI, then chat_conversation
-            history = chat_conversation1 + history.copy()
-    elif chat_conversation1:
-        history = chat_conversation1
-    else:
-        history = []
-    return history
-
-
-def history_to_context(history, langchain_mode=None,
-                       add_chat_history_to_context=None,
-                       prompt_type=None, prompt_dict=None, chat=None, model_max_length=None,
-                       memory_restriction_level=None, keep_sources_in_context=None,
-                       system_prompt=None, chat_conversation=None):
-    """
-    consumes all history up to (but not including) latest history item that is presumed to be an [instruction, None] pair
-    :param history:
-    :param langchain_mode:
-    :param add_chat_history_to_context:
-    :param prompt_type:
-    :param prompt_dict:
-    :param chat:
-    :param model_max_length:
-    :param memory_restriction_level:
-    :param keep_sources_in_context:
-    :param system_prompt:
-    :param chat_conversation:
-    :return:
-    """
-    history = merge_chat_conversation_history(chat_conversation, history)
-
-    if len(history) >= 1 and len(history[-1]) >= 2 and not history[-1][1]:
-        len_history = len(history) - 1
-    else:
-        # full history
-        len_history = len(history)
-
-    # ensure output will be unique to models
-    _, _, _, max_prompt_length = get_cutoffs(memory_restriction_level,
-                                             for_context=True, model_max_length=model_max_length)
-    context1 = ''
-    if max_prompt_length is not None and add_chat_history_to_context:
-        context1 = ''
-        # - 1 below because current instruction already in history from user()
-        for histi in range(0, len_history):
-            data_point = dict(instruction=history[histi][0], input='', output=history[histi][1])
-            prompt, pre_response, terminate_response, chat_sep, chat_turn_sep = \
-                generate_prompt(data_point,
-                                prompt_type,
-                                prompt_dict,
-                                chat,
-                                reduced=True,
-                                making_context=True,
-                                system_prompt=system_prompt,
-                                histi=histi)
-            # md -> back to text, maybe not super important if model trained enough
-            if not keep_sources_in_context and langchain_mode != 'Disabled' and prompt.find(super_source_prefix) >= 0:
-                # FIXME: This is relatively slow even for small amount of text, like 0.3s each history item
-                import re
-                prompt = re.sub(f'{re.escape(super_source_prefix)}.*?{re.escape(super_source_postfix)}', '', prompt,
-                                flags=re.DOTALL)
-                if prompt.endswith('\n<p>'):
-                    prompt = prompt[:-4]
-            prompt = prompt.replace('<br>', chat_turn_sep)
-            if not prompt.endswith(chat_turn_sep):
-                prompt += chat_turn_sep
-            # most recent first, add older if can
-            # only include desired chat history
-            if len(prompt + context1) > max_prompt_length:
-                break
-            context1 += prompt
-
-        _, pre_response, terminate_response, chat_sep, chat_turn_sep = \
-            generate_prompt({}, prompt_type, prompt_dict,
-                            chat, reduced=True,
-                            making_context=True,
-                            system_prompt=system_prompt,
-                            histi=-1)
-        if context1 and not context1.endswith(chat_turn_sep):
-            context1 += chat_turn_sep  # ensure if terminates abruptly, then human continues on next line
-    return context1
-
-
-def get_limited_prompt(instruction,
-                       iinput,
-                       tokenizer,
-                       prompter=None,
-                       inference_server=None,
-                       prompt_type=None, prompt_dict=None, chat=False, max_new_tokens=None,
-                       system_prompt='',
-                       context='', chat_conversation=None, text_context_list=None,
-                       keep_sources_in_context=False,
-                       model_max_length=None, memory_restriction_level=0,
-                       langchain_mode=None, add_chat_history_to_context=True,
-                       verbose=False,
-                       doc_importance=0.5,
-                       min_max_new_tokens=256,
-                       ):
-    if prompter:
-        prompt_type = prompter.prompt_type
-        prompt_dict = prompter.prompt_dict
-        chat = prompter.chat
-        stream_output = prompter.stream_output
-        system_prompt = prompter.system_prompt
-
-    # merge handles if chat_conversation is None
-    history = []
-    history = merge_chat_conversation_history(chat_conversation, history)
-    history_to_context_func = functools.partial(history_to_context,
-                                                langchain_mode=langchain_mode,
-                                                add_chat_history_to_context=add_chat_history_to_context,
-                                                prompt_type=prompt_type,
-                                                prompt_dict=prompt_dict,
-                                                chat=chat,
-                                                model_max_length=model_max_length,
-                                                memory_restriction_level=memory_restriction_level,
-                                                keep_sources_in_context=keep_sources_in_context,
-                                                system_prompt=system_prompt)
-    context2 = history_to_context_func(history)
-    context1 = context
-    if context1 is None:
-        context1 = ''
-
-    from h2oai_pipeline import H2OTextGenerationPipeline
-    data_point_just_instruction = dict(context='', instruction=instruction, input='')
-    prompt_just_instruction = prompter.generate_prompt(data_point_just_instruction)
-    instruction, num_instruction_tokens = H2OTextGenerationPipeline.limit_prompt(instruction, tokenizer)
-    num_instruction_tokens_real = get_token_count(prompt_just_instruction, tokenizer)
-    num_instruction_tokens += (num_instruction_tokens_real - num_instruction_tokens)
-
-    context1, num_context1_tokens = H2OTextGenerationPipeline.limit_prompt(context1, tokenizer)
-    context2, num_context2_tokens = H2OTextGenerationPipeline.limit_prompt(context2, tokenizer)
-    iinput, num_iinput_tokens = H2OTextGenerationPipeline.limit_prompt(iinput, tokenizer)
-    if text_context_list is None:
-        text_context_list = []
-    num_doc_tokens = sum([get_token_count(x + '\n\n', tokenizer) for x in text_context_list])
-
-    num_prompt_tokens0 = (num_instruction_tokens or 0) + \
-                         (num_context1_tokens or 0) + \
-                         (num_context2_tokens or 0) + \
-                         (num_iinput_tokens or 0) + \
-                         (num_doc_tokens or 0)
-
-    # go down to no less than 256, about 1 paragraph
-    # use max_new_tokens before use num_prompt_tokens0 else would be negative or ~0
-    min_max_new_tokens = min(min_max_new_tokens, max_new_tokens)
-    # by default assume can handle all chat and docs
-    chat_index = 0
-
-    # allowed residual is either half of what is allowed if doc exceeds half, or is rest of what doc didn't consume
-    num_non_doc_tokens = num_prompt_tokens0 - num_doc_tokens
-    # to doc first then non-doc, shouldn't matter much either way
-    doc_max_length = max(model_max_length - num_non_doc_tokens, doc_importance * model_max_length)
-    top_k_docs, one_doc_size, num_doc_tokens = get_docs_tokens(tokenizer, text_context_list=text_context_list,
-                                                               max_input_tokens=doc_max_length)
-    non_doc_max_length = max(model_max_length - num_doc_tokens, (1.0 - doc_importance) * model_max_length)
-
-    if num_non_doc_tokens > non_doc_max_length:
-        # need to limit in some way, keep portion of history but all of context and instruction
-        # 1) drop iinput (unusual to include anyways)
-        # 2) reduce history
-        # 3) reduce context1
-        # 4) limit instruction so will fit
-        diff1 = non_doc_max_length - (
-                num_instruction_tokens + num_context1_tokens + num_context2_tokens + min_max_new_tokens)
-        diff2 = non_doc_max_length - (num_instruction_tokens + num_context1_tokens + min_max_new_tokens)
-        diff3 = non_doc_max_length - (num_instruction_tokens + min_max_new_tokens)
-        diff4 = non_doc_max_length - min_max_new_tokens
-        if diff1 > 0:
-            # then should be able to do #1
-            iinput = ''
-            num_iinput_tokens = 0
-        elif diff2 > 0 > diff1:
-            # then may be able to do #1 + #2
-            iinput = ''
-            num_iinput_tokens = 0
-            chat_index_final = len(history)
-            for chat_index in range(len(history)):
-                # NOTE: history and chat_conversation are older for first entries
-                # FIXME: This is a slow for many short conversations
-                context2 = history_to_context_func(history[chat_index:])
-                num_context2_tokens = get_token_count(context2, tokenizer)
-                diff1 = non_doc_max_length - (
-                        num_instruction_tokens + num_context1_tokens + num_context2_tokens + min_max_new_tokens)
-                if diff1 > 0:
-                    chat_index_final = chat_index
-                    if verbose:
-                        print("chat_conversation used %d out of %d" % (chat_index, len(history)), flush=True)
-                    break
-            chat_index = chat_index_final  # i.e. if chat_index == len(history), then nothing can be consumed
-        elif diff3 > 0 > diff2:
-            # then may be able to do #1 + #2 + #3
-            iinput = ''
-            num_iinput_tokens = 0
-            context2 = ''
-            num_context2_tokens = 0
-            context1, num_context1_tokens = H2OTextGenerationPipeline.limit_prompt(context1, tokenizer,
-                                                                                   max_prompt_length=diff3)
-            if num_context1_tokens <= diff3:
-                pass
-            else:
-                print("failed to reduce", flush=True)
-        else:
-            # then must be able to do #1 + #2 + #3 + #4
-            iinput = ''
-            num_iinput_tokens = 0
-            context2 = ''
-            num_context2_tokens = 0
-            context1 = ''
-            num_context1_tokens = 0
-            # diff4 accounts for real prompting for instruction
-            # FIXME: history_to_context could include instruction, in case system prompt long, we overcount and could have more free tokens
-            instruction, num_instruction_tokens = H2OTextGenerationPipeline.limit_prompt(instruction, tokenizer,
-                                                                                         max_prompt_length=diff4)
-            # get actual tokens
-            data_point_just_instruction = dict(context='', instruction=instruction, input='')
-            prompt_just_instruction = prompter.generate_prompt(data_point_just_instruction)
-            num_instruction_tokens_real = get_token_count(prompt_just_instruction, tokenizer)
-            num_instruction_tokens += (num_instruction_tokens_real - num_instruction_tokens)
-
-    # update full context
-    context = context1 + context2
-    # update token counts (docs + non-docs, all tokens)
-    num_prompt_tokens = (num_instruction_tokens or 0) + \
-                        (num_context1_tokens or 0) + \
-                        (num_context2_tokens or 0) + \
-                        (num_iinput_tokens or 0) + \
-                        (num_doc_tokens or 0)
-
-    # update max_new_tokens
-    if inference_server and inference_server.startswith('http'):
-        # assume TGI/Gradio setup to consume tokens and have long output too, even if exceeds model capacity.
-        pass
-    else:
-        # limit so max_new_tokens = prompt + new < max
-        # otherwise model can fail etc. e.g. for distilgpt2 asking for 1024 tokens is enough to fail if prompt=1 token
-        max_new_tokens = min(max_new_tokens, model_max_length - num_prompt_tokens)
-
-    if prompter is None:
-        # get prompter
-        debug = False
-        stream_output = False  # doesn't matter
-        prompter = Prompter(prompt_type, prompt_dict, debug=debug, chat=chat, stream_output=stream_output,
-                            system_prompt=system_prompt)
-
-    data_point = dict(context=context, instruction=instruction, input=iinput)
-    # handle promptA/promptB addition if really from history.
-    # if not from history, then reduced=False inside correct
-    # if mixed, then no specific correct thing to do, so treat like history and promptA/B will come first still
-    context_from_history = len(history) > 0 and len(context1) > 0
-    prompt = prompter.generate_prompt(data_point, context_from_history=context_from_history)
-    num_prompt_tokens_actual = get_token_count(prompt, tokenizer)
-
-    return prompt, \
-        instruction, iinput, context, \
-        num_prompt_tokens, max_new_tokens, num_prompt_tokens0, num_prompt_tokens_actual, \
-        chat_index, top_k_docs, one_doc_size
-
-
-def get_docs_tokens(tokenizer, text_context_list=[], max_input_tokens=None):
-    if text_context_list is None or len(text_context_list) == 0:
-        return 0, None, 0
-    if max_input_tokens is None:
-        max_input_tokens = tokenizer.model_max_length
-    tokens = [get_token_count(x + '\n\n', tokenizer) for x in text_context_list]
-    tokens_cumsum = np.cumsum(tokens)
-    where_res = np.where(tokens_cumsum < max_input_tokens)[0]
-    # if below condition fails, then keep top_k_docs=-1 and trigger special handling next
-    if where_res.shape[0] > 0:
-        top_k_docs = 1 + where_res[-1]
-        one_doc_size = None
-        num_doc_tokens = tokens_cumsum[top_k_docs - 1]  # by index
-    else:
-        # if here, means 0 and just do best with 1 doc
-        top_k_docs = 1
-        text_context_list = text_context_list[:top_k_docs]
-        # critical protection
-        from src.h2oai_pipeline import H2OTextGenerationPipeline
-        doc_content = text_context_list[0]
-        doc_content, new_tokens0 = H2OTextGenerationPipeline.limit_prompt(doc_content,
-                                                                          tokenizer,
-                                                                          max_prompt_length=max_input_tokens)
-        text_context_list[0] = doc_content
-        one_doc_size = len(doc_content)
-        num_doc_tokens = get_token_count(doc_content + '\n\n', tokenizer)
-        print("Unexpected large chunks and can't add to context, will add 1 anyways.  Tokens %s -> %s" % (
-            tokens[0], new_tokens0), flush=True)
-    return top_k_docs, one_doc_size, num_doc_tokens
-
-
-def entrypoint_main():
-    """
-    Examples:
-
-    WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 --master_port=1234 generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca_6B
-    python generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights='lora-alpaca_6B'
-    python generate.py --base_model='EleutherAI/gpt-neox-20b' --lora_weights='lora-alpaca_20B'
-
-    # generate without lora weights, no prompt
-    python generate.py --base_model='EleutherAI/gpt-neox-20b' --prompt_type='plain'
-    python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='dai_faq'
-
-    python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='dai_faq' --lora_weights='lora_20B_daifaq'
-    # OpenChatKit settings:
-    python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='human_bot --debug=True --num_beams=1 --temperature=0.6 --top_k=40 --top_p=1.0
-
-    python generate.py --base_model='distilgpt2' --prompt_type='plain' --debug=True --num_beams=1 --temperature=0.6 --top_k=40 --top_p=1.0 --share=False
-    python generate.py --base_model='t5-large' --prompt_type='simple_instruct'
-    python generate.py --base_model='philschmid/bart-large-cnn-samsum'
-    python generate.py --base_model='philschmid/flan-t5-base-samsum'
-    python generate.py --base_model='facebook/mbart-large-50-many-to-many-mmt'
-
-    python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='human_bot' --lora_weights='GPT-NeoXT-Chat-Base-20B.merged.json.8_epochs.57b2892c53df5b8cefac45f84d019cace803ef26.28'
-
-    must have 4*48GB GPU and run without 8bit in order for sharding to work with use_gpu_id=False
-    can also pass --prompt_type='human_bot' and model can somewhat handle instructions without being instruct tuned
-    python generate.py --base_model=decapoda-research/llama-65b-hf --load_8bit=False --use_gpu_id=False --prompt_type='human_bot'
-
-    python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6_9b
-    """
-    H2O_Fire(main)
-
-
-if __name__ == "__main__":
-    entrypoint_main()