Spaces:
Paused
Paused
import os | |
import subprocess | |
import time | |
from datetime import datetime | |
import pytest | |
from src.utils import get_ngpus_vis, makedirs | |
from tests.utils import wrap_test_forked, get_inf_port, get_inf_server | |
from tests.test_langchain_units import have_openai_key, have_replicate_key | |
from src.client_test import run_client_many, test_client_basic_api_lean | |
from src.enums import PromptType, LangChainAction | |
def test_gradio_inference_server(base_model, force_langchain_evaluate, do_langchain, enforce_h2ogpt_api_key, | |
prompt='Who are you?', stream_output=False, max_new_tokens=256, | |
langchain_mode='Disabled', langchain_action=LangChainAction.QUERY.value, | |
langchain_agents=[], | |
user_path=None, | |
langchain_modes=['UserData', 'MyData', 'LLM', 'Disabled'], | |
docs_ordering_type='reverse_sort'): | |
if enforce_h2ogpt_api_key and base_model != 'h2oai/h2ogpt-oig-oasst1-512-6_9b': | |
# no need for so many cases | |
return | |
if force_langchain_evaluate: | |
langchain_mode = 'MyData' | |
if do_langchain: | |
langchain_mode = 'UserData' | |
from tests.utils import make_user_path_test | |
user_path = make_user_path_test() | |
# from src.gpt_langchain import get_some_dbs_from_hf | |
# get_some_dbs_from_hf() | |
if base_model in ['h2oai/h2ogpt-oig-oasst1-512-6_9b', 'h2oai/h2ogpt-oasst1-512-12b']: | |
prompt_type = PromptType.human_bot.name | |
elif base_model in ['h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v2']: | |
prompt_type = PromptType.prompt_answer.name | |
elif base_model in ['llama']: | |
prompt_type = PromptType.llama2.name | |
elif base_model in ['gptj']: | |
prompt_type = PromptType.gptj.name | |
else: | |
raise NotImplementedError(base_model) | |
main_kwargs = dict(base_model=base_model, prompt_type=prompt_type, chat=True, | |
stream_output=stream_output, gradio=True, num_beams=1, block_gradio_exit=False, | |
max_new_tokens=max_new_tokens, | |
langchain_mode=langchain_mode, langchain_action=langchain_action, | |
langchain_agents=langchain_agents, | |
user_path=user_path, | |
langchain_modes=langchain_modes, | |
docs_ordering_type=docs_ordering_type, | |
force_langchain_evaluate=force_langchain_evaluate) | |
# inference server | |
from src.gen import main | |
main(**main_kwargs) | |
inference_server = get_inf_server() | |
inf_port = get_inf_port() | |
# server that consumes inference server has different port | |
from src.gen import main | |
client_port = inf_port + 2 # assume will not use + 2 in testing, + 1 reserved for non-gradio inference servers | |
# only case when GRADIO_SERVER_PORT and HOST should appear in tests because using 2 gradio instances | |
os.environ['GRADIO_SERVER_PORT'] = str(client_port) | |
os.environ['HOST'] = "http://127.0.0.1:%s" % client_port | |
h2ogpt_key = 'foodoo#' | |
main_kwargs = main_kwargs.copy() | |
if enforce_h2ogpt_api_key: | |
main_kwargs.update(dict(enforce_h2ogpt_api_key=True, h2ogpt_api_keys=[h2ogpt_key])) | |
main(**main_kwargs, inference_server=inference_server) | |
# client test to server that only consumes inference server | |
from src.client_test import run_client_chat | |
res_dict, client = run_client_chat(prompt=prompt, prompt_type=prompt_type, stream_output=stream_output, | |
max_new_tokens=max_new_tokens, langchain_mode=langchain_mode, | |
langchain_action=langchain_action, langchain_agents=langchain_agents) | |
assert res_dict['prompt'] == prompt | |
assert res_dict['iinput'] == '' | |
# will use HOST from above | |
if enforce_h2ogpt_api_key: | |
# try without key first | |
ret1, ret2, ret3, ret4, ret5, ret6, ret7 = run_client_many(prompt_type=None) | |
assert 'Invalid Access Key' in ret1['response'] | |
assert 'Invalid Access Key' in ret2['response'] | |
assert 'Invalid Access Key' in ret3['response'] | |
assert 'Invalid Access Key' in ret4['response'] | |
assert 'Invalid Access Key' in ret5['response'] | |
assert 'Invalid Access Key' in ret6['response'] | |
assert 'Invalid Access Key' in ret7['response'] | |
ret1, ret2, ret3, ret4, ret5, ret6, ret7 = run_client_many(prompt_type=None, h2ogpt_key='foo') | |
assert 'Invalid Access Key' in ret1['response'] | |
assert 'Invalid Access Key' in ret2['response'] | |
assert 'Invalid Access Key' in ret3['response'] | |
assert 'Invalid Access Key' in ret4['response'] | |
assert 'Invalid Access Key' in ret5['response'] | |
assert 'Invalid Access Key' in ret6['response'] | |
assert 'Invalid Access Key' in ret7['response'] | |
# try normal or with key if enforcing | |
ret1, ret2, ret3, ret4, ret5, ret6, ret7 = run_client_many(prompt_type=None, | |
h2ogpt_key=h2ogpt_key) # client shouldn't have to specify | |
if base_model == 'h2oai/h2ogpt-oig-oasst1-512-6_9b': | |
assert 'h2oGPT' in ret1['response'] | |
assert 'Birds' in ret2['response'] | |
assert 'Birds' in ret3['response'] | |
assert 'h2oGPT' in ret4['response'] | |
assert 'h2oGPT' in ret5['response'] | |
assert 'h2oGPT' in ret6['response'] | |
assert 'h2oGPT' in ret7['response'] | |
elif base_model == 'h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v2': | |
assert 'I am a language model trained' in ret1['response'] or \ | |
'I am a helpful assistant' in ret1['response'] or \ | |
'I am a chatbot.' in ret1['response'] or \ | |
'a chat-based assistant that can answer questions' in ret1['response'] or \ | |
'I am an AI language model' in ret1['response'] or \ | |
'I am an AI assistant.' in ret1['response'] | |
assert 'Once upon a time' in ret2['response'] | |
assert 'Once upon a time' in ret3['response'] | |
assert 'I am a language model trained' in ret4['response'] or 'I am a helpful assistant' in \ | |
ret4['response'] or 'I am a chatbot.' in ret4['response'] or \ | |
'a chat-based assistant that can answer questions' in ret4['response'] or \ | |
'I am an AI language model' in ret4['response'] or \ | |
'I am an AI assistant.' in ret4['response'] | |
assert 'I am a language model trained' in ret5['response'] or 'I am a helpful assistant' in \ | |
ret5['response'] or 'I am a chatbot.' in ret5['response'] or \ | |
'a chat-based assistant that can answer questions' in ret5['response'] or \ | |
'I am an AI language model' in ret5['response'] or \ | |
'I am an AI assistant.' in ret5['response'] | |
assert 'I am a language model trained' in ret6['response'] or 'I am a helpful assistant' in \ | |
ret6['response'] or 'I am a chatbot.' in ret6['response'] or \ | |
'a chat-based assistant that can answer questions' in ret6['response'] or \ | |
'I am an AI language model' in ret6['response'] or \ | |
'I am an AI assistant.' in ret6['response'] | |
assert 'I am a language model trained' in ret7['response'] or 'I am a helpful assistant' in \ | |
ret7['response'] or 'I am a chatbot.' in ret7['response'] or \ | |
'a chat-based assistant that can answer questions' in ret7['response'] or \ | |
'I am an AI language model' in ret7['response'] or \ | |
'I am an AI assistant.' in ret7['response'] | |
elif base_model == 'llama': | |
assert 'I am a bot.' in ret1['response'] or 'can I assist you today?' in ret1[ | |
'response'] or 'How can I assist you?' in ret1['response'] or "I'm LLaMA" in ret1['response'] | |
assert 'Birds' in ret2['response'] or 'Once upon a time' in ret2['response'] | |
assert 'Birds' in ret3['response'] or 'Once upon a time' in ret3['response'] | |
assert 'I am a bot.' in ret4['response'] or 'can I assist you today?' in ret4[ | |
'response'] or 'How can I assist you?' in ret4['response'] or "I'm LLaMA" in ret4['response'] | |
assert 'I am a bot.' in ret5['response'] or 'can I assist you today?' in ret5[ | |
'response'] or 'How can I assist you?' in ret5['response'] or "I'm LLaMA" in ret5['response'] | |
assert 'I am a bot.' in ret6['response'] or 'can I assist you today?' in ret6[ | |
'response'] or 'How can I assist you?' in ret6['response'] or "I'm LLaMA" in ret6['response'] | |
assert 'I am a bot.' in ret7['response'] or 'can I assist you today?' in ret7[ | |
'response'] or 'How can I assist you?' in ret7['response'] or "I'm LLaMA" in ret7['response'] | |
elif base_model == 'gptj': | |
assert 'I am a bot.' in ret1['response'] or 'can I assist you today?' in ret1[ | |
'response'] or 'a student at' in ret1['response'] or 'am a person who' in ret1['response'] or 'I am' in \ | |
ret1['response'] or "I'm a student at" in ret1['response'] | |
assert 'Birds' in ret2['response'] or 'Once upon a time' in ret2['response'] | |
assert 'Birds' in ret3['response'] or 'Once upon a time' in ret3['response'] | |
assert 'I am a bot.' in ret4['response'] or 'can I assist you today?' in ret4[ | |
'response'] or 'a student at' in ret4['response'] or 'am a person who' in ret4['response'] or 'I am' in \ | |
ret4['response'] or "I'm a student at" in ret4['response'] | |
assert 'I am a bot.' in ret5['response'] or 'can I assist you today?' in ret5[ | |
'response'] or 'a student at' in ret5['response'] or 'am a person who' in ret5['response'] or 'I am' in \ | |
ret5['response'] or "I'm a student at" in ret5['response'] | |
assert 'I am a bot.' in ret6['response'] or 'can I assist you today?' in ret6[ | |
'response'] or 'a student at' in ret6['response'] or 'am a person who' in ret6['response'] or 'I am' in \ | |
ret6['response'] or "I'm a student at" in ret6['response'] | |
assert 'I am a bot.' in ret7['response'] or 'can I assist you today?' in ret7[ | |
'response'] or 'a student at' in ret7['response'] or 'am a person who' in ret7['response'] or 'I am' in \ | |
ret7['response'] or "I'm a student at" in ret7['response'] | |
print("DONE", flush=True) | |
def run_docker(inf_port, base_model, low_mem_mode=False, do_shared=True): | |
datetime_str = str(datetime.now()).replace(" ", "_").replace(":", "_") | |
msg = "Starting HF inference %s..." % datetime_str | |
print(msg, flush=True) | |
home_dir = os.path.expanduser('~') | |
makedirs(os.path.join(home_dir, '.cache/huggingface/hub')) | |
data_dir = '%s/.cache/huggingface/hub/' % home_dir | |
n_gpus = get_ngpus_vis() | |
cmd = ["docker"] + ['run', | |
'-d', | |
'--runtime', 'nvidia', | |
] + gpus_cmd() + [ | |
'--shm-size', '1g', | |
'-e', 'HUGGING_FACE_HUB_TOKEN=%s' % os.environ['HUGGING_FACE_HUB_TOKEN'], | |
'-p', '%s:80' % inf_port, | |
'-v', '%s/.cache/huggingface/hub/:/data' % home_dir, | |
'-v', '%s:/data' % data_dir, | |
'ghcr.io/huggingface/text-generation-inference:0.9.3', | |
'--model-id', base_model, | |
'--max-stop-sequences', '6', | |
'--sharded', 'false' if n_gpus == 1 or not do_shared else 'true' | |
] | |
if n_gpus > 1 and do_shared: | |
cmd.extend(['--num-shard', '%s' % n_gpus]) | |
if low_mem_mode: | |
cmd.extend(['--max-input-length', '1024', | |
'--max-total-tokens', '2048', | |
# '--cuda-memory-fraction', '0.3', # for 0.9.4, but too memory hungry | |
]) | |
else: | |
cmd.extend(['--max-input-length', '4096', | |
'--max-total-tokens', '8192', | |
# '--cuda-memory-fraction', '0.8', # for 0.9.4, but too memory hungry | |
]) | |
print(cmd, flush=True) | |
docker_hash = subprocess.check_output(cmd).decode().strip() | |
import time | |
connected = False | |
while not connected: | |
cmd = 'docker logs %s' % docker_hash | |
o = subprocess.check_output(cmd, shell=True, timeout=15) | |
connected = 'Connected' in o.decode("utf-8") | |
time.sleep(5) | |
print("Done starting TGI server: %s" % docker_hash, flush=True) | |
return docker_hash | |
def gpus_cmd(): | |
n_gpus = get_ngpus_vis() | |
if n_gpus == 1: | |
return ['--gpus', 'device=%d' % int(os.getenv('CUDA_VISIBLE_DEVICES', '0'))] | |
elif n_gpus > 2: | |
# note below if joined loses ' needed | |
return ['--gpus', '\"device=%s\"' % os.getenv('CUDA_VISIBLE_DEVICES', | |
str(list(range(0, n_gpus))).replace(']', '').replace('[', | |
'').replace( | |
' ', '') | |
)] | |
def run_vllm_docker(inf_port, base_model, tokenizer=None): | |
if base_model == 'h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v2': | |
# 7b has 71 heads, not divisible | |
os.environ['CUDA_VISIBLE_DEVICES'] = '0' | |
os.system("docker pull gcr.io/vorvan/h2oai/h2ogpt-runtime:0.1.0") | |
datetime_str = str(datetime.now()).replace(" ", "_").replace(":", "_") | |
msg = "Starting vLLM inference %s..." % datetime_str | |
print(msg, flush=True) | |
home_dir = os.path.expanduser('~') | |
makedirs(os.path.join(home_dir, '.cache/huggingface/hub')) | |
n_gpus = get_ngpus_vis() | |
cmd = ["docker"] + ['run', | |
'-d', | |
'--runtime', 'nvidia', | |
] + gpus_cmd() + [ | |
'--shm-size', '10.24g', | |
'-e', 'HUGGING_FACE_HUB_TOKEN=%s' % os.environ['HUGGING_FACE_HUB_TOKEN'], | |
'-p', '%s:5000' % inf_port, | |
'--entrypoint', '/h2ogpt_conda/vllm_env/bin/python3.10', | |
'-e', 'NCCL_IGNORE_DISABLED_P2P=1', | |
'-v', '/etc/passwd:/etc/passwd:ro', | |
'-v', '/etc/group:/etc/group:ro', | |
'-u', '%s:%s' % (os.getuid(), os.getgid()), | |
'-v', '%s/.cache:/workspace/.cache' % home_dir, | |
# '--network', 'host', | |
'gcr.io/vorvan/h2oai/h2ogpt-runtime:0.1.0', | |
# 'h2ogpt', # use when built locally with vLLM just freshly added | |
# 'docker.io/library/h2ogpt', # use when built locally with vLLM just freshly added | |
'-m', 'vllm.entrypoints.openai.api_server', | |
'--port=5000', | |
'--host=0.0.0.0', | |
'--model=%s' % base_model, | |
'--tensor-parallel-size=%s' % n_gpus, | |
'--seed', '1234', | |
'--trust-remote-code', | |
'--download-dir=/workspace/.cache/huggingface/hub', | |
] | |
os.environ.pop('CUDA_VISIBLE_DEVICES', None) | |
if tokenizer: | |
cmd.append('--tokenizer=%s' % tokenizer) | |
print(cmd, flush=True) | |
print(' '.join(cmd), flush=True) | |
docker_hash = subprocess.check_output(cmd).decode().strip() | |
import time | |
connected = False | |
while not connected: | |
cmd = 'docker logs %s' % docker_hash | |
o = subprocess.check_output(cmd, shell=True, timeout=15) | |
connected = 'Uvicorn running on' in o.decode("utf-8") | |
# somehow above message doesn't come up | |
connected |= 'GPU blocks' in o.decode("utf-8") | |
time.sleep(5) | |
print("Done starting vLLM server: %s" % docker_hash, flush=True) | |
return docker_hash | |
def run_h2ogpt_docker(port, base_model, inference_server=None, max_new_tokens=None): | |
os.system("docker pull gcr.io/vorvan/h2oai/h2ogpt-runtime:0.1.0") | |
datetime_str = str(datetime.now()).replace(" ", "_").replace(":", "_") | |
msg = "Starting h2oGPT %s..." % datetime_str | |
print(msg, flush=True) | |
home_dir = os.path.expanduser('~') | |
makedirs(os.path.join(home_dir, '.cache/huggingface/hub')) | |
makedirs(os.path.join(home_dir, 'save')) | |
cmd = ["docker"] + ['run', | |
'-d', | |
'--runtime', 'nvidia', | |
] + gpus_cmd() + [ | |
'--shm-size', '1g', | |
'-p', '%s:7860' % port, | |
'-v', '%s/.cache:/workspace/.cache/' % home_dir, | |
'-v', '%s/save:/workspace/save' % home_dir, | |
'-v', '/etc/passwd:/etc/passwd:ro', | |
'-v', '/etc/group:/etc/group:ro', | |
'-u', '%s:%s' % (os.getuid(), os.getgid()), | |
'-e', 'HUGGING_FACE_HUB_TOKEN=%s' % os.environ['HUGGING_FACE_HUB_TOKEN'], | |
'--network', 'host', | |
'gcr.io/vorvan/h2oai/h2ogpt-runtime:0.1.0', | |
# 'h2ogpt', # use when built locally with vLLM just freshly added | |
'/workspace/generate.py', | |
'--base_model=%s' % base_model, | |
'--use_safetensors=True', | |
'--save_dir=/workspace/save/', | |
'--score_model=None', | |
'--max_max_new_tokens=%s' % (max_new_tokens or 2048), | |
'--max_new_tokens=%s' % (max_new_tokens or 1024), | |
'--num_async=10', | |
'--num_beams=1', | |
'--top_k_docs=-1', | |
'--chat=True', | |
'--stream_output=True', | |
# '--debug=True', | |
] | |
if inference_server: | |
cmd.extend(['--inference_server=%s' % inference_server]) | |
print(cmd, flush=True) | |
docker_hash = subprocess.check_output(cmd).decode().strip() | |
print("Done starting h2oGPT server: %s" % docker_hash, flush=True) | |
return docker_hash | |
def test_hf_inference_server(base_model, force_langchain_evaluate, do_langchain, pass_prompt_type, do_model_lock, | |
prompt='Who are you?', stream_output=False, max_new_tokens=256, | |
langchain_mode='Disabled', | |
langchain_action=LangChainAction.QUERY.value, | |
langchain_agents=[], | |
user_path=None, | |
langchain_modes=['UserData', 'MyData', 'LLM', 'Disabled'], | |
docs_ordering_type='reverse_sort'): | |
# HF inference server | |
gradio_port = get_inf_port() | |
inf_port = gradio_port + 1 | |
inference_server = 'http://127.0.0.1:%s' % inf_port | |
docker_hash = run_docker(inf_port, base_model, low_mem_mode=True, do_shared=False) | |
if force_langchain_evaluate: | |
langchain_mode = 'MyData' | |
if do_langchain: | |
langchain_mode = 'UserData' | |
from tests.utils import make_user_path_test | |
user_path = make_user_path_test() | |
# from src.gpt_langchain import get_some_dbs_from_hf | |
# get_some_dbs_from_hf() | |
if base_model in ['h2oai/h2ogpt-oig-oasst1-512-6_9b', 'h2oai/h2ogpt-oasst1-512-12b']: | |
prompt_type = PromptType.human_bot.name | |
else: | |
prompt_type = PromptType.prompt_answer.name | |
if isinstance(pass_prompt_type, str): | |
prompt_type = 'custom' | |
prompt_dict = """{'promptA': None, 'promptB': None, 'PreInstruct': None, 'PreInput': None, 'PreResponse': None, 'terminate_response': [], 'chat_sep': '', 'chat_turn_sep': '', 'humanstr': None, 'botstr': None, 'generates_leading_space': False}""" | |
else: | |
prompt_dict = None | |
if not pass_prompt_type: | |
prompt_type = None | |
if do_model_lock: | |
model_lock = [{'inference_server': inference_server, 'base_model': base_model}] | |
base_model = None | |
inference_server = None | |
else: | |
model_lock = None | |
main_kwargs = dict(base_model=base_model, | |
prompt_type=prompt_type, | |
prompt_dict=prompt_dict, | |
chat=True, | |
stream_output=stream_output, gradio=True, num_beams=1, block_gradio_exit=False, | |
max_new_tokens=max_new_tokens, | |
langchain_mode=langchain_mode, | |
langchain_action=langchain_action, | |
langchain_agents=langchain_agents, | |
user_path=user_path, | |
langchain_modes=langchain_modes, | |
docs_ordering_type=docs_ordering_type, | |
force_langchain_evaluate=force_langchain_evaluate, | |
inference_server=inference_server, | |
model_lock=model_lock) | |
try: | |
# server that consumes inference server | |
from src.gen import main | |
main(**main_kwargs) | |
# client test to server that only consumes inference server | |
from src.client_test import run_client_chat | |
res_dict, client = run_client_chat(prompt=prompt, prompt_type=prompt_type, | |
stream_output=stream_output, | |
max_new_tokens=max_new_tokens, langchain_mode=langchain_mode, | |
langchain_action=langchain_action, | |
langchain_agents=langchain_agents, | |
prompt_dict=prompt_dict) | |
assert res_dict['prompt'] == prompt | |
assert res_dict['iinput'] == '' | |
# will use HOST from above | |
ret1, ret2, ret3, ret4, ret5, ret6, ret7 = run_client_many(prompt_type=None) # client shouldn't have to specify | |
# here docker started with falcon before personalization | |
if isinstance(pass_prompt_type, str): | |
assert 'year old student from the' in ret1['response'] or 'I am a person who is asking you a question' in \ | |
ret1['response'] or 'year old' in ret1['response'] | |
assert 'bird' in ret2['response'] | |
assert 'bird' in ret3['response'] | |
assert 'year old student from the' in ret4['response'] or 'I am a person who is asking you a question' in \ | |
ret4['response'] or 'year old' in ret4['response'] | |
assert 'year old student from the' in ret5['response'] or 'I am a person who is asking you a question' in \ | |
ret5['response'] or 'year old' in ret5['response'] | |
assert 'year old student from the' in ret6['response'] or 'I am a person who is asking you a question' in \ | |
ret6['response'] or 'year old' in ret6['response'] | |
assert 'year old student from the' in ret7['response'] or 'I am a person who is asking you a question' in \ | |
ret7['response'] or 'year old' in ret7['response'] | |
elif base_model == 'h2oai/h2ogpt-oig-oasst1-512-6_9b': | |
assert 'h2oGPT' in ret1['response'] | |
assert 'Birds' in ret2['response'] | |
assert 'Birds' in ret3['response'] | |
assert 'h2oGPT' in ret4['response'] | |
assert 'h2oGPT' in ret5['response'] | |
assert 'h2oGPT' in ret6['response'] | |
assert 'h2oGPT' in ret7['response'] | |
else: | |
assert 'I am a language model trained' in ret1['response'] or 'I am a helpful assistant' in \ | |
ret1['response'] or 'a chat-based assistant' in ret1['response'] or 'am a student' in ret1[ | |
'response'] or 'I am an AI language model' in ret1['response'] | |
assert 'Once upon a time' in ret2['response'] | |
assert 'Once upon a time' in ret3['response'] | |
assert 'I am a language model trained' in ret4['response'] or 'I am a helpful assistant' in \ | |
ret4['response'] or 'a chat-based assistant' in ret4['response'] or 'am a student' in ret4[ | |
'response'] or 'I am an AI language model' in ret4['response'] | |
assert 'I am a language model trained' in ret5['response'] or 'I am a helpful assistant' in \ | |
ret5['response'] or 'a chat-based assistant' in ret5['response'] or 'am a student' in ret5[ | |
'response'] or 'I am an AI language model' in ret5['response'] | |
assert 'I am a language model trained' in ret6['response'] or 'I am a helpful assistant' in \ | |
ret6['response'] or 'a chat-based assistant' in ret6['response'] or 'am a student' in ret6[ | |
'response'] or 'I am an AI language model' in ret6['response'] | |
assert 'I am a language model trained' in ret7['response'] or 'I am a helpful assistant' in \ | |
ret7['response'] or 'a chat-based assistant' in ret7['response'] or 'am a student' in ret7[ | |
'response'] or 'I am an AI language model' in ret7['response'] | |
print("DONE", flush=True) | |
finally: | |
os.system("docker stop %s" % docker_hash) | |
chat_conversation1 = [['Who are you?', | |
'I am an AI language model created by OpenAI, designed to assist with various tasks such as answering questions, generating text, and providing information.']] | |
def test_openai_inference_server(inference_server, force_langchain_evaluate, chat_conversation, | |
system_prompt, | |
prompt='Who are you?', stream_output=False, max_new_tokens=256, | |
base_model='gpt-3.5-turbo', | |
langchain_mode='Disabled', | |
langchain_action=LangChainAction.QUERY.value, | |
langchain_agents=[], | |
user_path=None, | |
langchain_modes=['UserData', 'MyData', 'LLM', 'Disabled'], | |
docs_ordering_type='reverse_sort'): | |
if force_langchain_evaluate: | |
langchain_mode = 'MyData' | |
if inference_server == 'openai_azure_chat': | |
# need at least deployment name added: | |
deployment_name = 'h2ogpt' | |
inference_server += ':%s:%s' % (deployment_name, 'h2ogpt.openai.azure.com/') | |
if 'azure' in inference_server: | |
assert 'OPENAI_AZURE_KEY' in os.environ, "Missing 'OPENAI_AZURE_KEY'" | |
os.environ['OPENAI_API_KEY'] = os.environ['OPENAI_AZURE_KEY'] | |
main_kwargs = dict(base_model=base_model, chat=True, | |
stream_output=stream_output, gradio=True, num_beams=1, block_gradio_exit=False, | |
max_new_tokens=max_new_tokens, | |
langchain_mode=langchain_mode, | |
langchain_action=langchain_action, | |
langchain_agents=langchain_agents, | |
user_path=user_path, | |
langchain_modes=langchain_modes, | |
system_prompt='auto', | |
docs_ordering_type=docs_ordering_type, | |
# chat_conversation=chat_conversation # not enough if API passes [], API will override | |
) | |
# server that consumes inference server | |
from src.gen import main | |
main(**main_kwargs, inference_server=inference_server) | |
if chat_conversation: | |
prompt = 'What did I ask?' | |
# client test to server that only consumes inference server | |
from src.client_test import run_client_chat | |
res_dict, client = run_client_chat(prompt=prompt, prompt_type='openai_chat', stream_output=stream_output, | |
max_new_tokens=max_new_tokens, langchain_mode=langchain_mode, | |
langchain_action=langchain_action, langchain_agents=langchain_agents, | |
chat_conversation=chat_conversation, | |
system_prompt=system_prompt) | |
assert res_dict['prompt'] == prompt | |
assert res_dict['iinput'] == '' | |
if chat_conversation and system_prompt: | |
# TODO: don't check yet, system_prompt ignored if response from LLM is as if no system prompt | |
return | |
if chat_conversation or system_prompt: | |
ret6, _ = test_client_basic_api_lean(prompt=prompt, prompt_type=None, | |
chat_conversation=chat_conversation, | |
system_prompt=system_prompt) | |
if system_prompt: | |
assert 'baby cat' in res_dict['response'] and 'meow' in res_dict['response'].lower() | |
assert 'baby cat' in ret6['response'] and 'meow' in ret6['response'].lower() | |
else: | |
options_response = ['You asked "Who are you?"', """You asked, \"Who are you?\""""] | |
assert res_dict['response'] in options_response | |
assert ret6['response'] in options_response | |
return | |
if system_prompt: | |
# don't test rest, too many cases | |
return | |
# will use HOST from above | |
ret1, ret2, ret3, ret4, ret5, ret6, ret7 = run_client_many(prompt_type=None) # client shouldn't have to specify | |
assert 'I am an AI language model' in ret1['response'] or 'I am a helpful assistant designed' in ret1[ | |
'response'] or 'I am an AI assistant designed to help answer questions and provide information' in ret1[ | |
'response'] | |
assert 'Once upon a time, in a far-off land,' in ret2['response'] or 'Once upon a time' in ret2['response'] | |
assert 'Once upon a time, in a far-off land,' in ret3['response'] or 'Once upon a time' in ret3['response'] | |
assert 'I am an AI language model' in ret4['response'] or 'I am a helpful assistant designed' in ret4[ | |
'response'] or 'I am an AI assistant designed to help answer questions and provide information' in ret4[ | |
'response'] | |
assert 'I am an AI language model' in ret5['response'] or 'I am a helpful assistant designed' in ret5[ | |
'response'] or 'I am an AI assistant designed to help answer questions and provide information' in ret5[ | |
'response'] | |
assert 'I am an AI language model' in ret6['response'] or 'I am a helpful assistant designed' in ret6[ | |
'response'] or 'I am an AI assistant designed to help answer questions and provide information' in ret6[ | |
'response'] | |
assert 'I am an AI language model' in ret7['response'] or 'I am a helpful assistant designed' in ret7[ | |
'response'] or 'I am an AI assistant designed to help answer questions and provide information' in ret7[ | |
'response'] | |
print("DONE", flush=True) | |
def test_gradio_tgi_docker(base_model): | |
# HF inference server | |
gradio_port = get_inf_port() | |
inf_port = gradio_port + 1 | |
inference_server = 'http://127.0.0.1:%s' % inf_port | |
docker_hash1 = run_docker(inf_port, base_model, low_mem_mode=True, do_shared=False) | |
os.system('docker logs %s | tail -10' % docker_hash1) | |
# h2oGPT server | |
docker_hash2 = run_h2ogpt_docker(gradio_port, base_model, inference_server=inference_server) | |
time.sleep(30) # assumes image already downloaded, else need more time | |
os.system('docker logs %s | tail -10' % docker_hash2) | |
# test this version for now, until docker updated | |
version = 1 | |
try: | |
# client test to server that only consumes inference server | |
prompt = 'Who are you?' | |
print("Starting client tests with prompt: %s using %s" % (prompt, get_inf_server())) | |
from src.client_test import run_client_chat | |
res_dict, client = run_client_chat(prompt=prompt, | |
stream_output=True, | |
max_new_tokens=256, | |
langchain_mode='Disabled', | |
langchain_action=LangChainAction.QUERY.value, | |
langchain_agents=[], | |
version=version) | |
assert res_dict['prompt'] == prompt | |
assert res_dict['iinput'] == '' | |
# will use HOST from above | |
# client shouldn't have to specify | |
ret1, ret2, ret3, ret4, ret5, ret6, ret7 = run_client_many(prompt_type=None, version=version) | |
if 'llama' in base_model.lower(): | |
who = "I'm LLaMA, an AI assistant developed by Meta AI" | |
assert who in ret1['response'] | |
assert who in ret1['response'] | |
assert 'Once upon a time' in ret2['response'] | |
assert 'Once upon a time' in ret3['response'] | |
assert who in ret4['response'] | |
assert who in ret5['response'] | |
assert who in ret6['response'] | |
assert who in ret7['response'] | |
else: | |
who = 'I am an AI language model' | |
assert who in ret1['response'] | |
assert 'Once upon a time' in ret2['response'] | |
assert 'Once upon a time' in ret3['response'] | |
assert who in ret4['response'] | |
assert who in ret5['response'] | |
assert who in ret6['response'] | |
assert who in ret7['response'] | |
print("DONE", flush=True) | |
finally: | |
os.system("docker stop %s" % docker_hash1) | |
os.system("docker stop %s" % docker_hash2) | |
def test_gradio_vllm_docker(base_model): | |
# HF inference server | |
gradio_port = get_inf_port() | |
inf_port = gradio_port + 1 | |
inference_server = 'vllm:127.0.0.1:%s' % inf_port | |
if 'llama' in base_model: | |
tokenizer = 'hf-internal-testing/llama-tokenizer' | |
else: | |
tokenizer = None | |
docker_hash1 = run_vllm_docker(inf_port, base_model, tokenizer) | |
os.system('docker logs %s | tail -10' % docker_hash1) | |
# h2oGPT server | |
docker_hash2 = run_h2ogpt_docker(gradio_port, base_model, inference_server=inference_server) | |
time.sleep(30) # assumes image already downloaded, else need more time | |
os.system('docker logs %s | tail -10' % docker_hash2) | |
# test this version for now, until docker updated | |
version = 1 | |
try: | |
# client test to server that only consumes inference server | |
prompt = 'Who are you?' | |
print("Starting client tests with prompt: %s using %s" % (prompt, get_inf_server())) | |
from src.client_test import run_client_chat | |
res_dict, client = run_client_chat(prompt=prompt, | |
stream_output=True, | |
max_new_tokens=256, | |
langchain_mode='Disabled', | |
langchain_action=LangChainAction.QUERY.value, | |
langchain_agents=[], | |
version=version) | |
assert res_dict['prompt'] == prompt | |
assert res_dict['iinput'] == '' | |
# will use HOST from above | |
# client shouldn't have to specify | |
ret1, ret2, ret3, ret4, ret5, ret6, ret7 = run_client_many(prompt_type=None, version=version) | |
if 'llama' in base_model.lower(): | |
who = "I'm LLaMA, an AI assistant developed by Meta AI" | |
assert who in ret1['response'] | |
assert who in ret1['response'] | |
assert 'Once upon a time' in ret2['response'] | |
assert 'Once upon a time' in ret3['response'] | |
assert who in ret4['response'] | |
assert who in ret5['response'] | |
assert who in ret6['response'] | |
assert who in ret7['response'] | |
else: | |
who = 'I am an AI language model' | |
assert who in ret1['response'] | |
assert 'Once upon a time' in ret2['response'] | |
assert 'Once upon a time' in ret3['response'] | |
assert who in ret4['response'] | |
assert who in ret5['response'] | |
assert who in ret6['response'] | |
assert who in ret7['response'] | |
print("DONE", flush=True) | |
finally: | |
os.system("docker stop %s" % docker_hash1) | |
os.system("docker stop %s" % docker_hash2) | |
def test_replicate_inference_server(force_langchain_evaluate, | |
chat_conversation, | |
system_prompt, | |
prompt='Who are you?', stream_output=False, | |
max_new_tokens=128, # limit cost | |
base_model='h2oai/h2ogpt-4096-llama2-7b-chat', | |
langchain_mode='Disabled', | |
langchain_action=LangChainAction.QUERY.value, | |
langchain_agents=[], | |
user_path=None, | |
langchain_modes=['UserData', 'MyData', 'LLM', 'Disabled'], | |
docs_ordering_type='reverse_sort'): | |
if force_langchain_evaluate: | |
langchain_mode = 'MyData' | |
main_kwargs = dict(base_model=base_model, chat=True, | |
stream_output=stream_output, gradio=True, num_beams=1, block_gradio_exit=False, | |
max_new_tokens=max_new_tokens, | |
langchain_mode=langchain_mode, | |
langchain_action=langchain_action, | |
langchain_agents=langchain_agents, | |
user_path=user_path, | |
langchain_modes=langchain_modes, | |
docs_ordering_type=docs_ordering_type) | |
# server that consumes inference server | |
from src.gen import main | |
# https://replicate.com/lucataco/llama-2-7b-chat | |
#model_string = "lucataco/llama-2-7b-chat:6ab580ab4eef2c2b440f2441ec0fc0ace5470edaf2cbea50b8550aec0b3fbd38" | |
model_string = "meta/llama-2-7b-chat:8e6975e5ed6174911a6ff3d60540dfd4844201974602551e10e9e87ab143d81e" | |
main(**main_kwargs, inference_server='replicate:%s' % model_string) | |
if chat_conversation: | |
prompt = 'What did I ask?' | |
# client test to server that only consumes inference server | |
from src.client_test import run_client_chat | |
res_dict, client = run_client_chat(prompt=prompt, prompt_type='llama2', stream_output=stream_output, | |
max_new_tokens=max_new_tokens, langchain_mode=langchain_mode, | |
langchain_action=langchain_action, langchain_agents=langchain_agents, | |
chat_conversation=chat_conversation, | |
system_prompt=system_prompt) | |
assert res_dict['prompt'] == prompt | |
assert res_dict['iinput'] == '' | |
if chat_conversation and system_prompt: | |
# TODO: don't check yet, system_prompt ignored if response from LLM is as if no system prompt | |
return | |
if chat_conversation or system_prompt: | |
ret6, _ = test_client_basic_api_lean(prompt=prompt, prompt_type=None, | |
chat_conversation=chat_conversation, | |
system_prompt=system_prompt) | |
if system_prompt: | |
assert 'baby cat' in res_dict['response'] and 'meow' in res_dict['response'].lower() | |
assert 'baby cat' in ret6['response'] and 'meow' in ret6['response'].lower() | |
else: | |
options_response = ['You asked "Who are you?"', | |
"""You asked, \"Who are you?\"""", | |
"""You asked: \"Who are you?\"""", | |
] | |
assert res_dict['response'] in options_response | |
assert ret6['response'] in options_response | |
return | |
if system_prompt: | |
# don't test rest, too many cases | |
return | |
# will use HOST from above | |
ret1, ret2, ret3, ret4, ret5, ret6, ret7 = run_client_many(prompt_type=None) # client shouldn't have to specify | |
who = 'an AI assistant' | |
who2 = 'just an AI' | |
assert who in ret1['response'] or who2 in ret1['response'] | |
assert 'Once upon a time, in a far-off land,' in ret2['response'] or 'Once upon a time' in ret2['response'] | |
assert 'Once upon a time, in a far-off land,' in ret3['response'] or 'Once upon a time' in ret3['response'] | |
assert who in ret4['response'] or 'I am a helpful assistant designed' in ret4['response'] or who2 in ret4['response'] | |
assert who in ret5['response'] or 'I am a helpful assistant designed' in ret5['response'] or who2 in ret5['response'] | |
assert who in ret6['response'] or 'I am a helpful assistant designed' in ret6['response'] or who2 in ret6['response'] | |
assert who in ret7['response'] or 'I am a helpful assistant designed' in ret7['response'] or who2 in ret7['response'] | |
print("DONE", flush=True) | |