Spaces:
Sleeping
Sleeping
Luke Stanley
commited on
Commit
·
74d6e52
1
Parent(s):
a0f49a0
Auto-downloads model if env var is not set
Browse files
utils.py
CHANGED
|
@@ -1,9 +1,16 @@
|
|
| 1 |
import json
|
|
|
|
| 2 |
from typing import Any, Dict, Union
|
| 3 |
import requests
|
| 4 |
|
|
|
|
| 5 |
from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
# The llama_cpp Python HTTP server communicates with the AI model, similar
|
| 8 |
# to the OpenAI API but adds a unique "grammar" parameter.
|
| 9 |
# The real OpenAI API has other ways to set the output format.
|
|
@@ -11,8 +18,24 @@ from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
|
|
| 11 |
|
| 12 |
URL = "http://localhost:5834/v1/chat/completions"
|
| 13 |
in_memory_llm = None
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
def llm_streaming(
|
| 18 |
prompt: str, pydantic_model_class, return_pydantic_object=False
|
|
@@ -83,9 +106,6 @@ def calculate_overall_score(faithfulness, spiciness):
|
|
| 83 |
def llm_stream_sans_network(
|
| 84 |
prompt: str, pydantic_model_class, return_pydantic_object=False
|
| 85 |
) -> Union[str, Dict[str, Any]]:
|
| 86 |
-
global in_memory_llm
|
| 87 |
-
if in_memory_llm is None:
|
| 88 |
-
in_memory_llm = Llama(model_path=IN_MEMORY_LLM_PATH)
|
| 89 |
schema = pydantic_model_class.model_json_schema()
|
| 90 |
|
| 91 |
# Optional example field from schema, is not needed for the grammar generation
|
|
@@ -97,6 +117,7 @@ def llm_stream_sans_network(
|
|
| 97 |
|
| 98 |
stream = in_memory_llm(
|
| 99 |
prompt,
|
|
|
|
| 100 |
max_tokens=1000,
|
| 101 |
temperature=0.7,
|
| 102 |
grammar=grammar,
|
|
|
|
| 1 |
import json
|
| 2 |
+
from os import environ as env
|
| 3 |
from typing import Any, Dict, Union
|
| 4 |
import requests
|
| 5 |
|
| 6 |
+
from huggingface_hub import hf_hub_download
|
| 7 |
from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
|
| 8 |
|
| 9 |
+
# There are two ways to use the LLM model currently used:
|
| 10 |
+
# 1. Use the HTTP server (USE_HTTP_SERVER=True), this is good for development
|
| 11 |
+
# when you want to change the logic of the translator without restarting the server.
|
| 12 |
+
# 2. Load the model into memory
|
| 13 |
+
# When using the HTTP server, it must be ran separately. See the README for instructions.
|
| 14 |
# The llama_cpp Python HTTP server communicates with the AI model, similar
|
| 15 |
# to the OpenAI API but adds a unique "grammar" parameter.
|
| 16 |
# The real OpenAI API has other ways to set the output format.
|
|
|
|
| 18 |
|
| 19 |
URL = "http://localhost:5834/v1/chat/completions"
|
| 20 |
in_memory_llm = None
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
LLM_MODEL_PATH = env.get("LLM_MODEL_PATH", None)
|
| 24 |
+
USE_HTTP_SERVER = env.get("USE_HTTP_SERVER", "false").lower() == "true"
|
| 25 |
+
|
| 26 |
+
if len(LLM_MODEL_PATH) > 0:
|
| 27 |
+
print(f"Using local model from {LLM_MODEL_PATH}")
|
| 28 |
+
else:
|
| 29 |
+
print("No local LLM_MODEL_PATH environment variable set. We need a model, downloading model from HuggingFace Hub")
|
| 30 |
+
LLM_MODEL_PATH =hf_hub_download(
|
| 31 |
+
repo_id=env.get("REPO_ID", "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"),
|
| 32 |
+
filename=env.get("MODEL_FILE", "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"),
|
| 33 |
+
)
|
| 34 |
+
print(f"Model downloaded to {LLM_MODEL_PATH}")
|
| 35 |
+
|
| 36 |
+
if in_memory_llm is None and USE_HTTP_SERVER is False:
|
| 37 |
+
print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
|
| 38 |
+
in_memory_llm = Llama(model_path=LLM_MODEL_PATH)
|
| 39 |
|
| 40 |
def llm_streaming(
|
| 41 |
prompt: str, pydantic_model_class, return_pydantic_object=False
|
|
|
|
| 106 |
def llm_stream_sans_network(
|
| 107 |
prompt: str, pydantic_model_class, return_pydantic_object=False
|
| 108 |
) -> Union[str, Dict[str, Any]]:
|
|
|
|
|
|
|
|
|
|
| 109 |
schema = pydantic_model_class.model_json_schema()
|
| 110 |
|
| 111 |
# Optional example field from schema, is not needed for the grammar generation
|
|
|
|
| 117 |
|
| 118 |
stream = in_memory_llm(
|
| 119 |
prompt,
|
| 120 |
+
n_ctx=4096,
|
| 121 |
max_tokens=1000,
|
| 122 |
temperature=0.7,
|
| 123 |
grammar=grammar,
|