Spaces:

lukestanley
/

ChillTranslator

Sleeping

App Files Files Community

Luke Stanley commited on Feb 27, 2024

Commit

233efeb

1 Parent(s): feeb679

RunPod Mixtral JSON output test

Browse files

Files changed (2) hide show

runpod.dockerfile +9 -0
runpod_handler.py +80 -4

runpod.dockerfile CHANGED Viewed

@@ -10,6 +10,15 @@ ENV HF_HOME="/runpod-volume/.cache/huggingface/"
 RUN python3.11 -m pip install --upgrade pip && \
     python3.11 -m pip install runpod==1.6.0
 ADD runpod_handler.py .
 CMD python3.11 -u /runpod_handler.py

 RUN python3.11 -m pip install --upgrade pip && \
     python3.11 -m pip install runpod==1.6.0
+RUN python3.11 -m pip install pytest cmake \
+    scikit-build setuptools pydantic-settings \
+    huggingface_hub hf_transfer \
+    pydantic pydantic_settings \
+    llama-cpp-python
+# Install llama-cpp-python (build with cuda)
+ENV CMAKE_ARGS="-DLLAMA_CUBLAS=on"
+RUN python3.11 -m pip install llama-cpp-python --upgrade --no-cache-dir --force-reinstall
 ADD runpod_handler.py .
 CMD python3.11 -u /runpod_handler.py

runpod_handler.py CHANGED Viewed

@@ -1,9 +1,82 @@
-""" Example handler file. """
 import runpod
 # If your handler runs inference on a model, load the model here.
 # You will want models to be loaded into memory before starting serverless.
 def handler(job):
@@ -12,7 +85,10 @@ def handler(job):
     name = job_input.get('name', 'World')
-    return f"Hello, {name}!"
 runpod.serverless.start({"handler": handler})

+import json
+from os import environ as env
+from typing import Any, Dict, Union
+from llama_cpp import Llama, LlamaGrammar
+from pydantic import BaseModel, Field
 import runpod
 # If your handler runs inference on a model, load the model here.
 # You will want models to be loaded into memory before starting serverless.
+from huggingface_hub import hf_hub_download
+small_repo = "TheBloke/phi-2-GGUF"
+small_model="phi-2.Q2_K.gguf"
+big_repo = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"
+big_model = "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"
+LLM_MODEL_PATH =hf_hub_download(
+    repo_id=big_repo,
+    filename=big_model,
+)
+print(f"Model downloaded to {LLM_MODEL_PATH}")
+in_memory_llm = None
+N_GPU_LAYERS = env.get("N_GPU_LAYERS", -1) # Default to -1, which means use all layers if available
+CONTEXT_SIZE = int(env.get("CONTEXT_SIZE", 2048))
+USE_HTTP_SERVER = env.get("USE_HTTP_SERVER", "false").lower() == "true"
+MAX_TOKENS = int(env.get("MAX_TOKENS", 1000))
+TEMPERATURE = float(env.get("TEMPERATURE", 0.3))
+class Movie(BaseModel):
+    title: str = Field(..., title="The title of the movie")
+    year: int = Field(..., title="The year the movie was released")
+    director: str = Field(..., title="The director of the movie")
+    genre: str = Field(..., title="The genre of the movie")
+    plot:  str = Field(..., title="Plot summary of the movie")
+JSON_EXAMPLE_MOVIE = """
+{ "title": "The Matrix", "year": 1999, "director": "The Wachowskis", "genre": "Science Fiction", "plot":"Prgrammer realises he lives in simulation and plays key role."
+"""
+if in_memory_llm is None:
+    print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
+    in_memory_llm = Llama(model_path=LLM_MODEL_PATH, n_ctx=CONTEXT_SIZE, n_gpu_layers=N_GPU_LAYERS, verbose=True)
+def llm_stream_sans_network(
+    prompt: str, pydantic_model_class=Movie, return_pydantic_object=False
+) -> Union[str, Dict[str, Any]]:
+    schema = pydantic_model_class.model_json_schema()
+    # Optional example field from schema, is not needed for the grammar generation
+    if "example" in schema:
+        del schema["example"]
+    json_schema = json.dumps(schema)
+    grammar = LlamaGrammar.from_json_schema(json_schema)
+    stream = in_memory_llm(
+        prompt,
+        max_tokens=MAX_TOKENS,
+        temperature=TEMPERATURE,
+        grammar=grammar,
+        stream=True
+    )
+    output_text = ""
+    for chunk in stream:
+        result = chunk["choices"][0]
+        print(result["text"], end='', flush=True)
+        output_text = output_text + result["text"]
+    print('\n')
+    if return_pydantic_object:
+        model_object = pydantic_model_class.model_validate_json(output_text)
+        return model_object
+    else:
+        return output_text
 def handler(job):
     name = job_input.get('name', 'World')
+    #return f"Hello, {name}!"
+    return llm_stream_sans_network(
+        f"""You need to output JSON objects describing movies.
+        For example for the movie called: `The Matrix`: Output: {JSON_EXAMPLE_MOVIE}
+        Instruct: Output the JSON object for the movie: `{name}` Output: """)
 runpod.serverless.start({"handler": handler})