Spaces:

lukestanley
/

ChillTranslator

Sleeping

App Files Files Community

Luke Stanley commited on Feb 27, 2024

Commit

976ea17

1 Parent(s): 233efeb

Expose json typed LLM interface for RunPod

Browse files

Files changed (4) hide show

docker-compose.yml +11 -0
runpod.dockerfile +12 -2
runpod_handler.py +22 -75
test.sh +28 -0

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,11 @@

+version: '3.8'
+services:
+  runpod:
+    build:
+      context: .
+      dockerfile: runpod.dockerfile
+    volumes:
+      - ./.cache:/runpod-volume/.cache
+      - ./test.sh:/test.sh
+    command: /test.sh
+    entrypoint: /usr/bin/python3

runpod.dockerfile CHANGED Viewed

@@ -15,10 +15,20 @@ RUN python3.11 -m pip install pytest cmake \
     huggingface_hub hf_transfer \
     pydantic pydantic_settings \
     llama-cpp-python
 # Install llama-cpp-python (build with cuda)
 ENV CMAKE_ARGS="-DLLAMA_CUBLAS=on"
-RUN python3.11 -m pip install llama-cpp-python --upgrade --no-cache-dir --force-reinstall
 ADD runpod_handler.py .
 CMD python3.11 -u /runpod_handler.py

     huggingface_hub hf_transfer \
     pydantic pydantic_settings \
     llama-cpp-python
 # Install llama-cpp-python (build with cuda)
 ENV CMAKE_ARGS="-DLLAMA_CUBLAS=on"
+RUN python3.11 -m pip install git+https://github.com/lukestanley/llama-cpp-python.git@expose_json_grammar_convert_function --upgrade --no-cache-dir --force-reinstall
+RUN apt-get update; apt-get install jq -y
 ADD runpod_handler.py .
+ADD chill.py .
+ADD utils.py .
+ADD promptObjects.py .
+#ENV REPO_ID="TheBloke/phi-2-GGUF"
+#ENV MODEL_FILE="phi-2.Q2_K.gguf"
+ENV N_GPU_LAYERS=-1
+ENV CONTEXT_SIZE=2048
 CMD python3.11 -u /runpod_handler.py

runpod_handler.py CHANGED Viewed

@@ -1,34 +1,7 @@
-import json
 from os import environ as env
-from typing import Any, Dict, Union
-from llama_cpp import Llama, LlamaGrammar
 from pydantic import BaseModel, Field
-import runpod
-# If your handler runs inference on a model, load the model here.
-# You will want models to be loaded into memory before starting serverless.
-from huggingface_hub import hf_hub_download
-small_repo = "TheBloke/phi-2-GGUF"
-small_model="phi-2.Q2_K.gguf"
-big_repo = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"
-big_model = "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"
-LLM_MODEL_PATH =hf_hub_download(
-    repo_id=big_repo,
-    filename=big_model,
-)
-print(f"Model downloaded to {LLM_MODEL_PATH}")
-in_memory_llm = None
-N_GPU_LAYERS = env.get("N_GPU_LAYERS", -1) # Default to -1, which means use all layers if available
-CONTEXT_SIZE = int(env.get("CONTEXT_SIZE", 2048))
-USE_HTTP_SERVER = env.get("USE_HTTP_SERVER", "false").lower() == "true"
-MAX_TOKENS = int(env.get("MAX_TOKENS", 1000))
-TEMPERATURE = float(env.get("TEMPERATURE", 0.3))
 class Movie(BaseModel):
     title: str = Field(..., title="The title of the movie")
     year: int = Field(..., title="The year the movie was released")
@@ -36,17 +9,7 @@ class Movie(BaseModel):
     genre: str = Field(..., title="The genre of the movie")
     plot:  str = Field(..., title="Plot summary of the movie")
-JSON_EXAMPLE_MOVIE = """
-{ "title": "The Matrix", "year": 1999, "director": "The Wachowskis", "genre": "Science Fiction", "plot":"Prgrammer realises he lives in simulation and plays key role."
-"""
-if in_memory_llm is None:
-    print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
-    in_memory_llm = Llama(model_path=LLM_MODEL_PATH, n_ctx=CONTEXT_SIZE, n_gpu_layers=N_GPU_LAYERS, verbose=True)
-def llm_stream_sans_network(
-    prompt: str, pydantic_model_class=Movie, return_pydantic_object=False
-) -> Union[str, Dict[str, Any]]:
     schema = pydantic_model_class.model_json_schema()
     # Optional example field from schema, is not needed for the grammar generation
@@ -54,41 +17,25 @@ def llm_stream_sans_network(
         del schema["example"]
     json_schema = json.dumps(schema)
-    grammar = LlamaGrammar.from_json_schema(json_schema)
-    stream = in_memory_llm(
-        prompt,
-        max_tokens=MAX_TOKENS,
-        temperature=TEMPERATURE,
-        grammar=grammar,
-        stream=True
-    )
-    output_text = ""
-    for chunk in stream:
-        result = chunk["choices"][0]
-        print(result["text"], end='', flush=True)
-        output_text = output_text + result["text"]
-    print('\n')
-    if return_pydantic_object:
-        model_object = pydantic_model_class.model_validate_json(output_text)
-        return model_object
-    else:
-        return output_text
 def handler(job):
     """ Handler function that will be used to process jobs. """
     job_input = job['input']
-    name = job_input.get('name', 'World')
-    #return f"Hello, {name}!"
-    return llm_stream_sans_network(
-        f"""You need to output JSON objects describing movies.
-        For example for the movie called: `The Matrix`: Output: {JSON_EXAMPLE_MOVIE}
-        Instruct: Output the JSON object for the movie: `{name}` Output: """)
-runpod.serverless.start({"handler": handler})

+import runpod
 from os import environ as env
+import json
 from pydantic import BaseModel, Field
 class Movie(BaseModel):
     title: str = Field(..., title="The title of the movie")
     year: int = Field(..., title="The year the movie was released")
     genre: str = Field(..., title="The genre of the movie")
     plot:  str = Field(..., title="Plot summary of the movie")
+def pydantic_model_to_json_schema(pydantic_model_class):
     schema = pydantic_model_class.model_json_schema()
     # Optional example field from schema, is not needed for the grammar generation
         del schema["example"]
     json_schema = json.dumps(schema)
+    return json_schema
+default_schema_example = """{ "title": ..., "year": ..., "director": ..., "genre": ..., "plot":...}"""
+default_schema = pydantic_model_to_json_schema(Movie)
+default_prompt = f"Instruct: \nOutput a JSON object in this format: {default_schema_example} for the following movie: The Matrix\nOutput:\n"
+from utils import llm_stream_sans_network_simple
 def handler(job):
     """ Handler function that will be used to process jobs. """
     job_input = job['input']
+    filename=env.get("MODEL_FILE", "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf")
+    prompt = job_input.get('prompt', default_prompt)
+    schema = job_input.get('schema', default_schema)
+    print("got this input", str(job_input))
+    print("prompt", prompt )
+    print("schema", schema )
+    output = llm_stream_sans_network_simple(prompt, schema)
+    #print("got this output", str(output))
+    return f"model:{filename}\n{output}"
+runpod.serverless.start({
+    "handler": handler,
+    #"return_aggregate_stream": True
+})

test.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/usr/bin/env python3
+import os, json
+# Define your JSON and prompt as Python dictionaries and strings
+schema = {
+    "properties": {
+        "title": {"title": "The title of the movie", "type": "string"},
+        "year": {"title": "The year the movie was released", "type": "integer"},
+        "director": {"title": "The director of the movie", "type": "string"},
+        "genre": {"title": "The genre of the movie", "type": "string"},
+        "plot": {"title": "Plot summary of the movie", "type": "string"}
+    },
+    "required": ["title", "year", "director", "genre", "plot"],
+    "title": "Movie",
+    "type": "object"
+}
+movie ="Toy Story"
+prompt = "Instruct: Output a JSON object in this format: { \"title\": ..., \"year\": ..., \"director\": ..., \"genre\": ..., \"plot\":...} for the following movie: "+movie+"\nOutput:\n"
+# Construct the JSON input string
+json_input = json.dumps({"input": {"schema": json.dumps(schema), "prompt": prompt}})
+print(json_input)
+# Define the command to execute your Python script with the JSON string
+command = f'python3.11 runpod_handler.py --test_input \'{json_input}\''
+# Execute the command
+os.system(command)