Spaces:
Runtime error
Runtime error
Yash Sachdeva
commited on
Commit
·
e48a0c0
1
Parent(s):
da45991
llm cpp
Browse files- Dockerfile +2 -0
- question_paper.py +14 -16
- requirements.txt +2 -1
Dockerfile
CHANGED
@@ -16,6 +16,8 @@ RUN pip install torch
|
|
16 |
|
17 |
RUN pip install accelerate
|
18 |
|
|
|
|
|
19 |
# Install requirements.txt
|
20 |
RUN pip install --no-cache-dir --upgrade -r /requirements.txt
|
21 |
|
|
|
16 |
|
17 |
RUN pip install accelerate
|
18 |
|
19 |
+
RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python
|
20 |
+
|
21 |
# Install requirements.txt
|
22 |
RUN pip install --no-cache-dir --upgrade -r /requirements.txt
|
23 |
|
question_paper.py
CHANGED
@@ -6,28 +6,26 @@ from fastapi import FastAPI
|
|
6 |
|
7 |
from transformers import AutoTokenizer
|
8 |
|
|
|
|
|
9 |
# Load the model
|
10 |
|
11 |
app = FastAPI()
|
12 |
-
model = "meta-llama/Llama-2-7b-hf"
|
13 |
-
access_token = os.getenv("access_token")
|
14 |
@app.get("/")
|
15 |
def llama():
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
|
22 |
-
do_sample=True,
|
23 |
-
top_k=10,
|
24 |
-
num_return_sequences=1,
|
25 |
-
eos_token_id=tokenizer.eos_token_id,
|
26 |
-
max_length=200,
|
27 |
)
|
28 |
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
return {"output":
|
33 |
|
|
|
6 |
|
7 |
from transformers import AutoTokenizer
|
8 |
|
9 |
+
from llama_cpp import Llama
|
10 |
+
|
11 |
# Load the model
|
12 |
|
13 |
app = FastAPI()
|
|
|
|
|
14 |
@app.get("/")
|
15 |
def llama():
|
16 |
+
llm = Llama(
|
17 |
+
model_path="./models/7B/llama-model.gguf",
|
18 |
+
# n_gpu_layers=-1, # Uncomment to use GPU acceleration
|
19 |
+
# seed=1337, # Uncomment to set a specific seed
|
20 |
+
# n_ctx=2048, # Uncomment to increase the context window
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
)
|
22 |
|
23 |
+
output = llm(
|
24 |
+
"Q: Name the planets in the solar system? A: ", # Prompt
|
25 |
+
max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
|
26 |
+
stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
|
27 |
+
echo=True # Echo the prompt back in the output
|
28 |
+
) # Generate a completion, can also call create_completion
|
29 |
|
30 |
+
return {"output": output}
|
31 |
|
requirements.txt
CHANGED
@@ -4,4 +4,5 @@ requests
|
|
4 |
pydantic==1.10.12
|
5 |
langchain
|
6 |
clarifai
|
7 |
-
Pillow
|
|
|
|
4 |
pydantic==1.10.12
|
5 |
langchain
|
6 |
clarifai
|
7 |
+
Pillow
|
8 |
+
llama-cpp-python -C cmake.args="-DLLAMA_BLAS=ON;-DLLAMA_BLAS_VENDOR=OpenBLAS"
|