Spaces:

yash009
/

textgeneration

Runtime error

Yash Sachdeva commited on Mar 10, 2024

Commit

e48a0c0

1 Parent(s): da45991

llm cpp

Files changed (3) hide show

Dockerfile CHANGED Viewed

@@ -16,6 +16,8 @@ RUN pip install torch
 RUN pip install accelerate
 # Install requirements.txt
 RUN pip install --no-cache-dir --upgrade -r /requirements.txt

 RUN pip install accelerate
+RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python
 # Install requirements.txt
 RUN pip install --no-cache-dir --upgrade -r /requirements.txt

question_paper.py CHANGED Viewed

@@ -6,28 +6,26 @@ from fastapi import FastAPI
 from transformers import AutoTokenizer
 # Load the model
 app = FastAPI()
-model = "meta-llama/Llama-2-7b-hf"
-access_token = os.getenv("access_token")
 @app.get("/")
 def llama():
-    tokenizer = AutoTokenizer.from_pretrained(model,token=access_token)
-    pipeline = transformers.pipeline("text-generation" ,model=model ,torch_dtype=torch.float16 ,device_map="auto" , )
-    sequences = pipeline(
-        'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
-        do_sample=True,
-        top_k=10,
-        num_return_sequences=1,
-        eos_token_id=tokenizer.eos_token_id,
-        max_length=200,
     )
-    for seq in sequences:
-        print("Result: {seq['generated_text']}")
-    return {"output": sequences[0]["generated_text"]}

 from transformers import AutoTokenizer
+from llama_cpp import Llama
 # Load the model
 app = FastAPI()
 @app.get("/")
 def llama():
+    llm = Llama(
+      model_path="./models/7B/llama-model.gguf",
+      # n_gpu_layers=-1, # Uncomment to use GPU acceleration
+      # seed=1337, # Uncomment to set a specific seed
+      # n_ctx=2048, # Uncomment to increase the context window
     )
+    output = llm(
+      "Q: Name the planets in the solar system? A: ", # Prompt
+      max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
+      stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
+      echo=True # Echo the prompt back in the output
+    ) # Generate a completion, can also call create_completion
+    return {"output": output}

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ requests
 pydantic==1.10.12
 langchain
 clarifai
-Pillow

 pydantic==1.10.12
 langchain
 clarifai
+Pillow
+llama-cpp-python -C cmake.args="-DLLAMA_BLAS=ON;-DLLAMA_BLAS_VENDOR=OpenBLAS"