Yash Sachdeva commited on
Commit
e48a0c0
·
1 Parent(s): da45991
Files changed (3) hide show
  1. Dockerfile +2 -0
  2. question_paper.py +14 -16
  3. requirements.txt +2 -1
Dockerfile CHANGED
@@ -16,6 +16,8 @@ RUN pip install torch
16
 
17
  RUN pip install accelerate
18
 
 
 
19
  # Install requirements.txt
20
  RUN pip install --no-cache-dir --upgrade -r /requirements.txt
21
 
 
16
 
17
  RUN pip install accelerate
18
 
19
+ RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python
20
+
21
  # Install requirements.txt
22
  RUN pip install --no-cache-dir --upgrade -r /requirements.txt
23
 
question_paper.py CHANGED
@@ -6,28 +6,26 @@ from fastapi import FastAPI
6
 
7
  from transformers import AutoTokenizer
8
 
 
 
9
  # Load the model
10
 
11
  app = FastAPI()
12
- model = "meta-llama/Llama-2-7b-hf"
13
- access_token = os.getenv("access_token")
14
  @app.get("/")
15
  def llama():
16
- tokenizer = AutoTokenizer.from_pretrained(model,token=access_token)
17
-
18
- pipeline = transformers.pipeline("text-generation" ,model=model ,torch_dtype=torch.float16 ,device_map="auto" , )
19
-
20
- sequences = pipeline(
21
- 'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
22
- do_sample=True,
23
- top_k=10,
24
- num_return_sequences=1,
25
- eos_token_id=tokenizer.eos_token_id,
26
- max_length=200,
27
  )
28
 
29
- for seq in sequences:
30
- print("Result: {seq['generated_text']}")
 
 
 
 
31
 
32
- return {"output": sequences[0]["generated_text"]}
33
 
 
6
 
7
  from transformers import AutoTokenizer
8
 
9
+ from llama_cpp import Llama
10
+
11
  # Load the model
12
 
13
  app = FastAPI()
 
 
14
  @app.get("/")
15
  def llama():
16
+ llm = Llama(
17
+ model_path="./models/7B/llama-model.gguf",
18
+ # n_gpu_layers=-1, # Uncomment to use GPU acceleration
19
+ # seed=1337, # Uncomment to set a specific seed
20
+ # n_ctx=2048, # Uncomment to increase the context window
 
 
 
 
 
 
21
  )
22
 
23
+ output = llm(
24
+ "Q: Name the planets in the solar system? A: ", # Prompt
25
+ max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
26
+ stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
27
+ echo=True # Echo the prompt back in the output
28
+ ) # Generate a completion, can also call create_completion
29
 
30
+ return {"output": output}
31
 
requirements.txt CHANGED
@@ -4,4 +4,5 @@ requests
4
  pydantic==1.10.12
5
  langchain
6
  clarifai
7
- Pillow
 
 
4
  pydantic==1.10.12
5
  langchain
6
  clarifai
7
+ Pillow
8
+ llama-cpp-python -C cmake.args="-DLLAMA_BLAS=ON;-DLLAMA_BLAS_VENDOR=OpenBLAS"