Mr-Vicky-01 commited on
Commit
cc809e5
·
1 Parent(s): 1e04a23

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +62 -20
README.md CHANGED
@@ -5,33 +5,75 @@ license: apache-2.0
5
 
6
  ## INFERENCE
7
 
8
- ```Python
9
- import time
 
10
  import torch
11
- from transformers import AutoTokenizer, AutoModelForCausalLM
12
 
13
- finetuned_model = AutoModelForCausalLM.from_pretrained("AquilaX-AI/QnA")
14
  tokenizer = AutoTokenizer.from_pretrained("AquilaX-AI/QnA")
 
15
 
16
- alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
 
 
17
 
18
- ### Instruction:
19
- what is machine learning?
20
 
21
- ### Response:
22
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- s = time.time()
25
- prompt = alpaca_prompt
26
- encodeds = tokenizer(prompt, return_tensors="pt",truncation=True).input_ids
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
29
- finetuned_model.to(device)
30
- inputs = encodeds.to(device)
31
 
32
- # Increase max_new_tokens if needed
33
- generated_ids = finetuned_model.generate(inputs, max_new_tokens=256, temperature=0.5, top_p=0.90, do_sample=True,pad_token_id=50259,eos_token_id=50259,num_return_sequences=1)
34
- print(tokenizer.decode(generated_ids[0]).split('### Response:')[1].split('<eos>')[0].strip())
35
- e = time.time()
36
- print(f'time taken:{e-s}')
37
  ```
 
5
 
6
  ## INFERENCE
7
 
8
+ ```python
9
+ # Load model directly
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer
11
  import torch
 
12
 
 
13
  tokenizer = AutoTokenizer.from_pretrained("AquilaX-AI/QnA")
14
+ model = AutoModelForCausalLM.from_pretrained("AquilaX-AI/QnA")
15
 
16
+ prompt = """
17
+ <|im_start|>system\nYou are a helpful AI assistant named Securitron<|im_end|>
18
+ """
19
 
20
+ # Keep a list for the last one conversation exchanges
21
+ conversation_history = []
22
 
23
+ while True:
24
+ user_prompt = input("\nUser Question: ")
25
+ if user_prompt.lower() == 'break':
26
+ break
27
+
28
+ # Format the user's input
29
+ user = f"""<|im_start|>user
30
+ {user_prompt}<|im_end|>
31
+ <|im_start|>assistant"""
32
+
33
+ # Add the user's question to the conversation history
34
+ conversation_history.append(user)
35
+
36
+ # Ensure conversation starts with a user's input and keep only the last 2 exchanges (4 turns)
37
+ conversation_history = conversation_history[-5:]
38
+
39
+ # Build the full prompt
40
+ current_prompt = prompt + "\n".join(conversation_history)
41
+
42
+ # Tokenize the prompt
43
+ encodeds = tokenizer(current_prompt, return_tensors="pt", truncation=True).input_ids
44
+
45
+ # Move model and inputs to the appropriate device
46
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
47
+ model.to(device)
48
+ inputs = encodeds.to(device)
49
+
50
+ # Create an empty list to store generated tokens
51
+ generated_ids = inputs
52
 
53
+ # Start generating tokens one by one
54
+ assistant_response = ""
55
+ for _ in range(512): # Specify a max token limit for streaming
56
+ next_token = model.generate(
57
+ generated_ids,
58
+ max_new_tokens=1,
59
+ pad_token_id=151644,
60
+ eos_token_id=151645,
61
+ num_return_sequences=1,
62
+ do_sample=False,
63
+ # top_k=5,
64
+ # temperature=0.2,
65
+ # top_p=0.90
66
+ )
67
+
68
+ generated_ids = torch.cat([generated_ids, next_token[:, -1:]], dim=1)
69
+ token_id = next_token[0, -1].item()
70
+ token = tokenizer.decode([token_id], skip_special_tokens=True)
71
+
72
+ assistant_response += token
73
+ print(token, end="", flush=True)
74
 
75
+ if token_id == 151645: # EOS token
76
+ break
 
77
 
78
+ conversation_history.append(f"{assistant_response.strip()}<|im_end|>")
 
 
 
 
79
  ```