AquilaX-AI
/

QnA

@@ -5,33 +5,75 @@ license: apache-2.0
 ## INFERENCE
-```Python
-import time
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-finetuned_model = AutoModelForCausalLM.from_pretrained("AquilaX-AI/QnA")
 tokenizer = AutoTokenizer.from_pretrained("AquilaX-AI/QnA")
-alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
-### Instruction:
-what is machine learning?
-### Response:
-"""
-s = time.time()
-prompt = alpaca_prompt
-encodeds = tokenizer(prompt, return_tensors="pt",truncation=True).input_ids
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-finetuned_model.to(device)
-inputs = encodeds.to(device)
-# Increase max_new_tokens if needed
-generated_ids = finetuned_model.generate(inputs, max_new_tokens=256, temperature=0.5, top_p=0.90, do_sample=True,pad_token_id=50259,eos_token_id=50259,num_return_sequences=1)
-print(tokenizer.decode(generated_ids[0]).split('### Response:')[1].split('<eos>')[0].strip())
-e = time.time()
-print(f'time taken:{e-s}')
 ```

 ## INFERENCE
+```python
+# Load model directly
+from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 tokenizer = AutoTokenizer.from_pretrained("AquilaX-AI/QnA")
+model = AutoModelForCausalLM.from_pretrained("AquilaX-AI/QnA")
+prompt = """
+<|im_start|>system\nYou are a helpful AI assistant named Securitron<|im_end|>
+"""
+# Keep a list for the last one conversation exchanges
+conversation_history = []
+while True:
+    user_prompt = input("\nUser Question: ")
+    if user_prompt.lower() == 'break':
+        break
+    # Format the user's input
+    user = f"""<|im_start|>user
+{user_prompt}<|im_end|>
+<|im_start|>assistant"""
+    # Add the user's question to the conversation history
+    conversation_history.append(user)
+    # Ensure conversation starts with a user's input and keep only the last 2 exchanges (4 turns)
+    conversation_history = conversation_history[-5:]
+    # Build the full prompt
+    current_prompt = prompt + "\n".join(conversation_history)
+    # Tokenize the prompt
+    encodeds = tokenizer(current_prompt, return_tensors="pt", truncation=True).input_ids
+    # Move model and inputs to the appropriate device
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    inputs = encodeds.to(device)
+    # Create an empty list to store generated tokens
+    generated_ids = inputs
+    # Start generating tokens one by one
+    assistant_response = ""
+    for _ in range(512):  # Specify a max token limit for streaming
+        next_token = model.generate(
+            generated_ids,
+            max_new_tokens=1,
+            pad_token_id=151644,
+            eos_token_id=151645,
+            num_return_sequences=1,
+            do_sample=False,
+            # top_k=5,
+            # temperature=0.2,
+            # top_p=0.90
+        )
+        generated_ids = torch.cat([generated_ids, next_token[:, -1:]], dim=1)
+        token_id = next_token[0, -1].item()
+        token = tokenizer.decode([token_id], skip_special_tokens=True)
+        assistant_response += token
+        print(token, end="", flush=True)
+        if token_id == 151645:  # EOS token
+            break
+    conversation_history.append(f"{assistant_response.strip()}<|im_end|>")
 ```