AquilaX-AI
/

QnA

@@ -6,20 +6,25 @@ license: apache-2.0
 ## INFERENCE
 ```python
-# Load model directly
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 tokenizer = AutoTokenizer.from_pretrained("AquilaX-AI/QnA")
 model = AutoModelForCausalLM.from_pretrained("AquilaX-AI/QnA")
 prompt = """
 <|im_start|>system\nYou are a helpful AI assistant named Securitron<|im_end|>
 """
-# Keep a list for the last one conversation exchanges
 conversation_history = []
 while True:
     user_prompt = input("\nUser Question: ")
     if user_prompt.lower() == 'break':
@@ -33,47 +38,29 @@ while True:
     # Add the user's question to the conversation history
     conversation_history.append(user)
-    # Ensure conversation starts with a user's input and keep only the last 2 exchanges (4 turns)
     conversation_history = conversation_history[-5:]
     # Build the full prompt
     current_prompt = prompt + "\n".join(conversation_history)
     # Tokenize the prompt
-    encodeds = tokenizer(current_prompt, return_tensors="pt", truncation=True).input_ids
-    # Move model and inputs to the appropriate device
-    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    model.to(device)
-    inputs = encodeds.to(device)
-    # Create an empty list to store generated tokens
-    generated_ids = inputs
-    # Start generating tokens one by one
-    assistant_response = ""
-    for _ in range(512):  # Specify a max token limit for streaming
-        next_token = model.generate(
-            generated_ids,
-            max_new_tokens=1,
-            pad_token_id=151644,
-            eos_token_id=151645,
-            num_return_sequences=1,
-            do_sample=False,
-            # top_k=5,
-            # temperature=0.2,
-            # top_p=0.90
-        )
-        generated_ids = torch.cat([generated_ids, next_token[:, -1:]], dim=1)
-        token_id = next_token[0, -1].item()
-        token = tokenizer.decode([token_id], skip_special_tokens=True)
-        assistant_response += token
-        print(token, end="", flush=True)
-        if token_id == 151645:  # EOS token
-            break
-    conversation_history.append(f"{assistant_response.strip()}<|im_end|>")
 ```

 ## INFERENCE
 ```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
 import torch
+# Load model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained("AquilaX-AI/QnA")
 model = AutoModelForCausalLM.from_pretrained("AquilaX-AI/QnA")
+# Define the system prompt
 prompt = """
 <|im_start|>system\nYou are a helpful AI assistant named Securitron<|im_end|>
 """
+# Initialize conversation history
 conversation_history = []
+# Set up device
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+model.to(device)
 while True:
     user_prompt = input("\nUser Question: ")
     if user_prompt.lower() == 'break':
     # Add the user's question to the conversation history
     conversation_history.append(user)
+    # Keep only the last 2 exchanges (4 turns)
     conversation_history = conversation_history[-5:]
     # Build the full prompt
     current_prompt = prompt + "\n".join(conversation_history)
     # Tokenize the prompt
+    encodeds = tokenizer(current_prompt, return_tensors="pt", truncation=True).input_ids.to(device)
+    # Initialize TextStreamer for real-time token generation
+    text_streamer = TextStreamer(tokenizer, skip_prompt=True)
+    # Generate response with TextStreamer
+    response = model.generate(
+        input_ids=encodeds,
+        streamer=text_streamer,
+        max_new_tokens=512,
+        use_cache=True,
+        pad_token_id=151645,
+        eos_token_id=151645,
+        num_return_sequences=1
+    )
+    # Finalize conversation history with the assistant's response
+    conversation_history.append(tokenizer.decode(response[0]).split('<|im_start|>assistant')[-1].split('<|im_end|>')[0].strip() + "<|im_end|>")
 ```