|
--- |
|
library_name: transformers |
|
license: apache-2.0 |
|
--- |
|
|
|
## INFERENCE |
|
|
|
```python |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer |
|
import torch |
|
|
|
# Load model and tokenizer |
|
tokenizer = AutoTokenizer.from_pretrained("AquilaX-AI/QnA") |
|
model = AutoModelForCausalLM.from_pretrained("AquilaX-AI/QnA") |
|
|
|
# Define the system prompt |
|
prompt = """ |
|
<|im_start|>system\nYou are a helpful AI assistant named Securitron<|im_end|> |
|
""" |
|
|
|
# Initialize conversation history |
|
conversation_history = [] |
|
|
|
# Set up device |
|
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
|
|
while True: |
|
user_prompt = input("\nUser Question: ") |
|
if user_prompt.lower() == 'break': |
|
break |
|
|
|
# Format the user's input |
|
user = f"""<|im_start|>user |
|
{user_prompt}<|im_end|> |
|
<|im_start|>assistant""" |
|
|
|
# Add the user's question to the conversation history |
|
conversation_history.append(user) |
|
|
|
# Keep only the last 2 exchanges (4 turns) |
|
conversation_history = conversation_history[-5:] |
|
|
|
# Build the full prompt |
|
current_prompt = prompt + "\n".join(conversation_history) |
|
|
|
# Tokenize the prompt |
|
encodeds = tokenizer(current_prompt, return_tensors="pt", truncation=True).input_ids.to(device) |
|
|
|
# Initialize TextStreamer for real-time token generation |
|
text_streamer = TextStreamer(tokenizer, skip_prompt=True) |
|
|
|
# Generate response with TextStreamer |
|
response = model.generate( |
|
input_ids=encodeds, |
|
streamer=text_streamer, |
|
max_new_tokens=512, |
|
use_cache=True, |
|
pad_token_id=151645, |
|
eos_token_id=151645, |
|
num_return_sequences=1 |
|
) |
|
|
|
# Finalize conversation history with the assistant's response |
|
conversation_history.append(tokenizer.decode(response[0]).split('<|im_start|>assistant')[-1].split('<|im_end|>')[0].strip() + "<|im_end|>") |
|
``` |