--- library_name: transformers license: apache-2.0 --- ## INFERENCE ```python from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer import torch # Load model and tokenizer tokenizer = AutoTokenizer.from_pretrained("AquilaX-AI/QnA") model = AutoModelForCausalLM.from_pretrained("AquilaX-AI/QnA") # Define the system prompt prompt = """ <|im_start|>system\nYou are a helpful AI assistant named Securitron<|im_end|> """ # Initialize conversation history conversation_history = [] # Set up device device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) while True: user_prompt = input("\nUser Question: ") if user_prompt.lower() == 'break': break # Format the user's input user = f"""<|im_start|>user {user_prompt}<|im_end|> <|im_start|>assistant""" # Add the user's question to the conversation history conversation_history.append(user) # Keep only the last 2 exchanges (4 turns) conversation_history = conversation_history[-5:] # Build the full prompt current_prompt = prompt + "\n".join(conversation_history) # Tokenize the prompt encodeds = tokenizer(current_prompt, return_tensors="pt", truncation=True).input_ids.to(device) # Initialize TextStreamer for real-time token generation text_streamer = TextStreamer(tokenizer, skip_prompt=True) # Generate response with TextStreamer response = model.generate( input_ids=encodeds, streamer=text_streamer, max_new_tokens=512, use_cache=True, pad_token_id=151645, eos_token_id=151645, num_return_sequences=1 ) # Finalize conversation history with the assistant's response conversation_history.append(tokenizer.decode(response[0]).split('<|im_start|>assistant')[-1].split('<|im_end|>')[0].strip() + "<|im_end|>") ```