import torch import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import BitsAndBytesConfig import os token = os.getenv("HUGGINGFACE_TOKEN") def load_quantized_model(): """ Function to load a quantized model""" tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct",token=token) config = BitsAndBytesConfig.from_dict({"load_in_4bit": True}) model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", quantization_config=config,token=token) return model, tokenizer model, tokenizer = load_quantized_model() def generate_response(prompt): """Simple prediction function for Gradio""" inputs = tokenizer(prompt, return_tensors="pt").to("cuda") outputs = model.generate(**inputs) return tokenizer.decode(outputs[0], skip_special_tokens=True) iface = gr.Interface( fn=generate_response, inputs="text", outputs="text", title="Quantized Model Chatbot" ) iface.launch()