import spaces import os import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer from huggingface_hub import login # Get the Hugging Face token from environment variables huggingface_token = os.getenv("HF_TOKEN") if huggingface_token is None: raise ValueError("Hugging Face token not set. Please set the HUGGINGFACE_HUB_TOKEN environment variable.") # Login using the Hugging Face token login(huggingface_token) # Load the model and tokenizer model_name = "meta-llama/Meta-Llama-3.1-8B" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) # Move the model to GPU if available device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) @spaces.GPU(duration=120) # Request GPU resources for 120 seconds # Define the prediction function def predict(input_text, temperature=0.2): try: inputs = tokenizer.encode(input_text, return_tensors="pt").to(device) outputs = model.generate(inputs, temperature=temperature, max_new_tokens=50) prediction = tokenizer.decode(outputs[0], skip_special_tokens=True) return prediction except Exception as e: return str(e) # Create Gradio interface interface = gr.Interface( fn=predict, inputs=[ gr.Textbox(lines=2, placeholder="Enter text here...", label="Input Text"), gr.Slider(minimum=0, maximum=1, value=0.2, label="Temperature") ], outputs=gr.Textbox(label="Output Text"), title="Transformer Model Prediction", description="Enter text and adjust the temperature to get predictions from the transformer model." ) # Launch the Gradio app interface.launch(server_name="0.0.0.0", server_port=7860)