Spaces:
Runtime error
Runtime error
# ⚠️ Performance Warning | |
# Running LLaMA 2 7B on CPU will be very slow. If performance is an issue, consider: | |
# Using a smaller model (e.g., llama-2-7b-chat-hf or mistral-7b). | |
# Upgrading to a GPU-enabled Hugging Face Space. (paid) | |
import gradio as gr | |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
import torch | |
# Load model and tokenizer from local directory | |
model_path = "./llama-2-7b-chat-w2w" | |
tokenizer_path = "./llama-2-7b-w2w" | |
# Force CPU execution | |
device = "cpu" | |
# Load model (without bitsandbytes) | |
model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device) | |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) | |
# Create text-generation pipeline | |
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200, device=0 if torch.cuda.is_available() else -1) | |
# Response function | |
def generate_response(prompt): | |
result = pipe(f"<s>[INST] waste to wealth idea from {prompt} [/INST] Provide details without repeating the prompt:", max_length=200) | |
response = result[0]['generated_text'] | |
cleaned_response = response.split('Provide details without repeating the prompt:')[-1].strip() | |
return cleaned_response | |
# Create Gradio interface | |
iface = gr.Interface( | |
fn=generate_response, | |
inputs=gr.Textbox(label="What waste material would you like to repurpose?", placeholder="e.g., Broken Glass", lines=2), | |
outputs=gr.Textbox(label="Generated Response:"), | |
title="Waste to Wealth Idea Generator", | |
description="Enter a waste item to generate ideas on how to repurpose it. Get creative suggestions!" | |
) | |
# Launch the interface | |
iface.launch() | |