LovnishVerma's picture
Update app.py
259bc45 verified
# ⚠️ Performance Warning
# Running LLaMA 2 7B on CPU will be very slow. If performance is an issue, consider:
# Using a smaller model (e.g., llama-2-7b-chat-hf or mistral-7b).
# Upgrading to a GPU-enabled Hugging Face Space. (paid)
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
# Load model and tokenizer from local directory
model_path = "./llama-2-7b-chat-w2w"
tokenizer_path = "./llama-2-7b-w2w"
# Force CPU execution
device = "cpu"
# Load model (without bitsandbytes)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
# Create text-generation pipeline
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200, device=0 if torch.cuda.is_available() else -1)
# Response function
def generate_response(prompt):
result = pipe(f"<s>[INST] waste to wealth idea from {prompt} [/INST] Provide details without repeating the prompt:", max_length=200)
response = result[0]['generated_text']
cleaned_response = response.split('Provide details without repeating the prompt:')[-1].strip()
return cleaned_response
# Create Gradio interface
iface = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(label="What waste material would you like to repurpose?", placeholder="e.g., Broken Glass", lines=2),
outputs=gr.Textbox(label="Generated Response:"),
title="Waste to Wealth Idea Generator",
description="Enter a waste item to generate ideas on how to repurpose it. Get creative suggestions!"
)
# Launch the interface
iface.launch()