Spaces:

KwabsHug
/

TestCompressedModelzero

Sleeping

File size: 2,334 Bytes

e95ad42
136e821
 
 
e95ad42
 
 
acfdbf2
e95ad42
 
 
 
 
 
136e821
e95ad42
 
 
 
 
 
 
 
1c670bf
acfdbf2
e95ad42
 
 
 
 
136e821
ba8ad86
136e821
e03ccf8
acfdbf2
e03ccf8
136e821
 
e03ccf8
 
 
acfdbf2
 
136e821

import spaces
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import subprocess
import os

def install_cuda_toolkit(): #Swiftly Provided by https://huggingface.co/John6666 to fix OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.
    # CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"
    CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run"
    CUDA_TOOLKIT_FILE = "/tmp/%s" % os.path.basename(CUDA_TOOLKIT_URL)
    subprocess.call(["wget", "-q", CUDA_TOOLKIT_URL, "-O", CUDA_TOOLKIT_FILE])
    subprocess.call(["chmod", "+x", CUDA_TOOLKIT_FILE])
    subprocess.call([CUDA_TOOLKIT_FILE, "--silent", "--toolkit"])

    os.environ["CUDA_HOME"] = "/usr/local/cuda"
    os.environ["PATH"] = "%s/bin:%s" % (os.environ["CUDA_HOME"], os.environ["PATH"])
    os.environ["LD_LIBRARY_PATH"] = "%s/lib:%s" % (
        os.environ["CUDA_HOME"],
        "" if "LD_LIBRARY_PATH" not in os.environ else os.environ["LD_LIBRARY_PATH"],
    )
    # Fix: arch_list[-1] += '+PTX'; IndexError: list index out of range
    os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6"

install_cuda_toolkit() #Swiftly Provided by https://huggingface.co/John6666 to fix OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Meta-Llama-3.1-70B-AQLM-PV-2Bit-1x16")
model = AutoModelForCausalLM.from_pretrained("ISTA-DASLab/Meta-Llama-3.1-70B-AQLM-PV-2Bit-1x16", torch_dtype='auto', device_map='auto').to(device)

@spaces.GPU
def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(inputs.input_ids) #, max_length=100)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

interface = gr.Interface(
    fn=generate_text,
    inputs="text",
    outputs="text",
    title="Meta-Llama-3.1-70B-AQLM-PV-2Bit-1x16 Text Generation",
    description="Enter a prompt and generate text using Meta-Llama-3.1-70B-AQLM-PV-2Bit-1x16. Responses are a little bit different Meta-Llama-3.1-70B",
)

interface.launch()