Spaces:

PhantHive
/

Phearion-bigbrain-v0.0.1

Paused

Phearion-bigbrain-v0.0.1

File size: 1,433 Bytes

ff2d599
60093a0
d030e25
9641b31
8194866
679bcc5
 
e9cf7a6
058347f
3287b3a
 
 
 
1956ece
3287b3a
 
679bcc5
0fa58d1
d2222b4
3287b3a
9641b31
0fa58d1
b554ac1
0fa58d1
20f6853
679bcc5
1956ece
679bcc5
00a2335
bf67241
3bf71d2
e4c199d
4cf26ce

import gradio as gr
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

# Device configuration (prioritize GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id = "Phearion/bigbrain-v0.0.1"

bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )

# Load models and tokenizer efficiently
config = PeftConfig.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, quantization_config=bnb_config)

# Load the Lora model
model = PeftModel.from_pretrained(mode, model_id)

def greet(text):
    with torch.no_grad():  # Disable gradient calculation for inference
        batch = tokenizer(f'### Human: {text}\n### Assistant:', return_tensors='pt')  # Move tensors to device
        with torch.cuda.amp.autocast():  # Enable mixed-precision if available
            output_tokens = model.generate(**batch, max_new_tokens=25)
    return tokenizer.decode(output_tokens[0], skip_special_tokens=True)

iface = gr.Interface(fn=greet, inputs="text", outputs="text", title="PEFT Model for Big Brain")
iface.launch()  # Share directly to Gradio Space