Spaces:

Schrieffer
/

SARM-Demo

Running on Zero

File size: 4,820 Bytes

import gradio as gr
import spaces
import torch
from transformers import AutoTokenizer
from sarm_llama import LlamaSARM

# --- 1. Load Model and Tokenizer ---

# No longer need to manually check for CUDA. `device_map="auto"` will handle it.
MODEL_ID = "schrieffer/SARM-4B"

print(f"Loading model: {MODEL_ID} with device_map='auto'...")

# trust_remote_code=True is required because SARM has a custom architecture.
# Using device_map="auto" is the key to correctly loading the model onto the GPU.
model = LlamaSARM.from_pretrained(
    MODEL_ID, 
    sae_hidden_state_source_layer=16, 
    sae_latent_size=65536,
    sae_k=192,
    device_map="auto",  # <<< KEY CHANGE HERE
    torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)

# We can get the device from the model itself after loading
DEVICE = model.device
print(f"Model loaded successfully on device: {DEVICE}")

# --- 2. Define the Inference Function ---
@spaces.GPU
def get_reward_score(prompt: str, response: str) -> float:
    """
    Receives a prompt and a response, and returns the reward score calculated by the SARM model.
    """
    if not prompt or not response:
        return 0.0
        
    try:
        # Use the same chat template as used during model training.
        messages = [{"role": "user", "content": prompt}, {"role": "assistant", "content": response}]
        # The model will handle moving inputs to the correct device automatically.
        input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt") # <<< REMOVED .to(DEVICE)

        with torch.no_grad():
            score = model(input_ids).logits.item()
        
        return round(score, 4)
    except Exception as e:
        print(f"Error: {e}")
        return 0.0

# --- 3. Create and Launch the Gradio Interface ---

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # SARM: Interpretable Reward Model Demo
        
        This is an interactive demo for the **SARM-4B** model (Sparse Autoencoder-enhanced Reward Model).
        
        SARM is a novel reward model architecture that enhances interpretability by integrating a pretrained Sparse Autoencoder (SAE). It maps the internal hidden states of a large language model into a sparse and human-understandable feature space, making the resulting reward scores transparent and conceptually meaningful.
        
        **How to use this Demo:**
        1.  Enter a **Prompt** (e.g., a question) in the left textbox below.
        2.  Enter a corresponding **Response** in the right textbox.
        3.  Click the "Calculate Reward Score" button.
        
        The model will output a scalar score that evaluates the quality of the response. **A higher score indicates that the SARM model considers the response to be of better quality.**

        ---
        
        **SARM Architecture**
        ![framework](assets/framework-v4.png)

        + **Authors** (* indicates equal contribution)

            Shuyi Zhang\*, Wei Shi\*, Sihang Li\*, Jiayi Liao, Tao Liang, Hengxing Cai, Xiang Wang
        + **Paper**: [Interpretable Reward Model via Sparse Autoencoder](https://arxiv.org/abs/2508.08746)

        + **Model**: [schrieffer/SARM-4B](https://huggingface.co/schrieffer/SARM-4B)

            + Finetuned from model: [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)

        + **Code Repository:** [https://github.com/schrieffer-z/sarm](https://github.com/schrieffer-z/sarm)
        """
    )
    
    with gr.Row():
        prompt_input = gr.Textbox(lines=3, label="Prompt / Question", placeholder="e.g., Can you explain the theory of relativity in simple terms?")
        response_input = gr.Textbox(lines=5, label="Response to be Evaluated", placeholder="e.g., Of course! Albert Einstein's theory of relativity...")

    calculate_btn = gr.Button("Calculate Reward Score", variant="primary")
    score_output = gr.Number(label="Reward Score", info="A higher score is better.")

    calculate_btn.click(
        fn=get_reward_score,
        inputs=[prompt_input, response_input],
        outputs=score_output
    )
    
    gr.Examples(
        examples=[
            ["What is the capital of France?", "The capital of France is Paris."],
            ["What is the capital of France?", "Berlin is a large city in Germany."],
            ["Write a short poem about the moon.", "Silver orb in velvet night, / Casting shadows, soft and light. / Silent watcher, distant, bright, / Guiding dreams till morning's light."],
            ["Write a short poem about the moon.", "The moon is a rock."]
        ],
        inputs=[prompt_input, response_input],
        outputs=score_output,
        fn=get_reward_score,
        cache_examples=True
    )

# Launch the application.
demo.launch()