File size: 3,656 Bytes
d11b63f
 
ec9b1de
 
d11b63f
 
 
 
 
 
 
 
 
 
 
ec9b1de
d11b63f
05a9ebf
 
 
d11b63f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import gradio as gr
import torch
from transformers import AutoTokenizer
from sarm_llama import LlamaSARM

# --- 1. 加载模型和Tokenizer ---
# 这一步会自动从Hugging Face Hub下载你的模型文件
# 确保你的模型仓库是公开的

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "schrieffer/SARM-4B"

print(f"Loading model: {MODEL_ID} on {DEVICE}...")

# 加载模型时必须信任远程代码,因为SARM有自定义架构
model = LlamaSARM.from_pretrained(
    MODEL_ID, 
    sae_hidden_state_source_layer=16, 
    sae_latent_size=65536,
    sae_k=192,
    device_map=DEVICE,
    trust_remote_code=True, 
    torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)

print("Model loaded successfully!")

# --- 2. 定义推理函数 ---
# 这个函数会被Gradio调用

def get_reward_score(prompt: str, response: str) -> float:
    """
    接收prompt和response,返回SARM模型计算出的奖励分数。
    """
    if not prompt or not response:
        return 0.0
        
    try:
        # 使用与模型训练时相同的聊天模板
        messages = [{"role": "user", "content": prompt}, {"role": "assistant", "content": response}]
        input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(DEVICE)

        with torch.no_grad():
            score = model(input_ids).logits.item()
        
        return round(score, 4)
    except Exception as e:
        print(f"Error: {e}")
        # 在界面上返回一个错误提示可能更好,但这里我们简单返回0
        return 0.0

# --- 3. 创建并启动Gradio界面 ---

# 使用gr.Blocks()可以获得更灵活的布局
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # SARM-4B: Interpretable Reward Model Demo
        This is an interactive demo for the SARM-4B model, an interpretable reward model enhanced by a Sparse Autoencoder.
        Enter a prompt (question) and a corresponding response below to get a reward score. A higher score indicates a better quality response according to the model.
        
        For more details, check out our [Tech Report](https://arxiv.org/abs/submit/6699218) and [Model Card](https://huggingface.co/schrieffer/SARM-4B).
        """
    )
    
    with gr.Row():
        prompt_input = gr.Textbox(lines=3, label="Prompt / Question", placeholder="e.g., Can you explain the theory of relativity in simple terms?")
        response_input = gr.Textbox(lines=5, label="Response to be Evaluated", placeholder="e.g., Of course! Albert Einstein's theory of relativity...")

    calculate_btn = gr.Button("Calculate Reward Score", variant="primary")
    score_output = gr.Number(label="Reward Score", info="A higher score is better.")

    # 定义按钮点击时的行为
    calculate_btn.click(
        fn=get_reward_score,
        inputs=[prompt_input, response_input],
        outputs=score_output
    )
    
    gr.Examples(
        examples=[
            ["What is the capital of France?", "The capital of France is Paris."],
            ["What is the capital of France?", "Berlin is a large city in Germany."],
            ["Write a short poem about the moon.", "Silver orb in velvet night, / Casting shadows, soft and light. / Silent watcher, distant, bright, / Guiding dreams till morning's light."],
            ["Write a short poem about the moon.", "The moon is a rock."]
        ],
        inputs=[prompt_input, response_input],
        outputs=score_output,
        fn=get_reward_score,
        cache_examples=True # 缓存示例结果,加快加载速度
    )

# 启动应用
demo.launch()