Update README.md (#1)
Browse files- Update README.md (e8ae0079dad6f6a13a84e177ca9c9f3d1eb2df92)
    	
        README.md
    CHANGED
    
    | 
         @@ -1,122 +1,147 @@ 
     | 
|
| 1 | 
         
            -
            ---
         
     | 
| 2 | 
         
            -
            license: apache-2.0
         
     | 
| 3 | 
         
            -
             
     | 
| 4 | 
         
            -
             
     | 
| 5 | 
         
            -
             
     | 
| 6 | 
         
            -
             
     | 
| 7 | 
         
            -
             
     | 
| 8 | 
         
            -
             
     | 
| 9 | 
         
            -
             
     | 
| 10 | 
         
            -
             
     | 
| 11 | 
         
            -
               
     | 
| 12 | 
         
            -
             
     | 
| 13 | 
         
            -
             
     | 
| 14 | 
         
            -
             
     | 
| 15 | 
         
            -
                   
     | 
| 16 | 
         
            -
             
     | 
| 17 | 
         
            -
             
     | 
| 18 | 
         
            -
             
     | 
| 19 | 
         
            -
               
     | 
| 20 | 
         
            -
             
     | 
| 21 | 
         
            -
             
     | 
| 22 | 
         
            -
             
     | 
| 23 | 
         
            -
             
     | 
| 24 | 
         
            -
             
     | 
| 25 | 
         
            -
             
     | 
| 26 | 
         
            -
             
     | 
| 27 | 
         
            -
             
     | 
| 28 | 
         
            -
             
     | 
| 29 | 
         
            -
             
     | 
| 30 | 
         
            -
             
     | 
| 31 | 
         
            -
             
     | 
| 32 | 
         
            -
             
     | 
| 33 | 
         
            -
             
     | 
| 34 | 
         
            -
             
     | 
| 35 | 
         
            -
             
     | 
| 36 | 
         
            -
             
     | 
| 37 | 
         
            -
             
     | 
| 38 | 
         
            -
             
     | 
| 39 | 
         
            -
             
     | 
| 40 | 
         
            -
             
     | 
| 41 | 
         
            -
             
     | 
| 42 | 
         
            -
             
     | 
| 43 | 
         
            -
             
     | 
| 44 | 
         
            -
             
     | 
| 45 | 
         
            -
             
     | 
| 46 | 
         
            -
             
     | 
| 47 | 
         
            -
             
     | 
| 48 | 
         
            -
             
     | 
| 49 | 
         
            -
             
     | 
| 50 | 
         
            -
             
     | 
| 51 | 
         
            -
             
     | 
| 52 | 
         
            -
             
     | 
| 53 | 
         
            -
            from transformers import AutoModelForCausalLM, AutoTokenizer
         
     | 
| 54 | 
         
            -
             
     | 
| 55 | 
         
            -
             
     | 
| 56 | 
         
            -
             
     | 
| 57 | 
         
            -
            model = AutoModelForCausalLM.from_pretrained(
         
     | 
| 58 | 
         
            -
                model_name,
         
     | 
| 59 | 
         
            -
                torch_dtype="auto",
         
     | 
| 60 | 
         
            -
                device_map="auto"
         
     | 
| 61 | 
         
            -
            )
         
     | 
| 62 | 
         
            -
            tokenizer = AutoTokenizer.from_pretrained(model_name)
         
     | 
| 63 | 
         
            -
             
     | 
| 64 | 
         
            -
             
     | 
| 65 | 
         
            -
             
     | 
| 66 | 
         
            -
             
     | 
| 67 | 
         
            -
             
     | 
| 68 | 
         
            -
             
     | 
| 69 | 
         
            -
             
     | 
| 70 | 
         
            -
             
     | 
| 71 | 
         
            -
             
     | 
| 72 | 
         
            -
             
     | 
| 73 | 
         
            -
             
     | 
| 74 | 
         
            -
             
     | 
| 75 | 
         
            -
             
     | 
| 76 | 
         
            -
             
     | 
| 77 | 
         
            -
             
     | 
| 78 | 
         
            -
             
     | 
| 79 | 
         
            -
             
     | 
| 80 | 
         
            -
             
     | 
| 81 | 
         
            -
             
     | 
| 82 | 
         
            -
             
     | 
| 83 | 
         
            -
             
     | 
| 84 | 
         
            -
             
     | 
| 85 | 
         
            -
             
     | 
| 86 | 
         
            -
             
     | 
| 87 | 
         
            -
             
     | 
| 88 | 
         
            -
             
     | 
| 89 | 
         
            -
             
     | 
| 90 | 
         
            -
             
     | 
| 91 | 
         
            -
             
     | 
| 92 | 
         
            -
             
     | 
| 93 | 
         
            -
             
     | 
| 94 | 
         
            -
             
     | 
| 95 | 
         
            -
             
     | 
| 96 | 
         
            -
             
     | 
| 97 | 
         
            -
             
     | 
| 98 | 
         
            -
             
     | 
| 99 | 
         
            -
             
     | 
| 100 | 
         
            -
             
     | 
| 101 | 
         
            -
             
     | 
| 102 | 
         
            -
             
     | 
| 103 | 
         
            -
             
     | 
| 104 | 
         
            -
             
     | 
| 105 | 
         
            -
             
     | 
| 106 | 
         
            -
             
     | 
| 107 | 
         
            -
             
     | 
| 108 | 
         
            -
             
     | 
| 109 | 
         
            -
             
     | 
| 110 | 
         
            -
             
     | 
| 111 | 
         
            -
             
     | 
| 112 | 
         
            -
             
     | 
| 113 | 
         
            -
             
     | 
| 114 | 
         
            -
             
     | 
| 115 | 
         
            -
             
     | 
| 116 | 
         
            -
             
     | 
| 117 | 
         
            -
             
     | 
| 118 | 
         
            -
             
     | 
| 119 | 
         
            -
             
     | 
| 120 | 
         
            -
             
     | 
| 121 | 
         
            -
             
     | 
| 122 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            ---
         
     | 
| 2 | 
         
            +
            license: apache-2.0
         
     | 
| 3 | 
         
            +
             
     | 
| 4 | 
         
            +
            ---
         
     | 
| 5 | 
         
            +
             
     | 
| 6 | 
         
            +
            # CompassJudger-2
         
     | 
| 7 | 
         
            +
             
     | 
| 8 | 
         
            +
            <div align="left" style="line-height: 1;">
         
     | 
| 9 | 
         
            +
              <a href="https://github.com/open-compass/CompassJudger" target="_blank" style="margin: 2px;">
         
     | 
| 10 | 
         
            +
                <img alt="Homepage" src="https://img.shields.io/badge/CompassJudger-GitHub-blue?color=1991ff&logo=github&logoColor=white" style="display: inline-block; vertical-align: middle;"/>
         
     | 
| 11 | 
         
            +
              </a>
         
     | 
| 12 | 
         
            +
              <a href="https://arxiv.org/abs/2507.09104" target="_blank" style="margin: 2px;"">
         
     | 
| 13 | 
         
            +
                <img
         
     | 
| 14 | 
         
            +
                  src="https://img.shields.io/badge/CompassJudger--2-Paper-red?logo=arxiv&logoColor=red"
         
     | 
| 15 | 
         
            +
                  alt="CompassJudger-2"
         
     | 
| 16 | 
         
            +
                  style="display: inline-block; vertical-align: middle;"
         
     | 
| 17 | 
         
            +
                />
         
     | 
| 18 | 
         
            +
              </a>
         
     | 
| 19 | 
         
            +
              <a href="https://huggingface.co/opencompass" target="_blank" style="margin: 2px;">
         
     | 
| 20 | 
         
            +
                  <img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-OpenCompass-536af5?color=536af5&logoColor=white" style="display: inline-block; vertical-align: middle;"/>
         
     | 
| 21 | 
         
            +
              </a>
         
     | 
| 22 | 
         
            +
              <a href="https://github.com/open-compass/CompassJudger/blob/main/LICENSE" style="margin: 2px;">
         
     | 
| 23 | 
         
            +
                  <img alt="License" src="https://img.shields.io/badge/License-Apache%202.0-f5de53?color=f5de53&logoColor=white" style="display: inline-block; vertical-align: middle;"/>
         
     | 
| 24 | 
         
            +
              </a>
         
     | 
| 25 | 
         
            +
            </div>
         
     | 
| 26 | 
         
            +
             
     | 
| 27 | 
         
            +
            ## Introduction
         
     | 
| 28 | 
         
            +
             
     | 
| 29 | 
         
            +
            We introduce **CompassJudger-2**, a novel series of generalist judge models designed to overcome the narrow specialization and limited robustness of existing LLM-as-judge solutions. Current judge models often struggle with comprehensive evaluation, but CompassJudger-2 addresses these limitations with a powerful new training paradigm.
         
     | 
| 30 | 
         
            +
             
     | 
| 31 | 
         
            +
            Key contributions of our work include:
         
     | 
| 32 | 
         
            +
             
     | 
| 33 | 
         
            +
            - **Advanced Data Strategy:** We employ a task-driven, multi-domain data curation and synthesis strategy to enhance the model's robustness and domain adaptability.
         
     | 
| 34 | 
         
            +
            - **Verifiable Reward-Guided Training:** We supervise judgment tasks with verifiable rewards, guiding the model's intrinsic reasoning through chain-of-thought (CoT) and rejection sampling. A refined margin policy gradient loss further enhances performance.
         
     | 
| 35 | 
         
            +
            - **Superior Performance:** CompassJudger-2 achieves state-of-the-art results across multiple judge and reward benchmarks. Our 7B model demonstrates competitive accuracy with models that are significantly larger.
         
     | 
| 36 | 
         
            +
            - **JudgerBenchV2:** We introduce a new, comprehensive benchmark with 10,000 questions across 10 scenarios, using a Mixture-of-Judgers (MoJ) consensus for more reliable ground truth.
         
     | 
| 37 | 
         
            +
             
     | 
| 38 | 
         
            +
            This repository contains the **CompassJudger-2** series of models, fine-tuned on the Qwen2.5-Instruct series.
         
     | 
| 39 | 
         
            +
             
     | 
| 40 | 
         
            +
            ## Models
         
     | 
| 41 | 
         
            +
             
     | 
| 42 | 
         
            +
            | Model Name                         | Size | Base Model           |                           Download                           | Notes                                         |
         
     | 
| 43 | 
         
            +
            | :--------------------------------- | :--: | :------------------- | :----------------------------------------------------------: | :-------------------------------------------- |
         
     | 
| 44 | 
         
            +
            | 👉 **CompassJudger-2-7B-Instruct**  |  7B  | Qwen2.5-7B-Instruct  | 🤗 [Model](https://huggingface.co/opencompass/CompassJudger-2-7B-Instruct) | Fine-tuned for generalist judge capabilities. |
         
     | 
| 45 | 
         
            +
            | 👉 **CompassJudger-2-32B-Instruct** | 32B  | Qwen2.5-32B-Instruct | 🤗 [Model](https://huggingface.co/opencompass/CompassJudger-2-32B-Instruct) | A larger, more powerful judge model.          |
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
            +
            ## Quickstart
         
     | 
| 48 | 
         
            +
             
     | 
| 49 | 
         
            +
            Here is a simple example demonstrating how to load the model and use it for pairwise evaluation.
         
     | 
| 50 | 
         
            +
             
     | 
| 51 | 
         
            +
            ```python
         
     | 
| 52 | 
         
            +
            import torch
         
     | 
| 53 | 
         
            +
            from transformers import AutoModelForCausalLM, AutoTokenizer
         
     | 
| 54 | 
         
            +
             
     | 
| 55 | 
         
            +
            model_path = "opencompass/CompassJudger-2-7B-Instruct"
         
     | 
| 56 | 
         
            +
             
     | 
| 57 | 
         
            +
            model = AutoModelForCausalLM.from_pretrained(
         
     | 
| 58 | 
         
            +
                model_name,
         
     | 
| 59 | 
         
            +
                torch_dtype="auto",
         
     | 
| 60 | 
         
            +
                device_map="auto"
         
     | 
| 61 | 
         
            +
            )
         
     | 
| 62 | 
         
            +
            tokenizer = AutoTokenizer.from_pretrained(model_name)
         
     | 
| 63 | 
         
            +
             
     | 
| 64 | 
         
            +
            # Example: Pair-wise Comparison
         
     | 
| 65 | 
         
            +
            prompt = """
         
     | 
| 66 | 
         
            +
            Please act as an impartial judge to evaluate the responses provided by two AI assistants to the user question below. Your evaluation should focus on the following criteria: helpfulness, relevance, accuracy, depth, creativity, and level of detail.
         
     | 
| 67 | 
         
            +
             
     | 
| 68 | 
         
            +
            - Do not let the order of presentation, response length, or assistant names influence your judgment.
         
     | 
| 69 | 
         
            +
            - Base your decision solely on how well each response addresses the user’s question and adheres to the instructions.
         
     | 
| 70 | 
         
            +
             
     | 
| 71 | 
         
            +
            Your final reply must be structured in the following format:
         
     | 
| 72 | 
         
            +
            {
         
     | 
| 73 | 
         
            +
              "Choice": "[Model A or Model B]"
         
     | 
| 74 | 
         
            +
            }
         
     | 
| 75 | 
         
            +
             
     | 
| 76 | 
         
            +
            User Question: {question}
         
     | 
| 77 | 
         
            +
             
     | 
| 78 | 
         
            +
            Model A's Response: {answerA}
         
     | 
| 79 | 
         
            +
             
     | 
| 80 | 
         
            +
            Model B's Response: {answerB}
         
     | 
| 81 | 
         
            +
             
     | 
| 82 | 
         
            +
            Now it's your turn. Please provide selection result as required:
         
     | 
| 83 | 
         
            +
            """
         
     | 
| 84 | 
         
            +
             
     | 
| 85 | 
         
            +
            messages = [
         
     | 
| 86 | 
         
            +
                {"role": "user", "content": prompt}
         
     | 
| 87 | 
         
            +
            ]
         
     | 
| 88 | 
         
            +
             
     | 
| 89 | 
         
            +
            text = tokenizer.apply_chat_template(
         
     | 
| 90 | 
         
            +
                messages,
         
     | 
| 91 | 
         
            +
                tokenize=False,
         
     | 
| 92 | 
         
            +
                add_generation_prompt=True
         
     | 
| 93 | 
         
            +
            )
         
     | 
| 94 | 
         
            +
            model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
         
     | 
| 95 | 
         
            +
             
     | 
| 96 | 
         
            +
            generated_ids = model.generate(
         
     | 
| 97 | 
         
            +
                **model_inputs,
         
     | 
| 98 | 
         
            +
                max_new_tokens=2048
         
     | 
| 99 | 
         
            +
            )
         
     | 
| 100 | 
         
            +
            generated_ids = [
         
     | 
| 101 | 
         
            +
                output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
         
     | 
| 102 | 
         
            +
            ]
         
     | 
| 103 | 
         
            +
             
     | 
| 104 | 
         
            +
            response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
         
     | 
| 105 | 
         
            +
            print(response)
         
     | 
| 106 | 
         
            +
            ```
         
     | 
| 107 | 
         
            +
             
     | 
| 108 | 
         
            +
            ## Evaluation
         
     | 
| 109 | 
         
            +
             
     | 
| 110 | 
         
            +
            CompassJudger-2 sets a new state-of-the-art for judge models, outperforming general models, reward models, and other specialized judge models across a wide range of benchmarks.
         
     | 
| 111 | 
         
            +
             
     | 
| 112 | 
         
            +
            | Model                              | JudgerBench V2 | JudgeBench |    RMB    | RewardBench |  Average  |
         
     | 
| 113 | 
         
            +
            | :--------------------------------- | :------------: | :--------: | :-------: | :---------: | :-------: |
         
     | 
| 114 | 
         
            +
            | **7B Judge Models**                |                |            |           |             |           |
         
     | 
| 115 | 
         
            +
            | CompassJudger-1-7B-Instruct        |     57.96      |   46.00    |   38.18   |    80.74    |   55.72   |
         
     | 
| 116 | 
         
            +
            | Con-J-7B-Instruct                  |     52.35      |   38.06    |   71.50   |    87.10    |   62.25   |
         
     | 
| 117 | 
         
            +
            | RISE-Judge-Qwen2.5-7B              |     46.12      |   40.48    |   72.64   |    88.20    |   61.61   |
         
     | 
| 118 | 
         
            +
            | **CompassJudger-2-7B-Instruct**    |   **60.52**    | **63.06**  | **73.90** |  **90.96**  | **72.11** |
         
     | 
| 119 | 
         
            +
            | **32B+ Judge Models**              |                |            |           |             |           |
         
     | 
| 120 | 
         
            +
            | CompassJudger-1-32B-Instruct       |     60.33      |   62.29    |   77.63   |    86.17    |   71.61   |
         
     | 
| 121 | 
         
            +
            | Skywork-Critic-Llama-3.1-70B       |     52.41      |   50.65    |   65.50   |    93.30    |   65.47   |
         
     | 
| 122 | 
         
            +
            | RISE-Judge-Qwen2.5-32B             |     56.42      |   63.87    |   73.70   |    92.70    |   71.67   |
         
     | 
| 123 | 
         
            +
            | **CompassJudger-2-32B-Instruct**   |   **62.21**    | **65.48**  |   72.98   |  **92.62**  | **73.32** |
         
     | 
| 124 | 
         
            +
            | **General Models (for reference)** |                |            |           |             |           |
         
     | 
| 125 | 
         
            +
            | Qwen2.5-32B-Instruct               |     62.97      |   59.84    |   74.99   |    85.61    |   70.85   |
         
     | 
| 126 | 
         
            +
            | DeepSeek-V3-0324                   |     64.43      |   59.68    |   78.16   |    85.17    |   71.86   |
         
     | 
| 127 | 
         
            +
            | Qwen3-235B-A22B                    |     61.40      |   65.97    |   75.59   |    84.68    |   71.91   |
         
     | 
| 128 | 
         
            +
             
     | 
| 129 | 
         
            +
             
     | 
| 130 | 
         
            +
            For detailed benchmark performance and methodology, please refer to our [📑 Paper](https://arxiv.org/abs/2507.09104). 
         
     | 
| 131 | 
         
            +
             
     | 
| 132 | 
         
            +
            ## License
         
     | 
| 133 | 
         
            +
             
     | 
| 134 | 
         
            +
            This project is licensed under the Apache 2.0 License. See the [LICENSE](https://github.com/open-compass/CompassJudger/blob/main/LICENSE) file for details. 
         
     | 
| 135 | 
         
            +
             
     | 
| 136 | 
         
            +
            ## Citation
         
     | 
| 137 | 
         
            +
             
     | 
| 138 | 
         
            +
            If you find our work helpful, please consider citing our paper:
         
     | 
| 139 | 
         
            +
             
     | 
| 140 | 
         
            +
            ```bibtex
         
     | 
| 141 | 
         
            +
            @article{zhang2025compassjudger,
         
     | 
| 142 | 
         
            +
              title={CompassJudger-2: Towards Generalist Judge Model via Verifiable Rewards},
         
     | 
| 143 | 
         
            +
              author={Zhang, Taolin and Cao, Maosong and Lam, Alexander and Zhang, Songyang and Chen, Kai},
         
     | 
| 144 | 
         
            +
              journal={arXiv preprint arXiv:2507.09104},
         
     | 
| 145 | 
         
            +
              year={2025}
         
     | 
| 146 | 
         
            +
            }
         
     | 
| 147 | 
         
            +
            ```
         
     |