|
import json |
|
import os |
|
import gradio as gr |
|
|
|
|
|
datasets = {} |
|
difficulty_files = { |
|
"Easy": "data/scalar_easy.json", |
|
"Medium": "data/scalar_medium.json", |
|
"Hard": "data/scalar_hard.json" |
|
} |
|
|
|
|
|
for difficulty, filepath in difficulty_files.items(): |
|
with open(filepath) as f: |
|
datasets[difficulty] = json.load(f) |
|
|
|
|
|
current_index = 0 |
|
current_difficulty = "Easy" |
|
current_data = datasets[current_difficulty] |
|
|
|
|
|
def update_display(index, difficulty=None): |
|
global current_index, current_difficulty, current_data |
|
|
|
|
|
if difficulty is not None and difficulty != current_difficulty: |
|
current_difficulty = difficulty |
|
current_data = datasets[difficulty] |
|
current_index = 0 |
|
index = 0 |
|
|
|
current_index = index |
|
item = current_data[current_index] |
|
|
|
|
|
question = item['question'].replace("<", "<").replace(">", ">") |
|
|
|
|
|
colored_question = question.replace( |
|
"[MASKED_CITATION]", |
|
'<span style="background-color: #ffebeb; padding: 2px 6px; border-radius: 4px; color: blue;">[MASKED_CITATION]</span>' |
|
) |
|
|
|
|
|
formatted_citation = json.dumps(item["citation"], indent=4, ensure_ascii=False) |
|
return colored_question, formatted_citation, item["reference_id"], item['label_id'], index |
|
|
|
def prev_item(difficulty): |
|
global current_index, current_data |
|
current_data = datasets[difficulty] |
|
current_index = (current_index - 1) % len(current_data) |
|
return update_display(current_index, difficulty) |
|
|
|
def next_item(difficulty): |
|
global current_index, current_data |
|
current_data = datasets[difficulty] |
|
current_index = (current_index + 1) % len(current_data) |
|
return update_display(current_index, difficulty) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown(""" |
|
## Data viewer for [SCALAR](https://arxiv.org/abs/2502.13753/) |
|
* SCALAR (Scientific Citation-based Live Assessment of Long-context Academic Reasoning) is a novel benchmark that leverages academic papers and their citation networks, featuring automatic generation of high-quality ground truth labels without human annotation, controllable difficulty levels, and a dynamic updating mechanism that prevents data contamination. |
|
* The benchmark provides a reliable and sustainable way to track progress in long-context understanding as LLM capabilities evolve. |
|
""") |
|
with gr.Column(): |
|
with gr.Row(): |
|
|
|
|
|
with gr.Column(): |
|
|
|
|
|
difficulty_dropdown = gr.Dropdown( |
|
choices=list(difficulty_files.keys()), |
|
value="Easy", |
|
label="Difficulty Level" |
|
) |
|
|
|
index_slider = gr.Slider(minimum=0, maximum=len(datasets["Easy"]) - 1, step=1, label="Select Index", value=0) |
|
with gr.Row(): |
|
prev_btn = gr.Button("Previous") |
|
next_btn = gr.Button("Next") |
|
|
|
|
|
with gr.Column(): |
|
citation_text = gr.TextArea(label="Citation", interactive=False) |
|
reference_id = gr.Textbox(label="Reference ID", interactive=False) |
|
label_id = gr.Textbox(label="Label ID", interactive=False) |
|
|
|
with gr.Row(): |
|
question_md = gr.Markdown(label="Question") |
|
|
|
|
|
difficulty_dropdown.change( |
|
fn=lambda d: gr.Slider(minimum=0, maximum=len(datasets[d]) - 1, step=1, value=0), |
|
inputs=[difficulty_dropdown], |
|
outputs=[index_slider] |
|
) |
|
|
|
index_slider.change( |
|
update_display, |
|
inputs=[index_slider, difficulty_dropdown], |
|
outputs=[question_md, citation_text, reference_id, label_id, index_slider] |
|
) |
|
|
|
prev_btn.click( |
|
prev_item, |
|
inputs=[difficulty_dropdown], |
|
outputs=[question_md, citation_text, reference_id, label_id, index_slider] |
|
) |
|
|
|
next_btn.click( |
|
next_item, |
|
inputs=[difficulty_dropdown], |
|
outputs=[question_md, citation_text, reference_id, label_id, index_slider] |
|
) |
|
|
|
|
|
demo.load( |
|
update_display, |
|
inputs=[index_slider, difficulty_dropdown], |
|
outputs=[question_md, citation_text, reference_id, label_id, index_slider] |
|
) |
|
|
|
|
|
demo.launch() |
|
|