gentleman12138 commited on
Commit
c6ad631
·
1 Parent(s): 4b98cd9

Add large JSON files using Git LFS

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/*.json filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import gradio as gr
4
+
5
+ # Dictionary to store data for each difficulty
6
+ datasets = {}
7
+ difficulty_files = {
8
+ "Easy": "data/scalar_easy.json",
9
+ "Medium": "data/scalar_medium.json",
10
+ "Hard": "data/scalar_hard.json"
11
+ }
12
+
13
+ # Load all json files
14
+ for difficulty, filepath in difficulty_files.items():
15
+ with open(filepath) as f:
16
+ datasets[difficulty] = json.load(f)
17
+
18
+ # Current state variables
19
+ current_index = 0
20
+ current_difficulty = "Easy" # default difficulty
21
+ current_data = datasets[current_difficulty]
22
+
23
+ # Update display function with difficulty parameter
24
+ def update_display(index, difficulty=None):
25
+ global current_index, current_difficulty, current_data
26
+
27
+ # Update difficulty if provided
28
+ if difficulty is not None and difficulty != current_difficulty:
29
+ current_difficulty = difficulty
30
+ current_data = datasets[difficulty]
31
+ current_index = 0
32
+ index = 0
33
+
34
+ current_index = index
35
+ item = current_data[current_index]
36
+
37
+ # escape markdown but keep our color formatting
38
+ question = item['question'].replace("<", "&lt;").replace(">", "&gt;")
39
+
40
+ # Add styled background with rounded corners
41
+ colored_question = question.replace(
42
+ "[MASKED_CITATION]",
43
+ '<span style="background-color: #ffebeb; padding: 2px 6px; border-radius: 4px; color: blue;">[MASKED_CITATION]</span>'
44
+ )
45
+
46
+ # Format the citation dictionary with indentation
47
+ formatted_citation = json.dumps(item["citation"], indent=4, ensure_ascii=False)
48
+ return colored_question, formatted_citation, item["reference_id"], item['label_id'], index
49
+
50
+ def prev_item(difficulty):
51
+ global current_index, current_data
52
+ current_data = datasets[difficulty]
53
+ current_index = (current_index - 1) % len(current_data)
54
+ return update_display(current_index, difficulty)
55
+
56
+ def next_item(difficulty):
57
+ global current_index, current_data
58
+ current_data = datasets[difficulty]
59
+ current_index = (current_index + 1) % len(current_data)
60
+ return update_display(current_index, difficulty)
61
+
62
+ # Create Gradio interface
63
+ with gr.Blocks() as demo:
64
+ gr.Markdown("""
65
+ ## Data viewer for [SCALAR](https://arxiv.org/abs/2502.13753/)
66
+ * SCALAR (Scientific Citation-based Live Assessment of Long-context Academic Reasoning) is a novel benchmark that leverages academic papers and their citation networks, featuring automatic generation of high-quality ground truth labels without human annotation, controllable difficulty levels, and a dynamic updating mechanism that prevents data contamination.
67
+ * The benchmark provides a reliable and sustainable way to track progress in long-context understanding as LLM capabilities evolve.
68
+ """)
69
+ with gr.Column():
70
+ with gr.Row():
71
+
72
+ # Display current index
73
+ with gr.Column():
74
+
75
+ # Add difficulty selector
76
+ difficulty_dropdown = gr.Dropdown(
77
+ choices=list(difficulty_files.keys()),
78
+ value="Easy",
79
+ label="Difficulty Level"
80
+ )
81
+
82
+ index_slider = gr.Slider(minimum=0, maximum=len(datasets["Easy"]) - 1, step=1, label="Select Index", value=0)
83
+ with gr.Row():
84
+ prev_btn = gr.Button("Previous")
85
+ next_btn = gr.Button("Next")
86
+
87
+ # Display question and citation
88
+ with gr.Column():
89
+ citation_text = gr.TextArea(label="Citation", interactive=False)
90
+ reference_id = gr.Textbox(label="Reference ID", interactive=False)
91
+ label_id = gr.Textbox(label="Label ID", interactive=False)
92
+
93
+ with gr.Row():
94
+ question_md = gr.Markdown(label="Question")
95
+
96
+ # Bind events
97
+ difficulty_dropdown.change(
98
+ fn=lambda d: gr.Slider(minimum=0, maximum=len(datasets[d]) - 1, step=1, value=0),
99
+ inputs=[difficulty_dropdown],
100
+ outputs=[index_slider]
101
+ )
102
+
103
+ index_slider.change(
104
+ update_display,
105
+ inputs=[index_slider, difficulty_dropdown],
106
+ outputs=[question_md, citation_text, reference_id, label_id, index_slider]
107
+ )
108
+
109
+ prev_btn.click(
110
+ prev_item,
111
+ inputs=[difficulty_dropdown],
112
+ outputs=[question_md, citation_text, reference_id, label_id, index_slider]
113
+ )
114
+
115
+ next_btn.click(
116
+ next_item,
117
+ inputs=[difficulty_dropdown],
118
+ outputs=[question_md, citation_text, reference_id, label_id, index_slider]
119
+ )
120
+
121
+ # Initialize display
122
+ demo.load(
123
+ update_display,
124
+ inputs=[index_slider, difficulty_dropdown],
125
+ outputs=[question_md, citation_text, reference_id, label_id, index_slider]
126
+ )
127
+
128
+ # Launch interface
129
+ demo.launch()
data/scalar_easy.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf1dc202fcc96d058347610c2026171029d0d373a8955d0b30630159ce1aa156
3
+ size 28401358
data/scalar_hard.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4557d57e59b02f1264d9f72e321374d342e4a463f45e727c12b8a1b3c26504cb
3
+ size 26949206
data/scalar_medium.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:273447c78034e05b2632710d9d61891e8bc277b234f1483946e0bb805cbfcc92
3
+ size 31101610