frimelle HF Staff commited on
Commit
cea76ed
Β·
1 Parent(s): 4dc7229

experiment with app.py

Browse files
Files changed (1) hide show
  1. app.py +236 -24
app.py CHANGED
@@ -1,35 +1,247 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter
3
- from pathlib import Path
4
  import pandas as pd
 
 
 
5
 
6
- abs_path = Path(__file__).parent
7
- df = pd.read_json(abs_path / "leaderboard_data.json")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- with gr.Blocks() as demo:
10
- gr.Markdown("# πŸ† Model Leaderboard")
11
 
12
- Leaderboard(
13
- value=df,
14
- # choose what shows by default (must match your column names)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  select_columns=SelectColumns(
16
- default_selection=["Model", "Average ⬆️", "ARC", "HellaSwag", "MMLU", "TruthfulQA", "Winogrande", "GSM8K"],
17
- cant_deselect=["Model"],
18
- label="Select columns to display:",
19
  ),
20
- # enable search across useful fields (primary first)
21
- search_columns=["Model", "Type"],
22
- # hide helper columns that are only for search/filtering
23
- hide_columns=["model_name_for_query"],
24
- # add quick filters; auto-picks widget types unless you specify
25
  filter_columns=[
26
- "Type",
27
- ColumnFilter("Precision", type="dropdown"),
28
- ColumnFilter("#Params (B)", type="slider", min=0, max=300),
 
 
 
 
 
 
 
 
 
29
  ],
30
- # optional: tell the component which columns are numbers/markdown/etc.
31
- datatype=["str","number","number","number","number","number","number","number"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  )
33
 
34
- if __name__ == "__main__":
35
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 
3
  import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+ from pathlib import Path # ⬅️ for local JSON
7
 
8
+ from src.about import (
9
+ CITATION_BUTTON_LABEL,
10
+ CITATION_BUTTON_TEXT,
11
+ EVALUATION_QUEUE_TEXT,
12
+ INTRODUCTION_TEXT,
13
+ LLM_BENCHMARKS_TEXT,
14
+ TITLE,
15
+ )
16
+ from src.display.css_html_js import custom_css
17
+ from src.display.utils import (
18
+ BENCHMARK_COLS,
19
+ COLS,
20
+ EVAL_COLS,
21
+ EVAL_TYPES,
22
+ AutoEvalColumn,
23
+ ModelType,
24
+ fields,
25
+ WeightType,
26
+ Precision
27
+ )
28
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
29
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
30
+ from src.submission.submit import add_new_eval
31
 
 
 
32
 
33
+ def restart_space():
34
+ API.restart_space(repo_id=REPO_ID)
35
+
36
+ ### Space initialisation (pull queue/results datasets like before)
37
+ try:
38
+ print(EVAL_REQUESTS_PATH)
39
+ snapshot_download(
40
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
41
+ )
42
+ except Exception:
43
+ restart_space()
44
+ try:
45
+ print(EVAL_RESULTS_PATH)
46
+ snapshot_download(
47
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
48
+ )
49
+ except Exception:
50
+ restart_space()
51
+
52
+ # Original leaderboard (unchanged)
53
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
54
+
55
+ (
56
+ finished_eval_queue_df,
57
+ running_eval_queue_df,
58
+ pending_eval_queue_df,
59
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
60
+
61
+ def init_leaderboard(dataframe):
62
+ if dataframe is None or dataframe.empty:
63
+ raise ValueError("Leaderboard DataFrame is empty or None.")
64
+ return Leaderboard(
65
+ value=dataframe,
66
+ datatype=[c.type for c in fields(AutoEvalColumn)],
67
  select_columns=SelectColumns(
68
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
69
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
70
+ label="Select Columns to Display:",
71
  ),
72
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
73
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
 
 
 
74
  filter_columns=[
75
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
76
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
77
+ ColumnFilter(
78
+ AutoEvalColumn.params.name,
79
+ type="slider",
80
+ min=0.01,
81
+ max=150,
82
+ label="Select the number of parameters (B)",
83
+ ),
84
+ ColumnFilter(
85
+ AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
86
+ ),
87
  ],
88
+ bool_checkboxgroup_label="Hide models",
89
+ interactive=False,
90
+ )
91
+
92
+ # -----------------------------
93
+ # NEW: Load your local JSON and a simple leaderboard (no AutoEvalColumn coupling)
94
+ # -----------------------------
95
+ USER_JSON = Path(__file__).parent / "leaderboard_data.json"
96
+ try:
97
+ USER_DF = pd.read_json(USER_JSON)
98
+ except Exception as e:
99
+ # Create an empty DF with the expected columns if file missing, so Space still builds.
100
+ USER_DF = pd.DataFrame(columns=["Model", "Average", "Assistant Traits", "Relationship & Intimacy", "Emotional Investment", "User Vulnerabilities"])
101
+
102
+ # Ensure types (Model=str, others=float) and clean column order
103
+ if "Model" in USER_DF.columns:
104
+ USER_DF["Model"] = USER_DF["Model"].astype(str)
105
+ for col in USER_DF.columns:
106
+ if col != "Model":
107
+ USER_DF[col] = pd.to_numeric(USER_DF[col], errors="coerce")
108
+
109
+ def init_simple_leaderboard(df: pd.DataFrame):
110
+ # Show Model + up to first 6 metric columns by default
111
+ metrics = [c for c in df.columns if c != "Model"]
112
+ default_cols = ["Model"] + metrics[:6] if "Model" in df.columns else list(df.columns)[:7]
113
+ cant_hide = ["Model"] if "Model" in df.columns else []
114
+
115
+ return Leaderboard(
116
+ value=df,
117
+ select_columns=SelectColumns(
118
+ default_selection=default_cols if default_cols else list(df.columns),
119
+ cant_deselect=cant_hide,
120
+ label="Select Columns to Display:",
121
+ ),
122
+ search_columns=["Model"] if "Model" in df.columns else [],
123
+ hide_columns=[], # keep everything visible
124
+ filter_columns=[], # no filters for now
125
+ interactive=False,
126
  )
127
 
128
+ # -----------------------------
129
+ # UI
130
+ # -----------------------------
131
+ demo = gr.Blocks(css=custom_css)
132
+ with demo:
133
+ gr.HTML(TITLE)
134
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
135
+
136
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
137
+ with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
138
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
139
+
140
+ # NEW TAB: renders your leaderboard_data.json
141
+ with gr.TabItem("πŸ“Š INTIMA Leaderboard", elem_id="intima-leaderboard-tab", id=1):
142
+ _ = init_simple_leaderboard(USER_DF)
143
+
144
+ with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
145
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
146
+
147
+ with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
148
+ with gr.Column():
149
+ with gr.Row():
150
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
151
+
152
+ with gr.Column():
153
+ with gr.Accordion(
154
+ f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
155
+ open=False,
156
+ ):
157
+ with gr.Row():
158
+ finished_eval_table = gr.components.Dataframe(
159
+ value=finished_eval_queue_df,
160
+ headers=EVAL_COLS,
161
+ datatype=EVAL_TYPES,
162
+ row_count=5,
163
+ )
164
+ with gr.Accordion(
165
+ f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
166
+ open=False,
167
+ ):
168
+ with gr.Row():
169
+ running_eval_table = gr.components.Dataframe(
170
+ value=running_eval_queue_df,
171
+ headers=EVAL_COLS,
172
+ datatype=EVAL_TYPES,
173
+ row_count=5,
174
+ )
175
+
176
+ with gr.Accordion(
177
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
178
+ open=False,
179
+ ):
180
+ with gr.Row():
181
+ pending_eval_table = gr.components.Dataframe(
182
+ value=pending_eval_queue_df,
183
+ headers=EVAL_COLS,
184
+ datatype=EVAL_TYPES,
185
+ row_count=5,
186
+ )
187
+ with gr.Row():
188
+ gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
189
+
190
+ with gr.Row():
191
+ with gr.Column():
192
+ model_name_textbox = gr.Textbox(label="Model name")
193
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
194
+ model_type = gr.Dropdown(
195
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
196
+ label="Model type",
197
+ multiselect=False,
198
+ value=None,
199
+ interactive=True,
200
+ )
201
+
202
+ with gr.Column():
203
+ precision = gr.Dropdown(
204
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
205
+ label="Precision",
206
+ multiselect=False,
207
+ value="float16",
208
+ interactive=True,
209
+ )
210
+ weight_type = gr.Dropdown(
211
+ choices=[i.value.name for i in WeightType],
212
+ label="Weights type",
213
+ multiselect=False,
214
+ value="Original",
215
+ interactive=True,
216
+ )
217
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
218
+
219
+ submit_button = gr.Button("Submit Eval")
220
+ submission_result = gr.Markdown()
221
+ submit_button.click(
222
+ add_new_eval,
223
+ [
224
+ model_name_textbox,
225
+ base_model_name_textbox,
226
+ revision_name_textbox,
227
+ precision,
228
+ weight_type,
229
+ model_type,
230
+ ],
231
+ submission_result,
232
+ )
233
+
234
+ with gr.Row():
235
+ with gr.Accordion("πŸ“™ Citation", open=False):
236
+ citation_button = gr.Textbox(
237
+ value=CITATION_BUTTON_TEXT,
238
+ label=CITATION_BUTTON_LABEL,
239
+ lines=20,
240
+ elem_id="citation-button",
241
+ show_copy_button=True,
242
+ )
243
+
244
+ scheduler = BackgroundScheduler()
245
+ scheduler.add_job(restart_space, "interval", seconds=1800)
246
+ scheduler.start()
247
+ demo.queue(default_concurrency_limit=40).launch()