Spaces:
Sleeping
Sleeping
Commit
·
a0327b4
1
Parent(s):
79f9e39
update
Browse files- app.py +119 -111
- src/about.py +8 -5
- src/display/utils.py +10 -10
- src/leaderboard/read_evals.py +34 -34
app.py
CHANGED
|
@@ -9,7 +9,7 @@ from src.about import (
|
|
| 9 |
CITATION_BUTTON_TEXT,
|
| 10 |
EVALUATION_QUEUE_TEXT,
|
| 11 |
INTRODUCTION_TEXT,
|
| 12 |
-
|
| 13 |
TITLE,
|
| 14 |
)
|
| 15 |
from src.display.css_html_js import custom_css
|
|
@@ -63,28 +63,28 @@ def init_leaderboard(dataframe):
|
|
| 63 |
return Leaderboard(
|
| 64 |
value=dataframe,
|
| 65 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
| 66 |
-
select_columns=SelectColumns(
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
),
|
| 71 |
-
search_columns=[AutoEvalColumn.model.name
|
| 72 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 73 |
-
filter_columns=[
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
],
|
| 87 |
-
bool_checkboxgroup_label="Hide models",
|
| 88 |
interactive=False,
|
| 89 |
)
|
| 90 |
|
|
@@ -95,98 +95,106 @@ with demo:
|
|
| 95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 96 |
|
| 97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 98 |
-
with gr.TabItem("🏅
|
| 99 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 100 |
|
| 101 |
-
with gr.TabItem("
|
| 102 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 103 |
-
|
| 104 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
| 105 |
-
with gr.Column():
|
| 106 |
-
with gr.Row():
|
| 107 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 108 |
-
|
| 109 |
-
with gr.Column():
|
| 110 |
-
with gr.Accordion(
|
| 111 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
| 112 |
-
open=False,
|
| 113 |
-
):
|
| 114 |
-
with gr.Row():
|
| 115 |
-
finished_eval_table = gr.components.Dataframe(
|
| 116 |
-
value=finished_eval_queue_df,
|
| 117 |
-
headers=EVAL_COLS,
|
| 118 |
-
datatype=EVAL_TYPES,
|
| 119 |
-
row_count=5,
|
| 120 |
-
)
|
| 121 |
-
with gr.Accordion(
|
| 122 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
| 123 |
-
open=False,
|
| 124 |
-
):
|
| 125 |
-
with gr.Row():
|
| 126 |
-
running_eval_table = gr.components.Dataframe(
|
| 127 |
-
value=running_eval_queue_df,
|
| 128 |
-
headers=EVAL_COLS,
|
| 129 |
-
datatype=EVAL_TYPES,
|
| 130 |
-
row_count=5,
|
| 131 |
-
)
|
| 132 |
-
|
| 133 |
-
with gr.Accordion(
|
| 134 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
| 135 |
-
open=False,
|
| 136 |
-
):
|
| 137 |
-
with gr.Row():
|
| 138 |
-
pending_eval_table = gr.components.Dataframe(
|
| 139 |
-
value=pending_eval_queue_df,
|
| 140 |
-
headers=EVAL_COLS,
|
| 141 |
-
datatype=EVAL_TYPES,
|
| 142 |
-
row_count=5,
|
| 143 |
-
)
|
| 144 |
-
with gr.Row():
|
| 145 |
-
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
| 146 |
-
|
| 147 |
with gr.Row():
|
| 148 |
with gr.Column():
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
with gr.Row():
|
| 192 |
with gr.Accordion("📙 Citation", open=False):
|
|
|
|
| 9 |
CITATION_BUTTON_TEXT,
|
| 10 |
EVALUATION_QUEUE_TEXT,
|
| 11 |
INTRODUCTION_TEXT,
|
| 12 |
+
DETECTOR_BENCHMARKS_TEXT,
|
| 13 |
TITLE,
|
| 14 |
)
|
| 15 |
from src.display.css_html_js import custom_css
|
|
|
|
| 63 |
return Leaderboard(
|
| 64 |
value=dataframe,
|
| 65 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
| 66 |
+
# select_columns=SelectColumns(
|
| 67 |
+
# default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
| 68 |
+
# cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
| 69 |
+
# label="Select Columns to Display:",
|
| 70 |
+
# ),
|
| 71 |
+
search_columns=[AutoEvalColumn.model.name],
|
| 72 |
+
# hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 73 |
+
# filter_columns=[
|
| 74 |
+
# ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 75 |
+
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
| 76 |
+
# ColumnFilter(
|
| 77 |
+
# AutoEvalColumn.params.name,
|
| 78 |
+
# type="slider",
|
| 79 |
+
# min=0.01,
|
| 80 |
+
# max=150,
|
| 81 |
+
# label="Select the number of parameters (B)",
|
| 82 |
+
# ),
|
| 83 |
+
# ColumnFilter(
|
| 84 |
+
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
| 85 |
+
# ),
|
| 86 |
+
# ],
|
| 87 |
+
# bool_checkboxgroup_label="Hide models",
|
| 88 |
interactive=False,
|
| 89 |
)
|
| 90 |
|
|
|
|
| 95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 96 |
|
| 97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 98 |
+
with gr.TabItem("🏅 Detector Leaderboard", elem_id="detector-benchmark-tab-table", id=0):
|
| 99 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 100 |
|
| 101 |
+
with gr.TabItem("🔍 Detector Playground ", elem_id="detector-playground-tab-table", id=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
with gr.Row():
|
| 103 |
with gr.Column():
|
| 104 |
+
# print(LEADERBOARD_DF.keys())
|
| 105 |
+
gr.Dropdown(LEADERBOARD_DF['Model'].tolist())
|
| 106 |
+
gr.Image()
|
| 107 |
+
gr.Button("Submit")
|
| 108 |
+
|
| 109 |
+
with gr.TabItem("📝 About", elem_id="detector-benchmark-tab-table", id=2):
|
| 110 |
+
gr.Markdown(DETECTOR_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 111 |
+
|
| 112 |
+
# with gr.TabItem("🚀 Submit here! ", elem_id="detector-benchmark-tab-table", id=3):
|
| 113 |
+
# with gr.Column():
|
| 114 |
+
# with gr.Row():
|
| 115 |
+
# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 116 |
+
|
| 117 |
+
# with gr.Column():
|
| 118 |
+
# with gr.Accordion(
|
| 119 |
+
# f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
| 120 |
+
# open=False,
|
| 121 |
+
# ):
|
| 122 |
+
# with gr.Row():
|
| 123 |
+
# finished_eval_table = gr.components.Dataframe(
|
| 124 |
+
# value=finished_eval_queue_df,
|
| 125 |
+
# headers=EVAL_COLS,
|
| 126 |
+
# datatype=EVAL_TYPES,
|
| 127 |
+
# row_count=5,
|
| 128 |
+
# )
|
| 129 |
+
# with gr.Accordion(
|
| 130 |
+
# f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
| 131 |
+
# open=False,
|
| 132 |
+
# ):
|
| 133 |
+
# with gr.Row():
|
| 134 |
+
# running_eval_table = gr.components.Dataframe(
|
| 135 |
+
# value=running_eval_queue_df,
|
| 136 |
+
# headers=EVAL_COLS,
|
| 137 |
+
# datatype=EVAL_TYPES,
|
| 138 |
+
# row_count=5,
|
| 139 |
+
# )
|
| 140 |
+
|
| 141 |
+
# with gr.Accordion(
|
| 142 |
+
# f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
| 143 |
+
# open=False,
|
| 144 |
+
# ):
|
| 145 |
+
# with gr.Row():
|
| 146 |
+
# pending_eval_table = gr.components.Dataframe(
|
| 147 |
+
# value=pending_eval_queue_df,
|
| 148 |
+
# headers=EVAL_COLS,
|
| 149 |
+
# datatype=EVAL_TYPES,
|
| 150 |
+
# row_count=5,
|
| 151 |
+
# )
|
| 152 |
+
# with gr.Row():
|
| 153 |
+
# gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
| 154 |
+
|
| 155 |
+
# with gr.Row():
|
| 156 |
+
# with gr.Column():
|
| 157 |
+
# model_name_textbox = gr.Textbox(label="Model name")
|
| 158 |
+
# revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
| 159 |
+
# model_type = gr.Dropdown(
|
| 160 |
+
# choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
| 161 |
+
# label="Model type",
|
| 162 |
+
# multiselect=False,
|
| 163 |
+
# value=None,
|
| 164 |
+
# interactive=True,
|
| 165 |
+
# )
|
| 166 |
+
|
| 167 |
+
# with gr.Column():
|
| 168 |
+
# precision = gr.Dropdown(
|
| 169 |
+
# choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
| 170 |
+
# label="Precision",
|
| 171 |
+
# multiselect=False,
|
| 172 |
+
# value="float16",
|
| 173 |
+
# interactive=True,
|
| 174 |
+
# )
|
| 175 |
+
# weight_type = gr.Dropdown(
|
| 176 |
+
# choices=[i.value.name for i in WeightType],
|
| 177 |
+
# label="Weights type",
|
| 178 |
+
# multiselect=False,
|
| 179 |
+
# value="Original",
|
| 180 |
+
# interactive=True,
|
| 181 |
+
# )
|
| 182 |
+
# base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
| 183 |
+
|
| 184 |
+
# submit_button = gr.Button("Submit Eval")
|
| 185 |
+
# submission_result = gr.Markdown()
|
| 186 |
+
# submit_button.click(
|
| 187 |
+
# add_new_eval,
|
| 188 |
+
# [
|
| 189 |
+
# model_name_textbox,
|
| 190 |
+
# base_model_name_textbox,
|
| 191 |
+
# revision_name_textbox,
|
| 192 |
+
# precision,
|
| 193 |
+
# weight_type,
|
| 194 |
+
# model_type,
|
| 195 |
+
# ],
|
| 196 |
+
# submission_result,
|
| 197 |
+
# )
|
| 198 |
|
| 199 |
with gr.Row():
|
| 200 |
with gr.Accordion("📙 Citation", open=False):
|
src/about.py
CHANGED
|
@@ -12,8 +12,11 @@ class Task:
|
|
| 12 |
# ---------------------------------------------------
|
| 13 |
class Tasks(Enum):
|
| 14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 15 |
-
task0 = Task("
|
| 16 |
-
task1 = Task("
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 19 |
# ---------------------------------------------------
|
|
@@ -21,15 +24,15 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
| 21 |
|
| 22 |
|
| 23 |
# Your leaderboard name
|
| 24 |
-
TITLE = """<h1 align="center" id="space-title">
|
| 25 |
|
| 26 |
# What does your leaderboard evaluate?
|
| 27 |
INTRODUCTION_TEXT = """
|
| 28 |
-
|
| 29 |
"""
|
| 30 |
|
| 31 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 32 |
-
|
| 33 |
## How it works
|
| 34 |
|
| 35 |
## Reproducibility
|
|
|
|
| 12 |
# ---------------------------------------------------
|
| 13 |
class Tasks(Enum):
|
| 14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 15 |
+
task0 = Task("miragenews", "acc", "MiRAGeNews")
|
| 16 |
+
task1 = Task("genimage", "acc", "GenImage")
|
| 17 |
+
# task2 = Task("cnn_det", "acc_norm", "CNN Detection")
|
| 18 |
+
# task3 = Task("forgery_net", "acc_norm", "Forgery Net")
|
| 19 |
+
# task4 = Task("deepfake_det", "acc_norm", "Deepfake Detection")
|
| 20 |
|
| 21 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 22 |
# ---------------------------------------------------
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
# Your leaderboard name
|
| 27 |
+
TITLE = """<h1 align="center" id="space-title"> 🕵️ AI-Generated Image Detector Benchmark</h1>"""
|
| 28 |
|
| 29 |
# What does your leaderboard evaluate?
|
| 30 |
INTRODUCTION_TEXT = """
|
| 31 |
+
AI-Generated Image Detector Benchmark is a platform for evaluating the performance of existing detectors on various data sources and tasks. We collected X images from Y generators with 2 different tasks: Full Image Generation Detection and Partial Image Manipulation Detection.
|
| 32 |
"""
|
| 33 |
|
| 34 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 35 |
+
DETECTOR_BENCHMARKS_TEXT = f"""
|
| 36 |
## How it works
|
| 37 |
|
| 38 |
## Reproducibility
|
src/display/utils.py
CHANGED
|
@@ -23,22 +23,22 @@ class ColumnContent:
|
|
| 23 |
## Leaderboard columns
|
| 24 |
auto_eval_column_dict = []
|
| 25 |
# Init
|
| 26 |
-
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 28 |
#Scores
|
| 29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 30 |
for task in Tasks:
|
| 31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 32 |
# Model information
|
| 33 |
-
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 34 |
-
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
| 35 |
-
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
| 36 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
| 37 |
-
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
| 38 |
-
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
| 39 |
-
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
| 40 |
-
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
| 41 |
-
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 42 |
|
| 43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
|
| 23 |
## Leaderboard columns
|
| 24 |
auto_eval_column_dict = []
|
| 25 |
# Init
|
| 26 |
+
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 28 |
#Scores
|
| 29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 30 |
for task in Tasks:
|
| 31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 32 |
# Model information
|
| 33 |
+
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 34 |
+
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
| 35 |
+
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
| 36 |
+
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
| 37 |
+
# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
| 38 |
+
# auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
| 39 |
+
# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
| 40 |
+
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
| 41 |
+
# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 42 |
|
| 43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -20,17 +20,17 @@ class EvalResult:
|
|
| 20 |
full_model: str # org/model (path on hub)
|
| 21 |
org: str
|
| 22 |
model: str
|
| 23 |
-
revision: str # commit hash, "" if main
|
| 24 |
results: dict
|
| 25 |
-
precision: Precision = Precision.Unknown
|
| 26 |
-
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
| 27 |
-
weight_type: WeightType = WeightType.Original # Original or Adapter
|
| 28 |
-
architecture: str = "Unknown"
|
| 29 |
-
license: str = "?"
|
| 30 |
-
likes: int = 0
|
| 31 |
-
num_params: int = 0
|
| 32 |
-
date: str = "" # submission date of request file
|
| 33 |
-
still_on_hub: bool = False
|
| 34 |
|
| 35 |
@classmethod
|
| 36 |
def init_from_json_file(self, json_filepath):
|
|
@@ -41,7 +41,7 @@ class EvalResult:
|
|
| 41 |
config = data.get("config")
|
| 42 |
|
| 43 |
# Precision
|
| 44 |
-
precision = Precision.from_str(config.get("model_dtype"))
|
| 45 |
|
| 46 |
# Get model and org
|
| 47 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
|
@@ -50,11 +50,11 @@ class EvalResult:
|
|
| 50 |
if len(org_and_model) == 1:
|
| 51 |
org = None
|
| 52 |
model = org_and_model[0]
|
| 53 |
-
result_key = f"{model}
|
| 54 |
else:
|
| 55 |
org = org_and_model[0]
|
| 56 |
model = org_and_model[1]
|
| 57 |
-
result_key = f"{org}_{model}
|
| 58 |
full_model = "/".join(org_and_model)
|
| 59 |
|
| 60 |
still_on_hub, _, model_config = is_model_on_hub(
|
|
@@ -85,15 +85,15 @@ class EvalResult:
|
|
| 85 |
org=org,
|
| 86 |
model=model,
|
| 87 |
results=results,
|
| 88 |
-
precision=precision,
|
| 89 |
-
revision= config.get("model_sha", ""),
|
| 90 |
-
still_on_hub=still_on_hub,
|
| 91 |
-
architecture=architecture
|
| 92 |
)
|
| 93 |
|
| 94 |
def update_with_request_file(self, requests_path):
|
| 95 |
"""Finds the relevant request file for the current model and updates info with it"""
|
| 96 |
-
request_file = get_request_file_for_model(requests_path, self.full_model
|
| 97 |
|
| 98 |
try:
|
| 99 |
with open(request_file, "r") as f:
|
|
@@ -112,27 +112,25 @@ class EvalResult:
|
|
| 112 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
| 113 |
data_dict = {
|
| 114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 115 |
-
AutoEvalColumn.precision.name: self.precision.value.name,
|
| 116 |
-
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
| 117 |
-
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
| 118 |
-
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
| 119 |
-
AutoEvalColumn.architecture.name: self.architecture,
|
| 120 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
| 121 |
-
AutoEvalColumn.revision.name: self.revision,
|
| 122 |
AutoEvalColumn.average.name: average,
|
| 123 |
-
AutoEvalColumn.license.name: self.license,
|
| 124 |
-
AutoEvalColumn.likes.name: self.likes,
|
| 125 |
-
AutoEvalColumn.params.name: self.num_params,
|
| 126 |
-
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 127 |
}
|
| 128 |
-
|
| 129 |
for task in Tasks:
|
| 130 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
| 131 |
-
|
| 132 |
return data_dict
|
| 133 |
|
| 134 |
|
| 135 |
-
def get_request_file_for_model(requests_path, model_name
|
| 136 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
| 137 |
request_files = os.path.join(
|
| 138 |
requests_path,
|
|
@@ -148,7 +146,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
| 148 |
req_content = json.load(f)
|
| 149 |
if (
|
| 150 |
req_content["status"] in ["FINISHED"]
|
| 151 |
-
and req_content["precision"] == precision.split(".")[-1]
|
| 152 |
):
|
| 153 |
request_file = tmp_request_file
|
| 154 |
return request_file
|
|
@@ -176,7 +174,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 176 |
for model_result_filepath in model_result_filepaths:
|
| 177 |
# Creation of result
|
| 178 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 179 |
-
eval_result.update_with_request_file(requests_path)
|
| 180 |
|
| 181 |
# Store results of same eval together
|
| 182 |
eval_name = eval_result.eval_name
|
|
@@ -187,10 +185,12 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 187 |
|
| 188 |
results = []
|
| 189 |
for v in eval_results.values():
|
|
|
|
| 190 |
try:
|
| 191 |
v.to_dict() # we test if the dict version is complete
|
| 192 |
results.append(v)
|
| 193 |
except KeyError: # not all eval values present
|
|
|
|
| 194 |
continue
|
| 195 |
-
|
| 196 |
return results
|
|
|
|
| 20 |
full_model: str # org/model (path on hub)
|
| 21 |
org: str
|
| 22 |
model: str
|
| 23 |
+
# revision: str # commit hash, "" if main
|
| 24 |
results: dict
|
| 25 |
+
# precision: Precision = Precision.Unknown
|
| 26 |
+
# model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
| 27 |
+
# weight_type: WeightType = WeightType.Original # Original or Adapter
|
| 28 |
+
# architecture: str = "Unknown"
|
| 29 |
+
# license: str = "?"
|
| 30 |
+
# likes: int = 0
|
| 31 |
+
# num_params: int = 0
|
| 32 |
+
# date: str = "" # submission date of request file
|
| 33 |
+
# still_on_hub: bool = False
|
| 34 |
|
| 35 |
@classmethod
|
| 36 |
def init_from_json_file(self, json_filepath):
|
|
|
|
| 41 |
config = data.get("config")
|
| 42 |
|
| 43 |
# Precision
|
| 44 |
+
# precision = Precision.from_str(config.get("model_dtype"))
|
| 45 |
|
| 46 |
# Get model and org
|
| 47 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
|
|
|
| 50 |
if len(org_and_model) == 1:
|
| 51 |
org = None
|
| 52 |
model = org_and_model[0]
|
| 53 |
+
result_key = f"{model}"
|
| 54 |
else:
|
| 55 |
org = org_and_model[0]
|
| 56 |
model = org_and_model[1]
|
| 57 |
+
result_key = f"{org}_{model}"
|
| 58 |
full_model = "/".join(org_and_model)
|
| 59 |
|
| 60 |
still_on_hub, _, model_config = is_model_on_hub(
|
|
|
|
| 85 |
org=org,
|
| 86 |
model=model,
|
| 87 |
results=results,
|
| 88 |
+
# precision=precision,
|
| 89 |
+
# revision= config.get("model_sha", ""),
|
| 90 |
+
# still_on_hub=still_on_hub,
|
| 91 |
+
# architecture=architecture
|
| 92 |
)
|
| 93 |
|
| 94 |
def update_with_request_file(self, requests_path):
|
| 95 |
"""Finds the relevant request file for the current model and updates info with it"""
|
| 96 |
+
request_file = get_request_file_for_model(requests_path, self.full_model)
|
| 97 |
|
| 98 |
try:
|
| 99 |
with open(request_file, "r") as f:
|
|
|
|
| 112 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
| 113 |
data_dict = {
|
| 114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 115 |
+
# AutoEvalColumn.precision.name: self.precision.value.name,
|
| 116 |
+
# AutoEvalColumn.model_type.name: self.model_type.value.name,
|
| 117 |
+
# AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
| 118 |
+
# AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
| 119 |
+
# AutoEvalColumn.architecture.name: self.architecture,
|
| 120 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
| 121 |
+
# AutoEvalColumn.revision.name: self.revision,
|
| 122 |
AutoEvalColumn.average.name: average,
|
| 123 |
+
# AutoEvalColumn.license.name: self.license,
|
| 124 |
+
# AutoEvalColumn.likes.name: self.likes,
|
| 125 |
+
# AutoEvalColumn.params.name: self.num_params,
|
| 126 |
+
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 127 |
}
|
|
|
|
| 128 |
for task in Tasks:
|
| 129 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
|
|
|
| 130 |
return data_dict
|
| 131 |
|
| 132 |
|
| 133 |
+
def get_request_file_for_model(requests_path, model_name):
|
| 134 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
| 135 |
request_files = os.path.join(
|
| 136 |
requests_path,
|
|
|
|
| 146 |
req_content = json.load(f)
|
| 147 |
if (
|
| 148 |
req_content["status"] in ["FINISHED"]
|
| 149 |
+
# and req_content["precision"] == precision.split(".")[-1]
|
| 150 |
):
|
| 151 |
request_file = tmp_request_file
|
| 152 |
return request_file
|
|
|
|
| 174 |
for model_result_filepath in model_result_filepaths:
|
| 175 |
# Creation of result
|
| 176 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 177 |
+
# eval_result.update_with_request_file(requests_path)
|
| 178 |
|
| 179 |
# Store results of same eval together
|
| 180 |
eval_name = eval_result.eval_name
|
|
|
|
| 185 |
|
| 186 |
results = []
|
| 187 |
for v in eval_results.values():
|
| 188 |
+
# print(v)
|
| 189 |
try:
|
| 190 |
v.to_dict() # we test if the dict version is complete
|
| 191 |
results.append(v)
|
| 192 |
except KeyError: # not all eval values present
|
| 193 |
+
# print(v)
|
| 194 |
continue
|
| 195 |
+
# print("RES", results)
|
| 196 |
return results
|