MEDIC-Benchmark

Running

App Files Files Community

tathagataraha commited on Oct 17, 2024

Commit

b3eff40

1 Parent(s): 09b313f

[ADD] Submit form, upload requests to requests dataset

Browse files

Files changed (6) hide show

.gitignore +2 -0
app.py +30 -37
medic-harness-results/meta-llama/Llama-3.1-8B-Instruct/results_2024-07-24T15:26:36Z.json +1 -1
src/display/utils.py +19 -16
src/populate.py +0 -3
src/submission/submit.py +25 -60

.gitignore CHANGED Viewed

@@ -12,4 +12,6 @@ eval-queue-bk/
 eval-results-bk/
 eval-queue-local/
 eval-results-local/
 logs/

 eval-results-bk/
 eval-queue-local/
 eval-results-local/
+medic-harness-requests/
+medic-harness-results/
 logs/

app.py CHANGED Viewed

@@ -361,7 +361,7 @@ with demo:
             gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
             # gr.HTML(ENTITY_DISTRIBUTION_IMG, elem_classes="logo")
             gr.Markdown(LLM_BENCHMARKS_TEXT_3, elem_classes="markdown-text")
         with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=4):
             with gr.Column():
                 with gr.Row():
@@ -407,16 +407,8 @@ with demo:
             with gr.Row():
                 with gr.Column():
                     model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_arch = gr.Radio(
-                        choices=[t.to_str(" : ") for t in ModelArch if t != ModelArch.Unknown],
-                        label="Model Architecture",
-                    )
                     model_type = gr.Dropdown(
                         choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
                         label="Model type",
@@ -426,29 +418,32 @@ with demo:
                     )
                 with gr.Column():
-                    label_normalization_map = gr.Textbox(lines=6, label="Label Normalization Map", placeholder=PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG)
-                    gliner_threshold = gr.Textbox(label="Threshold for GLiNER models", visible=False)
-                    gliner_tokenizer_bool = gr.Radio(
-                        choices=["True", "False"],
-                        label="Load GLiNER Tokenizer",
-                        visible=False
                     )
-                    prompt_name = gr.Dropdown(
-                        choices=[prompt_template.value for prompt_template in PromptTemplateName],
-                        label="Prompt for generation",
                         multiselect=False,
-                        value="HTML Highlighted Spans",
                         interactive=True,
-                        visible=False
-                    )# should be a dropdown
-                    # parsing_function - this is tied to the prompt & therefore does not need to be specified
-                    # generation_parameters = gr.Textbox(label="Generation params in json format") just default for now
-                    model_arch.change(fn=change_submit_request_form, inputs=model_arch, outputs=[
-                        gliner_threshold,
-                        gliner_tokenizer_bool,
-                        prompt_name])
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
@@ -456,15 +451,13 @@ with demo:
                 add_new_eval,
                 [
                     model_name_textbox,
-                    # base_model_name_textbox,
                     revision_name_textbox,
-                    model_arch,
-                    label_normalization_map,
-                    gliner_threshold,
-                    gliner_tokenizer_bool,
-                    prompt_name,
-                    # weight_type,
                     model_type,
                 ],
                 submission_result,
             )

             gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
             # gr.HTML(ENTITY_DISTRIBUTION_IMG, elem_classes="logo")
             gr.Markdown(LLM_BENCHMARKS_TEXT_3, elem_classes="markdown-text")
         with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=4):
             with gr.Column():
                 with gr.Row():
             with gr.Row():
                 with gr.Column():
                     model_name_textbox = gr.Textbox(label="Model name")
+                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                     model_type = gr.Dropdown(
                         choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
                         label="Model type",
                     )
                 with gr.Column():
+                    precision = gr.Dropdown(
+                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
+                        label="Precision",
+                        multiselect=False,
+                        value="float16",
+                        interactive=True,
                     )
+                    weight_type = gr.Dropdown(
+                        choices=[i.value.name for i in WeightType],
+                        label="Weights type",
                         multiselect=False,
+                        value=WeightType.Original.value.name,
                         interactive=True,
+                    )
+                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)", interactive=False)
+            with gr.Row():
+                domain_specific_toggle = gr.Checkbox(
+                    label="Domain specific",
+                    value=False,
+                    info="Is your model medically oriented?",
+                )
+                chat_template_toggle = gr.Checkbox(
+                    label="Use chat template",
+                    value=False,
+                    info="Is your model a chat model?",
+                )
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
                 add_new_eval,
                 [
                     model_name_textbox,
+                    base_model_name_textbox,
                     revision_name_textbox,
                     model_type,
+                    domain_specific_toggle,
+                    chat_template_toggle,
+                    precision,
+                    weight_type
                 ],
                 submission_result,
             )

medic-harness-results/meta-llama/Llama-3.1-8B-Instruct/results_2024-07-24T15:26:36Z.json CHANGED Viewed

@@ -3,7 +3,7 @@
     "model_name": "meta-llama/Llama-3.1-8B-Instruct",
     "revision": "main",
     "submitted_time": "2024-07-24 14:33:56+00:00",
-    "model_type": "instruct-tuned",
     "num_params": 8000000000,
     "private": false,
     "evaluated_time": "2024-07-24T15:26:36Z"

     "model_name": "meta-llama/Llama-3.1-8B-Instruct",
     "revision": "main",
     "submitted_time": "2024-07-24 14:33:56+00:00",
+    "model_type": "instruction-tuned",
     "num_params": 8000000000,
     "private": false,
     "evaluated_time": "2024-07-24T15:26:36Z"

src/display/utils.py CHANGED Viewed

@@ -58,9 +58,9 @@ class EvalQueueColumn:  # Queue column
     model = ColumnContent("model", "markdown", True)
     revision = ColumnContent("revision", "str", True)
     private = ColumnContent("private", "bool", True)
-    architecture = ColumnContent("model_architecture", "bool", True)
-    # precision = ColumnContent("precision", "str", True)
-    # weight_type = ColumnContent("weight_type", "str", "Original")
     status = ColumnContent("status", "str", True)
@@ -73,12 +73,13 @@ class ModelDetails:
 class ModelType(Enum):
-    ZEROSHOT = ModelDetails(name="zero-shot", symbol="⚫")
-    FINETUNED = ModelDetails(name="fine-tuned", symbol="⚪")
     PT = ModelDetails(name="pretrained", symbol="🟢")
-    FT = ModelDetails(name="fine-tuned", symbol="🔶")
-    # IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
-    # RL = ModelDetails(name="RL-tuned", symbol="🟦")
     Unknown = ModelDetails(name="", symbol="?")
     def to_str(self, separator=" "):
@@ -86,18 +87,20 @@ class ModelType(Enum):
     @staticmethod
     def from_str(type):
-        if "zero-shot" in type or "⚫" in type:
-            return ModelType.ZEROSHOT
-        if "fine-tuned" in type or "⚪" in type:
-            return ModelType.FINETUNED
         # if "fine-tuned" in type or "🔶" in type:
         #     return ModelType.FT
-        # if "pretrained" in type or "🟢" in type:
-        #     return ModelType.PT
         # if "RL-tuned" in type or "🟦" in type:
         #     return ModelType.RL
-        # if "instruction-tuned" in type or "⭕" in type:
-        #     return ModelType.IFT
         return ModelType.Unknown
 class ModelArch(Enum):

     model = ColumnContent("model", "markdown", True)
     revision = ColumnContent("revision", "str", True)
     private = ColumnContent("private", "bool", True)
+    model_type = ColumnContent("model_type", "str", True)
+    precision = ColumnContent("precision", "str", True)
+    weight_type = ColumnContent("weight_type", "str", "Original")
     status = ColumnContent("status", "str", True)
 class ModelType(Enum):
+    # ZEROSHOT = ModelDetails(name="zero-shot", symbol="⚫")
+    # FINETUNED = ModelDetails(name="fine-tuned", symbol="⚪")
     PT = ModelDetails(name="pretrained", symbol="🟢")
+    # FT = ModelDetails(name="fine-tuned", symbol="🔶")
+    # DS = ModelDetails(name="domain-specific", symbol="➕")
+    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
+    RL = ModelDetails(name="preference-tuned", symbol="🟦")
     Unknown = ModelDetails(name="", symbol="?")
     def to_str(self, separator=" "):
     @staticmethod
     def from_str(type):
+        # if "zero-shot" in type or "⚫" in type:
+        #     return ModelType.ZEROSHOT
+        # if "fine-tuned" in type or "⚪" in type:
+        #     return ModelType.FINETUNED
         # if "fine-tuned" in type or "🔶" in type:
         #     return ModelType.FT
+        if "pretrained" in type or "🟢" in type:
+            return ModelType.PT
         # if "RL-tuned" in type or "🟦" in type:
         #     return ModelType.RL
+        if "instruction-tuned" in type or "⭕" in type:
+            return ModelType.IFT
+        # if "domain-specific" in type or "➕" in type:
+        #     return ModelType.DS
         return ModelType.Unknown
 class ModelArch(Enum):

src/populate.py CHANGED Viewed

@@ -29,16 +29,13 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     """Creates the different dataframes for the evaluation queues requestes"""
     entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
     all_evals = []
     for entry in entries:
         if ".json" in entry:
             file_path = os.path.join(save_path, entry)
             with open(file_path) as fp:
                 data = json.load(fp)
             data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
             data[EvalQueueColumn.revision.name] = data.get("revision", "main")
             all_evals.append(data)
         elif ".md" not in entry:
             # this is a folder

     """Creates the different dataframes for the evaluation queues requestes"""
     entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
     all_evals = []
     for entry in entries:
         if ".json" in entry:
             file_path = os.path.join(save_path, entry)
             with open(file_path) as fp:
                 data = json.load(fp)
             data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
             data[EvalQueueColumn.revision.name] = data.get("revision", "main")
             all_evals.append(data)
         elif ".md" not in entry:
             # this is a folder

src/submission/submit.py CHANGED Viewed

@@ -42,16 +42,13 @@ PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG = """{
 def add_new_eval(
     model: str,
-    # base_model: str,
     revision: str,
-    # precision: str,
-    # weight_type: str,
-    model_arch: str,
-    label_normalization_map: str,
-    gliner_threshold:str,
-    gliner_tokenizer_bool:str,
-    prompt_template_name:str,
     model_type: str,
 ):
     """
     Saves request if valid else returns the error.
@@ -85,22 +82,16 @@ def add_new_eval(
     if revision == "":
         revision = "main"
-    # # Is the model on the hub?
-    # if weight_type in ["Delta", "Adapter"]:
-    #     base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
-    #     if not base_model_on_hub:
-    #         return styled_error(f'Base model "{base_model}" {error}')
-    if not model_arch == "GLiNER Encoder":
         model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
         if not model_on_hub:
             return styled_error(f'Model "{model}" {error}')
-    else:
-        model_name_matches = list(API.list_models(model_name=model))
-        if len(model_name_matches) < 1:
-            return styled_error(f'Model "{model}" does not exist on the hub!')
-        elif model_name_matches[0].id != model:
-            return styled_error(f'Model "{model}" does not exist on the hub! There might be a typo in the name')
     # Is the model info correctly filled?
@@ -122,39 +113,15 @@ def add_new_eval(
         return styled_error(error_msg)
     # Verify the inference config now
-    try:
-        label_normalization_map = ast.literal_eval(label_normalization_map)
-    except Exception as e:
-        return styled_error("Please enter a valid json for the labe; normalization map")
-    inference_config = {
-        # "model_arch" : model_arch,
-        "label_normalization_map": label_normalization_map,
-    }
-    match model_arch:
-        case "Encoder":
-            pass
-        case "Decoder":
-            if not prompt_template_name in [prompt_template.value for prompt_template in PromptTemplateName]:
-                return styled_error("Prompt template name is invalid")
-            inference_config = {
-                **inference_config,
-                "prompt_template_identifier": prompt_template_name,
-            }
-        case "GLiNER Encoder":
-                try:
-                    gliner_threshold = float(gliner_threshold)
-                    gliner_tokenizer_bool = ast.literal_eval(gliner_tokenizer_bool)
-                    inference_config = {
-                        **inference_config,
-                        "gliner_threshold": gliner_threshold,
-                        "gliner_tokenizer_bool" : gliner_tokenizer_bool
-                    }
-                except Exception as e:
-                    return styled_error("Please enter a valid float for the threshold")
-        case _:
-            return styled_error("Model Architecture is invalid")
     # Seems good, creating the eval
     print("Adding new eval")
@@ -162,11 +129,10 @@ def add_new_eval(
     eval_entry = {
         "model_name": model,
-        # "base_model": base_model,
         "revision": revision,
-        # "precision": precision,
-        # "weight_type": weight_type,
-        "model_architecture": model_arch,
         "status": "PENDING",
         "submitted_time": current_time,
         "model_type": model_type,
@@ -174,18 +140,17 @@ def add_new_eval(
         "num_params": model_size,
         "license": license,
         "private": False,
-        "inference_config":inference_config,
     }
     # Check for duplicate submission
-    if f"{model}_{revision}" in REQUESTED_MODELS:
         return styled_warning("This model has been already submitted. Add the revision if the model has been updated.")
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
-    out_path = f"{OUT_DIR}/{model_path}_{revision}_eval_request.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))
@@ -193,7 +158,7 @@ def add_new_eval(
     print("Uploading eval file")
     API.upload_file(
         path_or_fileobj=out_path,
-        path_in_repo=out_path.split("eval-queue/")[1],
         repo_id=QUEUE_REPO,
         repo_type="dataset",
         commit_message=f"Add {model} to eval queue",

 def add_new_eval(
     model: str,
+    base_model: str,
     revision: str,
     model_type: str,
+    domain_specific: bool,
+    chat_template: bool,
+    precision: str,
+    weight_type: str,
 ):
     """
     Saves request if valid else returns the error.
     if revision == "":
         revision = "main"
+    # Is the model on the hub?
+    if weight_type in ["Delta", "Adapter"]:
+        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
+        if not base_model_on_hub:
+            return styled_error(f'Base model "{base_model}" {error}')
+    if not weight_type == "Adapter":
         model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
         if not model_on_hub:
             return styled_error(f'Model "{model}" {error}')
     # Is the model info correctly filled?
         return styled_error(error_msg)
     # Verify the inference config now
+    # try:
+    #     label_normalization_map = ast.literal_eval(label_normalization_map)
+    # except Exception as e:
+    #     return styled_error("Please enter a valid json for the labe; normalization map")
+    # inference_config = {
+    #     # "model_arch" : model_arch,
+    #     "label_normalization_map": label_normalization_map,
+    # }
     # Seems good, creating the eval
     print("Adding new eval")
     eval_entry = {
         "model_name": model,
+        "base_model": base_model,
         "revision": revision,
+        "precision": precision,
+        "weight_type": weight_type,
         "status": "PENDING",
         "submitted_time": current_time,
         "model_type": model_type,
         "num_params": model_size,
         "license": license,
         "private": False,
     }
     # Check for duplicate submission
+    if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
         return styled_warning("This model has been already submitted. Add the revision if the model has been updated.")
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
+    out_path = f"{OUT_DIR}/{model_path}_{revision}_{precision}_{weight_type}_eval_request.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))
     print("Uploading eval file")
     API.upload_file(
         path_or_fileobj=out_path,
+        path_in_repo=out_path.split(f"{EVAL_REQUESTS_PATH}/")[1],
         repo_id=QUEUE_REPO,
         repo_type="dataset",
         commit_message=f"Add {model} to eval queue",