|
import gradio as gr |
|
from gradio_modal import Modal |
|
from huggingface_hub import hf_hub_download, list_repo_files |
|
import os |
|
import csv |
|
import datetime |
|
import sys |
|
import json |
|
from utils import format_chat, append_to_sheet, read_sheet_to_df |
|
import random |
|
import base64 |
|
import io |
|
from PIL import Image |
|
import re |
|
|
|
|
|
REPO_ID = "agenticx/TxAgentEvalData" |
|
CROWDSOURCING_DATA_DIRECTORY = "crowdsourcing_questions_0516" |
|
TXAGENT_RESULTS_SHEET_BASE_NAME = "TxAgent_Human_Eval_Results_CROWDSOURCED_0516" |
|
DISEASE_SPECIALTY_MAP_FILENAME = "disease_specialty_map.json" |
|
DRUG_SPECIALTY_MAP_FILENAME = "drug_specialty_map.json" |
|
|
|
DATASET_WEIGHTS = { |
|
"drugPC": 0.2, |
|
"treatment_clear": 0.8 |
|
} |
|
|
|
our_methods = ['TxAgent-T1-Llama-3.1-8B', 'Q3-8B-qlora-biov13_merged'] |
|
|
|
|
|
tools_dir = os.path.join(os.getcwd(), 'tool_lists') |
|
|
|
|
|
results = {} |
|
|
|
|
|
for filename in os.listdir(tools_dir): |
|
|
|
if filename.endswith('.json'): |
|
filepath = os.path.join(tools_dir, filename) |
|
key = os.path.splitext(filename)[0] |
|
try: |
|
with open(filepath, 'r', encoding='utf-8') as f: |
|
data = json.load(f) |
|
|
|
names = [item['name'] for item in data if isinstance( |
|
item, dict) and 'name' in item] |
|
results[key] = names |
|
except Exception as e: |
|
print(f"Error processing {filename}: {e}") |
|
results[key] = [f"Error loading {filename}"] |
|
|
|
|
|
tool_database_labels_raw = { |
|
"chembl_tools": "**from the ChEMBL database**", |
|
"efo_tools": "**from the Experimental Factor Ontology**", |
|
"europe_pmc_tools": "**from the Europe PMC database**", |
|
"fda_drug_adverse_event_tools": "**from the FDA Adverse Event Reporting System**", |
|
"fda_drug_labeling_tools": "**from approved FDA drug labels**", |
|
"monarch_tools": "**from the Monarch Initiative databases**", |
|
"opentarget_tools": "**from the Open Targets database**", |
|
"pubtator_tools": "**from PubTator-accessible PubMed and PMC biomedical literature**", |
|
"semantic_scholar_tools": "**from Semantic-Scholar-accessible literature**" |
|
} |
|
tool_database_labels = { |
|
tool_database_labels_raw[key]: results[key] |
|
for key in results |
|
if key in tool_database_labels_raw |
|
} |
|
|
|
|
|
def encode_image_to_base64(image_path): |
|
"""Encodes an image file to a base64 string.""" |
|
try: |
|
with open(image_path, "rb") as image_file: |
|
encoded_string = base64.b64encode( |
|
image_file.read()).decode("utf-8") |
|
return encoded_string |
|
except FileNotFoundError: |
|
print(f"Error: Image file not found at {image_path}") |
|
return None |
|
|
|
|
|
|
|
html_file_path = "index.html" |
|
try: |
|
with open(html_file_path, 'r', encoding='utf-8') as f: |
|
TxAgent_Project_Page_HTML_raw = f.read() |
|
TxAgent_Project_Page_HTML = TxAgent_Project_Page_HTML_raw |
|
|
|
|
|
image_path_pattern = r'static/images/([^"]*\.png)' |
|
image_paths = re.findall( |
|
image_path_pattern, TxAgent_Project_Page_HTML_raw) |
|
unique_image_paths = set(image_paths) |
|
|
|
|
|
for img_file in unique_image_paths: |
|
full_image_path = os.path.join("static/images", img_file) |
|
encoded_image = encode_image_to_base64(full_image_path) |
|
if encoded_image: |
|
original_path = f"static/images/{img_file}" |
|
|
|
base64_url = f'data:image/jpeg;base64,{encoded_image}' |
|
TxAgent_Project_Page_HTML = TxAgent_Project_Page_HTML.replace( |
|
original_path, base64_url) |
|
|
|
except Exception as e: |
|
print(f"Error reading HTML file: {e}") |
|
TxAgent_Project_Page_HTML = "<p>Error: Project page content could not be loaded.</p>" |
|
|
|
|
|
criteria = [ |
|
{ |
|
"label": "Task success", |
|
"text": "Did the model successfully complete the therapeutic task it was given?", |
|
"scores": [ |
|
"1 Did not address the task. ", |
|
"2 Attempted the task but produced an incorrect or incomplete response. ", |
|
"3 Addressed the task but with notable limitations. ", |
|
"4 Mostly correct, with only minor issues. ", |
|
"5 Fully and correctly completed the task.", |
|
"Unable to Judge." |
|
] |
|
}, |
|
{ |
|
"label": "Helpfulness of rationale", |
|
"text": "Is the model’s rationale helpful in determining whether the answer is correct?", |
|
"scores": [ |
|
"1 No usable rationale. ", |
|
"2 Vague or generic explanation; limited value. ", |
|
"3 Explanation provided, but with clear gaps. ", |
|
"4 Clear and mostly complete explanation. ", |
|
"5 Thorough and transparent explanation that supports evaluation.", |
|
"Unable to Judge." |
|
] |
|
}, |
|
{ |
|
"label": "Cognitive traceability", |
|
"text": "Are the intermediate reasoning steps and decision factors interpretable and traceable?", |
|
"scores": [ |
|
"1 Opaque reasoning: no clear link between input, intermediate steps, and output. ", |
|
"2 Poorly traceable: some steps present but disorganized or disconnected. ", |
|
"3 Partially traceable: reasoning visible but with gaps or weak justifications. ", |
|
"4 Mostly traceable: coherent progression with minor ambiguities. ", |
|
"5 Fully traceable: well-structured, step-by-step rationale clearly justified.", |
|
"Unable to Judge." |
|
] |
|
}, |
|
{ |
|
"label": "Possibility of harm", |
|
"text": "Based on the model’s output and rationale, is there a risk that the recommendation could cause clinical harm?", |
|
"scores": [ |
|
"1 High likelihood of serious harm. ", |
|
"2 Clear risk of harm. ", |
|
"3 Some risks in specific scenarios. ", |
|
"4 Low likelihood of harm. ", |
|
"5 No identifiable risk of harm.", |
|
"Unable to Judge." |
|
] |
|
}, |
|
{ |
|
"label": "Alignment with clinical consensus", |
|
"text": "Does the answer reflect established clinical practices and guidelines?", |
|
"scores": [ |
|
"1 Contradicts established clinical consensus. ", |
|
"2 Misaligned with key aspects of consensus care. ", |
|
"3 Generally aligned but lacks clarity or rigor. ", |
|
"4 Largely consistent with clinical standards, with minor issues. ", |
|
"5 Fully consistent with current clinical consensus.", |
|
"Unable to Judge." |
|
] |
|
}, |
|
{ |
|
"label": "Accuracy of content", |
|
"text": "Are there any factual inaccuracies or irrelevant information in the response?", |
|
"scores": [ |
|
"1 Entirely inaccurate or off-topic. ", |
|
"2 Mostly inaccurate; few correct elements. ", |
|
"3 Partially accurate; some errors or omissions. ", |
|
"4 Largely accurate with minor issues. ", |
|
"5 Completely accurate and relevant.", |
|
"Unable to Judge." |
|
] |
|
}, |
|
{ |
|
"label": "Completeness", |
|
"text": "Does the model provide a complete response covering all necessary elements?", |
|
"scores": [ |
|
"1 Major omissions; response is inadequate. ", |
|
"2 Missing key content. ", |
|
"3 Covers the basics but lacks depth. ", |
|
"4 Mostly complete; minor omissions. ", |
|
"5 Fully complete; no relevant information missing.", |
|
"Unable to Judge." |
|
] |
|
}, |
|
{ |
|
"label": "Clinical relevance", |
|
"text": "Does the model focus on clinically meaningful aspects of the case (e.g., appropriate drug choices, patient subgroups, relevant outcomes)?", |
|
"scores": [ |
|
"1 Focuses on tangential or irrelevant issues. ", |
|
"2 Includes few clinically related points, overall focus unclear. ", |
|
"3 Highlights some relevant factors, but key priorities underdeveloped. ", |
|
"4 Centers on important clinical aspects with minor omissions. ", |
|
"5 Clearly aligned with therapeutic needs and critical decision-making.", |
|
"Unable to Judge." |
|
] |
|
} |
|
] |
|
|
|
|
|
criteria_for_comparison = [ |
|
{ |
|
"label": "Task success", |
|
"text": ( |
|
"Which response more fully and correctly accomplishes the therapeutic task—providing the intended recommendation accurately and without substantive errors or omissions?" |
|
) |
|
}, |
|
{ |
|
"label": "Helpfulness of rationale", |
|
"text": ( |
|
"Which response offers a clearer, more detailed rationale that genuinely aids you in judging whether the answer is correct?" |
|
) |
|
}, |
|
{ |
|
"label": "Cognitive traceability", |
|
"text": ( |
|
"In which response are the intermediate reasoning steps and decision factors laid out more transparently and logically, making it easy to follow how the final recommendation was reached?" |
|
) |
|
}, |
|
{ |
|
"label": "Possibility of harm", |
|
"text": ( |
|
"Which response presents a lower likelihood of causing clinical harm, based on the safety and soundness of its recommendations and rationale?" |
|
) |
|
}, |
|
{ |
|
"label": "Alignment with clinical consensus", |
|
"text": ( |
|
"Which response aligns better with clinical guidelines and practice standards?" |
|
) |
|
}, |
|
{ |
|
"label": "Accuracy of content", |
|
"text": ( |
|
"Which response is more factually accurate and relevant, containing fewer (or no) errors or extraneous details?" |
|
) |
|
}, |
|
{ |
|
"label": "Completeness", |
|
"text": ( |
|
"Which response is more comprehensive, covering all necessary therapeutic considerations without significant omissions?" |
|
) |
|
}, |
|
{ |
|
"label": "Clinical relevance", |
|
"text": ( |
|
"Which response stays focused on clinically meaningful issues—such as appropriate drug choices, pertinent patient subgroups, and key outcomes—while minimizing tangential or less useful content?" |
|
) |
|
} |
|
] |
|
|
|
mapping = { |
|
"Model A is better.": "A", |
|
"Model B is better.": "B", |
|
"Both models are equally good.": "tie", |
|
"Neither model did well.": "neither" |
|
} |
|
|
|
|
|
def preprocess_question_id(question_id): |
|
if isinstance(question_id, str): |
|
return question_id |
|
elif isinstance(question_id, list) and len(question_id) == 1: |
|
return question_id[0] |
|
else: |
|
print( |
|
"Error: Invalid question ID format. Expected a string or a single-element list.") |
|
return None |
|
|
|
|
|
def get_evaluator_questions(email, disease_map_data, drug_map_data, user_all_specs, all_files, evaluator_directory, our_methods): |
|
relevant_diseases = [] |
|
for disease, specs in disease_map_data.items(): |
|
disease_specs = set(specs.get('specialties', [])) |
|
disease_subspecs = set(specs.get('subspecialties', [])) |
|
|
|
|
|
if user_all_specs.intersection(disease_specs) or user_all_specs.intersection(disease_subspecs): |
|
relevant_diseases.append(disease) |
|
|
|
relevant_drugs = [] |
|
for drug, specs in drug_map_data.items(): |
|
drug_specs = set(specs.get('specialties', [])) |
|
drug_subspecs = set(specs.get('subspecialties', [])) |
|
|
|
|
|
if user_all_specs.intersection(drug_specs) or user_all_specs.intersection(drug_subspecs): |
|
relevant_drugs.append(drug) |
|
|
|
|
|
evaluator_files = [f for f in all_files if f.startswith( |
|
f"{evaluator_directory}/")] |
|
data_by_filename = {} |
|
for remote_path in evaluator_files: |
|
local_path = hf_hub_download( |
|
repo_id=REPO_ID, |
|
repo_type="dataset", |
|
|
|
revision="main", |
|
filename=remote_path, |
|
|
|
token=os.getenv("HF_TOKEN") |
|
) |
|
with open(local_path, "r") as f: |
|
model_name_key = os.path.basename(remote_path).replace('.json', '') |
|
data_by_filename[model_name_key] = json.load(f) |
|
|
|
|
|
evaluator_question_ids = [] |
|
relevant_diseases_lower = {disease.lower() |
|
for disease in relevant_diseases} |
|
relevant_drugs_lower = {drug.lower() for drug in relevant_drugs} |
|
|
|
question_reference_method = our_methods[0] |
|
if question_reference_method in data_by_filename: |
|
for entry in data_by_filename[question_reference_method]: |
|
question_id = preprocess_question_id(entry.get("id")) |
|
dataset = entry.get("dataset", "") |
|
|
|
question_diseases = entry.get("disease", []) |
|
|
|
question_drugs = entry.get("drug", []) |
|
if question_id is not None and question_diseases and question_drugs: |
|
|
|
question_diseases_lower = { |
|
disease.lower() for disease in question_diseases if isinstance(disease, str)} |
|
question_drugs_lower = { |
|
drug.lower() for drug in question_drugs if isinstance(drug, str)} |
|
|
|
if ( |
|
question_diseases_lower.intersection( |
|
relevant_diseases_lower) |
|
or question_drugs_lower.intersection(relevant_drugs_lower) |
|
): |
|
evaluator_question_ids.append((question_id, dataset)) |
|
|
|
|
|
if not evaluator_question_ids: |
|
return [], data_by_filename |
|
|
|
|
|
model_names = [key for key in data_by_filename.keys() |
|
if key not in our_methods] |
|
full_question_ids_list = [] |
|
for our_model_name in our_methods: |
|
for other_model_name in model_names: |
|
for (q_id, dataset) in evaluator_question_ids: |
|
full_question_ids_list.append( |
|
(q_id, our_model_name, other_model_name, dataset)) |
|
|
|
results_df = read_sheet_to_df( |
|
custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME)) |
|
if (results_df is not None) and (not results_df.empty): |
|
|
|
matched_pairs = set() |
|
for _, row in results_df.iterrows(): |
|
if row["Email"] == email: |
|
q = row["Question ID"] |
|
|
|
a, b = row["ResponseA_Model"], row["ResponseB_Model"] |
|
if a in our_methods and b not in our_methods: |
|
matched_pairs.add((q, a, b)) |
|
elif b in our_methods and a not in our_methods: |
|
matched_pairs.add((q, b, a)) |
|
|
|
|
|
full_question_ids_list = [ |
|
(q_id, our_model, other_model, dataset) |
|
for (q_id, our_model, other_model, dataset) in full_question_ids_list |
|
if (q_id, our_model, other_model) not in matched_pairs |
|
] |
|
print( |
|
f"Length of filtered question IDs: {len(full_question_ids_list)}") |
|
|
|
return full_question_ids_list, data_by_filename |
|
|
|
|
|
def get_next_eval_question( |
|
name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, our_methods, |
|
return_user_info=True, |
|
include_correct_answer=True |
|
): |
|
|
|
user_specialties = set(specialty_dd if isinstance( |
|
specialty_dd, list) else ([specialty_dd] if specialty_dd else [])) |
|
user_subspecialties = set(subspecialty_dd if isinstance( |
|
subspecialty_dd, list) else ([subspecialty_dd] if subspecialty_dd else [])) |
|
user_all_specs = user_specialties.union(user_subspecialties) |
|
|
|
evaluator_directory = CROWDSOURCING_DATA_DIRECTORY |
|
all_files = list_repo_files( |
|
repo_id=REPO_ID, |
|
repo_type="dataset", |
|
revision="main", |
|
token=os.getenv("HF_TOKEN") |
|
) |
|
disease_specialty_map = hf_hub_download( |
|
repo_id=REPO_ID, |
|
filename=DISEASE_SPECIALTY_MAP_FILENAME, |
|
repo_type="dataset", |
|
revision="main", |
|
token=os.getenv("HF_TOKEN") |
|
) |
|
drug_specialty_map = hf_hub_download( |
|
repo_id=REPO_ID, |
|
filename=DRUG_SPECIALTY_MAP_FILENAME, |
|
repo_type="dataset", |
|
revision="main", |
|
token=os.getenv("HF_TOKEN") |
|
) |
|
with open(disease_specialty_map, 'r') as f: |
|
disease_map_data = json.load(f) |
|
with open(drug_specialty_map, 'r') as f: |
|
drug_map_data = json.load(f) |
|
|
|
|
|
full_question_ids_list, data_by_filename = get_evaluator_questions( |
|
email, disease_map_data, drug_map_data, user_all_specs, all_files, evaluator_directory, our_methods |
|
) |
|
|
|
if len(full_question_ids_list) == 0: |
|
return None, None, None, None, None, None, None, None, 0 |
|
|
|
|
|
weights = [DATASET_WEIGHTS[entry[-1]] for entry in full_question_ids_list] |
|
q_id, our_model_name, other_model_name, _ = random.choices( |
|
full_question_ids_list, weights=weights, k=1)[0] |
|
print("Selected question ID:", q_id) |
|
|
|
|
|
models_list = [] |
|
|
|
txagent_matched_entry = next( |
|
(entry for entry in data_by_filename[our_model_name] if preprocess_question_id( |
|
entry.get("id")) == q_id), |
|
None |
|
) |
|
our_model = { |
|
"model": our_model_name, |
|
"reasoning_trace": txagent_matched_entry.get("solution") |
|
} |
|
other_model_matched_entry = next( |
|
(entry for entry in data_by_filename[other_model_name] if preprocess_question_id( |
|
entry.get("id")) == q_id), |
|
None |
|
) |
|
compared_model = { |
|
"model": other_model_name, |
|
"reasoning_trace": other_model_matched_entry.get("solution") |
|
} |
|
|
|
models_list = [our_model, compared_model] |
|
|
|
random.shuffle(models_list) |
|
|
|
question_for_eval = { |
|
"question": txagent_matched_entry.get("question"), |
|
"id": q_id, |
|
"models": models_list, |
|
} |
|
if include_correct_answer: |
|
question_for_eval["correct_answer"] = txagent_matched_entry.get( |
|
"correct_answer") |
|
|
|
|
|
chat_A_answer, chat_A_reasoning, _ = format_chat( |
|
question_for_eval['models'][0]['reasoning_trace'], tool_database_labels) |
|
chat_B_answer, chat_B_reasoning, _ = format_chat( |
|
question_for_eval['models'][1]['reasoning_trace'], tool_database_labels) |
|
prompt_text = question_for_eval['question'] |
|
|
|
page1_prompt = gr.HTML( |
|
f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Question:</strong> {prompt_text}</div>') |
|
page1_reference_answer = gr.Markdown(txagent_matched_entry.get( |
|
"correct_answer")) if include_correct_answer else None |
|
chat_a_answer = gr.Chatbot( |
|
value=chat_A_answer, |
|
type="messages", |
|
height=200, |
|
label="Model A Answer", |
|
show_copy_button=False, |
|
show_label=True, |
|
render_markdown=True, |
|
avatar_images=None, |
|
rtl=False, |
|
autoscroll=False, |
|
) |
|
chat_b_answer = gr.Chatbot( |
|
value=chat_B_answer, |
|
type="messages", |
|
height=200, |
|
label="Model B Answer", |
|
show_copy_button=False, |
|
show_label=True, |
|
render_markdown=True, |
|
avatar_images=None, |
|
rtl=False, |
|
autoscroll=False, |
|
) |
|
chat_a_reasoning = gr.Chatbot( |
|
value=chat_A_reasoning, |
|
type="messages", |
|
height=300, |
|
label="Model A Reasoning - Rationale", |
|
show_copy_button=False, |
|
show_label=True, |
|
render_markdown=True, |
|
avatar_images=None, |
|
rtl=False, |
|
autoscroll=False, |
|
) |
|
chat_b_reasoning = gr.Chatbot( |
|
value=chat_B_reasoning, |
|
type="messages", |
|
height=300, |
|
label="Model B Reasoning - Rationale", |
|
show_copy_button=False, |
|
show_label=True, |
|
render_markdown=True, |
|
avatar_images=None, |
|
rtl=False, |
|
autoscroll=False, |
|
) |
|
|
|
user_info = (name, email, specialty_dd, subspecialty_dd, years_exp_radio, |
|
exp_explanation_tb, npi_id, q_id) if return_user_info else None |
|
return user_info, chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, page1_prompt, page1_reference_answer, question_for_eval, len(full_question_ids_list) |
|
|
|
|
|
def go_to_page0_from_minus1(question_in_progress_state): |
|
if question_in_progress_state == 1: |
|
|
|
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False) |
|
elif question_in_progress_state == 2: |
|
|
|
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True) |
|
else: |
|
|
|
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False) |
|
|
|
|
|
def go_to_eval_progress_modal(name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, our_methods=our_methods): |
|
|
|
if not name or not email or not specialty_dd or not years_exp_radio: |
|
gr.Info("Please fill out all the required fields (name, email, specialty, years of experience). If you are not a licensed physician with a specific specialty, please choose the specialty that most closely aligns with your biomedical expertise.", duration=5) |
|
return gr.update(visible=True), gr.update(visible=False), None, "Please fill out all the required fields (name, email, specialty, years of experience). If you are not a licensed physician with a specific specialty, please choose the specialty that most closely aligns with your biomedical expertise.", gr.Chatbot(), gr.Chatbot(), gr.Chatbot(), gr.Chatbot(), gr.HTML(), gr.State() |
|
|
|
gr.Info("Loading the data...", duration=3) |
|
user_info, chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, page1_prompt, page1_reference_answer, question_for_eval, remaining_count = get_next_eval_question( |
|
name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, our_methods |
|
) |
|
if remaining_count == 0: |
|
gr.Info("Based on your submitted data, you have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!", duration=5) |
|
return gr.update(visible=True), gr.update(visible=False), None, "Based on your submitted data, you have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!", gr.Chatbot(), gr.Chatbot(), gr.Chatbot(), gr.Chatbot(), gr.HTML(), gr.State() |
|
gr.Info(f"You are about to evaluate the next question.", duration=3) |
|
return gr.update(visible=False), gr.update(visible=True), user_info, "", chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, page1_prompt, question_for_eval |
|
|
|
|
|
|
|
|
|
def go_to_page1(show_page_1): |
|
""" |
|
Shows page 1 if user requests it, otherwise shows page 0 |
|
""" |
|
|
|
|
|
if show_page_1: |
|
updates = [ |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=True), |
|
] |
|
else: |
|
updates = [ |
|
gr.update(visible=False), |
|
gr.update(visible=True), |
|
gr.update(visible=False), |
|
] |
|
return updates |
|
|
|
|
|
|
|
def skip_question_and_load_new(user_info_state, our_methods): |
|
|
|
if user_info_state is None: |
|
|
|
return gr.update(visible=False), gr.update(visible=False), None, "", gr.Chatbot(), gr.Chatbot(), gr.Chatbot(), gr.Chatbot(), gr.HTML(), gr.Markdown(), gr.State() |
|
|
|
name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, _ = user_info_state |
|
user_info, chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, page1_prompt, page1_reference_answer, question_for_eval, remaining_count = get_next_eval_question( |
|
name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, our_methods |
|
) |
|
if remaining_count == 0: |
|
|
|
return gr.update(visible=False), gr.update(visible=False), None, "Based on your submitted data, you have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!", gr.Chatbot(), gr.Chatbot(), gr.Chatbot(), gr.Chatbot(), gr.HTML(), gr.Markdown(), gr.State() |
|
return gr.update(visible=False), gr.update(visible=True), user_info, "", chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, page1_prompt, page1_reference_answer, question_for_eval |
|
|
|
|
|
|
|
|
|
def skip_current_question(user_info_state, our_methods: list = our_methods): |
|
|
|
gr.Info("Skipping this question and loading the next one…", duration=5) |
|
if user_info_state is None: |
|
return ( |
|
None, |
|
gr.update( |
|
value="Please start the evaluation before skipping questions."), |
|
gr.update(value=[]), |
|
gr.update(value=[]), |
|
gr.update(value=""), |
|
gr.State() |
|
) |
|
|
|
|
|
name, email, specialty_dd, subspecialty_dd, yrs_exp, exp_desc, npi_id, _ = user_info_state |
|
|
|
|
|
( |
|
user_info_new, |
|
_chat_a_answer, |
|
_chat_b_answer, |
|
_chat_a_reasoning, |
|
_chat_b_reasoning, |
|
_prompt_comp, |
|
_ref_comp, |
|
question_for_eval, |
|
remaining, |
|
) = get_next_eval_question( |
|
name, email, specialty_dd, subspecialty_dd, yrs_exp, exp_desc, npi_id, our_methods |
|
) |
|
|
|
|
|
if remaining == 0 or question_for_eval is None: |
|
final_msg = ( |
|
"Based on your submitted data, you have no more questions to evaluate. " |
|
"You may exit the page; we will follow‑up if we require anything else from you. " |
|
"Thank you!" |
|
) |
|
return ( |
|
user_info_state, |
|
gr.update(value=final_msg), |
|
gr.update(value=[]), |
|
gr.update(value=[]), |
|
gr.update(value=[]), |
|
gr.update(value=[]), |
|
gr.update(value=""), |
|
gr.State() |
|
) |
|
|
|
|
|
chat_a_answer, chat_a_reasoning, _ = format_chat( |
|
question_for_eval['models'][0]['reasoning_trace'], tool_database_labels) |
|
chat_b_answer, chat_b_reasoning, _ = format_chat( |
|
question_for_eval['models'][1]['reasoning_trace'], tool_database_labels) |
|
|
|
prompt_html = ( |
|
f"<div style='background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; " |
|
f"border-radius: 5px; color: black;'><strong style='color: black;'>Question:</strong> " |
|
f"{question_for_eval['question']}</div>" |
|
) |
|
reference_md = question_for_eval.get("correct_answer", "") |
|
gr.Info("New question loaded…", duration=3) |
|
|
|
|
|
return ( |
|
user_info_new, |
|
gr.update(value=""), |
|
gr.update(value=chat_a_answer), |
|
gr.update(value=chat_b_answer), |
|
gr.update(value=chat_a_reasoning), |
|
gr.update(value=chat_b_reasoning), |
|
gr.update(value=prompt_html), |
|
question_for_eval |
|
) |
|
|
|
|
|
|
|
|
|
def flag_nonsense_and_skip(user_info_state, skip_comments=""): |
|
""" |
|
When the evaluator clicks the “Wrong Question?” button, immediately |
|
record that this question was flagged as nonsensical/irrelevant and |
|
then load the next question (re‑using the existing skip logic). |
|
""" |
|
|
|
|
|
if user_info_state is not None: |
|
name, email, specialty_dd, subspecialty_dd, yrs_exp, exp_desc, npi_id, q_id = user_info_state |
|
timestamp = datetime.datetime.now().isoformat() |
|
row = { |
|
"Timestamp": timestamp, |
|
"Name": name, |
|
"Email": email, |
|
"Question ID": q_id, |
|
"Question Makes No Sense or Biomedically Irrelevant": True, |
|
"Skip Comments": skip_comments, |
|
} |
|
append_to_sheet( |
|
user_data=None, |
|
custom_row_dict=row, |
|
custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME), |
|
add_header_when_create_sheet=True, |
|
) |
|
|
|
|
|
return skip_current_question(user_info_state) |
|
|
|
|
|
|
|
|
|
def make_restrict_function(base_choices): |
|
def restrict_choices_page1(radio_choice, score_a, score_b): |
|
""" |
|
Returns (update_for_A, update_for_B). |
|
Enforces rating constraints based on the radio choice for page 1. |
|
""" |
|
|
|
def to_int(x): |
|
try: |
|
|
|
return int(x.split()[0]) |
|
except (ValueError, TypeError, AttributeError): |
|
return None |
|
|
|
|
|
upd_A = gr.update(choices=base_choices, |
|
value=score_a if score_a in base_choices else None) |
|
upd_B = gr.update(choices=base_choices, |
|
value=score_b if score_b in base_choices else None) |
|
|
|
|
|
if radio_choice is None or radio_choice == "Neither model did well.": |
|
return upd_A, upd_B |
|
|
|
a_int = to_int(score_a) |
|
b_int = to_int(score_b) |
|
|
|
|
|
if radio_choice == "Model A is better.": |
|
|
|
if a_int is not None and b_int is not None: |
|
|
|
if a_int < b_int: |
|
|
|
upd_A = gr.update(choices=base_choices, value=None) |
|
upd_B = gr.update(choices=base_choices, value=None) |
|
else: |
|
|
|
allowed_a_choices = [choice for choice in base_choices if to_int( |
|
choice) is None or to_int(choice) >= b_int] |
|
allowed_b_choices = [choice for choice in base_choices if to_int( |
|
choice) is None or to_int(choice) <= a_int] |
|
upd_A = gr.update( |
|
choices=allowed_a_choices, value=score_a if score_a in allowed_a_choices else None) |
|
upd_B = gr.update( |
|
choices=allowed_b_choices, value=score_b if score_b in allowed_b_choices else None) |
|
elif a_int is not None: |
|
|
|
allowed_b_choices = [choice for choice in base_choices if to_int( |
|
choice) is None or to_int(choice) <= a_int] |
|
upd_B = gr.update( |
|
choices=allowed_b_choices, value=score_b if score_b in allowed_b_choices else None) |
|
elif b_int is not None: |
|
|
|
allowed_a_choices = [choice for choice in base_choices if to_int( |
|
choice) is None or to_int(choice) >= b_int] |
|
upd_A = gr.update( |
|
choices=allowed_a_choices, value=score_a if score_a in allowed_a_choices else None) |
|
|
|
|
|
elif radio_choice == "Model B is better.": |
|
|
|
if a_int is not None and b_int is not None: |
|
|
|
if b_int < a_int: |
|
|
|
upd_A = gr.update(choices=base_choices, value=None) |
|
upd_B = gr.update(choices=base_choices, value=None) |
|
else: |
|
|
|
allowed_a_choices = [choice for choice in base_choices if to_int( |
|
choice) is None or to_int(choice) <= b_int] |
|
allowed_b_choices = [choice for choice in base_choices if to_int( |
|
choice) is None or to_int(choice) >= a_int] |
|
upd_A = gr.update( |
|
choices=allowed_a_choices, value=score_a if score_a in allowed_a_choices else None) |
|
upd_B = gr.update( |
|
choices=allowed_b_choices, value=score_b if score_b in allowed_b_choices else None) |
|
elif a_int is not None: |
|
|
|
allowed_b_choices = [choice for choice in base_choices if to_int( |
|
choice) is None or to_int(choice) >= a_int] |
|
upd_B = gr.update( |
|
choices=allowed_b_choices, value=score_b if score_b in allowed_b_choices else None) |
|
elif b_int is not None: |
|
|
|
allowed_a_choices = [choice for choice in base_choices if to_int( |
|
choice) is None or to_int(choice) <= b_int] |
|
upd_A = gr.update( |
|
choices=allowed_a_choices, value=score_a if score_a in allowed_a_choices else None) |
|
|
|
elif radio_choice == "Both models are equally good.": |
|
|
|
if a_int is not None and b_int is not None: |
|
|
|
if a_int == b_int: |
|
|
|
upd_A = gr.update(choices=[score_a], value=score_a) |
|
upd_B = gr.update(choices=[score_b], value=score_b) |
|
else: |
|
|
|
upd_A = gr.update(choices=base_choices, value=None) |
|
upd_B = gr.update(choices=base_choices, value=None) |
|
elif a_int is not None: |
|
|
|
upd_B = gr.update(choices=[score_a], value=score_a) |
|
elif b_int is not None: |
|
|
|
upd_A = gr.update(choices=[score_b], value=score_b) |
|
elif score_a == "Unable to Judge." and score_b == "Unable to Judge.": |
|
|
|
upd_A = gr.update( |
|
choices=["Unable to Judge."], value="Unable to Judge.") |
|
upd_B = gr.update( |
|
choices=["Unable to Judge."], value="Unable to Judge.") |
|
elif score_a == "Unable to Judge.": |
|
|
|
upd_B = gr.update( |
|
choices=["Unable to Judge."], value="Unable to Judge.") |
|
elif score_b == "Unable to Judge.": |
|
|
|
upd_A = gr.update( |
|
choices=["Unable to Judge."], value="Unable to Judge.") |
|
|
|
|
|
return upd_A, upd_B |
|
return restrict_choices_page1 |
|
|
|
|
|
|
|
|
|
def build_row_dict(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args): |
|
num_criteria = len(criteria) |
|
ratings_A_vals = list(args[:num_criteria]) |
|
ratings_B_vals = list(args[num_criteria:]) |
|
|
|
prompt_text = data_subset_state['question'] |
|
response_A_model = data_subset_state['models'][0]['model'] |
|
response_B_model = data_subset_state['models'][1]['model'] |
|
|
|
timestamp = datetime.datetime.now().isoformat() |
|
row = { |
|
"Timestamp": timestamp, |
|
"Name": user_info[0], |
|
"Email": user_info[1], |
|
"Specialty": str(user_info[2]), |
|
"Subspecialty": str(user_info[3]), |
|
"Years of Experience": user_info[4], |
|
"Experience Explanation": user_info[5], |
|
"NPI ID": user_info[6], |
|
"Question ID": user_info[7], |
|
"Prompt": prompt_text, |
|
"ResponseA_Model": response_A_model, |
|
"ResponseB_Model": response_B_model, |
|
"Question Makes No Sense or Biomedically Irrelevant": nonsense_btn_clicked, |
|
} |
|
|
|
pairwise = [mapping.get(val, val) for val in pairwise] |
|
for i, crit in enumerate(criteria): |
|
label = crit['label'] |
|
row[f"Criterion_{label} Comparison: Which is Better?"] = pairwise[i] |
|
row[f"Criterion_{label} Comments"] = comparisons_reasons[i] |
|
row[f"ScoreA_{label}"] = ratings_A_vals[i] |
|
row[f"ScoreB_{label}"] = ratings_B_vals[i] |
|
|
|
return row |
|
|
|
|
|
def final_submit(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args): |
|
|
|
row_dict = build_row_dict(data_subset_state, user_info, |
|
pairwise, comparisons_reasons, nonsense_btn_clicked, *args) |
|
append_to_sheet(user_data=None, custom_row_dict=row_dict, custom_sheet_name=str( |
|
TXAGENT_RESULTS_SHEET_BASE_NAME), add_header_when_create_sheet=True) |
|
|
|
|
|
name, email, specialty, subspecialty, years_exp_radio, exp_explanation_tb, npi_id, _ = user_info |
|
user_info_new, chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, page1_prompt, page1_reference_answer, question_for_eval, remaining_count = get_next_eval_question( |
|
name, email, specialty, subspecialty, years_exp_radio, exp_explanation_tb, npi_id, our_methods |
|
) |
|
|
|
if remaining_count == 0: |
|
return ( |
|
"", |
|
gr.update(visible=False), |
|
gr.update(visible=True), |
|
"", |
|
None, |
|
None, |
|
None, |
|
None, |
|
None, |
|
None, |
|
user_info_new, |
|
) |
|
return ( |
|
"", |
|
gr.update(visible=True), |
|
gr.update(visible=False), |
|
"", |
|
chat_a_answer, |
|
chat_b_answer, |
|
chat_a_reasoning, |
|
chat_b_reasoning, |
|
page1_prompt, |
|
question_for_eval, |
|
user_info_new |
|
) |
|
|
|
|
|
|
|
def validate_and_submit_page1(data_subset_state, user_info, *combined_values): |
|
|
|
criteria_count = len(criteria_for_comparison) |
|
pairwise_list = list(combined_values[:criteria_count]) |
|
comparison_reasons_list = list( |
|
combined_values[criteria_count:criteria_count*2]) |
|
ratings_A_list = list( |
|
combined_values[criteria_count*2:criteria_count*3]) |
|
ratings_B_list = list(combined_values[criteria_count*3:]) |
|
|
|
|
|
if any(answer is None for answer in pairwise_list): |
|
missing_comparisons = [] |
|
for i, answer in enumerate(pairwise_list): |
|
if answer is None: |
|
missing_comparisons.append(criteria_for_comparison[i]['label']) |
|
|
|
missing_text = ", ".join(missing_comparisons) |
|
error_msg = f"Your response is missing for: {missing_text}" |
|
gr.Info(error_msg) |
|
return ( |
|
gr.update(value=f"Error: {error_msg}"), |
|
gr.update(visible=True), |
|
gr.update(visible=False), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
|
|
*combined_values |
|
) |
|
|
|
|
|
if any(r is None for r in ratings_A_list) or any(r is None for r in ratings_B_list): |
|
missing_ratings = [] |
|
for i in range(len(criteria)): |
|
missing_parts = [] |
|
if ratings_A_list[i] is None: |
|
missing_parts.append("Model A Response") |
|
if ratings_B_list[i] is None: |
|
missing_parts.append("Model B Response") |
|
if missing_parts: |
|
missing_ratings.append( |
|
f"{criteria[i]['label']} ({', '.join(missing_parts)})") |
|
|
|
missing_text = "; ".join(missing_ratings) |
|
error_msg = f"Please provide ratings for: {missing_text}" |
|
gr.Info(error_msg) |
|
return ( |
|
gr.update(value=f"Error: {error_msg}"), |
|
gr.update(visible=True), |
|
gr.update(visible=False), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
|
|
*combined_values |
|
) |
|
gr.Info("Submitting your evaluation and loading the next question...") |
|
|
|
submit_result = final_submit(data_subset_state, user_info, pairwise_list, |
|
comparison_reasons_list, False, *ratings_A_list, *ratings_B_list) |
|
|
|
|
|
|
|
page1_update = submit_result[1] |
|
page1_visible = page1_update.get('visible', False) if isinstance( |
|
page1_update, dict) else False |
|
gr.Info(f"Your evaluation has been submitted. You are about to evaluate the next question...") |
|
|
|
if page1_visible: |
|
|
|
reset_values = [] |
|
for _ in range(len(combined_values)): |
|
reset_values.append(None) |
|
return submit_result + tuple(reset_values) |
|
else: |
|
|
|
return submit_result + tuple(combined_values) |
|
|
|
|
|
centered_col_css = """ |
|
#centered-column { |
|
margin-left: auto; |
|
margin-right: auto; |
|
max-width: 800px; /* Adjust this width as desired */ |
|
width: 100%; |
|
} |
|
#participate-btn { |
|
background-color: purple !important; |
|
color: white !important; |
|
border-color: purple !important; |
|
} |
|
#answer-reference-btn { |
|
/* Light‑mode palette */ |
|
--btn-bg: #E0F2FF; /* soft pastel blue */ |
|
--btn-text: #00334D; /* dark slate for good contrast */ |
|
--btn-border: #E0F2FF; |
|
|
|
background-color: var(--btn-bg) !important; |
|
color: var(--btn-text) !important; |
|
border: 1px solid var(--btn-border) !important; |
|
} |
|
|
|
/* Dark‑mode overrides */ |
|
@media (prefers-color-scheme: dark) { |
|
#answer-reference-btn { |
|
--btn-bg: #2C6E98; /* muted steel blue for dark backgrounds */ |
|
--btn-text: #FFFFFF; /* switch to white text for contrast */ |
|
--btn-border: #2C6E98; |
|
} |
|
} |
|
|
|
#clear_btn { |
|
background-color: #F08080 !important; |
|
color: white !important; |
|
border-color: #F08080 !important; |
|
} |
|
.reference-box { |
|
border: 1px solid #ccc; |
|
padding: 10px; |
|
border-radius: 5px; |
|
} |
|
.short-btn { min-width: 80px !important; max-width: 120px !important; width: 100px !important; padding-left: 4px !important; padding-right: 4px !important; } |
|
.light-stop-btn { background-color: #ffcccc !important; color: #b30000 !important; border-color: #ffcccc !important; } |
|
|
|
/* --- Added for larger criteria font --- */ |
|
.criteria-font-large { |
|
font-size: 1.2em !important; |
|
} |
|
/* Radio component labels (the title above the choices) */ |
|
.criteria-radio-label label[data-testid="block-label"] { |
|
font-weight: bold !important; |
|
font-size: 1.1em !important; |
|
} |
|
/* Textbox labels */ |
|
.textbox-bold-label label[data-testid="block-label"] { |
|
font-weight: bold !important; |
|
} |
|
#participate-btn button { |
|
font-size: 24px !important; /* Large readable text */ |
|
font-weight: 700 !important; /* Bold for emphasis */ |
|
padding: 28px 40px !important; /* Extra padding for height */ |
|
min-height: 120px !important; /* Make button visibly taller (multi‑line) */ |
|
width: 100% !important; /* Occupy full width of its column */ |
|
white-space: normal !important; /* Allow text to wrap onto multiple lines */ |
|
} |
|
.criteria-radio-score-label [role="radiogroup"], |
|
.criteria-radio-score-label .gr-radio-group, |
|
.criteria-radio-score-label .flex { |
|
display: flex !important; |
|
flex-direction: column !important; |
|
gap: 4px !important; /* 行间距,可按需调整 */ |
|
} |
|
|
|
/* 更具体的选择器来确保垂直布局 */ |
|
.criteria-radio-score-label fieldset { |
|
display: flex !important; |
|
flex-direction: column !important; |
|
gap: 4px !important; |
|
} |
|
|
|
.criteria-radio-score-label .wrap { |
|
display: flex !important; |
|
flex-direction: column !important; |
|
gap: 4px !important; |
|
} |
|
|
|
/* 确保每个单选按钮选项垂直排列 */ |
|
.criteria-radio-score-label label { |
|
display: block !important; |
|
margin-bottom: 4px !important; |
|
} |
|
""" |
|
with gr.Blocks(css=centered_col_css) as demo: |
|
|
|
user_info_state = gr.State() |
|
pairwise_state = gr.State() |
|
scores_A_state = gr.State() |
|
comparison_reasons = gr.State() |
|
nonsense_btn_clicked = gr.State(False) |
|
unqualified_A_state = gr.State() |
|
data_subset_state = gr.State() |
|
question_in_progress = gr.State(0) |
|
|
|
|
|
specialties_path = "specialties.json" |
|
subspecialties_path = "subspecialties.json" |
|
|
|
try: |
|
with open(specialties_path, 'r') as f: |
|
specialties_list = json.load(f) |
|
with open(subspecialties_path, 'r') as f: |
|
subspecialties_list = json.load(f) |
|
except FileNotFoundError: |
|
print( |
|
f"Error: Could not find specialty files at {specialties_path} or {subspecialties_path}. Please ensure these files exist.") |
|
|
|
specialties_list = ["Error loading specialties"] |
|
subspecialties_list = ["Error loading subspecialties"] |
|
except json.JSONDecodeError: |
|
print(f"Error: Could not parse JSON from specialty files.") |
|
specialties_list = ["Error parsing specialties"] |
|
subspecialties_list = ["Error parsing subspecialties"] |
|
|
|
|
|
with gr.Column(visible=True, elem_id="page-1") as page_minus1: |
|
gr.HTML(""" |
|
<div> |
|
<h1>TxAgent Portal: AI Evaluation and Crowdsourcing of Therapeutic Questions</h1> |
|
</div> |
|
""") |
|
|
|
|
|
with gr.Column(scale=1): |
|
participate_eval_btn = gr.Button( |
|
value="Evaluate TxAgent", |
|
variant="primary", |
|
size="lg", |
|
elem_id="participate-btn" |
|
) |
|
with gr.Column(scale=1): |
|
gr.Markdown( |
|
""" |
|
When you join Evaluate TxAgent, you will: |
|
- See model responses to diverse prompts. |
|
- Provide instant thumbs-up or thumbs-down ratings. |
|
- Influence the roadmap for future releases. |
|
|
|
Thank you for helping improve TxAgent! |
|
""" |
|
) |
|
with gr.Column(scale=1): |
|
submit_questions_btn = gr.Button( |
|
value="Submit Your Therapeutic Questions", |
|
variant="primary", |
|
size="lg", |
|
elem_id="submit-btn" |
|
) |
|
|
|
|
|
|
|
with gr.Column(scale=1): |
|
gr.Markdown( |
|
""" |
|
By submitting therapeutic questions, you will: |
|
- Help identify edge cases and blind spots for AI models. |
|
- Help extend AI models to reason in new domains. |
|
- Directly shape future model improvements. |
|
|
|
We look forward to seeing your feedback! |
|
""" |
|
) |
|
|
|
|
|
contact_info_markdown = """ |
|
## Contact |
|
|
|
For questions or suggestions, email [Shanghua Gao](mailto:[email protected]) and [Marinka Zitnik](mailto:[email protected]). |
|
""" |
|
|
|
gr.Markdown(contact_info_markdown) |
|
|
|
gr.HTML(TxAgent_Project_Page_HTML) |
|
|
|
|
|
|
|
|
|
google_form_url = "https://forms.gle/pYvyvEQQwS5gdupQA" |
|
submit_questions_btn.click( |
|
fn=None, |
|
inputs=None, |
|
outputs=None, |
|
js=f"() => {{ window.open('{google_form_url}', '_blank'); }}" |
|
) |
|
|
|
|
|
with gr.Column(visible=False, elem_id="page0") as page0: |
|
|
|
gr.Markdown("## Sign Up") |
|
name = gr.Textbox(label="Name (required)") |
|
email = gr.Textbox( |
|
label="Email (required). Use the same email each time you log into this evaluation portal to avoid receiving repeat questions.") |
|
specialty_dd = gr.Dropdown( |
|
choices=specialties_list, label="Primary Medical Specialty (required). Visit https://www.abms.org/member-boards/specialty-subspecialty-certificates/ for categories.", multiselect=True) |
|
subspecialty_dd = gr.Dropdown( |
|
choices=subspecialties_list, label="Subspecialty (if applicable). Visit https://www.abms.org/member-boards/specialty-subspecialty-certificates/ for categories.", multiselect=True) |
|
npi_id = gr.Textbox( |
|
label="National Provider Identifier ID (optional). Visit https://npiregistry.cms.hhs.gov/search to find your NPI ID. Leave blank if you do not have an NPI ID.") |
|
years_exp_radio = gr.Radio( |
|
choices=["0-2 years", "3-5 years", "6-10 years", |
|
"11-20 years", "20+ years", "Not Applicable"], |
|
label="Years of experience in clinical and/or research activities related to your biomedical expertise (required)." |
|
) |
|
exp_explanation_tb = gr.Textbox( |
|
label="Briefly describe your expertise in AI (optional).") |
|
|
|
page0_error_box = gr.Markdown("") |
|
with gr.Row(): |
|
next_btn_0 = gr.Button("Next") |
|
gr.Markdown("""Click Next to start the study. Your progress will be saved after you submit each question. For questions or concerns, contact us directly. Thank you for participating! |
|
""") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Column(visible=False) as page1: |
|
with gr.Accordion("Instructions", open=False): |
|
gr.Markdown(""" |
|
## Instructions: |
|
Please review these instructions and enter your information to begin: |
|
|
|
- Each session requires at least 5-10 minutes per question. |
|
- You can evaluate multiple questions; you will not repeat evaluations. |
|
- For each question, compare responses from two models and rate them (scale: 1-5). |
|
- If a question is unclear or irrelevant to biomedicine, click the RED BUTTON at the top of the comparison page. |
|
- Use the Back and Next buttons to edit responses before submission. |
|
- Use the Home Page button to return to the homepage; progress will save but not submit. |
|
- Submit answers to the current question before moving to the next. |
|
- You can pause between questions and return later; ensure current answers are submitted to save them. |
|
""") |
|
|
|
|
|
|
|
page1_prompt = gr.HTML() |
|
with gr.Row(): |
|
nonsense_btn = gr.Button( |
|
"Skip Question", |
|
size="sm", |
|
variant="stop", |
|
elem_id="invalid-question-btn", |
|
elem_classes=["short-btn"], |
|
scale=1 |
|
) |
|
skip_comments = gr.Textbox( |
|
placeholder="(Optional) Why do you want to skip this question...", |
|
show_label=False, |
|
scale=3, |
|
container=False, |
|
) |
|
|
|
page1_error_box = gr.Markdown("") |
|
|
|
|
|
with gr.Row(): |
|
|
|
with gr.Column(): |
|
gr.Markdown("**Model A Response:**") |
|
chat_a_answer = gr.Chatbot( |
|
value=[], |
|
type="messages", |
|
height=200, |
|
label="Model A Answer", |
|
show_copy_button=False, |
|
show_label=True, |
|
render_markdown=True, |
|
avatar_images=None, |
|
rtl=False |
|
) |
|
|
|
chat_a_reasoning = gr.Chatbot( |
|
value=[], |
|
type="messages", |
|
height=300, |
|
label="Model A Reasoning - Rationale", |
|
show_copy_button=False, |
|
show_label=True, |
|
render_markdown=True, |
|
avatar_images=None, |
|
rtl=False |
|
) |
|
|
|
with gr.Column(): |
|
gr.Markdown("**Model B Response:**") |
|
chat_b_answer = gr.Chatbot( |
|
value=[], |
|
type="messages", |
|
height=200, |
|
label="Model B Answer", |
|
show_copy_button=False, |
|
show_label=True, |
|
render_markdown=True, |
|
avatar_images=None, |
|
rtl=False |
|
) |
|
|
|
chat_b_reasoning = gr.Chatbot( |
|
value=[], |
|
type="messages", |
|
height=300, |
|
label="Model B Reasoning - Rationale", |
|
show_copy_button=False, |
|
show_label=True, |
|
render_markdown=True, |
|
avatar_images=None, |
|
rtl=False |
|
) |
|
|
|
|
|
comparison_reasons_inputs = [] |
|
pairwise_inputs = [] |
|
ratings_A_page1 = [] |
|
ratings_B_page1 = [] |
|
|
|
for i, crit_comp in enumerate(criteria_for_comparison): |
|
|
|
crit_score = criteria[i] |
|
|
|
restrict_fn = make_restrict_function(sorted(crit_score["scores"])) |
|
|
|
|
|
gr.Markdown(f"**{crit_comp['label']}**", |
|
elem_classes="criteria-font-large") |
|
radio = gr.Radio( |
|
choices=[ |
|
"Model A is better.", |
|
"Model B is better.", |
|
"Both models are equally good.", |
|
"Neither model did well." |
|
], |
|
|
|
label=crit_comp['text'], |
|
elem_classes="criteria-radio-label" |
|
) |
|
pairwise_inputs.append(radio) |
|
|
|
|
|
|
|
index_component = gr.Number( |
|
value=i, visible=False, interactive=False) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
rating_a = gr.Radio(choices=sorted(crit_score["scores"]), |
|
label=f"Model A Response - {crit_score['text']}", |
|
interactive=True, |
|
elem_classes="criteria-radio-score-label") |
|
with gr.Column(scale=1): |
|
rating_b = gr.Radio(choices=sorted(crit_score["scores"]), |
|
label=f"Model B Response - {crit_score['text']}", |
|
interactive=True, |
|
elem_classes="criteria-radio-score-label") |
|
|
|
|
|
with gr.Row(): |
|
|
|
radio.change( |
|
fn=restrict_fn, |
|
inputs=[radio, rating_a, rating_b], |
|
outputs=[rating_a, rating_b] |
|
) |
|
rating_a.change( |
|
fn=restrict_fn, |
|
inputs=[radio, rating_a, rating_b], |
|
outputs=[rating_a, rating_b] |
|
) |
|
rating_b.change( |
|
fn=restrict_fn, |
|
inputs=[radio, rating_a, rating_b], |
|
outputs=[rating_a, rating_b] |
|
) |
|
|
|
ratings_A_page1.append(rating_a) |
|
ratings_B_page1.append(rating_b) |
|
|
|
text_input = gr.Textbox( |
|
|
|
placeholder="Comments for your selection (optional)", |
|
show_label=False, |
|
|
|
) |
|
comparison_reasons_inputs.append(text_input) |
|
|
|
with gr.Row(): |
|
submit_btn_1 = gr.Button( |
|
"Submit Evaluation", variant="primary", elem_id="submit_btn") |
|
|
|
|
|
with gr.Column(visible=False, elem_id="final_page") as final_page: |
|
gr.Markdown( |
|
"## You have no questions left to evaluate. Thank you for your participation!") |
|
|
|
|
|
with Modal("Error", visible=False, elem_id="error_modal") as error_modal: |
|
error_message_box = gr.Markdown() |
|
ok_btn = gr.Button("OK") |
|
|
|
ok_btn.click(lambda: gr.update(visible=False), None, error_modal) |
|
|
|
|
|
|
|
|
|
participate_eval_btn.click( |
|
fn=go_to_page0_from_minus1, |
|
inputs=[question_in_progress], |
|
|
|
outputs=[page_minus1, page0, page1, final_page] |
|
) |
|
|
|
|
|
next_btn_0.click( |
|
fn=go_to_eval_progress_modal, |
|
inputs=[name, email, specialty_dd, subspecialty_dd, |
|
years_exp_radio, exp_explanation_tb, npi_id], |
|
outputs=[page0, page1, user_info_state, page0_error_box, chat_a_answer, |
|
chat_b_answer, chat_a_reasoning, chat_b_reasoning, page1_prompt, data_subset_state], |
|
scroll_to_output=True |
|
) |
|
|
|
nonsense_btn.click( |
|
fn=flag_nonsense_and_skip, |
|
inputs=[user_info_state, skip_comments], |
|
outputs=[user_info_state, page1_error_box, chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, |
|
page1_prompt, data_subset_state], |
|
scroll_to_output=True |
|
) |
|
|
|
|
|
submit_btn_1.click( |
|
fn=validate_and_submit_page1, |
|
inputs=[data_subset_state, user_info_state, *pairwise_inputs, |
|
*comparison_reasons_inputs, *ratings_A_page1, *ratings_B_page1], |
|
outputs=[page1_error_box, page1, final_page, page0_error_box, chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, |
|
page1_prompt, data_subset_state, user_info_state, *pairwise_inputs, *comparison_reasons_inputs, *ratings_A_page1, *ratings_B_page1], |
|
scroll_to_output=True |
|
) |
|
|
|
|
|
demo.launch(share=True, allowed_paths=["."]) |
|
|