Spaces:

NCSOFT
/

VARCO_Arena

Running

File size: 14,577 Bytes

import pandas as pd
import streamlit as st

import analysis_utils as au
from analysis_utils import number_breakdown_from_df
from app import load_and_cache_data

# from app import VA_ROOT
from query_comp import QueryWrapper, get_base_url
from varco_arena.varco_arena_core.prompts import load_prompt
from view_utils import (
    default_page_setting,
    escape_markdown,
    set_nav_bar,
    show_linebreak_in_md,
)

DEFAULT_LAYOUT_DICT = {
    "title": {"font": {"size": 20, "family": "Gothic A1"}},
    "font": {"size": 16, "family": "Gothic A1"},
    "xaxis": {"tickfont": {"size": 12, "family": "Gothic A1"}},
    "yaxis": {"tickfont": {"size": 12, "family": "Gothic A1"}},
    "legend": {"font": {"size": 12, "family": "Gothic A1"}},
}


def navigate(t, source, key, val):
    # print(key, val)
    if source is None:
        return
    target_index = t.index(source) + val
    if 0 <= target_index < len(t):
        st.session_state[key] = t[target_index]
        st.rerun()


def main():
    sidebar_placeholder = default_page_setting(layout="wide")
    set_nav_bar(
        False,
        sidebar_placeholder=sidebar_placeholder,
        toggle_hashstr="see_results_init",
    )

    # load the data
    # print(f"{st.session_state.get('result_file_path', None)=}")
    most_recent_run = st.session_state.get("result_file_path", None)
    most_recent_run = str(most_recent_run) if most_recent_run is not None else None
    (
        st.session_state["all_result_dict"],
        st.session_state["df_dict"],
    ) = load_and_cache_data(result_file_path=most_recent_run)

    # side bar
    st.sidebar.title("Select Result:")
    result_select = QueryWrapper("expname")(
        st.sidebar.selectbox,
        list(st.session_state["all_result_dict"].keys()),
    )

    if result_select is None:
        if st.session_state.korean:
            st.markdown("결과를 확인하려면 먼저 **🔥VARCO Arena를 구동**하셔야 합니다")
        else:
            st.markdown("You should **🔥Run VARCO Arena** first to see results")
        st.stop()
    eval_prompt_name = result_select.split("/")[-1].strip()
    if st.sidebar.button("Clear Cache"):
        st.cache_data.clear()
        st.cache_resource.clear()
        st.rerun()

    if result_select:
        if "alpha2names" in st.session_state:
            del st.session_state["alpha2names"]

    fig_dict_per_task = st.session_state["all_result_dict"][result_select]
    task_list = list(fig_dict_per_task.keys())
    elo_rating_by_task = fig_dict_per_task["Overall"]["elo_rating_by_task"]
    # tabs = st.tabs(task_list)

    df_dict_per_task = st.session_state["df_dict"][result_select]

    default_layout_dict = DEFAULT_LAYOUT_DICT
    task = QueryWrapper("task", "Select Task")(st.selectbox, task_list)

    if task is None:
        st.stop()
    figure_dict = fig_dict_per_task[task]
    judgename = figure_dict["judgename"]
    df = df_dict_per_task[task]
    interpretation, n_models, size_testset = number_breakdown_from_df(df)
    if st.session_state.korean:
        st.markdown(f"## 결과 ({task})")
        st.markdown(f"##### Judge 모델: {judgename} / 평가프롬: {eval_prompt_name}")
        st.markdown(f"##### 테스트셋 사이즈: {int(size_testset)} 행")
    else:
        st.markdown(f"## Results ({task})")
        st.markdown(f"##### Judge Model: {judgename} / prompt: {eval_prompt_name}")
        st.markdown(f"##### Size of Testset: {int(size_testset)} rows")
    col1, col2 = st.columns(2)
    with col1:
        with st.container(border=True):
            st.markdown(f"#### Ratings ({task})")
            st.table(figure_dict["elo_rating"])
            st.write(show_linebreak_in_md(escape_markdown(interpretation)))

    with col2:
        with st.container(border=True):
            st.plotly_chart(
                elo_rating_by_task.update_layout(**default_layout_dict),
                use_container_width=True,
                key=f"{task}_elo_rating_by_task",
            )

    st.divider()

    if st.session_state.korean:
        st.markdown("### 토너먼트 (테스트 시나리오) 별로 보기")
    else:
        st.markdown("### Tournament Results by Test Scenario")

    # with st.expander("볼 토너먼트 고르기"):
    d = list(df.idx_inst_src.unique())
    default_idx = st.session_state.get("selected_tournament", None)
    cols = st.columns((1, 18, 1))
    with cols[0]:
        if st.button("◀", key="prev_tournament"):
            navigate(d, default_idx, "selected_tournament", -1)
    with cols[1]:
        tournament_prm_select = QueryWrapper("tournament", "Select Tournament")(
            st.selectbox,
            d,
            default_idx,
            key=f"{task}_tournament_select",
            on_change=lambda: st.session_state.update(
                selected_tournament=st.session_state.get(f"{task}_tournament_select"),
                selected_match=None,
            ),
            label_visibility="collapsed",
        )
    with cols[2]:
        if st.button("▶", key="next_tournament"):
            navigate(d, default_idx, "selected_tournament", 1)

    # tournament_prm_select = st.selectbox(
    #     "Select Tournament",
    #     df.idx_inst_src.unique(),
    #     index=d.index(st.session_state.get("selected_tournament")),
    #     key=f"{task}_tournament_{result_select}",
    # )

    # print(tournament_prm_select, type(tournament_prm_select))
    st.session_state["selected_tournament"] = tournament_prm_select
    # tournament_prm_select = st.selectbox(
    #     "Select Tournament",
    #     df.idx_inst_src.unique(),
    #     key=f"{task}_tournament_{result_select}",
    # )
    df_now_processed = None
    if tournament_prm_select:
        df_now = df[df.idx_inst_src == tournament_prm_select]
        df_now_processed, _alpha2names = au.init_tournament_dataframe(
            df_now,
            alpha2names=st.session_state["alpha2names"]
            if "alpha2names" in st.session_state.keys()
            else None,
        )
        if "alpha2names" not in st.session_state:
            st.session_state["alpha2names"] = _alpha2names

        try:
            bracket_drawing = au.draw(
                df_now_processed,
                alpha2names=st.session_state["alpha2names"],
            )
            legend = au.make_legend_str(
                df_now_processed, st.session_state["alpha2names"]
            )
            st.code(bracket_drawing + legend)

            m = list(df_now_processed.human_readable_idx)
            default_idx = st.session_state.get("selected_match", None)
            cols = st.columns((1, 18, 1))
            with cols[0]:
                if st.button("◀", key="prev_match"):
                    navigate(m, default_idx, "selected_match", -1)
            with cols[1]:
                match_idx_human = QueryWrapper("match", "Select Match")(
                    st.selectbox,
                    m,
                    default_idx,
                    key=f"{task}_match_select",
                    label_visibility="collapsed",
                )
            with cols[2]:
                if st.button("▶", key="next_match"):
                    navigate(m, default_idx, "selected_match", 1)

            # match_idx_human = st.selectbox(
            #     "Select Match",
            #     df_now_processed.human_readable_idx,
            #     key=f"{task}_match_{result_select}",
            # )
            # print(match_idx_human)
            st.session_state["selected_match"] = match_idx_human
            # match_idx_human = st.selectbox(
            #     "Select Match",
            #     df_now_processed.human_readable_idx,
            #     key=f"{task}_match_{result_select}",
            # )
            if match_idx_human:
                match_idx = int(match_idx_human.split(": ")[0])
                row = df_now_processed.loc[match_idx]

                st.markdown("#### Current Test Scenario:")

                with st.expander(
                    f"### Evaluation Prompt (evalprompt: {eval_prompt_name}--{task})"
                ):
                    prompt = load_prompt(eval_prompt_name, task=task)
                    kwargs = dict(
                        inst="{inst}",
                        src="{src}",
                        out_a="{out_a}",
                        out_b="{out_b}",
                        task=task,
                    )
                    if eval_prompt_name == "translation_pair":
                        kwargs["source_lang"] = "{source_lang}"
                        kwargs["target_lang"] = "{target_lang}"
                    prompt_cmpl = prompt.complete_prompt(**kwargs)
                    for msg in prompt_cmpl:
                        st.markdown(f"**{msg['role']}**")
                        st.info(show_linebreak_in_md(escape_markdown(msg["content"])))

                st.info(show_linebreak_in_md(tournament_prm_select))

                winner = row.winner
                col1, col2 = st.columns(2)

                winnerbox = st.success
                loserbox = st.error
                with col1:
                    iswinner = winner == "model_a"
                    writemsg = winnerbox if iswinner else loserbox
                    st.markdown(f"#### ({row.model_a}) {row.human_readable_model_a}")
                    writemsg(
                        show_linebreak_in_md(row.generated_a),
                        icon="✅" if iswinner else "❌",
                    )
                with col2:
                    iswinner = winner == "model_b"
                    writemsg = winnerbox if iswinner else loserbox
                    st.markdown(f"#### ({row.model_b}) {row.human_readable_model_b}")
                    writemsg(
                        show_linebreak_in_md(row.generated_b),
                        icon="✅" if iswinner else "❌",
                    )
        except Exception as e:
            import traceback

            traceback.print_exc()
            st.markdown(
                "**Bug: 아래 표를 복사해서 이슈로 남겨주시면 개선에 도움이 됩니다. 감사합니다🙏**"
                if st.session_state.korean
                else "Bug: Please open issue and attach the table output below to help me out. Thanks in advance.🙏"
            )
            st.error(e)
            st.info(tournament_prm_select)
            st.table(
                df_now_processed[
                    [
                        "depth",
                        "round",
                        "winner_nodes",
                        "winner_resolved",
                        "winner",
                        "model_a",
                        "model_b",
                    ]
                ]
            )
    st.write("Sharable link")
    st.code(f"{get_base_url()}/see_results?{QueryWrapper.get_sharable_link()}")
    st.divider()

    if st.session_state.korean:
        st.markdown("### 매치 통계")
    else:
        st.markdown("### Match Stats.")
    col1, col2 = st.columns(2)
    col1, col2 = st.columns(2)
    with col1:
        with st.container(border=True):
            st.plotly_chart(
                figure_dict[
                    "fraction_of_model_a_wins_for_all_a_vs_b_matches"
                ].update_layout(autosize=True, **default_layout_dict),
                use_container_width=True,
                key=f"{task}_fraction_of_model_a_wins_for_all_a_vs_b_matches",
            )
    with col2:
        with st.container(border=True):
            st.plotly_chart(
                figure_dict["match_count_of_each_combination_of_models"].update_layout(
                    autosize=True, **default_layout_dict
                ),
                use_container_width=True,
                key=f"{task}_match_count_of_each_combination_of_models",
            )

    with col1:
        with st.container(border=True):
            st.plotly_chart(
                figure_dict["match_count_for_each_model"].update_layout(
                    **default_layout_dict
                ),
                use_container_width=True,
                key=f"{task}_match_count_for_each_model",
            )
    with col2:
        pass

    if st.session_state.korean:
        st.markdown("### 참고용 LLM Judge 편향 정보")
    else:
        st.markdown("### FYI: How biased is your LLM Judge?")

    with st.expander("펼쳐서 보기" if st.session_state.korean else "Expand to show"):
        st.info(
            """
Varco Arena에서는 position bias의 영향을 최소화하기 위해 모든 모델이 A나 B위치에 번갈아 위치하도록 하였습니다. 그러나 LLM Judge 혹은 Prompt의 성능이 부족하다고 느껴진다면, 아래 알려진 LLM Judge bias가 참고가 될겁니다.
* position bias (왼쪽)
* length bias (오른쪽)

결과의 왜곡이 LLM Judge의 부족함 떄문이었다는 점을 규명하려면 사용하신 LLM Judge와 Prompt의 binary classification 정확도를 측정해보시길 바랍니다 (Varco Arena를 활용하여 이를 수행해볼 수 있습니다!).""".strip()
            if st.session_state.korean
            else """
In Varco Arena, to minimize the effect of position bias, all models are alternately positioned in either position A or B. However, if you feel the LLM Judge or Prompt performance is insufficient, the following known LLM Judge biases may be helpful to reference:
* position bias (left)
* length bias (right)

To determine if result distortion was due to LLM Judge limitations, please measure the binary classification accuracy of your LLM Judge and Prompt (You could use Varco Arena for this purpose!).
""".strip()
        )
        st.markdown(f"#### {judgename} + prompt = {eval_prompt_name}")
        col1, col2 = st.columns(2)
        with col1:
            with st.container(border=True):
                st.plotly_chart(
                    figure_dict["counts_of_match_winners"].update_layout(
                        **default_layout_dict
                    ),
                    use_container_width=True,
                    key=f"{task}_counts_of_match_winners",
                )
        with col2:
            with st.container(border=True):
                st.plotly_chart(
                    figure_dict["length_bias"].update_layout(**default_layout_dict),
                    use_container_width=True,
                    key=f"{task}_length_bias",
                )
                st.table(figure_dict["length_bias_df"].groupby("category").describe().T)


if __name__ == "__main__":
    main()