# import shutil import os import select import subprocess import sys import time from datetime import datetime, timedelta, timezone from pathlib import Path from typing import * import streamlit as st sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from varco_arena_core.prompts import load_prompt from view_utils import ( default_page_setting, escape_markdown, set_nav_bar, show_linebreak_in_md, ) # import sys # print(sys.executable) VA_ROOT = Path(os.environ.get("VARCO_ARENA_RESULT_PATH", "./user_submit")) USR_SUB = VA_ROOT.parts[-1] def upload_files(uploaded_files) -> Path: # prep directory for user submission user_sub_root = VA_ROOT if user_sub_root.exists(): if not user_sub_root.is_dir(): raise ValueError( f"{user_sub_root} file exists and is not a directory. Consider renaming it." ) else: user_sub_root.mkdir(parents=True) KST = timezone(timedelta(hours=9)) tstamp = datetime.now(KST) tstr = tstamp.strftime("%m-%d_%H:%M:%S") files_dir_str = "./" + str(user_sub_root / tstr) files_dir = Path(files_dir_str) files_dir.mkdir(parents=True, exist_ok=True) uploaded_files = list(uploaded_files) if not uploaded_files: st.warning("❌ No files to upload. Please drag/drop or browse files to upload.") elif len(uploaded_files) < 2: st.error("❌ You need at least 2 jsonlines files to properly run VA.") else: # properly uploaded for file in uploaded_files: # Create a path for the file in the server directory file_path = files_dir / file.name # Save the file to the server directory with open(file_path, "wb") as f: f.write(file.getbuffer()) jslfiles = list(files_dir.glob("*.jsonl")) st.success(f"✅ Successfully uploaded {len(jslfiles)} jsonl files.") return files_dir.resolve() def run_varco_arena( price_estimation: bool = False, # upload_dir: Union[str, Path] = None, promptname: str = None, exp_name: str = None, api_key: Optional[str] = None, evaluation_model: str = "gpt-4o-mini", update_interval: float = 1.0, ): # Use environment variable for API key ptn = f"{str(st.session_state.upfiles_dir)}" outdir = Path(ptn) if exp_name: outdir = outdir / exp_name command = f"python ../varco_arena/main.py -i {ptn} -o {outdir} -k {api_key} -p {promptname} -e {evaluation_model} -j 64" if price_estimation: command = f"{command} -c" else: command = command.replace("python", "yes | python ") print(command) api_key = None # clear immediately process = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE, text=True, bufsize=1, shell=True, ) # Set stdout and stdin to non-blocking mode os.set_blocking(process.stdout.fileno(), False) last_update_time = time.time() terminal_output = st.empty() full_output = f"{command}\n" while True: # Check if we have output to read if select.select([process.stdout], [], [], 0)[0]: output = process.stdout.readline() if output: full_output += output if price_estimation: to_show = full_output terminal_output.code(to_show, language="bash") else: current_time = time.time() if current_time - last_update_time > update_interval: lines = full_output.split("\n") if len(lines) < 5: to_show = full_output else: to_show = "\n".join(["...\n..\n.\n"] + lines[-5:]) terminal_output.code(to_show, language="bash") last_update_time = current_time print(output) time.sleep(0.1) # Check if the process has finished if process.poll() is not None: # Read any remaining output remaining_output = process.stdout.read() if remaining_output: lines = remaining_output.split("\n") if len(lines) > 10: to_show += "\n".join(["\n...\n..\n.\n"] + lines[-10:]) else: to_show += remaining_output terminal_output.code(to_show, language="bash") print(remaining_output) break return_code = process.poll() return outdir, return_code def main(): # init lang st.session_state["korean"] = st.session_state.get("korean", False) sidebar_placeholder = default_page_setting() set_nav_bar( False, sidebar_placeholder=sidebar_placeholder, toggle_hashstr="app_init" ) st.title("⚔️ VARCO ARENA ⚔️") if st.session_state.korean: st.write( "**VARCO Arena는 각 모델의 생성된 결과를 비교 평가하여 모델의 성능 순위를 제공하는 시스템입니다. 커스텀 테스트셋에 유용하게 활용할 수 있으며, 모범답안을 필요로 하지 않습니다.**" ) else: st.write( "**VARCO Arena is an LLM benchmarking system that compares model responses across customized test scenarios without requiring reference answers.**" ) st.divider() # Set up the file uploader if st.session_state.korean: st.markdown("모델 출력파일 업로드") else: st.markdown("### 1. Upload LLM responses") uploaded_files = st.file_uploader( "Drag and Drop jsonlines files (.jsonl)", accept_multiple_files=True ) # upload state if "upfiles_dir" not in st.session_state: st.session_state.upfiles_dir = None if st.button("Upload Files"): st.session_state.upfiles_dir = upload_files(uploaded_files) # st.success(st.session_state.upfiles_dir) # st.markdown("**💥주의: 중복된 테스트 시나리오는 오류로 처리됩니다💥**") if st.session_state.korean: with st.expander("❓❔ 무엇을 업로드 하나요❓❔"): st.info(open("guide_mds/input_jsonls_kr.md", encoding="UTF8").read()) else: with st.expander("❓❔ What should I upload ❓❔"): st.info(open("guide_mds/input_jsonls_en.md", encoding="UTF8").read()) # Form for cost estimation with st.form("cost_estimation_form"): if st.session_state.korean: st.write("### 2. 가격 산정") else: st.write("### 2. Cost Estimation") eval_model = st.selectbox( "Select Judge", open("eval_models_list.txt", encoding="UTF8").read().split("\n"), ) promptname = st.selectbox( "Select Evalutaion Prompt", open("eval_prompt_list.txt", encoding="UTF8").read().split("\n"), ) if promptname == USR_SUB: raise ValueError( f"{USR_SUB=} is preserved name for the system. Consider another naming for the prompt or consider changing {VA_ROOT=} (USR_SUB == VA_ROOT.parts[-1])." ) estimate_button = st.form_submit_button("Calculate Cost!") with st.expander( "LLM Judge에 활용되는 프롬프트 (`Calculate Cost!` 클릭시 갱신)" if st.session_state.korean else "**Evaluation Prompt for LLM Judge (will refresh after `Calculate Cost!` clicked)**" ): prompt = load_prompt(promptname, task="-") kwargs = dict( inst="{inst}", src="{src}", out_a="{out_a}", out_b="{out_b}", task="-", ) if promptname == "translation_pair": kwargs["source_lang"] = "{source_lang}" kwargs["target_lang"] = "{target_lang}" prompt_cmpl = prompt.complete_prompt(**kwargs) st.markdown(f"### Evaluation Prompt: {promptname}") for msg in prompt_cmpl: st.markdown(f"**{msg['role']}**") st.info(show_linebreak_in_md(escape_markdown(msg["content"]))) if estimate_button: if st.session_state.get("upfiles_dir") is None: st.error( "❌ Requirements: You have to upload jsonlines files first to proceed" ) else: st.markdown("##### Estimated Cost") dummy_api_key = "dummy" dummy_exp_name = "dummy" result_file_path, return_code = run_varco_arena( # upload_dir=st.session_state.upfiles_dir, promptname=promptname, api_key=dummy_api_key, exp_name=dummy_exp_name, price_estimation=True, evaluation_model=eval_model, ) if return_code: st.error("❌ RuntimeError: An error occurred during cost estimation") else: st.success("✅ Cost estimation completed successfully") st.session_state.cost_estimated = True # Form for actual run with st.form("run_arena_form"): if st.session_state.korean: st.write("### 3. Varco Arena 구동하기") else: st.write("### 3. Run Varco Arena") api_key = st.text_input("Enter your OpenAI API Key", type="password") exp_name = st.text_input("(Optional) Enter Exp. name") exp_name = exp_name.replace( "..", "_" ) # May cause rmtree problem later. Block it. exp_name = exp_name.replace( USR_SUB, f"-{USR_SUB}-" ) # May cause rmtree problem later. Block it. exp_name = exp_name.replace("/", "-") exp_name = exp_name.replace(" ", "_") exp_name = exp_name.replace("~", "_") if st.session_state.korean: st.write("**주의**:`Ctrl+C` 버튼은 구현되지 않았습니다. 구동 전 숙고해주세요.") else: st.write("**Caution: `Ctrl+C` button hasn't been implemented.**") run_button = st.form_submit_button( "🔥 Run Arena!", disabled=(not st.session_state.get("cost_estimated", False)) or "result_file_path" in st.session_state.keys(), # run already performed once ) if run_button: set_nav_bar( True, sidebar_placeholder=sidebar_placeholder, toggle_hashstr="app_during_run", ) if st.session_state.get("upfiles_dir") is None: st.error( "❌ Requirements: You have to upload jsonlines files first to proceed" ) elif not api_key: st.error("❌ Requirements: OpenAI key required to run VA.") else: result_file_path, return_code = run_varco_arena( # upload_dir=st.session_state.upfiles_dir, promptname=promptname, api_key=api_key, exp_name=exp_name, price_estimation=False, evaluation_model=eval_model, ) if return_code: st.error("❌ RuntimeError: An error occurred during Varco Arena run") else: st.success("✅ Varco Arena run completed successfully") st.session_state.result_file_path = result_file_path set_nav_bar( False, sidebar_placeholder=sidebar_placeholder, toggle_hashstr="app_run_done" ) if __name__ == "__main__": main()