import pandas as pd
from pathlib import Path
from datasets import load_dataset
import numpy as np
import os
import re

UNVERIFIED_MODELS = [

]

CONTAMINATED_MODELS = [

]

# From Open LLM Leaderboard
def model_hyperlink(link, model_name):
    # if model_name is above 50 characters, return first 47 characters and "..."
    if len(model_name) > 50:
        model_name = model_name[:47] + "..."
    if model_name == "random":
        output = "random"
    elif model_name == "Cohere March 2024":
        output = f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
    elif "openai" == model_name.split("/")[0]:
        output = f'<a target="_blank" href="https://huggingface.co/openai" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
    elif "Anthropic" == model_name.split("/")[0]:
        output = f'<a target="_blank" href="https://huggingface.co/Anthropic" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
    elif "google" == model_name.split("/")[0]:
        output = f'<a target="_blank" href="https://huggingface.co/google" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
    elif "PoLL" == model_name.split("/")[0]:
        output = model_name
    output = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'

    if model_name in UNVERIFIED_MODELS:
        output += " *"
    if model_name in CONTAMINATED_MODELS:
        output += " ⚠️"
    return output

def undo_hyperlink(html_string):
    # Regex pattern to match content inside > and <
    pattern = r'>[^<]+<'
    match = re.search(pattern, html_string)
    if match:
        # Extract the matched text and remove leading '>' and trailing '<'
        return match.group(0)[1:-1]
    else:
        return "No text found"


# Define a function to fetch and process data
def load_all_data(data_repo, subdir:str, subsubsets=False):    # use HF api to pull the git repo
    dir = Path(data_repo)
    data_dir = dir / subdir
    
    # get all files
    models_names = [f.split(".json")[0] for f in os.listdir(data_dir) 
                          if os.path.isfile(os.path.join(data_dir, f)) and f.endswith(".json")]
    # create empty dataframe to add all data to
    df = pd.DataFrame()

    # load all json data in the list models_results one by one to avoid not having the same entries
    for model_name in models_names:
        model_data = load_dataset("json", data_files=os.path.join(data_dir, model_name + ".json"), split="train")
        model_data = model_data.add_column("model", [model_name])
        df2 = pd.DataFrame(model_data)
        # add to df
        df = pd.concat([df2, df])
    
    return df


def prep_df(df):
    
    # sort columns alphabetically
    df = df.reindex(sorted(df.columns), axis=1)

    # move column "model" to the front
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index('model')))
    df = df.loc[:, cols]
    
    # apply model_hyperlink function to column "model"
    df["model"] = df.apply(lambda row: model_hyperlink(f"https://huggingface.co/{row['path']}", row['model']), axis=1)
    df = df.drop(columns=["path"])

    # select all columns except "model" and convert to score 
    cols = df.columns.tolist()
    cols.remove("model")
    cols = [c for c in cols if "rank" not in c and "confi" not in c]
    df[cols] = (df[cols]*100)

    # move average column to the second
    cols = list(df.columns)
    cols.insert(1, cols.pop(cols.index('average')))
    df = df.loc[:, cols]

    df = df.rename(columns={
        "model": "Model",
        "average": "Average",
        "brainstorm": "Brainstorm",
        "open_qa": "Open QA",
        "closed_qa": "Closed QA",
        "extract": "Extract",
        "generation": "Generation",
        "rewrite": "Rewrite",
        "summarize": "Summarize",
        "classify": "Classify",
        "reasoning_over_numerical_data": "Reasoning Over Numerical Data",
        "multi-document_synthesis": "Multi-Document Synthesis",
        "fact_checking_or_attributed_qa": "Fact Checking or Attributed QA",
    })

    # Format for different columns
    # if Score exists, round to 2 decimals
    # if "Average" in df.columns:
    #     df["Average"] = np.array([f"{v:.2f}" for v in df["Average"].values])
         
    # # round all others to 1 decimal
    # for col in df.columns:
    #     if col not in ["Model", "Average"]:
    #         # replace any df[col].values == '' with np.nan
    #         df[col] = df[col].replace('', np.nan)
    #         df[col] = np.array([f"{v:.1f}" for v in df[col].values])

    return df


def sort_by_category(df, category):
    new_df = df.copy()
    col_rank = category.lower().replace(" ", "_") + "_rank"
    col_confi = category.lower().replace(" ", "_") + "_confi"

    # sort
    new_df = new_df.sort_values(by=[col_rank, category], ascending=[True, False])

    # move column ranking to the front
    cols = list(new_df.columns)
    cols.insert(0, cols.pop(cols.index(col_rank)))
    new_df = new_df.loc[:, cols]
    new_df = new_df.rename(columns={col_rank: "Rank"})

    # move selected column to the third
    cols = list(new_df.columns)
    cols.insert(2, cols.pop(cols.index(category)))
    new_df = new_df.loc[:, cols]

    # move selected column to the fourth
    cols = list(new_df.columns)
    cols.insert(3, cols.pop(cols.index(col_confi)))
    new_df = new_df.loc[:, cols]
    new_df = new_df.rename(columns={col_confi: "95% CI"})


    # drop all ranking and confidence interval
    new_df = new_df.drop(columns=[c for c in new_df.columns if c.endswith("rank")])
    new_df = new_df.drop(columns=[c for c in new_df.columns if c.endswith("confi")])

    return new_df