Spaces:

spark-ds549
/

Epik

Sleeping

File size: 8,499 Bytes

a446b0b

import os
import pickle
import random
import pandas as pd

SPEAKERA_ROLE_MAP = {"Agent": 0, "Visitor": 1}

LABEL_MAP = {
    "Curiosity": 0,
    "Obscene": 1,
    "Informative": 2,
    "Openness": 3,
    "Acceptance": 4,
    "Interest": 5,
    "Greeting": 6,
    "Disapproval": 7,
    "Denial": 8,
    "Anxious": 9,
    "Uninterested": 10,
    "Remorse": 11,
    "Confused": 12,
    "Accusatory": 13,
    "Annoyed": 14,
}


def process_user_input(input: str):
    """Parse the user input and return a list of row where each row is a list with
    format `[<conversation_id>, <speaker>, <message>]`.

    Args:
        input (str): the input of the user with each line has the format of
        `<speaker>:<message>`. Only one message per line.

    Returns:
        dict: a dictionary containing whether the input was successfully processed and
        if so, the processed data of the input.
    """
    if input == None or input == "":
        return {"success": False, "message": "Input must not be an empty string!"}

    data = []
    for line in input.split("\n"):
        if line == "":
            continue
        try:
            speaker, message = line.split(":", 1)

            if speaker != "Agent" and speaker != "Visitor":
                return {"success": False, "message": f"Invalid speaker {speaker}"}

            # Assuming there's only one input conversation
            # Give it a dummy conversation id of epik_0
            data.append(["epik_0", speaker, message])
        except:
            return {"success": False, "message": "Invalid Input"}

    return {
        "success": True,
        "message": "Success",
        "data": data,
    }


def encode_speaker_role(role):
    return SPEAKERA_ROLE_MAP.get(role, 1)


def decode_speaker_role(role_numeric):
    for role, numeric_val in SPEAKERA_ROLE_MAP.items():
        if role_numeric == numeric_val:
            return role

    return "Unknow Speaker"


def encode_sentiment_label(label):
    return LABEL_MAP.get(label, -1)


def decode_numeric_label(label_numeric):
    for label, numeric_val in LABEL_MAP.items():
        if label_numeric == numeric_val:
            return label

    return "Unknow Label"


def preapre_csv(data: list[list], output_path: str, with_label: bool = False):
    """
    Process and group the speakers, messages, and labels (if any) by conversation
    ids. This function is useful to prepare the neccesary csv file before converting it into
    pickle file.


    Args:
        data (list[list]): A list contains the rows of a dataframe. Each row contains
        values representing the coversation id, speaker role, message (, and label if any) in this order.
        output_path (str): path to write the csv file.
        with_label (bool, optional): Whether the input data contains labels (ie, for
        training) or not (ie, for making predictions on a new sample). Defaults to False.
    """
    columns = ["ConversationId", "ParticipantRole", "Text"]

    if with_label:
        columns += ["Label"]

    df = pd.DataFrame(data=data, columns=columns)

    # encode the participant role
    df["ParticipantRoleEncoded"] = df["ParticipantRole"].apply(
        lambda role: encode_speaker_role(role)
    )

    # encode the labels
    if with_label:
        df["LabelNumeric"] = df["Label"].apply(
            lambda label: encode_sentiment_label(label)
        )
    else:
        # Give the new input dummy labels to match the model input shape
        df["LabelNumeric"] = df["ParticipantRole"].apply(lambda _: -1)

    # group the data into list based on conversation id
    agg_params = {"Label": list} if with_label else {}
    agg_params.update(
        {
            "ParticipantRole": list,
            "ParticipantRoleEncoded": list,
            "Text": list,
            "LabelNumeric": list,
        }
    )
    grouped_df = df.groupby("ConversationId").agg(agg_params).reset_index()

    grouped_df.to_csv(output_path, index=False, encoding="ascii")

    return grouped_df


def convert_to_pickle(
    source: str,
    dest: str,
    index_col: str = None,
    list_type_columns: list = [],
    order=[],
    exclude=[],
    single_tuple=False,
):
    """Convert a csv file into a pickle file with format
    col1, col2, ..., coln

    Args:
        source (str): path to csv file
        dest (str): the location where the pickle file will be stored
        index_col (str): the column with unique ids that serves as index. Default to
        None
        order (list, optional): specify the order for one or many columns from left to
        right, followed by columns not in order.
        exclude (list, optional): columns to be excluded from the result. Defaults to
        [].
        single_tuple (bool): whether or not to output as tuple if there is only one
        single column. Default to False.
    """
    df = pd.read_csv(source)
    df = df.drop(columns=exclude)

    # convert column from string representation of a list to list
    for col in list_type_columns:
        if col in df.columns:
            df[col] = df[col].fillna("[]").apply(lambda x: eval(x))

    if index_col != None:
        df = df.set_index(index_col)

    # reorder the columns
    if order != []:
        left = df[order]
        right = df[[col for col in df.columns if col not in order]]
        df = pd.concat([left, right], axis=1)

    output = ()
    for col in df.columns:
        output += (df[col].to_dict(),)

    if not single_tuple and len(output) == 1:
        output = output[0]

    with open(dest, "wb") as f:
        pickle.dump(output, f)
        f.close()

    return


def split_and_save_ids(
    ids, train_ratio=0.8, test_ratio=0.1, valid_ratio=0.1, dir=".", seed=None
):
    """
    Randomly split a list of IDs into training, testing, and validation sets and save them to text files.

    Args:
        ids (list): List of IDs to be split.
        train_ratio (float): Ratio of IDs for the training set (default is 0.8).
        test_ratio (float): Ratio of IDs for the testing set (default is 0.1).
        valid_ratio (float): Ratio of IDs for the validation set (default is 0.1).
        dir (str): the path to the directory to save the files for ids
        seed (int): Seed for randomization (default is None).

    Returns:
        train_set (list): List of IDs in the training set.
        test_set (list): List of IDs in the testing set.
        valid_set (list): List of IDs in the validation set.
    """

    # Check if the ratios add up to 1.0
    assert train_ratio + test_ratio + valid_ratio == 1.0, "Ratios should add up to 1.0"

    # Set random seed for reproducibility
    if seed is not None:
        random.seed(seed)

    # Shuffle the list of IDs
    random.shuffle(ids)

    # Calculate the split points
    train_split = int(len(ids) * train_ratio)
    test_split = train_split + int(len(ids) * test_ratio)

    # Split the IDs
    train_set = ids[:train_split]
    test_set = ids[train_split:test_split]
    valid_set = ids[test_split:]

    # Save the sets to text files
    def save_to_txt(file_path, id_set):
        with open(file_path, "w") as file:
            id_strings = [str(conv_id) for conv_id in id_set]
            file.write("\n".join(id_strings))

    save_to_txt(os.path.join(dir, "train_set.txt"), train_set)
    save_to_txt(os.path.join(dir, "test_set.txt"), test_set)
    save_to_txt(os.path.join(dir, "validation_set.txt"), valid_set)

    return train_set, test_set, valid_set


def merge_pkl_with_ids(pickle_src: str, ids_files: list, dir: str = "."):
    """Merge an existing pickle file with id files, resulting in a pickle file with 3
    more fields of train_ids, test_ids, and valid_ids.

    Args:
        pickle_src (str): the path to the pickle file
        ids_files (list): list of files that contain ids. Example:
        ["train_set.txt", "test_set.txt", "validation_set.txt"]. Each file should
        contain one single unique id on each line.
        dir (str, optional): the directory for ids_files. Defaults to ''.
    """
    ids_set = ()
    for filename in ids_files:
        ids = []
        path = os.path.join(dir, filename)
        with open(path, "r") as file:
            for line in file:
                ids.append(line.strip())

        ids_set += (ids,)

    with open(pickle_src, "rb") as file:
        data = pickle.load(file)
        data += ids_set
        file.close()

    with open(pickle_src, "wb") as file:
        pickle.dump(data, file)
        file.close()