Epik / app /views /cosmic_view.py
Minh Q. Le
Fixed modular import
efe5117
raw
history blame
7.35 kB
import os
import pickle
import tempfile
import gradio as gr
from tqdm import tqdm
from views.utils import (
create_input_instruction,
format_prediction_ouptut,
remove_temp_dir,
EXAMPLE_CONVERSATIONS,
)
from fairseq.data.data_utils import collate_tokens
import sys
sys.path.insert(0, "../") # neccesary to load modules outside of app
from views import roberta, comet, COSMIC_MODEL, cosmic_args
from preprocessing import preprocess
from Model.COSMIC.erc_training.predict_epik import predict, get_valid_dataloader
def cosmic_preprocess(input, dir="."):
result = preprocess.process_user_input(input)
if not result["success"]:
raise gr.Error(result["message"])
data = result["data"]
# processed the data and turn it into a csv file
output_csv_path = os.path.join(dir, "epik.csv")
grouped_df = preprocess.preapre_csv(data, output_csv_path, with_label=False)
# convert the csv to pickle file of speakers, labels, sentences
pickle_dest = os.path.join(dir, "epik.pkl")
preprocess.convert_to_pickle(
source=output_csv_path,
dest=pickle_dest,
index_col="ConversationId",
list_type_columns=[
"Text",
"ParticipantRoleEncoded",
"LabelNumeric",
],
order=[
"ParticipantRoleEncoded",
"LabelNumeric",
"Text",
],
exclude=["ParticipantRole"],
)
# split the id for prediction, we'll put these in validation ids
preprocess.split_and_save_ids(
grouped_df["ConversationId"].to_list(), 0, 0, 1, dir=dir
)
# add ids into the pickle files
preprocess.merge_pkl_with_ids(
pickle_src=pickle_dest,
ids_files=["train_set.txt", "test_set.txt", "validation_set.txt"],
dir=dir,
)
# generate the sentences pickle file
sentences_pkl_path = os.path.join(dir, "epik_sentences.pkl")
preprocess.convert_to_pickle(
source=output_csv_path,
dest=sentences_pkl_path,
index_col="ConversationId",
list_type_columns=["Text"],
exclude=[
"ParticipantRole",
"ParticipantRoleEncoded",
"LabelNumeric",
],
)
return pickle_dest, sentences_pkl_path
def cosmic_roberta_extract(path, dest_dir="."):
# load the feature from file at path
speakers, labels, sentences, train_ids, test_ids, valid_ids = pickle.load(
open(path, "rb")
)
roberta1, roberta2, roberta3, roberta4 = {}, {}, {}, {}
all_ids = train_ids + test_ids + valid_ids
for i in tqdm(range(len(all_ids))):
item = all_ids[i]
sent = sentences[item]
sent = [s.encode("ascii", errors="ignore").decode("utf-8") for s in sent]
batch = collate_tokens([roberta.encode(s) for s in sent], pad_idx=1)
feat = roberta.extract_features(batch, return_all_hiddens=True)
roberta1[item] = [row for row in feat[-1][:, 0, :].detach().numpy()]
roberta2[item] = [row for row in feat[-2][:, 0, :].detach().numpy()]
roberta3[item] = [row for row in feat[-3][:, 0, :].detach().numpy()]
roberta4[item] = [row for row in feat[-4][:, 0, :].detach().numpy()]
roberta_feature_path = os.path.join(dest_dir, "epik_features_roberta.pkl")
pickle.dump(
[
speakers,
labels,
roberta1,
roberta2,
roberta3,
roberta4,
sentences,
train_ids,
test_ids,
valid_ids,
],
open(roberta_feature_path, "wb"),
)
return roberta_feature_path
def cosmic_comet_extract(path, dir="."):
print("Extracting features in", path)
sentences = pickle.load(open(path, "rb"))
feaures = comet.extract(sentences)
comet_feature_path = os.path.join(dir, "epik_features_comet.pkl")
pickle.dump(feaures, open(comet_feature_path, "wb"))
return comet_feature_path
def cosmic_classifier(input):
# create a temporary directory for the input data
temp_dir = tempfile.mkdtemp(dir=os.getcwd(), prefix="temp")
epik_path, epik_sentences_path = cosmic_preprocess(input, temp_dir)
roberta_path = cosmic_roberta_extract(epik_path, temp_dir)
comet_path = cosmic_comet_extract(epik_sentences_path, temp_dir)
# use cosmic model to make predictions
data_loader, ids = get_valid_dataloader(roberta_path, comet_path)
predictions = predict(COSMIC_MODEL, data_loader, cosmic_args)
speakers, _, sentences, _, _, valid_ids = pickle.load(open(epik_path, "rb"))
# Assuming that there's only one conversation
conv_id = ids[0]
output = format_prediction_ouptut(
speakers[conv_id], sentences[conv_id], predictions[0]
)
print()
print("======= Removing Temporary Directory =======")
remove_temp_dir(temp_dir)
return output
def cosmic_ui():
with gr.Blocks() as cosmic_model:
gr.Markdown(
"""
# COSMIC
COSMIC is a popular model for predicting sentiment labels using the entire
context of the conversation. In other words, it analyzes the previous
messages to predict the sentiment label for the current message.<br/>
The model was adopted from this
[repo](https://github.com/declare-lab/conv-emotion.git), implemented based
on this research [paper](https://arxiv.org/pdf/2010.02795.pdf).
```bash COSMIC: COmmonSense knowledge for eMotion Identification in
Conversations. D. Ghosal, N. Majumder, A. Gelbukh, R. Mihalcea, & S. Poria. Findings of EMNLP 2020.
```
"""
)
create_input_instruction()
with gr.Row():
with gr.Column():
example_dropdown = gr.Dropdown(
choices=["-- Not Selected --"] + list(EXAMPLE_CONVERSATIONS.keys()),
value="-- Not Selected --",
label="Select an example",
)
gr.Markdown('<p style="text-align: center;color: gray;">--- OR ---</p>')
conversation_input = gr.TextArea(
value="",
label="Input you conversation",
placeholder="Plese input your conversation here.\n\n\n\nMaximum number of lines: 200",
lines=5,
max_lines=200,
)
def on_example_change(input):
if input in EXAMPLE_CONVERSATIONS:
return EXAMPLE_CONVERSATIONS[input]
return ""
example_dropdown.input(
on_example_change,
inputs=example_dropdown,
outputs=conversation_input,
)
submit_btn = gr.Button(value="Submit")
with gr.Column():
gr.Markdown(
'</br></br></br></br><h3 style="text-align: center;">Predicted Sentiment Labels for the Conversation</h3></br>'
)
output = gr.Markdown(value="", label="Output")
submit_btn.click(cosmic_classifier, conversation_input, output)
conversation_input.change(lambda x: "", conversation_input, output)
return cosmic_model