# NDIS Project - OpenAI - PBSP Scoring - Page 4 - Quality of Life Strategies

In [None]:
import openai
import re
import string
from ipywidgets import interact
import ipywidgets as widgets
from IPython.display import display, clear_output, Javascript, HTML, Markdown
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import json
import spacy
from spacy import displacy
from dotenv import load_dotenv
import pandas as pd
import argilla as rg
from argilla.metrics.text_classification import f1
from typing import Dict
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 10000)
pd.set_option('display.width', 10000)

In [None]:
#initializations
openai.api_key = os.environ['API_KEY']
openai.api_base = os.environ['API_BASE']
openai.api_type = os.environ['API_TYPE']
openai.api_version = os.environ['API_VERSION']
deployment_name = os.environ['DEPLOYMENT_ID']

#argilla
rg.init(
    api_url=os.environ["ARGILLA_API_URL"],
    api_key=os.environ["ARGILLA_API_KEY"]
)

In [None]:
#sentence extraction
def extract_sentences(paragraph):
    symbols = ['\\.', '!', '\\?', ';', ':', ',', '\\_', '\n', '\\-']
    pattern = '|'.join([f'{symbol}' for symbol in symbols])
    sentences = re.split(pattern, paragraph)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

In [None]:
def filter_dataframe(result_df, paragraph):
    filtered_df = result_df[result_df['Phrase'].apply(lambda x: x.lower().translate(str.maketrans("", "", string.punctuation)) in paragraph.lower().translate(str.maketrans("", "", string.punctuation)) or 
                                                                  x.lower().translate(str.maketrans("", "", string.punctuation)).replace("’s","s'") in paragraph.lower().translate(str.maketrans("", "", string.punctuation)))]
    filtered_df['Match_Percentage'] = filtered_df.apply(lambda row: len(set(row['Phrase'].lower()) & set(paragraph.lower())) / len(set(row['Phrase'].lower())), axis=1)
    filtered_df = filtered_df[filtered_df['Match_Percentage'] >= 0.2]
    filtered_df = filtered_df.drop(['Match_Percentage'], axis=1)
    if len(filtered_df) == 0:
        filtered_df = result_df
    filtered_df = filtered_df.drop_duplicates()
    return filtered_df

In [None]:
def process_response(response, query):
    sentences = []
    topics = []
    scores = []
    lines = response.strip().split("\n")
    topic = None
    for line in lines:
        if "Likes/Dislikes:" in line:
            topic = "LIKES/DISLIKES"
        elif "Interests:" in line:
            topic = "INTERESTS"
        elif "Hobbies:" in line:
            topic = "HOBBIES"
        elif "Relationships:" in line:
            topic = "RELATIONSHIPS"
        elif "Employment:" in line:
            topic = "EMPLOYMENT"
        elif "Health:" in line:
            topic = "HEALTH"
        elif "Education:" in line:
            topic = "EDUCATION"
        elif "None:" in line:
            topic = "NONE"
        else:
            try:
                parts = line.split("(Confidence Score:")
                if len(parts) == 2:
                    phrase = parts[0].strip()
                    score = float(parts[1].strip().replace(")", ""))
                    sentences.append(phrase)
                    topics.append(topic)
                    scores.append(score)
            except:
                pass
    result_df = pd.DataFrame({'Phrase': sentences, 'Topic': topics, 'Score': scores})
    try:
        result_df['Phrase'] = result_df['Phrase'].str.replace('\d+\.', '', regex=True)
        result_df['Phrase'] = result_df['Phrase'].str.replace('^\s', '', regex=True)
        result_df['Phrase'] = result_df['Phrase'].str.strip('"')
        result_df = filter_dataframe(result_df, query)
    except:
        sentences = extract_sentences(query)
        topics = ['NONE'] * len(sentences)
        scores = [0.9] * len(sentences)
        result_df = pd.DataFrame({'Phrase': sentences, 'Topic': topics, 'Score': scores})
    return result_df

In [None]:
def get_prompt(query):
    prompt = f"""
    The paragraph below is written in a positive behaviour support plan by a disability practitioner to outline a set of strategies to implement that aim to enhance the person with disability's quality of life. These strategies are not related to the person with disability's target behaviours; but they are strategies related to their likes/dislikes, interests, hobbies, relationships, employment, health, and education.

    Practitioner Paragraph:
    {query}

    Requirement:
    - Identify the phrases from the practitioner paragraph above that represent each of the following quality of life strategies: "Likes/Dislikes", "Interests", "Hobbies", "Relationships", "Employment", "Health", and "Education". 

    Guidelines:
    - "Likes/Dislikes": One strategy to enhance the quality of life of a person with a disability is to provide him with experiences that he enjoys and avoid experiences that he finds unpleasant. For example, if the person with a disability likes music, then encourage him to attend concerts or music festivals. On the other hand, if he dislikes crowded places, then discourage him from attending such events.

    - "Interests": One strategy to enhance the quality of life of a person with a disability is to provide opportunities for him to pursue his interests. For example, if the person with a disability is interested in animals, then encourage him to visit a zoo or volunteer at an animal shelter.

    - "Hobbies": One strategy to enhance the quality of life of a person with a disability is to facilitate access to materials or equipment needed for practicing his hobbies, or to arrange for him to participate in hobby-related events. For example, if the person with a disability enjoys painting, then provide him with art supplies or arrange for him to attend art classes.

    - "Relationships": One strategy to enhance the quality of life of a person with a disability is to support his existing relationships or facilitate the development of new relationships. For example, if the person with a disability enjoys spending time with a particular friend or family member, then support and encourage that relationship. 

    - "Employment": One strategy to enhance the quality of life of a person with a disability is to identify potential job opportunities, provide job training or coaching, or modify job duties to better suit the person with a disability's abilities. For example, if the person with a disability has an interest in gardening, then find for him employment at a nursery or garden center.

    - "Health": One strategy to enhance the quality of life of a person with a disability is to facilitate access to healthcare services, encourage healthy lifestyle choices, and support the person with a disability's physical and mental well-being. For example, if the person with a disability enjoys outdoor activities, then facilitate opportunities for him to engage in those activities.

    - "Education": One strategy to enhance the quality of life of a person with a disability is to identify educational resources or programs that align with the person with a disability's interests or abilities, or provide support to pursue educational goals. For example, if the person with a disability has an interest in history, then provide him with access to historical books, documentaries, or lectures.

    Specifications of a correct answer:
    - Please provide a response that closely matches the information in the practitioner paragraph and does not deviate significantly from it.
    - Provide your answer in numbered lists. 
    - All the phrases in your answer must be exact substrings in the original practitioner paragraph. without changing any characters.
    - All the upper case and lower case characters in the phrases in your answer must match the upper case and lower case characters in the original practitioner paragraph.
    - Start numbering the phrases under each quality of life strategy from number 1. 
    - Start each list of phrases with these titles: "Likes/Dislikes", "Interests", "Hobbies", "Relationships", "Employment", "Health", "Education".
    - For each phrase that belongs to any of the above quality of life strategies, provide a confidence score that ranges between 0.50 and 1.00, where a score of 0.50 means you are very weakly confident that the phrase belongs to that specific strategy, whereas a score of 1.00 means you are very strongly confident that the phrase belongs to that specific strategy.
    - Never include any phrase in your answer that does not exist in the practitioner paragraph.
    - If none of the phrases in the practitioner paragraph belongs to a quality of life strategy, do not include this quality of life strategy in your answer.
    - Include a final numbered list titled "None:", which include all the remaining phrases from the practitioner paragraph that do not belong to any of the quality of life strategies above. Provide a confidence score for each of these phrases as well.

    Example answer:

    Likes/Dislikes:
    1. I have encouraged Eddie to attend the theatre as he enjoys watching movies. (Confidence Score: 1.00)
    2. I have discouraged Taylor from attending theatres as watching loud movies disturbes him. (Confidence Score: 0.95)
    
    Interests:
    1. I always urge Eddie to visit a butterfly house as I've noticed his big interest in butterflies. (Confidence Score: 0.97)
    
    Hobbies:
    1. I've arranged for Taylor to attend dancing classes as he told me that dancing is his greatest hobby. (Confidence Score: 0.94)

    Relationships:
    1. I always encourage Eddie to gather with his school friends at cafes or restaurants as he enjoys spending time with them. (Confidence Score: 0.99)
    
    Employment:
    1. I have been searching for librarian role at a local library for Taylor to undertake since he has shown a great interest in books. (Confidence Score: 0.94)

    Health:
    1. I have encouraged Eddie to participate in local Marathons to improve his physical well-being. (Confidence Score: 0.99)

    Education:
    1. I have provided Taylor with free access to fiction and drama books as he is a big fan of literature. (Confidence Score: 0.96)

    None:
    1. I support Eddie, who is a 25-year old man with a disability. (Confidence Score: 0.96)
    2. Taylor has a plenty of likes, interests and hobbies. In the following lines, I will expain the strategies I am using these to enhance his quality of life.  (Confidence Score: 0.94)
    """
    return prompt

In [None]:
def get_response_chatgpt(prompt):
    response=openai.ChatCompletion.create(   
        engine=deployment_name,   
        messages=[         
        {"role": "system", "content": "You are a helpful assistant."},                  
        {"role": "user", "content": prompt}     
        ],
        temperature=0
    )
    reply = response["choices"][0]["message"]["content"]
    return reply

In [None]:
def convert_df(result_df):
    new_df = pd.DataFrame(columns=['text', 'prediction'])
    new_df['text'] = result_df['Phrase']
    new_df['prediction'] = result_df.apply(lambda row: [[row['Topic'], row['Score']]], axis=1)
    return new_df

In [None]:
def custom_f1(data: Dict[str, float], title: str):
    from plotly.subplots import make_subplots
    import plotly.colors
    import random

    fig = make_subplots(
        rows=2,
        cols=1,
        subplot_titles=[        "Overall Model Score",        "Model Score By Category",    ],
    )

    x = ['precision', 'recall', 'f1']
    macro_data = [v for k, v in data.items() if "macro" in k]
    fig.add_bar(
        x=x,
        y=macro_data,
        row=1,
        col=1,
    )
    per_label = {
        k: v
        for k, v in data.items()
        if all(key not in k for key in ["macro", "micro", "support"])
    }

    num_labels = int(len(per_label.keys())/3)
    fixed_colors = [str(color) for color in plotly.colors.qualitative.Plotly]
    colors = random.sample(fixed_colors, num_labels)

    fig.add_bar(
        x=[k for k, v in per_label.items()],
        y=[v for k, v in per_label.items()],
        row=2,
        col=1,
        marker_color=[colors[int(i/3)] for i in range(0, len(per_label.keys()))]
    )
    fig.update_layout(showlegend=False, title_text=title)

    return fig

In [None]:
topic_color_dict = {
        'LIKES/DISLIKES': '#FFCCCC',
        'INTERESTS': '#CCFFFF',
        'HOBBIES': '#FF69B4',
        'RELATIONSHIPS': '#FFFF00',
        'EMPLOYMENT': '#CCCCFF',
        'HEALTH': '#FFCC99',
        'EDUCATION': '#CCCCCC',
        'NONE': '#F08080'
    }

def color(df, color):
    return df.style.format({'Score': '{:,.2%}'.format}).bar(subset=['Score'], color=color)

def annotate_query(highlights, query, topics):
    ents = []
    for h, t in zip(highlights, topics):
        pattern = re.escape(h)
        pattern = re.sub(r'\\(.)', r'[\1\\W]*', pattern) # optional non-alphanumeric characters
        for match in re.finditer(pattern, query, re.IGNORECASE):
            ent_dict = {"start": match.start(), "end": match.end(), "label": t}
            ents.append(ent_dict)
    return ents

def path_to_image_html(path):
    return '<img src="'+ path + '" width="30" height="15" />'

passing_score = 0.5
final_passing = 0.0
def display_final_df(agg_df):
    tags = []
    crits = [
            'LIKES/DISLIKES',
            'INTERESTS',
            'HOBBIES',
            'RELATIONSHIPS',
            'EMPLOYMENT',
            'HEALTH',
            'EDUCATION'
            ]
    orig_crits = crits
    crits = [x for x in crits if x in agg_df.index.tolist()]
    bools = [agg_df.loc[crit, 'Final_Score'] > final_passing for crit in crits]
    paths = ['./thumbs_up.png' if x else './thumbs_down.png' for x in bools]
    df = pd.DataFrame({'Quality of Life Strategy': crits, 'USED': paths})
    rem_crits = [x for x in orig_crits if x not in crits]
    if len(rem_crits) > 0:
        df2 = pd.DataFrame({'Quality of Life Strategy': rem_crits, 'USED': ['./thumbs_down.png'] * len(rem_crits)})
        df = pd.concat([df, df2])
    df = df.set_index('Quality of Life Strategy')
    pd.set_option('display.max_colwidth', None)
    display(HTML('<div style="text-align: center;">' + df.to_html(classes=["align-center"], index=True, escape=False ,formatters=dict(USED=path_to_image_html)) + '</div>'))
    

### Please outline a set of strategies to implement that aim to enhance the focus person’s quality of life. 
#### These strategies are not related to the focus person’s target behaviours; they are strategies related to their likes/dislikes, interests, hobbies, relationships, employment, health, education for example.

In [None]:
#demo with Voila

bhvr_label = widgets.Label(value='Please type your answer:')
bhvr_text_input = widgets.Textarea(
    value='',
    placeholder='Type your answer',
    description='',
    disabled=False,
    layout={'height': '300px', 'width': '90%'}
)

bhvr_nlp_btn = widgets.Button(
    description='Score Answer',
    disabled=False,
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Score Answer',
    icon='check',
    layout={'height': '70px', 'width': '250px'}
)
bhvr_agr_btn = widgets.Button(
    description='Validate Data',
    disabled=False,
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Validate Data',
    icon='check',
    layout={'height': '70px', 'width': '250px'}
)
bhvr_eval_btn = widgets.Button(
    description='Evaluate Model',
    disabled=False,
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Evaluate Model',
    icon='check',
    layout={'height': '70px', 'width': '250px'}
)
btn_box = widgets.HBox([bhvr_nlp_btn, bhvr_agr_btn, bhvr_eval_btn], 
                       layout={'width': '100%', 'height': '160%'})
bhvr_outt = widgets.Output()
bhvr_outt.layout.height = '100%'
bhvr_outt.layout.width = '100%'
bhvr_box = widgets.VBox([bhvr_text_input, btn_box, bhvr_outt], 
                   layout={'width': '100%', 'height': '160%'})
dataset_rg_name = 'pbsp-page4-quality-of-life-strategy-argilla-ds'
agrilla_df = None
annotated = False
def on_bhvr_button_next(b):
    global agrilla_df
    with bhvr_outt:
        clear_output()
        query = bhvr_text_input.value
        prompt = get_prompt(query)
        response = get_response_chatgpt(prompt)
        result_df = process_response(response, query)
        sub_result_df = result_df[(result_df['Score'] >= passing_score) & (result_df['Topic'] != 'NONE')]
        sub_2_result_df = result_df[result_df['Topic'] == 'NONE']
        highlights = []
        if len(sub_result_df) > 0:
            highlights = sub_result_df['Phrase'].tolist()
            highlight_topics = sub_result_df['Topic'].tolist()    
            ents = annotate_query(highlights, query, highlight_topics)
            colors = {}
            for ent, ht in zip(ents, highlight_topics):
                colors[ent['label']] = topic_color_dict[ht]

            ex = [{"text": query,
                   "ents": ents,
                   "title": None}]
            title = "Quality of Life Strategy Highlights"
            display(HTML(f'<center><h1>{title}</h1></center>'))
            html = displacy.render(ex, style="ent", manual=True, jupyter=True, options={'colors': colors})
            display(HTML(html))
            title = "Quality of Life Strategy Classifications"
            display(HTML(f'<center><h1>{title}</h1></center>'))
            for top in topic_color_dict.keys():
                top_result_df = sub_result_df[sub_result_df['Topic'] == top]
                if len(top_result_df) > 0:
                    top_result_df = top_result_df.sort_values(by='Score', ascending=False).reset_index(drop=True)
                    top_result_df = top_result_df.set_index('Phrase')
                    top_result_df = top_result_df[['Score']]
                    display(HTML(
                        f'<left><h2 style="text-decoration: underline; text-decoration-color:{topic_color_dict[top]};">{top}</h2></left>'))
                    display(color(top_result_df, topic_color_dict[top]))

            agg_df = sub_result_df.groupby('Topic')['Score'].sum()
            agg_df = agg_df.to_frame()
            agg_df.index.name = 'Topic'
            agg_df.columns = ['Total Score']
            agg_df = agg_df.assign(
                Final_Score=lambda x: x['Total Score'] / x['Total Score'].sum() * 100.00
            )
            agg_df = agg_df.sort_values(by='Final_Score', ascending=False)
            title = "Quality of Life Strategy Coverage"
            display(HTML(f'<center><h1>{title}</h1></center>'))
            agg_df['Topic'] = agg_df.index
            rem_topics= [x for x in list(topic_color_dict.keys()) if not x in agg_df.Topic.tolist()]
            if len(rem_topics) > 0:
                rem_agg_df = pd.DataFrame({'Topic': rem_topics, 'Final_Score': 0.0, 'Total Score': 0.0})
                agg_df = pd.concat([agg_df, rem_agg_df])
            labels = agg_df['Final_Score'].round(1).astype('str') + '%'
            ax = agg_df.plot.bar(x='Topic', y='Final_Score', rot=0, figsize=(20, 5), align='center')
            for container in ax.containers:
                ax.bar_label(container, labels=labels)
                ax.yaxis.set_major_formatter(mtick.PercentFormatter())
                ax.legend(["Final Score (%)"])
                ax.set_xlabel('')
            plt.show()
            title = "Final Scores"
            display(HTML(f'<left><h1>{title}</h1></left>'))
            display_final_df(agg_df)
            if len(sub_2_result_df) > 0:
                sub_result_df = pd.concat([sub_result_df, sub_2_result_df]).reset_index(drop=True)
            agrilla_df = sub_result_df.copy()
        else:
            print(query)
            
def on_agr_button_next(b):
    global agrilla_df, annotated
    with bhvr_outt:
        clear_output()
        if agrilla_df is not None:
            # convert the dataframe to the structure accepted by argilla
            converted_df = convert_df(agrilla_df)
            # convert pandas dataframe to DatasetForTextClassification
            dataset_rg = rg.DatasetForTextClassification.from_pandas(converted_df)
            # delete the old DatasetForTextClassification from the Argilla web app if exists
            rg.delete(dataset_rg_name, workspace="admin")
            # load the new DatasetForTextClassification into the Argilla web app
            rg.log(dataset_rg, name=dataset_rg_name, workspace="admin")
            # Make sure all classes are present for annotation
            rg_settings = rg.TextClassificationSettings(label_schema=list(topic_color_dict.keys()))
            rg.configure_dataset(name=dataset_rg_name, workspace="admin", settings=rg_settings)
            annotated = True
        else:
            display(Markdown("<h2 style='color:red; text-align:center;'>Please score the answer first!</h2>"))
            
def on_eval_button_next(b):
    global annotated
    with bhvr_outt:
        clear_output()
        if annotated:
            data = dict(f1(dataset_rg_name))['data']
            display(custom_f1(data, "Model Evaluation Results"))
        else:
            display(Markdown("<h2 style='color:red; text-align:center;'>Please score the answer and validate the data first!</h2>"))

bhvr_nlp_btn.on_click(on_bhvr_button_next)
bhvr_agr_btn.on_click(on_agr_button_next)
bhvr_eval_btn.on_click(on_eval_button_next)

display(bhvr_label, bhvr_box)