# NDIS Project - OpenAI - PBSP Scoring - Page 3 - Formulation (Contextual Factors)

In [None]:
import openai
import re
import string
from ipywidgets import interact
import ipywidgets as widgets
from IPython.display import display, clear_output, Javascript, HTML, Markdown
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import json
import spacy
from spacy import displacy
from dotenv import load_dotenv
import pandas as pd
import argilla as rg
from argilla.metrics.text_classification import f1
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 10000)
pd.set_option('display.width', 10000)

In [None]:
#initializations
openai.api_key = os.environ['API_KEY']
openai.api_base = os.environ['API_BASE']
openai.api_type = os.environ['API_TYPE']
openai.api_version = os.environ['API_VERSION']
deployment_name = os.environ['DEPLOYMENT_ID']

#argilla
rg.init(
    api_url=os.environ["ARGILLA_API_URL"],
    api_key=os.environ["ARGILLA_API_KEY"]
)

In [None]:
#sentence extraction
def extract_sentences(paragraph):
    symbols = ['\\.', '!', '\\?', ';', ':', ',', '\\_', '\n', '\\-']
    pattern = '|'.join([f'{symbol}' for symbol in symbols])
    sentences = re.split(pattern, paragraph)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

In [None]:
def filter_dataframe(result_df, paragraph):
    filtered_df = result_df[result_df['Phrase'].apply(lambda x: x.lower().translate(str.maketrans("", "", string.punctuation)) in paragraph.lower().translate(str.maketrans("", "", string.punctuation)) or 
                                                                  x.lower().translate(str.maketrans("", "", string.punctuation)).replace("’s","s'") in paragraph.lower().translate(str.maketrans("", "", string.punctuation)))]
    filtered_df['Match_Percentage'] = filtered_df.apply(lambda row: len(set(row['Phrase'].lower()) & set(paragraph.lower())) / len(set(row['Phrase'].lower())), axis=1)
    filtered_df = filtered_df[filtered_df['Match_Percentage'] >= 0.2]
    filtered_df = filtered_df.drop(['Match_Percentage'], axis=1)
    if len(filtered_df) == 0:
        filtered_df = result_df
    filtered_df = filtered_df.drop_duplicates()
    return filtered_df

In [None]:
def process_response(response, query):
    sentences = []
    topics = []
    scores = []
    lines = response.strip().split("\n")
    topic = None
    for line in lines:
        if "Background factors:" in line:
            topic = "BACKGROUND"
        elif "Contributing factors:" in line:
            topic = "CONTRIBUTING"
        elif "Sustaining factors:" in line:
            topic = "SUSTAINING"
        elif "Strength factors:" in line:
            topic = "STRENGTH"
        elif "None:" in line:
            topic = "NONE"
        else:
            try:
                parts = line.split("(Confidence Score:")
                if len(parts) == 2:
                    phrase = parts[0].strip()
                    score = float(parts[1].strip().replace(")", ""))
                    sentences.append(phrase)
                    topics.append(topic)
                    scores.append(score)
            except:
                pass
    result_df = pd.DataFrame({'Phrase': sentences, 'Topic': topics, 'Score': scores})
    try:
        result_df['Phrase'] = result_df['Phrase'].str.replace('\d+\.', '', regex=True)
        result_df['Phrase'] = result_df['Phrase'].str.replace('^\s', '', regex=True)
        result_df['Phrase'] = result_df['Phrase'].str.strip('"')
        result_df = filter_dataframe(result_df, query)
    except:
        sentences = extract_sentences(query)
        topics = ['NONE'] * len(sentences)
        scores = [0.9] * len(sentences)
        result_df = pd.DataFrame({'Phrase': sentences, 'Topic': topics, 'Score': scores})
    return result_df

In [None]:
def get_prompt(query):
    prompt = f"""
    The paragraph below is written in a positive behaviour support plan by a disability practitioner to present a formulation regarding the target behaviours of the person with disability. This formulation summarises the functional behavioural assessment and provides a summary of the contextual factors impacting the target behaviours of the person with disability.

    Paragraph:
    {query}

    Requirement:
    - Identify the phrases from the paragraph above that represent each of the following contextual factors: "Background factors", "Contributing factors", "Sustaining factors", "Strength factors". 

    Guidelines:
    - "Background factors": The background (predisposing) factors refer to the factors that make the person with disability more likely to exhibit the target behaviours. Examples of background factors include: diagnosed disabilities, a diagnosed health conditions, previous history of trauma, previous history of mental health issues, aspects of family and relationships, inappropriate service delivery, and alike. 

    - "Contributing factors": The contributing (precipitating) factors refer to the factors that contribute to why is the person with disability is exhibiting the target behaviours now. Example of a contributing factor: a child with autism who engages in self-injurious behavior may be triggered by a noisy and crowded environment, such as a busy shopping mall, which can lead to the behavior occurring. Another example is a person with sensory processing disorder who engages in repetitive behaviors may do so in response to sensory overload, such as bright lights, loud noises, or certain textures.

    - "Sustaining factors": The sustaining (perpetuating) factors refer to the factors that support the continuation of the target behaviours exhibited by the person with disability. Example of a sustaining factor: a child who throws tantrums to gain attention from caregivers may be inadvertently reinforced by the attention they receive, leading to the behavior persisting. Another example is a student who engages in disruptive behavior in the classroom may do so to avoid a task they find challenging or aversive, and this avoidance may reinforce the behavior, making it more likely to continue.

    - "Strength factors": The strength (protective) factors refer to the factors within the person with disability’s context that set them up to succeed. Examples of strength factors include: skills which the person with disability possesses, having skilled and stable support team, having family supports, having friendships, education, employment, and alike. 

    Specifications of a correct answer:
    - Please provide a response that closely matches the information in the paragraph and does not deviate significantly from it.
    - Provide your answer in numbered lists. 
    - All the phrases in your answer must be exact substrings in the original paragraph. without changing any characters.
    - All the upper case and lower case characters in the phrases in your answer must match the upper case and lower case characters in the original paragraph.
    - Start numbering the phrases under each contextual factor from number 1. 
    - Start each list of phrases with these titles: "Background factors", "Contributing factors", "Sustaining factors", "Strength factors".
    - For each phrase that belongs to any of the above contextual factors, provide a confidence score that ranges between 0.50 and 1.00, where a score of 0.50 means you are very weakly confident that the phrase belongs to that specific contextual factor, whereas a score of 1.00 means you are very strongly confident that the phrase belongs to that specific contextual factor.
    - Never include any phrase in your answer that does not exist in the paragraph above.
    - If none of the phrases in the paragraph belongs to a contextual factor, do not include this contextual factor in your answer.
    - Include a final numbered list titled "None:", which include all the remaining phrases from the paragraph above that do not belong to any of the contextual factors above. Provide a confidence score for each of these phrases as well.

    Example answer:

    Background factors:
    1. Eddie's genetic condition, Down syndrome, which has genetic predispositions that influence Eddie's behavior and communication abilities. (Confidence Score: 1.00)
    2. Taylor has experienced past trauma, abuse, and neglect, which may have underlying emotional factors that have impacted his behavior. (Confidence Score: 0.95)
    
    Contributing factors:
    1. Eddie's self-injurious behavior is triggered by a noisy and crowded environment, such as a busy shopping mall, which can lead to the behavior occurring. (Confidence Score: 1.00)
    2. Taylor's aggression behavior is triggered when feeling overwhelmed or threatened, such as during a confrontational situation with a peer or authority figure. (Confidence Score: 0.92)
    
    Sustaining factors:
    1. Eddie's disruptive behavior in the classroom may occur to avoid a task he finds challenging, and his avoidance may reinforce his behavior, making it more likely to continue. (Confidence Score: 0.94)
    2. When Taylor throws tantrums to gain attention from caregivers, his behaviour is inadvertently reinforced by the attention he receives from them, leading to the behavior persisting. (Confidence Score: 0.90)

    Strength factors:
    1. Eddie has strong and nurturing relationship with his caregiver. This can foster a sense of trust, security, and stability, which may contribute to his positive behavior. (Confidence Score: 1.00)
    2. Taylor has a consistent daily schedule, visual supports and social stories, which can help in better understanding his autism expectations and in navigating his environment more effectively.

    None:
    1. <remaining phrase from the paragraph goes here>. (Confidence Score: <your score goes here>)
    2. <remaining phrase from the paragraph goes here>. (Confidence Score: <your score goes here>)
    """
    return prompt

In [None]:
def get_response_chatgpt(prompt):
    response=openai.ChatCompletion.create(   
        engine=deployment_name,   
        messages=[         
        {"role": "system", "content": "You are a helpful assistant."},                  
        {"role": "user", "content": prompt}     
        ],
        temperature=0
    )
    reply = response["choices"][0]["message"]["content"]
    return reply

In [None]:
def convert_df(result_df):
    new_df = pd.DataFrame(columns=['text', 'prediction'])
    new_df['text'] = result_df['Phrase']
    new_df['prediction'] = result_df.apply(lambda row: [[row['Topic'], row['Score']]], axis=1)
    return new_df

In [None]:
#query = """
#After conducting a thorough functional behavioral assessment of Eddie, it appears that his target behaviors stem from a variety of contextual factors. Eddie is a 25-year-old man with intellectual disability and autism spectrum disorder, who exhibits a range of challenging behaviors including self-injury, aggression, and property destruction. The formulation of Eddie's behaviors includes the following contextual factors:

#Background factors: Eddie's background factors suggest that he has experienced significant adversity and trauma throughout his life. Eddie was adopted at the age of four after experiencing neglect and abuse in his early years. He has a history of being hospitalized for self-injury and aggression, and he struggles with anxiety and sensory processing issues. These background factors may have predisposed Eddie to exhibit the target behaviors he displays.

#Contributing factors: The contributing factors to Eddie's behaviors are multi-faceted. Eddie struggles with communication and has limited expressive language skills, which makes it difficult for him to express his needs and desires effectively. Additionally, Eddie experiences significant sensory sensitivities that can trigger his challenging behaviors, such as when he is exposed to loud noises or crowded environments. Furthermore, Eddie's history of being restrained and secluded in hospital settings has led to trauma and anxiety surrounding certain stimuli, which can contribute to his behaviors.

#Sustaining factors: The sustaining factors that maintain Eddie's challenging behaviors include a lack of appropriate coping strategies and supports. Eddie has limited access to positive reinforcement, socialization opportunities, and leisure activities, which can lead to boredom and frustration. In addition, Eddie's challenging behaviors have inadvertently led to increased attention from staff and caregivers, which can reinforce the behaviors.

#Strength factors: Despite Eddie's challenges, he has several protective factors that can be built upon to support positive behaviors. Eddie is generally cooperative and compliant, and he responds well to structure and routine. Eddie has a strong desire to engage in meaningful activities, and he has a talent for art and music. Furthermore, Eddie has a supportive family who is invested in his success.
#"""
#prompt = get_prompt(query)
#response = get_response_chatgpt(prompt)
#result_df = process_response(response, query)
#result_df

In [None]:
topic_color_dict = {
        'BACKGROUND': '#FFCCCC',
        'CONTRIBUTING': '#CCFFFF',
        'SUSTAINING': '#FF69B4',
        'STRENGTH': '#FFFF00',
        'NONE': '#CCCCCC'
    }

def color(df, color):
    return df.style.format({'Score': '{:,.2%}'.format}).bar(subset=['Score'], color=color)

def annotate_query(highlights, query, topics):
    ents = []
    for h, t in zip(highlights, topics):
        pattern = re.escape(h)
        pattern = re.sub(r'\\(.)', r'[\1\\W]*', pattern) # optional non-alphanumeric characters
        for match in re.finditer(pattern, query, re.IGNORECASE):
            ent_dict = {"start": match.start(), "end": match.end(), "label": t}
            ents.append(ent_dict)
    return ents

def path_to_image_html(path):
    return '<img src="'+ path + '" width="30" height="15" />'

passing_score = 0.5
final_passing = 0.0
def display_final_df(agg_df):
    tags = []
    crits = [
            'BACKGROUND',
            'CONTRIBUTING',
            'SUSTAINING',
            'STRENGTH'
            ]
    orig_crits = crits
    crits = [x for x in crits if x in agg_df.index.tolist()]
    bools = [agg_df.loc[crit, 'Final_Score'] > final_passing for crit in crits]
    paths = ['./thumbs_up.png' if x else './thumbs_down.png' for x in bools]
    df = pd.DataFrame({'Contextual Factor': crits, 'USED': paths})
    rem_crits = [x for x in orig_crits if x not in crits]
    if len(rem_crits) > 0:
        df2 = pd.DataFrame({'Contextual Factor': rem_crits, 'USED': ['./thumbs_down.png'] * len(rem_crits)})
        df = pd.concat([df, df2])
    df = df.set_index('Contextual Factor')
    pd.set_option('display.max_colwidth', None)
    display(HTML('<div style="text-align: center;">' + df.to_html(classes=["align-center"], index=True, escape=False ,formatters=dict(USED=path_to_image_html)) + '</div>'))
    

### Please provide a formulation regarding the focus person’s target behaviour(s)

In [None]:
#demo with Voila

bhvr_label = widgets.Label(value='Please type your answer:')
bhvr_text_input = widgets.Textarea(
    value='',
    placeholder='Type your answer',
    description='',
    disabled=False,
    layout={'height': '300px', 'width': '90%'}
)

bhvr_nlp_btn = widgets.Button(
    description='Score Answer',
    disabled=False,
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Score Answer',
    icon='check',
    layout={'height': '70px', 'width': '250px'}
)
bhvr_agr_btn = widgets.Button(
    description='Validate Data',
    disabled=False,
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Validate Data',
    icon='check',
    layout={'height': '70px', 'width': '250px'}
)
bhvr_eval_btn = widgets.Button(
    description='Evaluate Model',
    disabled=False,
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Evaluate Model',
    icon='check',
    layout={'height': '70px', 'width': '250px'}
)
btn_box = widgets.HBox([bhvr_nlp_btn, bhvr_agr_btn, bhvr_eval_btn], 
                       layout={'width': '100%', 'height': '160%'})
bhvr_outt = widgets.Output()
bhvr_outt.layout.height = '100%'
bhvr_outt.layout.width = '100%'
bhvr_box = widgets.VBox([bhvr_text_input, btn_box, bhvr_outt], 
                   layout={'width': '100%', 'height': '160%'})
dataset_rg_name = 'pbsp-page3-formulation-argilla-ds'
agrilla_df = None
annotated = False
def on_bhvr_button_next(b):
    global agrilla_df
    with bhvr_outt:
        clear_output()
        query = bhvr_text_input.value
        prompt = get_prompt(query)
        response = get_response_chatgpt(prompt)
        result_df = process_response(response, query)
        sub_result_df = result_df[(result_df['Score'] >= passing_score) & (result_df['Topic'] != 'NONE')]
        sub_2_result_df = result_df[result_df['Topic'] == 'NONE']
        highlights = []
        if len(sub_result_df) > 0:
            highlights = sub_result_df['Phrase'].tolist()
            highlight_topics = sub_result_df['Topic'].tolist()    
            ents = annotate_query(highlights, query, highlight_topics)
            colors = {}
            for ent, ht in zip(ents, highlight_topics):
                colors[ent['label']] = topic_color_dict[ht]

            ex = [{"text": query,
                   "ents": ents,
                   "title": None}]
            title = "Contextual Factor Highlights"
            display(HTML(f'<center><h1>{title}</h1></center>'))
            html = displacy.render(ex, style="ent", manual=True, jupyter=True, options={'colors': colors})
            display(HTML(html))
            title = "Contextual Factor Classifications"
            display(HTML(f'<center><h1>{title}</h1></center>'))
            for top in topic_color_dict.keys():
                top_result_df = sub_result_df[sub_result_df['Topic'] == top]
                if len(top_result_df) > 0:
                    top_result_df = top_result_df.sort_values(by='Score', ascending=False).reset_index(drop=True)
                    top_result_df = top_result_df.set_index('Phrase')
                    top_result_df = top_result_df[['Score']]
                    display(HTML(
                        f'<left><h2 style="text-decoration: underline; text-decoration-color:{topic_color_dict[top]};">{top}</h2></left>'))
                    display(color(top_result_df, topic_color_dict[top]))

            agg_df = sub_result_df.groupby('Topic')['Score'].sum()
            agg_df = agg_df.to_frame()
            agg_df.index.name = 'Topic'
            agg_df.columns = ['Total Score']
            agg_df = agg_df.assign(
                Final_Score=lambda x: x['Total Score'] / x['Total Score'].sum() * 100.00
            )
            agg_df = agg_df.sort_values(by='Final_Score', ascending=False)
            title = "Contextual Factor Coverage"
            display(HTML(f'<center><h1>{title}</h1></center>'))
            agg_df['Topic'] = agg_df.index
            rem_topics= [x for x in list(topic_color_dict.keys()) if not x in agg_df.Topic.tolist()]
            if len(rem_topics) > 0:
                rem_agg_df = pd.DataFrame({'Topic': rem_topics, 'Final_Score': 0.0, 'Total Score': 0.0})
                agg_df = pd.concat([agg_df, rem_agg_df])
            labels = agg_df['Final_Score'].round(1).astype('str') + '%'
            ax = agg_df.plot.bar(x='Topic', y='Final_Score', rot=0, figsize=(20, 5), align='center')
            for container in ax.containers:
                ax.bar_label(container, labels=labels)
                ax.yaxis.set_major_formatter(mtick.PercentFormatter())
                ax.legend(["Final Score (%)"])
                ax.set_xlabel('')
            plt.show()
            title = "Final Scores"
            display(HTML(f'<left><h1>{title}</h1></left>'))
            display_final_df(agg_df)
            if len(sub_2_result_df) > 0:
                sub_result_df = pd.concat([sub_result_df, sub_2_result_df]).reset_index(drop=True)
            agrilla_df = sub_result_df.copy()
        else:
            print(query)
            
def on_agr_button_next(b):
    global agrilla_df, annotated
    with bhvr_outt:
        clear_output()
        if agrilla_df is not None:
            # convert the dataframe to the structure accepted by argilla
            converted_df = convert_df(agrilla_df)
            # convert pandas dataframe to DatasetForTextClassification
            dataset_rg = rg.DatasetForTextClassification.from_pandas(converted_df)
            # delete the old DatasetForTextClassification from the Argilla web app if exists
            rg.delete(dataset_rg_name, workspace="admin")
            # load the new DatasetForTextClassification into the Argilla web app
            rg.log(dataset_rg, name=dataset_rg_name, workspace="admin")
            # Make sure all classes are present for annotation
            rg_settings = rg.TextClassificationSettings(label_schema=list(topic_color_dict.keys()))
            rg.configure_dataset(name=dataset_rg_name, workspace="admin", settings=rg_settings)
            annotated = True
        else:
            display(Markdown("<h2 style='color:red; text-align:center;'>Please score the answer first!</h2>"))
            
def on_eval_button_next(b):
    global annotated
    with bhvr_outt:
        clear_output()
        if annotated:
            display(f1(dataset_rg_name).visualize())
        else:
            display(Markdown("<h2 style='color:red; text-align:center;'>Please score the answer and validate the data first!</h2>"))

bhvr_nlp_btn.on_click(on_bhvr_button_next)
bhvr_agr_btn.on_click(on_agr_button_next)
bhvr_eval_btn.on_click(on_eval_button_next)

display(bhvr_label, bhvr_box)