import html

def clear_reply_mentions(tweet):
    '''Remove user mentions found in a reply to a tweet.

    Example: @user1 @user2 okay @user3 -> okay @user3
    '''
    # We don't need to use any sophisticated tokenization here like nltk
    tokens = tweet.split(" ")
    for index in range(len(tokens)):
        if not tokens[index].startswith("@"):
            return " ".join(tokens[index:])
    return ""

from emoji import demojize, is_emoji
from nltk.tokenize import TweetTokenizer

tweet_tokenizer = TweetTokenizer()

def normalizeToken(token, emojis_found=[], replace_user_mentions=True, replace_urls=True, demojize_emojis=True):
    lowercased_token = token.lower()
    if token.startswith("@") and replace_user_mentions:
        return "@USER"
    elif (lowercased_token.startswith("http") or lowercased_token.startswith("www")) and replace_urls:
        return "HTTPURL"
    elif len(token) == 1 and is_emoji(token):
        emojis_found.append(token)
        if demojize_emojis:
            return demojize(token)
        else:
            return token
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token


def normalizeTweet(tweet, tokenizer=tweet_tokenizer, replace_user_mentions=True, replace_urls=True, demojize_emojis=True, bert_tweet_specific_processing=True):
    emojis_found = []
    tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token, emojis_found=emojis_found, 
                                         replace_user_mentions=replace_user_mentions, 
                                         replace_urls=replace_urls,
                                         demojize_emojis=demojize_emojis) for token in tokens])

    if bert_tweet_specific_processing:
        normTweet = (
            normTweet.replace("cannot ", "can not ")
            .replace("n't ", " n't ")
            .replace("n 't ", " n't ")
            .replace("ca n't", "can't")
            .replace("ai n't", "ain't")
        )
        normTweet = (
            normTweet.replace("'m ", " 'm ")
            .replace("'re ", " 're ")
            .replace("'s ", " 's ")
            .replace("'ll ", " 'll ")
            .replace("'d ", " 'd ")
            .replace("'ve ", " 've ")
        )
        normTweet = (
            normTweet.replace(" p . m .", "  p.m.")
            .replace(" p . m ", " p.m ")
            .replace(" a . m .", " a.m.")
            .replace(" a . m ", " a.m ")
        )

    return " ".join(normTweet.split()), emojis_found


def clean_tweet(tweet, clear_html_chars=True, replace_user_mentions=True, replace_urls=True,
                demojize_emojis=True, bert_tweet_specific_processing=True):
    '''Helper function to clean tweets. Highly customizable to fit different needs.

    Params:
        tweet: the tweet to clean
        clear_html_chars: If true, will unescape any special html entities found in the tweet
        replace_user_mentions: If true, will replace any user mention with the token @USER
        replace_urls: If true, will replace any urls with the token HTTPURL
        demojize_emojis: If true, will demojize emojis
        bert_tweet_specific_clean: if true, will do some additional preprocessing for the BertTweet model

    Returns:
        The cleaned tweet
    '''
    # First step: clear mentions at the beginning of tweets (inserted automatically by Twitter when replying to a tweet).
    # These do not count in the character count of a tweet and may make the tweet length go way overboard.
    cleaned_tweet = clear_reply_mentions(tweet)
    
    # Second step: Remove any new lines 
    cleaned_tweet = cleaned_tweet.replace('\r', '').replace('\n', '')

    # Third step: if True, escape any html entities
    if clear_html_chars:
        cleaned_tweet = html.unescape(cleaned_tweet)

    # Normalize Tweet with remaining preprocessing (emojis, urls, mentions, etc..)
    normalized_tweet, emojis = normalizeTweet(cleaned_tweet,
                                              replace_user_mentions=replace_user_mentions,
                                              replace_urls=replace_urls,
                                              demojize_emojis=demojize_emojis,
                                              bert_tweet_specific_processing=bert_tweet_specific_processing)
    
    # TODO: process emoticons? e.g. :)

    return normalized_tweet