File size: 4,425 Bytes

cc0b62b

# -*- coding: utf-8 -*-
""" Tokenization tests.
"""
from __future__ import absolute_import, print_function, division, unicode_literals

import sys
from nose.tools import nottest
from os.path import dirname, abspath
sys.path.append(dirname(dirname(abspath(__file__))))
from torchmoji.tokenizer import tokenize

TESTS_NORMAL = [
    ('200K words!', ['200', 'K', 'words', '!']),
]

TESTS_EMOJIS = [
    ('i \U0001f496 you to the moon and back',
     ['i', '\U0001f496', 'you', 'to', 'the', 'moon', 'and', 'back']),
    ("i\U0001f496you to the \u2605's and back",
     ['i', '\U0001f496', 'you', 'to', 'the',
      '\u2605', "'", 's', 'and', 'back']),
    ('~<3~', ['~', '<3', '~']),
    ('<333', ['<333']),
    (':-)', [':-)']),
    ('>:-(', ['>:-(']),
    ('\u266b\u266a\u2605\u2606\u2665\u2764\u2661',
     ['\u266b', '\u266a', '\u2605', '\u2606',
      '\u2665', '\u2764', '\u2661']),
]

TESTS_URLS = [
    ('www.sample.com', ['www.sample.com']),
    ('http://endless.horse', ['http://endless.horse']),
    ('https://github.mit.ed', ['https://github.mit.ed']),
]

TESTS_TWITTER = [
    ('#blacklivesmatter', ['#blacklivesmatter']),
    ('#99_percent.', ['#99_percent', '.']),
    ('the#99%', ['the', '#99', '%']),
    ('@golden_zenith', ['@golden_zenith']),
    ('@99_percent', ['@99_percent']),
    ('[email protected]', ['[email protected]']),
]

TESTS_PHONE_NUMS = [
    ('518)528-0252', ['518', ')', '528', '-', '0252']),
    ('1200-0221-0234', ['1200', '-', '0221', '-', '0234']),
    ('1200.0221.0234', ['1200', '.', '0221', '.', '0234']),
]

TESTS_DATETIME = [
    ('15:00', ['15', ':', '00']),
    ('2:00pm', ['2', ':', '00', 'pm']),
    ('9/14/16', ['9', '/', '14', '/', '16']),
]

TESTS_CURRENCIES = [
    ('517.933\xa3', ['517', '.', '933', '\xa3']),
    ('$517.87', ['$', '517', '.', '87']),
    ('1201.6598', ['1201', '.', '6598']),
    ('120,6', ['120', ',', '6']),
    ('10,00\u20ac', ['10', ',', '00', '\u20ac']),
    ('1,000', ['1', ',', '000']),
    ('1200pesos', ['1200', 'pesos']),
]

TESTS_NUM_SYM = [
    ('5162f', ['5162', 'f']),
    ('f5162', ['f', '5162']),
    ('1203(', ['1203', '(']),
    ('(1203)', ['(', '1203', ')']),
    ('1200/', ['1200', '/']),
    ('1200+', ['1200', '+']),
    ('1202o-east', ['1202', 'o-east']),
    ('1200r', ['1200', 'r']),
    ('1200-1400', ['1200', '-', '1400']),
    ('120/today', ['120', '/', 'today']),
    ('today/120', ['today', '/', '120']),
    ('120/5', ['120', '/', '5']),
    ("120'/5", ['120', "'", '/', '5']),
    ('120/5pro', ['120', '/', '5', 'pro']),
    ("1200's,)", ['1200', "'", 's', ',', ')']),
    ('120.76.218.207', ['120', '.', '76', '.', '218', '.', '207']),
]

TESTS_PUNCTUATION = [
    ("don''t", ['don', "''", 't']),
    ("don'tcha", ["don'tcha"]),
    ('no?!?!;', ['no', '?', '!', '?', '!', ';']),
    ('no??!!..', ['no', '??', '!!', '..']),
    ('a.m.', ['a.m.']),
    ('.s.u', ['.', 's', '.', 'u']),
    ('!!i..n__', ['!!', 'i', '..', 'n', '__']),
    ('lv(<3)w(3>)u Mr.!', ['lv', '(', '<3', ')', 'w', '(', '3',
                            '>', ')', 'u', 'Mr.', '!']),
    ('-->', ['--', '>']),
    ('->', ['-', '>']),
    ('<-', ['<', '-']),
    ('<--', ['<', '--']),
    ('hello (@person)', ['hello', '(', '@person', ')']),
]


def test_normal():
    """ Normal/combined usage.
    """
    test_base(TESTS_NORMAL)


def test_emojis():
    """ Tokenizing emojis/emoticons/decorations.
    """
    test_base(TESTS_EMOJIS)


def test_urls():
    """ Tokenizing URLs.
    """
    test_base(TESTS_URLS)


def test_twitter():
    """ Tokenizing hashtags, mentions and emails.
    """
    test_base(TESTS_TWITTER)


def test_phone_nums():
    """ Tokenizing phone numbers.
    """
    test_base(TESTS_PHONE_NUMS)


def test_datetime():
    """ Tokenizing dates and times.
    """
    test_base(TESTS_DATETIME)


def test_currencies():
    """ Tokenizing currencies.
    """
    test_base(TESTS_CURRENCIES)


def test_num_sym():
    """ Tokenizing combinations of numbers and symbols.
    """
    test_base(TESTS_NUM_SYM)


def test_punctuation():
    """ Tokenizing punctuation and contractions.
    """
    test_base(TESTS_PUNCTUATION)


@nottest
def test_base(tests):
    """ Base function for running tests.
    """
    for (test, expected) in tests:
        actual = tokenize(test)
        assert actual == expected, \
            "Tokenization of \'{}\' failed, expected: {}, actual: {}"\
            .format(test, expected, actual)