Merge pull request #13 from cclauss/flake8-fixes
Browse files- scripts/calculate_coverages.py +5 -0
- scripts/convert_all_datasets.py +5 -0
- torchmoji/filter_utils.py +7 -4
- torchmoji/finetuning.py +8 -4
- torchmoji/word_generator.py +5 -4
scripts/calculate_coverages.py
CHANGED
@@ -11,6 +11,11 @@ sys.path.insert(0, dirname(dirname(abspath(__file__))))
|
|
11 |
|
12 |
from torchmoji.sentence_tokenizer import SentenceTokenizer, coverage
|
13 |
|
|
|
|
|
|
|
|
|
|
|
14 |
IS_PYTHON2 = int(sys.version[0]) == 2
|
15 |
|
16 |
OUTPUT_PATH = 'coverage.csv'
|
|
|
11 |
|
12 |
from torchmoji.sentence_tokenizer import SentenceTokenizer, coverage
|
13 |
|
14 |
+
try:
|
15 |
+
unicode # Python 2
|
16 |
+
except NameError:
|
17 |
+
unicode = str # Python 3
|
18 |
+
|
19 |
IS_PYTHON2 = int(sys.version[0]) == 2
|
20 |
|
21 |
OUTPUT_PATH = 'coverage.csv'
|
scripts/convert_all_datasets.py
CHANGED
@@ -14,6 +14,11 @@ from torchmoji.create_vocab import VocabBuilder
|
|
14 |
from torchmoji.sentence_tokenizer import SentenceTokenizer, extend_vocab, coverage
|
15 |
from torchmoji.tokenizer import tokenize
|
16 |
|
|
|
|
|
|
|
|
|
|
|
17 |
IS_PYTHON2 = int(sys.version[0]) == 2
|
18 |
|
19 |
DATASETS = [
|
|
|
14 |
from torchmoji.sentence_tokenizer import SentenceTokenizer, extend_vocab, coverage
|
15 |
from torchmoji.tokenizer import tokenize
|
16 |
|
17 |
+
try:
|
18 |
+
unicode # Python 2
|
19 |
+
except NameError:
|
20 |
+
unicode = str # Python 3
|
21 |
+
|
22 |
IS_PYTHON2 = int(sys.version[0]) == 2
|
23 |
|
24 |
DATASETS = [
|
torchmoji/filter_utils.py
CHANGED
@@ -11,8 +11,11 @@ import numpy as np
|
|
11 |
from torchmoji.tokenizer import RE_MENTION, RE_URL
|
12 |
from torchmoji.global_variables import SPECIAL_TOKENS
|
13 |
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
16 |
|
17 |
AtMentionRegex = re.compile(RE_MENTION)
|
18 |
urlRegex = re.compile(RE_URL)
|
@@ -36,8 +39,8 @@ VARIATION_SELECTORS = [ '\ufe00',
|
|
36 |
'\ufe0f']
|
37 |
|
38 |
# from https://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python
|
39 |
-
ALL_CHARS = (
|
40 |
-
CONTROL_CHARS = ''.join(map(
|
41 |
CONTROL_CHAR_REGEX = re.compile('[%s]' % re.escape(CONTROL_CHARS))
|
42 |
|
43 |
def is_special_token(word):
|
|
|
11 |
from torchmoji.tokenizer import RE_MENTION, RE_URL
|
12 |
from torchmoji.global_variables import SPECIAL_TOKENS
|
13 |
|
14 |
+
try:
|
15 |
+
unichr # Python 2
|
16 |
+
except NameError:
|
17 |
+
unichr = chr # Python 3
|
18 |
+
|
19 |
|
20 |
AtMentionRegex = re.compile(RE_MENTION)
|
21 |
urlRegex = re.compile(RE_URL)
|
|
|
39 |
'\ufe0f']
|
40 |
|
41 |
# from https://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python
|
42 |
+
ALL_CHARS = (unichr(i) for i in range(sys.maxunicode))
|
43 |
+
CONTROL_CHARS = ''.join(map(unichr, list(range(0,32)) + list(range(127,160))))
|
44 |
CONTROL_CHAR_REGEX = re.compile('[%s]' % re.escape(CONTROL_CHARS))
|
45 |
|
46 |
def is_special_token(word):
|
torchmoji/finetuning.py
CHANGED
@@ -3,7 +3,6 @@
|
|
3 |
"""
|
4 |
from __future__ import print_function
|
5 |
|
6 |
-
import sys
|
7 |
import uuid
|
8 |
from time import sleep
|
9 |
from io import open
|
@@ -28,8 +27,13 @@ from torchmoji.global_variables import (FINETUNING_METHODS,
|
|
28 |
from torchmoji.tokenizer import tokenize
|
29 |
from torchmoji.sentence_tokenizer import SentenceTokenizer
|
30 |
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
def load_benchmark(path, vocab, extend_with=0):
|
35 |
""" Loads the given benchmark dataset.
|
@@ -66,7 +70,7 @@ def load_benchmark(path, vocab, extend_with=0):
|
|
66 |
|
67 |
# Decode data
|
68 |
try:
|
69 |
-
texts = [
|
70 |
except UnicodeDecodeError:
|
71 |
texts = [x.decode('utf-8') for x in data['texts']]
|
72 |
|
|
|
3 |
"""
|
4 |
from __future__ import print_function
|
5 |
|
|
|
6 |
import uuid
|
7 |
from time import sleep
|
8 |
from io import open
|
|
|
27 |
from torchmoji.tokenizer import tokenize
|
28 |
from torchmoji.sentence_tokenizer import SentenceTokenizer
|
29 |
|
30 |
+
try:
|
31 |
+
unicode
|
32 |
+
IS_PYTHON2 = True
|
33 |
+
except NameError:
|
34 |
+
unicode = str
|
35 |
+
IS_PYTHON2 = False
|
36 |
+
|
37 |
|
38 |
def load_benchmark(path, vocab, extend_with=0):
|
39 |
""" Loads the given benchmark dataset.
|
|
|
70 |
|
71 |
# Decode data
|
72 |
try:
|
73 |
+
texts = [unicode(x) for x in data['texts']]
|
74 |
except UnicodeDecodeError:
|
75 |
texts = [x.decode('utf-8') for x in data['texts']]
|
76 |
|
torchmoji/word_generator.py
CHANGED
@@ -7,7 +7,6 @@
|
|
7 |
|
8 |
from __future__ import division, print_function, unicode_literals
|
9 |
|
10 |
-
import sys
|
11 |
import re
|
12 |
import unicodedata
|
13 |
import numpy as np
|
@@ -26,8 +25,10 @@ from torchmoji.filter_utils import (convert_linebreaks,
|
|
26 |
remove_variation_selectors,
|
27 |
separate_emojis_and_text)
|
28 |
|
29 |
-
|
30 |
-
|
|
|
|
|
31 |
|
32 |
# Only catch retweets in the beginning of the tweet as those are the
|
33 |
# automatically added ones.
|
@@ -68,7 +69,7 @@ class WordGenerator():
|
|
68 |
that is not allowed.
|
69 |
"""
|
70 |
|
71 |
-
if not isinstance(sentence,
|
72 |
raise ValueError("All sentences should be Unicode-encoded!")
|
73 |
sentence = sentence.strip().lower()
|
74 |
|
|
|
7 |
|
8 |
from __future__ import division, print_function, unicode_literals
|
9 |
|
|
|
10 |
import re
|
11 |
import unicodedata
|
12 |
import numpy as np
|
|
|
25 |
remove_variation_selectors,
|
26 |
separate_emojis_and_text)
|
27 |
|
28 |
+
try:
|
29 |
+
unicode # Python 2
|
30 |
+
except NameError:
|
31 |
+
unicode = str # Python 3
|
32 |
|
33 |
# Only catch retweets in the beginning of the tweet as those are the
|
34 |
# automatically added ones.
|
|
|
69 |
that is not allowed.
|
70 |
"""
|
71 |
|
72 |
+
if not isinstance(sentence, unicode):
|
73 |
raise ValueError("All sentences should be Unicode-encoded!")
|
74 |
sentence = sentence.strip().lower()
|
75 |
|