|
|
|
import jieba |
|
import torch |
|
|
|
|
|
def jieba_tokenize(str): |
|
return jieba.lcut(str) |
|
|
|
|
|
_UCODE_RANGES = ( |
|
("\u3400", "\u4db5"), |
|
("\u4e00", "\u9fa5"), |
|
("\u9fa6", "\u9fbb"), |
|
("\uf900", "\ufa2d"), |
|
("\ufa30", "\ufa6a"), |
|
("\ufa70", "\ufad9"), |
|
("\u20000", "\u2a6d6"), |
|
("\u2f800", "\u2fa1d"), |
|
("\uff00", "\uffef"), |
|
|
|
("\u2e80", "\u2eff"), |
|
("\u3000", "\u303f"), |
|
("\u31c0", "\u31ef"), |
|
("\u2f00", "\u2fdf"), |
|
("\u2ff0", "\u2fff"), |
|
("\u3100", "\u312f"), |
|
("\u31a0", "\u31bf"), |
|
("\ufe10", "\ufe1f"), |
|
("\ufe30", "\ufe4f"), |
|
("\u2600", "\u26ff"), |
|
("\u2700", "\u27bf"), |
|
("\u3200", "\u32ff"), |
|
("\u3300", "\u33ff"), |
|
) |
|
|
|
|
|
def is_chinese_char(uchar): |
|
for start, end in _UCODE_RANGES: |
|
if start <= uchar <= end: |
|
return True |
|
return False |
|
|
|
|
|
def chinese_char_tokenize(line): |
|
line = line.strip() |
|
line_in_chars = "" |
|
|
|
for char in line: |
|
if is_chinese_char(char): |
|
line_in_chars += " " |
|
line_in_chars += char |
|
line_in_chars += " " |
|
else: |
|
line_in_chars += char |
|
|
|
return line_in_chars |
|
|
|
|
|
|
|
|
|
|
|
def report_memory(name): |
|
"""Simple GPU memory report.""" |
|
mega_bytes = 1024.0 * 1024.0 |
|
string = name + ' memory (MB)' |
|
string += ' | allocated: {}'.format( |
|
torch.cuda.memory_allocated() / mega_bytes) |
|
string += ' | max allocated: {}'.format( |
|
torch.cuda.max_memory_allocated() / mega_bytes) |
|
string += ' | reserved: {}'.format( |
|
torch.cuda.memory_reserved() / mega_bytes) |
|
string += ' | max reserved: {}'.format( |
|
torch.cuda.max_memory_reserved() / mega_bytes) |
|
print(string) |
|
|