Upload with huggingface_hub
Browse files- cctokenizer.py +312 -0
- config.json +9 -3
- config.yaml +28 -6
- pytorch_model.bin +2 -2
- replace.json +0 -0
- special_tokens_map.json +8 -0
- tokenizer_config.json +22 -0
- vocab.txt +2 -2
cctokenizer.py
ADDED
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tokenization classes for ChineseCharTokenizer."""
|
2 |
+
|
3 |
+
from typing import Optional, Tuple, Union
|
4 |
+
from transformers import BertTokenizer
|
5 |
+
import numpy as np
|
6 |
+
import os
|
7 |
+
import re
|
8 |
+
import shutil
|
9 |
+
|
10 |
+
# https://www.ling.upenn.edu/courses/Spring_2003/ling538/UnicodeRanges.html
|
11 |
+
# https://www.microfocus.com/documentation/idol/IDOL/Servers/IDOLServer/11.2/Guides/html/English/expert/Content/IDOLExpert/Languages/Script_Ranges.htm
|
12 |
+
# https://www.ssec.wisc.edu/~tomw/java/unicode.html
|
13 |
+
# https://character-table.netlify.app/
|
14 |
+
# https://character-table.netlify.app/french/
|
15 |
+
# https://www.compart.com/en/unicode/U+31CE 重要:看各个unicode的编码语义信息,属于哪个编码段(block/plane)
|
16 |
+
|
17 |
+
# 联合国六个官方语言:阿拉伯文、中文、英文、法文、俄文、西班牙文
|
18 |
+
# [U_LAT] 拉丁语系的文字,包括英语、法语、西班牙语、德语等
|
19 |
+
# [U_RUS] 俄语等
|
20 |
+
# [U_ARA] 表示阿拉伯语
|
21 |
+
# [U_JAP] 日语
|
22 |
+
# [U_KOR] 韩语
|
23 |
+
# [U_LAN] 某种语言的文字,藏语, 泰语等等
|
24 |
+
# [U_CHI] 未知中文
|
25 |
+
|
26 |
+
# [U_PHO] 注音文字,音标等
|
27 |
+
# [U_RAD] 部首,笔画
|
28 |
+
# [U_PUN] 标点
|
29 |
+
|
30 |
+
# [U_GRE] 希腊字母
|
31 |
+
# [U_SYM] 各种各样的符号
|
32 |
+
# [U_COM] 组合符号,包括上下标
|
33 |
+
# [U_NUM] 序号,包括VIII,(1),(一)等,也包含一些数字
|
34 |
+
# [U_MAT] 数学符号
|
35 |
+
# [U_EMO] 表情
|
36 |
+
# ¤ 货币符号
|
37 |
+
|
38 |
+
unicode_map = [
|
39 |
+
{'token': '[U_LAT]', 'range': (0x0000, 0x007F), 'meaning': 'Basic Latin', },
|
40 |
+
{'token': '[U_LAT]', 'range': (0x0080, 0x00FF), 'meaning': 'C1 Controls and Latin-1 Supplement', }, # 拉丁字符,有德语、法语
|
41 |
+
{'token': '[U_LAT]', 'range': (0x0100, 0x017F), 'meaning': 'Latin Extended-A', }, # 有法语,例如œ
|
42 |
+
{'token': '[U_LAT]', 'range': (0x0180, 0x024F), 'meaning': 'Latin Extended-B', }, # 有法语,例如Ÿ
|
43 |
+
{'token': '[U_PHO]', 'range': (0x0250, 0x02AF), 'meaning': 'IPA Extensions', }, # 国际音标,例如ə
|
44 |
+
{'token': '[U_PHO]', 'range': (0x02B0, 0x02FF), 'meaning': 'Spacing Modifier Letters', }, # 国际音标,例如ʳ
|
45 |
+
{'token': '[U_PHO]', 'range': (0x0300, 0x036F), 'meaning': 'Combining Diacritical Marks', }, # 变音符号
|
46 |
+
{'token': '[U_GRE]', 'range': (0x0370, 0x03FF), 'meaning': 'Greek/Coptic', }, # 希腊字符,例如α
|
47 |
+
{'token': '[U_RUS]', 'range': (0x0400, 0x04FF), 'meaning': 'Cyrillic', }, # 有俄语
|
48 |
+
{'token': '[U_RUS]', 'range': (0x0500, 0x052F), 'meaning': 'Cyrillic Supplement', },
|
49 |
+
{'token': '[U_LAN]', 'range': (0x0530, 0x058F), 'meaning': 'Armenian', }, # 亚美尼亚语,属于印欧语系
|
50 |
+
{'token': '[U_LAN]', 'range': (0x0590, 0x05FF), 'meaning': 'Hebrew', }, # 希伯来语,犹太族用,有以色列,属于亚非语系
|
51 |
+
{'token': '[U_ARA]', 'range': (0x0600, 0x06FF), 'meaning': 'Arabic', }, # 阿拉伯语,联合国用,有沙特,属于亚非语系
|
52 |
+
{'token': '[U_LAN]', 'range': (0x0700, 0x074F), 'meaning': 'Syriac', }, # 古叙利亚语
|
53 |
+
{'token': '[U_ARA]', 'range': (0x0750, 0x077F), 'meaning': 'Undefined -> Arabic', },
|
54 |
+
{'token': '[U_LAN]', 'range': (0x0780, 0x07BF), 'meaning': 'Thaana', }, # 它拿字母, 马尔代夫用,属于印欧语系
|
55 |
+
{'token': '[U_ARA]', 'range': (0x07C0, 0x08FF), 'meaning': 'Undefined -> Arabic', },
|
56 |
+
{'token': '[U_LAN]', 'range': (0x0900, 0x097F), 'meaning': 'Devanagari', }, # 梵语, 印度宗教用,属于印欧语系
|
57 |
+
{'token': '[U_LAN]', 'range': (0x0980, 0x09FF), 'meaning': 'Bengali/Assamese', }, # 孟加拉语
|
58 |
+
{'token': '[U_LAN]', 'range': (0x0A00, 0x0A7F), 'meaning': 'Gurmukhi', }, # 古木基文,彭加语,印度用
|
59 |
+
{'token': '[U_LAN]', 'range': (0x0A80, 0x0AFF), 'meaning': 'Gujarati', }, # 古吉拉特语,印度用
|
60 |
+
{'token': '[U_LAN]', 'range': (0x0B00, 0x0B7F), 'meaning': 'Oriya', }, # 印度用
|
61 |
+
{'token': '[U_LAN]', 'range': (0x0B80, 0x0BFF), 'meaning': 'Tamil', }, # 印度用
|
62 |
+
{'token': '[U_LAN]', 'range': (0x0C00, 0x0C7F), 'meaning': 'Telugu', }, # 印度用
|
63 |
+
{'token': '[U_LAN]', 'range': (0x0C80, 0x0CFF), 'meaning': 'Kannada', }, # 印度用
|
64 |
+
{'token': '[U_LAN]', 'range': (0x0D00, 0x0DFF), 'meaning': 'Malayalam', }, # 印度用
|
65 |
+
# {'token': '[U_LAN]', 'range': (0x0D80, 0x0DFF), 'meaning': 'Sinhala', }, # 僧伽罗语,斯里兰卡用,近印度
|
66 |
+
{'token': '[U_LAN]', 'range': (0x0E00, 0x0E7F), 'meaning': 'Thai', }, # 泰语
|
67 |
+
{'token': '[U_LAN]', 'range': (0x0E80, 0x0EFF), 'meaning': 'Lao', }, # 老挝语
|
68 |
+
{'token': '[U_LAN]', 'range': (0x0F00, 0x0FFF), 'meaning': 'Tibetan', }, # 藏语 NOTE: 藏语是否需要单独列出?
|
69 |
+
{'token': '[U_LAN]', 'range': (0x1000, 0x109F), 'meaning': 'Myanmar', }, # 缅甸语
|
70 |
+
{'token': '[U_LAN]', 'range': (0x10A0, 0x10FF), 'meaning': 'Georgian', }, # 格鲁吉亚语,伊朗也用
|
71 |
+
{'token': '[U_KOR]', 'range': (0x1100, 0x11FF), 'meaning': 'Hangul Jamo', }, # 谚文,有古韩语
|
72 |
+
{'token': '[U_LAN]', 'range': (0x1200, 0x137F), 'meaning': 'Ethiopic', }, # 埃塞俄比亚语,非洲,仅次于阿拉伯语
|
73 |
+
{'token': '[U_LAN]', 'range': (0x1380, 0x139F), 'meaning': 'Undefined -> Ethiopic', },
|
74 |
+
{'token': '[U_LAN]', 'range': (0x13A0, 0x13FF), 'meaning': 'Cherokee', }, # 切罗基语,北美原住民
|
75 |
+
{'token': '[U_LAN]', 'range': (0x1400, 0x167F), 'meaning': 'Unified Canadian Aboriginal Syllabics', }, # 加拿大澳大利亚音标
|
76 |
+
{'token': '[U_LAN]', 'range': (0x1680, 0x169F), 'meaning': 'Ogham', }, # 欧甘字母,古爱尔兰用
|
77 |
+
{'token': '[U_LAN]', 'range': (0x16A0, 0x16FF), 'meaning': 'Runic', }, # 卢恩字母,古北欧用
|
78 |
+
{'token': '[U_LAN]', 'range': (0x1700, 0x171F), 'meaning': 'Tagalog', }, # 他加禄语,菲律宾及东南亚用
|
79 |
+
{'token': '[U_LAN]', 'range': (0x1720, 0x173F), 'meaning': 'Hanunoo', }, # 菲律宾用
|
80 |
+
{'token': '[U_LAN]', 'range': (0x1740, 0x175F), 'meaning': 'Buhid', }, # 菲律宾用
|
81 |
+
{'token': '[U_LAN]', 'range': (0x1760, 0x177F), 'meaning': 'Tagbanwa', }, # 菲律宾用
|
82 |
+
{'token': '[U_LAN]', 'range': (0x1780, 0x17FF), 'meaning': 'Khmer', }, # 高棉语,柬埔寨用
|
83 |
+
{'token': '[U_LAN]', 'range': (0x1800, 0x18AF), 'meaning': 'Mongolian', }, # 蒙古语
|
84 |
+
{'token': '[U_LAN]', 'range': (0x18B0, 0x18FF), 'meaning': 'Undefined -> Unified Canadian Aboriginal Syllabics', },
|
85 |
+
{'token': '[U_LAN]', 'range': (0x1900, 0x194F), 'meaning': 'Limbu', }, # 林布语,尼泊尔用
|
86 |
+
{'token': '[U_LAN]', 'range': (0x1950, 0x197F), 'meaning': 'Tai Le', }, # 傣语,云南用
|
87 |
+
{'token': '[U_LAN]', 'range': (0x1980, 0x19DF), 'meaning': 'Undefined -> New Tai Lue', }, # 新傣语
|
88 |
+
{'token': '[U_LAN]', 'range': (0x19E0, 0x19FF), 'meaning': 'Khmer Symbols', }, # 高棉标点,柬埔寨用
|
89 |
+
{'token': '[U_LAN]', 'range': (0x1A00, 0x1CFF), 'meaning': 'Undefined -> Ol Chiki', }, # 桑塔利语,印度用
|
90 |
+
{'token': '[U_PHO]', 'range': (0x1D00, 0x1D7F), 'meaning': 'Phonetic Extensions', }, # 音标,例如法语的ᵈ
|
91 |
+
{'token': '[U_PHO]', 'range': (0x1D80, 0x1DFF), 'meaning': 'Undefined -> Phonetic Extensions Supplement', },
|
92 |
+
{'token': '[U_LAT]', 'range': (0x1E00, 0x1EFF), 'meaning': 'Latin Extended Additional', }, # 拉丁带修饰符号,例如ṡ
|
93 |
+
{'token': '[U_GRE]', 'range': (0x1F00, 0x1FFF), 'meaning': 'Greek Extended', }, # 希腊字符带修饰,例如Ᾱ
|
94 |
+
{'token': '[U_SYM]', 'range': (0x2000, 0x206F), 'meaning': 'General Punctuation', }, # 各种符号,例如千分之‰
|
95 |
+
{'token': '[U_COM]', 'range': (0x2070, 0x209F), 'meaning': 'Superscripts and Subscripts', }, # 上下标,例如₂
|
96 |
+
{'token': '¤', 'range': (0x20A0, 0x20CF), 'meaning': 'Currency Symbols', }, # 货币符号,例如欧元€
|
97 |
+
{'token': '[U_COM]', 'range': (0x20D0, 0x20FF), 'meaning': 'Combining Diacritical Marks for Symbols', }, # 奇怪的可组合符号
|
98 |
+
{'token': '[U_SYM]', 'range': (0x2100, 0x214F), 'meaning': 'Letterlike Symbols', }, # 符号,例如普朗克常数ℎ,摄氏度℃
|
99 |
+
{'token': '[U_NUM]', 'range': (0x2150, 0x218F), 'meaning': 'Number Forms', }, # 特殊形式数字,例如三分之一⅓,八Ⅷ
|
100 |
+
{'token': '[U_SYM]', 'range': (0x2190, 0x21FF), 'meaning': 'Arrows', }, # 箭头,例如→
|
101 |
+
{'token': '[U_MAT]', 'range': (0x2200, 0x22FF), 'meaning': 'Mathematical Operators', }, # 数学符号,例如减号−,不属于∉
|
102 |
+
{'token': '[U_SYM]', 'range': (0x2300, 0x23FF), 'meaning': 'Miscellaneous Technical', }, # 杂乱的符号,例如放大倍数⌀,右上角⌝
|
103 |
+
{'token': '[U_SYM]', 'range': (0x2400, 0x243F), 'meaning': 'Control Pictures', }, # 描述符:表达控制的符号,例如空␀,退出␛
|
104 |
+
{'token': '[U_SYM]', 'range': (0x2440, 0x245F), 'meaning': 'Optical Character Recognition', }, # OCR符号,例如⑀,⑂
|
105 |
+
{'token': '[U_NUM]', 'range': (0x2460, 0x24FF), 'meaning': 'Enclosed Alphanumerics', }, # 带框序号,例如①,⒆
|
106 |
+
{'token': '[U_SYM]', 'range': (0x2500, 0x257F), 'meaning': 'Box Drawing', }, # 画盒子符,例如┋,┓
|
107 |
+
{'token': '[U_SYM]', 'range': (0x2580, 0x259F), 'meaning': 'Block Elements', }, # 画块符,例如▉,▀
|
108 |
+
{'token': '[U_SYM]', 'range': (0x25A0, 0x25FF), 'meaning': 'Geometric Shapes', }, # 几何形状,例如△
|
109 |
+
{'token': '[U_SYM]', 'range': (0x2600, 0x26FF), 'meaning': 'Miscellaneous Symbols', }, # 杂乱的符号,例如多云☁,女♀
|
110 |
+
{'token': '[U_SYM]', 'range': (0x2700, 0x27BF), 'meaning': 'Dingbats', }, # 杂乱的符号✈➓
|
111 |
+
{'token': '[U_MAT]', 'range': (0x27C0, 0x27EF), 'meaning': 'Miscellaneous Mathematical Symbols-A', }, # 杂乱的数学符号⟂⟘
|
112 |
+
{'token': '[U_SYM]', 'range': (0x27F0, 0x27FF), 'meaning': 'Supplemental Arrows-A', }, # 补充箭头⟹
|
113 |
+
{'token': '[U_LAN]', 'range': (0x2800, 0x28FF), 'meaning': 'Braille Patterns', }, # 盲文,点字文,⠝⠟
|
114 |
+
{'token': '[U_SYM]', 'range': (0x2900, 0x297F), 'meaning': 'Supplemental Arrows-B', }, # ⥬
|
115 |
+
{'token': '[U_MAT]', 'range': (0x2980, 0x29FF), 'meaning': 'Miscellaneous Mathematical Symbols-B', }, # ⭬
|
116 |
+
{'token': '[U_MAT]', 'range': (0x2A00, 0x2AFF), 'meaning': 'Supplemental Mathematical Operators', }, # ⪆
|
117 |
+
{'token': '[U_SYM]', 'range': (0x2B00, 0x2BFF), 'meaning': 'Miscellaneous Symbols and Arrows'}, # ⭬
|
118 |
+
{'token': '[U_LAN]', 'range': (0x2C00, 0x2E7F), 'meaning': 'Undefined -> Coptic', }, # 科普特语,埃塞俄比亚语
|
119 |
+
{'token': '[U_RAD]', 'range': (0x2E80, 0x2EFF), 'meaning': 'CJK Radicals Supplement', }, # CJK中日韩统一表意文字,部首,例如⺘
|
120 |
+
{'token': '[U_RAD]', 'range': (0x2F00, 0x2FDF), 'meaning': 'Kangxi Radicals', }, # 康熙字典部首
|
121 |
+
{'token': '[U_SYM]', 'range': (0x2FE0, 0x2FEF), 'meaning': 'Undefined -> Symbol', },
|
122 |
+
{'token': '[U_SYM]', 'range': (0x2FF0, 0x2FFF), 'meaning': 'Ideographic Description Characters', }, # 描述符:表意文字结构,例如上下结构,左右结构,半包围结构
|
123 |
+
{'token': '[U_PUN]', 'range': (0x3000, 0x303F), 'meaning': 'CJK Symbols and Punctuation', }, # 中文标点 。
|
124 |
+
{'token': '[U_JAP]', 'range': (0x3040, 0x309F), 'meaning': 'Hiragana', }, # 日语平假名
|
125 |
+
{'token': '[U_JAP]', 'range': (0x30A0, 0x30FF), 'meaning': 'Katakana', }, # 日语片假名
|
126 |
+
{'token': '[U_PHO]', 'range': (0x3100, 0x312F), 'meaning': 'Bopomofo', }, # 汉语拼音字,例如 ㄠㄎ
|
127 |
+
{'token': '[U_KOR]', 'range': (0x3130, 0x318F), 'meaning': 'Hangul Compatibility Jamo', }, # 韩文
|
128 |
+
{'token': '[U_JAP]', 'range': (0x3190, 0x319F), 'meaning': 'Kanbun (Kunten)', }, # 汉文,日本用
|
129 |
+
{'token': '[U_PHO]', 'range': (0x31A0, 0x31BF), 'meaning': 'Bopomofo Extended', }, # 汉语拼音字,例如ㆠ
|
130 |
+
{'token': '[U_RAD]', 'range': (0x31C0, 0x31EF), 'meaning': 'Undefined -> CJK Strokes', }, # 汉字笔画
|
131 |
+
{'token': '[U_JAP]', 'range': (0x31F0, 0x31FF), 'meaning': 'Katakana Phonetic Extensions', }, # 片假名音标
|
132 |
+
{'token': '[U_NUM]', 'range': (0x3200, 0x32FF), 'meaning': 'Enclosed CJK Letters and Months', }, # 汉字序号,例如㈠
|
133 |
+
{'token': '[U_SYM]', 'range': (0x3300, 0x33FF), 'meaning': 'CJK Compatibility', }, # 单字符表达单位 平方厘米㎠,毫克㎎,23点㍯
|
134 |
+
{'token': '[U_CHI]', 'range': (0x3400, 0x4DBF), 'meaning': 'CJK Unified Ideographs Extension A', }, # 中文罕见字
|
135 |
+
{'token': '[U_SYM]', 'range': (0x4DC0, 0x4DFF), 'meaning': 'Yijing Hexagram Symbols', }, # 易经六十四卦,䷁ ䷖
|
136 |
+
{'token': '[U_CHI]', 'range': (0x4E00, 0x9FAF), 'meaning': 'CJK Unified Ideographs', }, # 中文
|
137 |
+
{'token': '[U_CHI]', 'range': (0x9FB0, 0x9FFF), 'meaning': 'Undefined -> CJK Unified Ideographs', }, # 中文
|
138 |
+
{'token': '[U_LAN]', 'range': (0xA000, 0xA48F), 'meaning': 'Yi Syllables', }, # 彝文字符,凉山彝族用
|
139 |
+
{'token': '[U_LAN]', 'range': (0xA490, 0xA4CF), 'meaning': 'Yi Radicals', }, # 彝文部首
|
140 |
+
{'token': '[U_LAN]', 'range': (0xA4D0, 0xABFF), 'meaning': 'Undefined -> Cherokee'}, # 彻罗基族, 北美印第安人
|
141 |
+
{'token': '[U_KOR]', 'range': (0xAC00, 0xD7AF), 'meaning': 'Hangul Syllables', }, # 韩语音节
|
142 |
+
{'token': '[U_KOR]', 'range': (0xD7B0, 0xD7FF), 'meaning': 'Undefined -> Hangul Jamo Extended-B', },
|
143 |
+
{'range': (0xD800, 0xDBFF), 'meaning': 'High Surrogate Area', 'token': '[UNK]'},
|
144 |
+
{'range': (0xDC00, 0xDFFF), 'meaning': 'Low Surrogate Area', 'token': '[UNK]'},
|
145 |
+
{'range': (0xE000, 0xF8FF), 'meaning': 'Private Use Area', 'token': '[UNK]'}, # NOTE 统计字频时,有很多落无法显示的字符落到了这一区域
|
146 |
+
{'token': '[U_CHI]', 'range': (0xF900, 0xFAFF), 'meaning': 'CJK Compatibility Ideographs', }, # 中文
|
147 |
+
{'token': '[U_LAT]', 'range': (0xFB00, 0xFB4F), 'meaning': 'Alphabetic Presentation Forms', }, # 拉丁、希伯来语字母排版符号
|
148 |
+
{'token': '[U_ARA]', 'range': (0xFB50, 0xFDFF), 'meaning': 'Arabic Presentation Forms-A', }, # 阿拉伯语排版符号
|
149 |
+
{'token': '[U_SYM]', 'range': (0xFE00, 0xFE0F), 'meaning': 'Variation Selectors', }, # 描述符: 表示选择第几个
|
150 |
+
{'token': '[U_PUN]', 'range': (0xFE10, 0xFE1F), 'meaning': 'Undefined -> Vertical Forms', }, # 竖排标点
|
151 |
+
{'token': '[U_COM]', 'range': (0xFE20, 0xFE2F), 'meaning': 'Combining Half Marks', }, # 可组合符号 例如$︡a
|
152 |
+
{'token': '[U_PUN]', 'range': (0xFE30, 0xFE4F), 'meaning': 'CJK Compatibility Forms', }, # 中文排版符号,例如竖排的符号︻︼
|
153 |
+
{'token': '[U_PUN]', 'range': (0xFE50, 0xFE6F), 'meaning': 'Small Form Variants', }, # 小符号,小逗号﹐,小问号﹖
|
154 |
+
{'token': '[U_ARA]', 'range': (0xFE70, 0xFEFF), 'meaning': 'Arabic Presentation Forms-B', }, # 阿拉伯语排版符号
|
155 |
+
{'token': '[U_LAT]', 'range': (0xFF00, 0xFFEF), 'meaning': 'Halfwidth and Fullwidth Forms', }, # 中文全角字符0c, TODO: 这里视为拉丁文,应该映射回拉丁的半角字符
|
156 |
+
{'token': '[U_SYM]', 'range': (0xFFF0, 0xFFFF), 'meaning': 'Specials', }, # 描述符: 可表示替换等操作
|
157 |
+
{'token': '[U_LAN]', 'range': (0x10000, 0x1007F), 'meaning': 'Linear B Syllabary', }, # 线形文字,像象形文字
|
158 |
+
{'token': '[U_LAN]', 'range': (0x10080, 0x100FF), 'meaning': 'Linear B Ideograms', },
|
159 |
+
{'token': '[U_LAN]', 'range': (0x10100, 0x1013F), 'meaning': 'Aegean Numbers', }, # 爱琴海象形数字
|
160 |
+
{'token': '[U_LAN]', 'range': (0x10140, 0x102FF), 'meaning': 'Undefined -> Carian', },
|
161 |
+
{'token': '[U_LAN]', 'range': (0x10300, 0x1032F), 'meaning': 'Old Italic', }, # 古意大利
|
162 |
+
{'token': '[U_LAN]', 'range': (0x10330, 0x1034F), 'meaning': 'Gothic', }, # 哥特语
|
163 |
+
{'range': (0x10350, 0x1037F), 'meaning': 'Undefined', 'token': '[UNK]'},
|
164 |
+
{'token': '[U_LAN]', 'range': (0x10380, 0x1039F), 'meaning': 'Ugaritic', }, # 古文字在叙利亚发现
|
165 |
+
{'token': '[U_LAN]', 'range': (0x103A0, 0x103FF), 'meaning': 'Undefined -> Old Persian', }, # 古波斯语
|
166 |
+
{'token': '[U_LAN]', 'range': (0x10400, 0x1044F), 'meaning': 'Deseret', }, # 北美原住民用
|
167 |
+
{'token': '[U_PHO]', 'range': (0x10450, 0x1047F), 'meaning': 'Shavian', }, # 萧伯纳的音标
|
168 |
+
{'token': '[U_LAN]', 'range': (0x10480, 0x104AF), 'meaning': 'Osmanya', }, # 索马里曾经用
|
169 |
+
{'token': '[U_LAN]', 'range': (0x104B0, 0x107FF), 'meaning': 'Undefined -> Osage', }, # 欧塞奇语
|
170 |
+
{'token': '[U_LAN]', 'range': (0x10800, 0x1083F), 'meaning': 'Cypriot Syllabary', }, # 塞浦路斯,地中海岛国
|
171 |
+
{'token': '[U_LAN]', 'range': (0x10840, 0x1CFFF), 'meaning': 'Undefined -> Cuneiform, Chakma, Kharoshthi...', }, # 许多种语言,包括楔形文字, 和一些符号𓆝 1319d 𓆟 1319f
|
172 |
+
{'token': '[U_LAN]', 'range': (0x1D000, 0x1D0FF), 'meaning': 'Byzantine Musical Symbols', }, # 拜占庭人音乐符号
|
173 |
+
{'token': '[U_SYM]', 'range': (0x1D100, 0x1D1FF), 'meaning': 'Musical Symbols', }, # 音乐符号
|
174 |
+
{'range': (0x1D200, 0x1D2FF), 'meaning': 'Undefined', 'token': '[UNK]'},
|
175 |
+
{'token': '[U_SYM]', 'range': (0x1D300, 0x1D35F), 'meaning': 'Tai Xuan Jing Symbols', }, # 太玄经符号
|
176 |
+
{'range': (0x1D360, 0x1D3FF), 'meaning': 'Undefined', 'token': '[UNK]'},
|
177 |
+
{'token': '[U_MAT]', 'range': (0x1D400, 0x1D7FF), 'meaning': 'Mathematical Alphanumeric Symbols', }, # 数学中用的字母,例如斜体的字母
|
178 |
+
{'token': '[U_LAN]', 'range': (0x1D800, 0x1F003), 'meaning': 'Undefined -> Adlam', }, # 阿德拉姆字母,用于西非
|
179 |
+
{'token': '[U_EMO]', 'range': (0x1F004, 0x1FAF8), 'meaning': 'Undefined -> Emoji', }, # emoji
|
180 |
+
{'token': '[U_SYM]', 'range': (0x1FAF9, 0x1FFFF), 'meaning': 'Undefined -> Symbols for Legacy Computing', },
|
181 |
+
{'token': '[U_CHI]', 'range': (0x20000, 0x2A6DF), 'meaning': 'CJK Unified Ideographs Extension B', }, # 中文罕见字
|
182 |
+
{'token': '[U_CHI]', 'range': (0x2A6E0, 0x2F7FF), 'meaning': 'Undefined -> CJK Unified Ideographs Extension F...', }, # 中文罕见字
|
183 |
+
{'token': '[U_CHI]', 'range': (0x2F800, 0x2FA1F), 'meaning': 'CJK Compatibility Ideographs Supplement', }, # 中文罕见字
|
184 |
+
{'range': (0x2FA20, 0x2FAAF), 'meaning': 'Undefined', 'token': '[UNK]'},
|
185 |
+
{'range': (0x2FAB0, 0x2FFFF), 'meaning': 'Unused', 'token': '[UNK]'},
|
186 |
+
{'token': '[U_CHI]', 'range': (0x30000, 0x3134F), 'meaning': 'Unused -> CJK Unified Ideographs Extension G (unassigned)', }, # 未使用的中文编码区间
|
187 |
+
{'range': (0x31350, 0xDFFFF), 'meaning': 'Unused', 'token': '[UNK]'},
|
188 |
+
{'token': '[U_SYM]', 'range': (0xE0000, 0xE007F), 'meaning': 'Tags', }, # 描述符: 表示标签
|
189 |
+
{'range': (0xE0080, 0xE00FF), 'meaning': 'Unused', 'token': '[UNK]'},
|
190 |
+
{'token': '[U_SYM]', 'range': (0xE0100, 0xE01EF), 'meaning': 'Variation Selectors Supplement', }, # 描述符: 表示选择
|
191 |
+
{'range': (0xE01F0, 0xEFFFF), 'meaning': 'Unused', 'token': '[UNK]'},
|
192 |
+
{'range': (0xF0000, 0xFFFFD), 'meaning': 'Supplementary Private Use Area-A', 'token': '[UNK]'},
|
193 |
+
{'range': (0xFFFFE, 0xFFFFF), 'meaning': 'Unused', 'token': '[UNK]'},
|
194 |
+
{'range': (0x100000, 0x10FFFD), 'meaning': 'Supplementary Private Use Area-B', 'token': '[UNK]'},
|
195 |
+
]
|
196 |
+
|
197 |
+
|
198 |
+
def get_unicode_ranges():
|
199 |
+
# 检查区间是否连续
|
200 |
+
left_bounds = [m['range'][0] for m in unicode_map]
|
201 |
+
right_bounds = [m['range'][1] for m in unicode_map]
|
202 |
+
for right, left in zip(right_bounds[:-1], left_bounds[1:]):
|
203 |
+
assert right+1 == left
|
204 |
+
return np.array(right_bounds)
|
205 |
+
|
206 |
+
|
207 |
+
def _is_chinese_char(cp):
|
208 |
+
# copied from transformers.models.bert.tokenization_bert.BasicTokenizer._is_chinese_char
|
209 |
+
"""Checks whether CP is the codepoint of a CJK character."""
|
210 |
+
# This defines a "chinese character" as anything in the CJK Unicode block:
|
211 |
+
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
|
212 |
+
#
|
213 |
+
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
|
214 |
+
# despite its name. The modern Korean Hangul alphabet is a different block,
|
215 |
+
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
|
216 |
+
# space-separated words, so they are not treated specially and handled
|
217 |
+
# like the all of the other languages.
|
218 |
+
if (
|
219 |
+
(cp >= 0x4E00 and cp <= 0x9FFF)
|
220 |
+
or (cp >= 0x3400 and cp <= 0x4DBF) #
|
221 |
+
or (cp >= 0x20000 and cp <= 0x2A6DF) #
|
222 |
+
or (cp >= 0x2A700 and cp <= 0x2B73F) #
|
223 |
+
or (cp >= 0x2B740 and cp <= 0x2B81F) #
|
224 |
+
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
|
225 |
+
or (cp >= 0xF900 and cp <= 0xFAFF)
|
226 |
+
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
|
227 |
+
): #
|
228 |
+
return True
|
229 |
+
|
230 |
+
return False
|
231 |
+
|
232 |
+
|
233 |
+
def show_unicode(start=0x1F004, end=0x1FAF8):
|
234 |
+
# emoji 文件 https://www.unicode.org/Public/emoji/latest/emoji-sequences.txt
|
235 |
+
# https://github.com/hidehalo/emoji/issues/3
|
236 |
+
# https://apps.timwhitlock.info/emoji/tables/unicode
|
237 |
+
for i in range(start, end):
|
238 |
+
print(chr(i), end=' ')
|
239 |
+
print()
|
240 |
+
|
241 |
+
|
242 |
+
def load_json(file):
|
243 |
+
import json
|
244 |
+
with open(file, 'r', encoding='utf-8') as f:
|
245 |
+
obj = json.load(f)
|
246 |
+
return obj
|
247 |
+
|
248 |
+
|
249 |
+
class ChineseCharTokenizer(BertTokenizer):
|
250 |
+
vocab_files_names = {"vocab_file": "vocab.txt", 'mapping_file': "replace.json"}
|
251 |
+
|
252 |
+
def __init__(self, vocab_file, *args, **kwargs):
|
253 |
+
super(ChineseCharTokenizer, self).__init__(vocab_file, *args, **kwargs)
|
254 |
+
self.unicoder_ranges = get_unicode_ranges()
|
255 |
+
self.enclosed_tokens = {token for token in self.vocab if token[0] == '[' and token[-1] == ']' and 'unused' not in token}
|
256 |
+
self.enclosed_tokens_by_len = [
|
257 |
+
[token for token in self.enclosed_tokens if len(token) == 5],
|
258 |
+
[token for token in self.enclosed_tokens if len(token) == 6],
|
259 |
+
[token for token in self.enclosed_tokens if len(token) == 7]
|
260 |
+
]
|
261 |
+
self.dir = os.path.join(os.path.dirname(vocab_file))
|
262 |
+
self.replace_map = load_json(os.path.join(self.dir, 'replace.json'))
|
263 |
+
|
264 |
+
# # [EOS]相当于逗号、换行,不用看作special token
|
265 |
+
def convert_token_to_representative(self, token: str) -> str:
|
266 |
+
token = self.replace_map.get(token, token) # 异体字转换,繁简转换,全半角转换,大小写转换等
|
267 |
+
if token in self.vocab:
|
268 |
+
return token
|
269 |
+
else:
|
270 |
+
assert len(token) == 1, token
|
271 |
+
if re.match(r'\s', token): # 匹配\u2003, \t等
|
272 |
+
return ' '
|
273 |
+
v = ord(token)
|
274 |
+
if _is_chinese_char(v):
|
275 |
+
return '[U_CHI]'
|
276 |
+
elif v <= 0x10FFFD:
|
277 |
+
i = np.searchsorted(self.unicoder_ranges, v) # 找到插入位置 ranges[i-1] < v <= ranges[i]
|
278 |
+
return unicode_map[i]['token']
|
279 |
+
else:
|
280 |
+
return '[UNK]'
|
281 |
+
|
282 |
+
# bert的tokenize会加上CLS?
|
283 |
+
def _tokenize(self, text):
|
284 |
+
# 如果没有人为加的特殊赋好,可以不用这个tokenize,list(text)就是tokenize的结果
|
285 |
+
split_tokens = []
|
286 |
+
i = 0
|
287 |
+
while i < len(text):
|
288 |
+
if text[i:i+5] in self.enclosed_tokens_by_len[0]:
|
289 |
+
split_tokens.append(text[i:i+5])
|
290 |
+
i += 5
|
291 |
+
elif text[i:i+6] == '[MASK]':
|
292 |
+
split_tokens.append('[MASK]')
|
293 |
+
i += 6
|
294 |
+
elif text[i:i+7] in self.enclosed_tokens_by_len[2]:
|
295 |
+
split_tokens.append(text[i:i+7])
|
296 |
+
i += 7
|
297 |
+
else:
|
298 |
+
split_tokens.append(self.convert_token_to_representative(text[i]))
|
299 |
+
i += 1
|
300 |
+
return split_tokens
|
301 |
+
|
302 |
+
def _convert_token_to_id(self, token):
|
303 |
+
return self.vocab.get(self.convert_token_to_representative(token), self.vocab.get(self.unk_token)) # BUG: convert_token_to_representative 不是 id!
|
304 |
+
|
305 |
+
def convert_tokens_to_string(self, tokens):
|
306 |
+
return ''.join(tokens)
|
307 |
+
|
308 |
+
def save_pretrained(self, save_directory: Union[str, os.PathLike], legacy_format: Optional[bool] = None, filename_prefix: Optional[str] = None, push_to_hub: bool = False, **kwargs) -> Tuple[str]:
|
309 |
+
ret = super().save_pretrained(save_directory, legacy_format, filename_prefix, push_to_hub, **kwargs)
|
310 |
+
shutil.copyfile(os.path.join(self.dir, 'replace.json'), f'{save_directory}/replace.json')
|
311 |
+
shutil.copyfile(os.path.join(self.dir, 'cctokenizer.py'), f'{save_directory}/cctokenizer.py')
|
312 |
+
return ret
|
config.json
CHANGED
@@ -1,9 +1,10 @@
|
|
1 |
{
|
2 |
"architectures": [
|
3 |
-
"
|
4 |
],
|
5 |
"attention_probs_dropout_prob": 0.1,
|
6 |
"classifier_dropout": null,
|
|
|
7 |
"hidden_act": "gelu",
|
8 |
"hidden_dropout_prob": 0.1,
|
9 |
"hidden_size": 768,
|
@@ -15,10 +16,15 @@
|
|
15 |
"num_attention_heads": 12,
|
16 |
"num_hidden_layers": 12,
|
17 |
"pad_token_id": 0,
|
|
|
|
|
|
|
|
|
|
|
18 |
"position_embedding_type": "absolute",
|
19 |
"torch_dtype": "float32",
|
20 |
-
"transformers_version": "4.
|
21 |
"type_vocab_size": 2,
|
22 |
"use_cache": true,
|
23 |
-
"vocab_size":
|
24 |
}
|
|
|
1 |
{
|
2 |
"architectures": [
|
3 |
+
"BertForMaskedLM"
|
4 |
],
|
5 |
"attention_probs_dropout_prob": 0.1,
|
6 |
"classifier_dropout": null,
|
7 |
+
"directionality": "bidi",
|
8 |
"hidden_act": "gelu",
|
9 |
"hidden_dropout_prob": 0.1,
|
10 |
"hidden_size": 768,
|
|
|
16 |
"num_attention_heads": 12,
|
17 |
"num_hidden_layers": 12,
|
18 |
"pad_token_id": 0,
|
19 |
+
"pooler_fc_size": 768,
|
20 |
+
"pooler_num_attention_heads": 12,
|
21 |
+
"pooler_num_fc_layers": 3,
|
22 |
+
"pooler_size_per_head": 128,
|
23 |
+
"pooler_type": "first_token_transform",
|
24 |
"position_embedding_type": "absolute",
|
25 |
"torch_dtype": "float32",
|
26 |
+
"transformers_version": "4.26.1",
|
27 |
"type_vocab_size": 2,
|
28 |
"use_cache": true,
|
29 |
+
"vocab_size": 12288
|
30 |
}
|
config.yaml
CHANGED
@@ -1,12 +1,34 @@
|
|
1 |
-
batch_size:
|
2 |
clip_grad: 5
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
- B
|
5 |
- M
|
6 |
- E
|
7 |
- S
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
12 |
weight_decay: 0.01
|
|
|
1 |
+
batch_size: 32
|
2 |
clip_grad: 5
|
3 |
+
data_strategy: shuffle_batches
|
4 |
+
dev_datasets:
|
5 |
+
seg_ours:
|
6 |
+
dir: data
|
7 |
+
name: seg_ours
|
8 |
+
samples_num: 120
|
9 |
+
split: test
|
10 |
+
epoch_num: 3
|
11 |
+
head_config:
|
12 |
+
dropout: 0.1
|
13 |
+
layers_num: 1
|
14 |
+
use_crf: false
|
15 |
+
heads:
|
16 |
+
- seg
|
17 |
+
learning_rate: 1.5e-05
|
18 |
+
part_data: false
|
19 |
+
pretrained_bert_model: /data03/private/chengzhili/pretrain/output/2023-04-09_04-22-50/save/step_1195000
|
20 |
+
saved_path: output/seg
|
21 |
+
seg_labels:
|
22 |
- B
|
23 |
- M
|
24 |
- E
|
25 |
- S
|
26 |
+
train_datasets:
|
27 |
+
seg_ours:
|
28 |
+
dir: data
|
29 |
+
name: seg_ours
|
30 |
+
repeat_times: 1
|
31 |
+
samples_num: 0
|
32 |
+
split: train
|
33 |
+
warmup_steps: 2.0
|
34 |
weight_decay: 0.01
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:588d7e98dee348c01df94d9c9cd315243a83defcae7f60a24b3ba94594cdf01b
|
3 |
+
size 382011629
|
replace.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
special_tokens_map.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"eos_token": "[EOS]",
|
4 |
+
"mask_token": "[MASK]",
|
5 |
+
"pad_token": "[PAD]",
|
6 |
+
"sep_token": "[SEP]",
|
7 |
+
"unk_token": "[UNK]"
|
8 |
+
}
|
tokenizer_config.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_map": {
|
3 |
+
"AutoTokenizer": [
|
4 |
+
"cctokenizer.ChineseCharTokenizer",
|
5 |
+
null
|
6 |
+
]
|
7 |
+
},
|
8 |
+
"cls_token": "[CLS]",
|
9 |
+
"do_basic_tokenize": true,
|
10 |
+
"do_lower_case": true,
|
11 |
+
"mask_token": "[MASK]",
|
12 |
+
"model_max_length": 1000000000000000019884624838656,
|
13 |
+
"name_or_path": "chengzl18/deepthulac-seg",
|
14 |
+
"never_split": null,
|
15 |
+
"pad_token": "[PAD]",
|
16 |
+
"sep_token": "[SEP]",
|
17 |
+
"special_tokens_map_file": null,
|
18 |
+
"strip_accents": null,
|
19 |
+
"tokenize_chinese_chars": true,
|
20 |
+
"tokenizer_class": "ChineseCharTokenizer",
|
21 |
+
"unk_token": "[UNK]"
|
22 |
+
}
|
vocab.txt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:078a9cfc23740546300d9781885054a46396f4e52f7ee4c783481be2dc7ffc46
|
3 |
+
size 49848
|