alibayram commited on
Commit
f8c9370
·
1 Parent(s): f6f7bf5

Implement Gradio interface for Turkish Tokenizer, replacing Streamlit; update requirements to include Gradio.

Browse files
Files changed (7) hide show
  1. app.py +166 -247
  2. bpe_tokenler.json +0 -0
  3. ekler.json +363 -0
  4. kokler.json +0 -0
  5. requirements.txt +1 -5
  6. tr_decoder.py +232 -0
  7. tr_tokenizer.py +137 -0
app.py CHANGED
@@ -1,255 +1,174 @@
1
- import atexit
2
- import base64
3
- import colorsys
4
- import importlib.util
5
- import shutil
6
- import sys
7
- from datetime import datetime
8
- from pathlib import Path
9
-
10
- import requests
11
- import streamlit as st
12
-
13
- # Set page config - MUST BE FIRST STREAMLIT COMMAND
14
- st.set_page_config(
15
- page_title="Turkish Tiktokenizer",
16
- page_icon="🇹🇷",
17
- layout="wide"
18
- )
19
-
20
- # Initialize session state
21
- if 'text' not in st.session_state:
22
- st.session_state.text = "Akademisyenler ve aileleri birlikte çalışıyorlar."
23
- if 'token_results' not in st.session_state:
24
- st.session_state.token_results = None
25
-
26
- # Constants
27
- GITHUB_REPO = "malibayram/tokenizer"
28
- GITHUB_BRANCH = "main"
29
-
30
- # Special tokens and their IDs
31
- SPECIAL_TOKENS = {
32
- "<uppercase>": 0, # Uppercase letter marker
33
- "<space>": 1, # Space character
34
- "<newline>": 2, # Newline character
35
- "<tab>": 3, # Tab character
36
- "<unknown>": 4 # Unknown token
37
  }
38
 
39
- # Special token display symbols
40
- SPECIAL_TOKEN_SYMBOLS = {
41
- "<uppercase>": "[uppercase]", # Up arrow for uppercase
42
- "<space>": "[space]", # Space symbol
43
- "<newline>": "[newline]", # Return symbol
44
- "<tab>": "[tab]", # Tab symbol
45
- "<unknown>": "[unknown]" # Question mark for unknown
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- # Colors for special tokens
49
- SPECIAL_COLORS = {
50
- "<uppercase>": "#FF9999", # Light red for uppercase markers
51
- "<space>": "#CCCCCC", # Gray for spaces
52
- "<newline>": "#CCCCCC", # Gray for newlines
53
- "<tab>": "#CCCCCC", # Gray for tabs
54
- "<unknown>": "#FF0000" # Red for unknown tokens
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
- # Required files mapping
58
- REQUIRED_FILES = {
59
- 'tokenizer.py': 'turkish_tokenizer/turkish_tokenizer.py',
60
- 'kokler_v08.json': 'turkish_tokenizer/kokler_v08.json',
61
- 'ekler_v05.json': 'turkish_tokenizer/ekler_v05.json',
62
- 'bpe_v06.json': 'turkish_tokenizer/bpe_v06.json'
63
- }
 
 
 
 
64
 
65
- # Token ID ranges
66
- TOKEN_RANGES = {
67
- 'special': (0, 4), # Special tokens
68
- 'root_words': (5, 20000), # Root words
69
- 'suffixes': (22268, 22767), # Suffixes
70
- 'bpe': (20000, None) # BPE tokens (20000+)
71
- }
72
 
73
- def generate_colors(n):
74
- """Generate n visually distinct colors."""
75
- colors = []
76
- for i in range(n):
77
- hue = i / n
78
- saturation = 0.3 + (i % 3) * 0.1 # Vary saturation between 0.3-0.5
79
- value = 0.95 - (i % 2) * 0.1 # Vary value between 0.85-0.95
80
- rgb = colorsys.hsv_to_rgb(hue, saturation, value)
81
- hex_color = "#{:02x}{:02x}{:02x}".format(
82
- int(rgb[0] * 255),
83
- int(rgb[1] * 255),
84
- int(rgb[2] * 255)
85
- )
86
- colors.append(hex_color)
87
- return colors
88
-
89
- def fetch_github_file(path, ref=GITHUB_BRANCH):
90
- """Fetch file content from GitHub repository."""
91
- url = f"https://api.github.com/repos/{GITHUB_REPO}/contents/{path}?ref={ref}"
92
- response = requests.get(url)
93
- if response.status_code == 200:
94
- content = base64.b64decode(response.json()['content']).decode('utf-8')
95
- return content
96
- else:
97
- st.error(f"Could not fetch {path} from GitHub: {response.status_code}")
98
- return None
99
-
100
- @st.cache_resource
101
- def load_tokenizer():
102
- """Load and initialize the tokenizer from GitHub."""
103
- temp_dir = Path("temp_tokenizer")
104
- temp_dir.mkdir(exist_ok=True)
105
-
106
- # Fetch required files
107
- for local_name, github_path in REQUIRED_FILES.items():
108
- content = fetch_github_file(github_path)
109
- if content is None:
110
- return None
111
-
112
- with open(temp_dir / local_name, 'w', encoding='utf-8') as f:
113
- f.write(content)
114
-
115
- # Modify tokenizer to use correct paths
116
- tokenizer_path = temp_dir / "tokenizer.py"
117
- with open(tokenizer_path, 'r', encoding='utf-8') as f:
118
- tokenizer_code = f.read()
119
-
120
- modified_code = tokenizer_code.replace(
121
- 'def load_json(filename):',
122
- f'''def load_json(filename):
123
- full_path = os.path.join("{temp_dir.absolute()}", filename)
124
- with open(full_path, 'r', encoding='utf-8') as file:
125
- return json.load(file)'''
126
  )
127
-
128
- with open(tokenizer_path, 'w', encoding='utf-8') as f:
129
- f.write(modified_code)
130
-
131
- # Load module
132
- spec = importlib.util.spec_from_file_location("tokenizer", str(temp_dir / "tokenizer.py"))
133
- module = importlib.util.module_from_spec(spec)
134
- sys.modules["tokenizer"] = module
135
- spec.loader.exec_module(module)
136
-
137
- return module.tokenize
138
-
139
- @st.cache_data(ttl=3600)
140
- def get_commit_history():
141
- """Fetch commit history from GitHub."""
142
- url = f"https://api.github.com/repos/{GITHUB_REPO}/commits"
143
- try:
144
- response = requests.get(url)
145
- if response.status_code == 200:
146
- commits = response.json()
147
- versions = []
148
- for commit in commits[:10]:
149
- date = datetime.strptime(commit['commit']['author']['date'], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d')
150
- sha = commit['sha'][:7]
151
- message = commit['commit']['message'].split('\n')[0][:50]
152
- versions.append(f"{date} - {sha} - {message}")
153
- return versions
154
- return ["latest"]
155
- except Exception as e:
156
- st.warning(f"Could not fetch commit history: {str(e)}")
157
- return ["latest"]
158
-
159
- def render_tokens(tokens, token_colors):
160
- """Render colored token visualization."""
161
- html_tokens = []
162
- for token in tokens:
163
- color = token_colors[token]
164
- display_text = SPECIAL_TOKEN_SYMBOLS.get(token, token) # Use symbol for special tokens
165
- html_tokens.append(
166
- f'<span style="background-color: {color}; padding: 2px 4px; margin: 2px; border-radius: 3px;" title="{token}">{display_text}</span>'
167
- )
168
- return " ".join(html_tokens)
169
-
170
- # Load tokenizer
171
- tokenize = load_tokenizer()
172
- if tokenize is None:
173
- st.error("Failed to load tokenizer from GitHub")
174
- st.stop()
175
-
176
- # Tokenize example text on startup if no results exist
177
- if st.session_state.token_results is None and st.session_state.text:
178
- try:
179
- st.session_state.token_results = tokenize(st.session_state.text)
180
- except Exception as e:
181
- st.error(f"Error tokenizing text: {str(e)}")
182
-
183
- # UI Layout
184
- st.title("🇹🇷 Turkish Tiktokenizer")
185
-
186
- # Model selection
187
- versions = get_commit_history()
188
- model = st.selectbox("", versions, key="model_selection", label_visibility="collapsed")
189
-
190
- # Main layout
191
- col1, col2 = st.columns([0.4, 0.6])
192
-
193
- # Input column
194
- with col1:
195
- text = st.text_area(
196
- "Enter Turkish text to tokenize",
197
- value=st.session_state.text,
198
- height=200,
199
- key="text_input",
200
- label_visibility="collapsed",
201
- placeholder="Enter Turkish text to tokenize"
202
  )
203
-
204
- if st.button("Tokenize", type="primary"):
205
- st.session_state.text = text
206
- if text.strip():
207
- try:
208
- st.session_state.token_results = tokenize(text)
209
- except Exception as e:
210
- st.session_state.token_results = None
211
- st.error(f"Error tokenizing text: {str(e)}")
212
- else:
213
- st.session_state.token_results = None
214
-
215
- # Results column
216
- with col2:
217
- st.markdown("Token count")
218
- if st.session_state.token_results is not None:
219
- result = st.session_state.token_results
220
- token_count = len(result["tokens"])
221
- st.markdown(f"### {token_count}")
222
-
223
- st.markdown("Tokenized text")
224
-
225
- # Generate token colors
226
- regular_tokens = [t for t in result["tokens"] if t not in SPECIAL_COLORS]
227
- regular_token_colors = dict(zip(regular_tokens, generate_colors(len(regular_tokens))))
228
- token_colors = {**SPECIAL_COLORS, **regular_token_colors}
229
-
230
- # Render tokens
231
- with st.container():
232
- st.markdown(render_tokens(result["tokens"], token_colors), unsafe_allow_html=True)
233
-
234
- st.markdown("Token IDs")
235
- st.code(", ".join(map(str, result["ids"])), language=None)
236
- else:
237
- st.markdown("### 0")
238
- st.markdown("Tokenized text")
239
- st.markdown("")
240
- st.markdown("Token IDs")
241
- st.text("")
242
-
243
- # Footer
244
- st.markdown("""
245
- <div style="position: fixed; bottom: 0; width: 100%; text-align: center; padding: 10px; background-color: white;">
246
- <a href="https://github.com/malibayram/tokenizer" target="_blank">View on GitHub</a>
247
- </div>
248
- """, unsafe_allow_html=True)
249
-
250
- # Cleanup
251
- def cleanup():
252
- if Path("temp_tokenizer").exists():
253
- shutil.rmtree("temp_tokenizer")
254
-
255
- atexit.register(cleanup)
 
1
+ import gradio as gr
2
+
3
+ # Assuming tr_tokenizer.py contains both TRTokenizer and TokenType
4
+ # and that it correctly imports TRDecoder from tr_decoder.py.
5
+ # Make sure tr_tokenizer.py, tr_decoder.py, and your .json files
6
+ # are in the same directory as this app.py file.
7
+ from tr_tokenizer import TokenType, TRTokenizer
8
+
9
+ # --- Gradio App ---
10
+
11
+ # Instantiate the tokenizer
12
+ # This will now load directly from your existing .json files
13
+ # as defined in your TRTokenizer class.
14
+ tokenizer = TRTokenizer()
15
+
16
+ # Define colors for each token type (dark theme)
17
+ dark_color_map = {
18
+ TokenType.ROOT.name: "#FF6B6B", # Darker Red
19
+ TokenType.SUFFIX.name: "#4ECDC4", # Teal
20
+ TokenType.BPE.name: "#FFE66D", # Darker Yellow
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  }
22
 
23
+ def tokenize_and_display(text, theme="light"):
24
+ """
25
+ Tokenizes the input text and prepares it for display in Gradio's HighlightedText component.
26
+ """
27
+ if not text:
28
+ # Return a structure that matches all outputs to avoid errors
29
+ return [], "", "", "", theme
30
+
31
+ tokens, _ = tokenizer.tokenize_text(text)
32
+
33
+ # Create the list of (token, label) for HighlightedText
34
+ highlighted_tokens = []
35
+ token_stats = {"ROOT": 0, "SUFFIX": 0, "BPE": 0}
36
+
37
+ for t in tokens:
38
+ token_text = t["token"]
39
+ token_type = t["type"].name
40
+
41
+ # Count token types for statistics
42
+ token_stats[token_type] = token_stats.get(token_type, 0) + 1
43
+
44
+ highlighted_tokens.append((token_text, token_type))
45
+
46
+ encoded_ids = tokenizer.encode(text)
47
+ decoded_text = tokenizer.decode(encoded_ids)
48
+
49
+ # Calculate statistics
50
+ total_tokens = len(tokens)
51
+ total_chars = len(text)
52
+ compression_ratio = (1 - total_tokens / total_chars) * 100 if total_chars > 0 else 0
53
+
54
+ # Define theme-specific colors for the stats block
55
+ bg_col, text_col, card_col, border_col = ('#2d3748', '#f7fafc', '#4a5568', '#718096')
56
+
57
+ # Create statistics HTML
58
+ stats_html = f"""
59
+ <div style="background:{bg_col};padding:20px;border-radius:12px;margin:20px 0;">
60
+ <h4 style="color:{text_col};margin-bottom:15px;">📊 Tokenization Statistics</h4>
61
+ <div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(150px,1fr));gap:15px;margin-bottom:20px;">
62
+ <div style="background:{card_col};padding:15px;border-radius:8px;text-align:center;border:1px solid {border_col};"><div style="font-size:24px;font-weight:bold;color:#3b82f6;">{total_chars}</div><div style="color:{'#64748b' if theme == 'light' else '#a0aec0'};font-size:14px;">Characters</div></div>
63
+ <div style="background:{card_col};padding:15px;border-radius:8px;text-align:center;border:1px solid {border_col};"><div style="font-size:24px;font-weight:bold;color:#10b981;">{total_tokens}</div><div style="color:{'#64748b' if theme == 'light' else '#a0aec0'};font-size:14px;">Tokens</div></div>
64
+ <div style="background:{card_col};padding:15px;border-radius:8px;text-align:center;border:1px solid {border_col};"><div style="font-size:24px;font-weight:bold;color:#f59e0b;">{compression_ratio:.1f}%</div><div style="color:{'#64748b' if theme == 'light' else '#a0aec0'};font-size:14px;">Compression</div></div>
65
+ </div>
66
+ <div>
67
+ <h5 style="color:{text_col};margin-bottom:10px;">Token Type Distribution:</h5>
68
+ <div style="display:flex;gap:15px;flex-wrap:wrap;">
69
+ <div style="background:#FFADAD;color:#2d3748;padding:8px 12px;border-radius:6px;font-size:14px;font-weight:600;">🔴 Roots: {token_stats['ROOT']}</div>
70
+ <div style="background:#A0C4FF;color:#2d3748;padding:8px 12px;border-radius:6px;font-size:14px;font-weight:600;">🔵 Suffixes: {token_stats['SUFFIX']}</div>
71
+ <div style="background:#FDFFB6;color:#2d3748;padding:8px 12px;border-radius:6px;font-size:14px;font-weight:600;">🟡 BPE: {token_stats['BPE']}</div>
72
+ </div>
73
+ </div>
74
+ </div>"""
75
+ return highlighted_tokens, str(encoded_ids), decoded_text, stats_html, theme
76
+
77
+ # Custom CSS for better styling
78
+ custom_css = """
79
+ .gradio-container{font-family:'Inter',-apple-system,BlinkMacSystemFont,sans-serif;}
80
+ .custom-button{background:linear-gradient(135deg,#667eea 0%,#764ba2 100%);border:none;border-radius:8px;padding:12px 24px;color:white;font-weight:600;transition:all .3s ease;}
81
+ .custom-button:hover{transform:translateY(-2px);box-shadow:0 8px 25px rgba(0,0,0,.15);}
82
+ .theme-toggle{background:linear-gradient(135deg,#f093fb 0%,#f5576c 100%);border:none;border-radius:50px;padding:10px 20px;color:white;font-weight:600;transition:all .3s ease;}
83
+ .theme-toggle:hover{transform:scale(1.05);box-shadow:0 4px 15px rgba(0,0,0,.2);}
84
+ .input-textbox{border-radius:12px!important;border:2px solid #e2e8f0!important;transition:all .3s ease;}
85
+ .input-textbox:focus{border-color:#667eea!important;box-shadow:0 0 0 3px rgba(102,126,234,.1)!important;}
86
+ .dark .gradio-container{background:#1a202c!important;}
87
+ .dark .input-textbox{background:#2d3748!important;border-color:#4a5568!important;color:#f7fafc!important;}
88
+ """
89
+
90
+ # Create the Gradio Interface
91
+ with gr.Blocks(theme=gr.themes.Soft(), title="Turkish Tokenizer", css=custom_css) as demo:
92
+ with gr.Row():
93
+ with gr.Column(scale=3):
94
+ gr.Markdown("""
95
+ # Turkish Tokenizer
96
+ ### Advanced Turkish Text Tokenization with Visual Analysis
97
+ Enter text to see how it's tokenized. Tokens are color-coded by type.
98
+ """)
99
+
100
+ theme_state = gr.State("light")
101
+
102
+ input_text = gr.Textbox(
103
+ label="📝 Input Text",
104
+ placeholder="Merhaba Dünya, kitapları okumak güzeldir.",
105
+ lines=4,
106
+ elem_classes=["input-textbox"]
107
+ )
108
 
109
+ with gr.Row():
110
+ process_button = gr.Button("🚀 Tokenize", variant="primary", elem_classes=["custom-button"], size="lg")
111
+ clear_button = gr.Button("🗑️ Clear", variant="secondary", size="lg")
112
+
113
+ gr.Markdown("---")
114
+ gr.Markdown("### 🔄 Encoded & Decoded Output")
115
+ with gr.Row():
116
+ encoded_output = gr.Textbox(label="🔢 Encoded Token IDs", interactive=False, lines=2)
117
+ decoded_output = gr.Textbox(label="📝 Decoded Text", interactive=False, lines=2)
118
+
119
+ gr.Markdown("### 💡 Example Texts")
120
+ gr.Examples(
121
+ examples=[
122
+ ["Merhaba Dünya! Bu bir gelişmiş Türkçe tokenizer testidir."],
123
+ ["İstanbul'da yaşıyorum ve Türkçe dilini öğreniyorum."],
124
+ ["KitapOkumak çok güzeldir ve bilgi verir."],
125
+ ["Türkiye Cumhuriyeti'nin başkenti Ankara'dır."],
126
+ ["Yapay zeka ve makine öğrenmesi teknolojileri gelişiyor."],
127
+ ],
128
+ inputs=input_text,
129
+ label="Try these examples:"
130
+ )
131
 
132
+ gr.Markdown("---")
133
+ gr.Markdown("### 🎨 Tokenization Output")
134
+ highlighted_output = gr.HighlightedText(
135
+ label="Colorized Tokens",
136
+ color_map=dark_color_map, # This will be updated dynamically if needed
137
+ show_legend=True
138
+ )
139
+
140
+ gr.Markdown("---")
141
+ gr.Markdown("### 📊 Statistics")
142
+ stats_output = gr.HTML(label="")
143
 
 
 
 
 
 
 
 
144
 
145
+ gr.Markdown("--- \n **Turkish Tokenizer Pro** - Advanced tokenization for Turkish text.")
146
+
147
+ # --- Event Handlers ---
148
+ def process_with_theme(text, theme):
149
+ return tokenize_and_display(text, theme)
150
+
151
+ def clear_all():
152
+ return "", [], "", "", ""
153
+
154
+ # Connect the buttons to the functions
155
+ process_button.click(
156
+ fn=process_with_theme,
157
+ inputs=[input_text, theme_state],
158
+ outputs=[highlighted_output, encoded_output, decoded_output, stats_output, theme_state]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  )
160
+
161
+ clear_button.click(
162
+ fn=clear_all,
163
+ outputs=[input_text, highlighted_output, encoded_output, decoded_output, stats_output]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  )
165
+
166
+ # Auto-process on load with a default example
167
+ demo.load(
168
+ fn=lambda theme: tokenize_and_display("Merhaba Dünya!", theme),
169
+ inputs=[theme_state],
170
+ outputs=[highlighted_output, encoded_output, decoded_output, stats_output, theme_state]
171
+ )
172
+
173
+ if __name__ == "__main__":
174
+ demo.launch(show_error=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bpe_tokenler.json ADDED
The diff for this file is too large to render. See raw diff
 
ekler.json ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lar": 20000,
3
+ "ler": 20000,
4
+ "ya": 20001,
5
+ "ye": 20001,
6
+ "ma": 20002,
7
+ "me": 20002,
8
+ "malı": 20003,
9
+ "meli": 20003,
10
+ "laş": 20004,
11
+ "leş": 20004,
12
+ "ça": 20005,
13
+ "çe": 20005,
14
+ "şar": 20006,
15
+ "şer": 20006,
16
+ "kan": 20007,
17
+ "ken": 20007,
18
+ "lak": 20008,
19
+ "lek": 20008,
20
+ "layın": 20009,
21
+ "leyin": 20009,
22
+ "sak": 20010,
23
+ "sek": 20010,
24
+ "arak": 20011,
25
+ "erek": 20011,
26
+ "an": 20012,
27
+ "en": 20012,
28
+ "ım": 20013,
29
+ "im": 20013,
30
+ "um": 20013,
31
+ "üm": 20013,
32
+ "ız": 20014,
33
+ "iz": 20014,
34
+ "uz": 20014,
35
+ "üz": 20014,
36
+ "sı": 20015,
37
+ "si": 20015,
38
+ "su": 20015,
39
+ "sü": 20015,
40
+ "mış": 20016,
41
+ "miş": 20016,
42
+ "muş": 20016,
43
+ "müş": 20016,
44
+ "yı": 20017,
45
+ "yi": 20017,
46
+ "yu": 20017,
47
+ "yü": 20017,
48
+ "lı": 20018,
49
+ "li": 20018,
50
+ "lu": 20018,
51
+ "lü": 20018,
52
+ "sız": 20019,
53
+ "siz": 20019,
54
+ "suz": 20019,
55
+ "süz": 20019,
56
+ "ncı": 20020,
57
+ "nci": 20020,
58
+ "ncu": 20020,
59
+ "ncü": 20020,
60
+ "ın": 20021,
61
+ "in": 20021,
62
+ "un": 20021,
63
+ "ün": 20021,
64
+ "nın": 20022,
65
+ "nin": 20022,
66
+ "nun": 20022,
67
+ "nün": 20022,
68
+ "la": 20023,
69
+ "le": 20023,
70
+ "yla": 20023,
71
+ "yle": 20023,
72
+ "da": 20024,
73
+ "de": 20024,
74
+ "ta": 20024,
75
+ "te": 20024,
76
+ "dan": 20025,
77
+ "den": 20025,
78
+ "tan": 20025,
79
+ "ten": 20025,
80
+ "dı": 20026,
81
+ "di": 20026,
82
+ "du": 20026,
83
+ "dü": 20026,
84
+ "tı": 20026,
85
+ "ti": 20026,
86
+ "tu": 20026,
87
+ "tü": 20026,
88
+ "cı": 20027,
89
+ "ci": 20027,
90
+ "cu": 20027,
91
+ "cü": 20027,
92
+ "çı": 20027,
93
+ "çi": 20027,
94
+ "çu": 20027,
95
+ "çü": 20027,
96
+ "dır": 20028,
97
+ "dir": 20028,
98
+ "dur": 20028,
99
+ "dür": 20028,
100
+ "tır": 20028,
101
+ "tir": 20028,
102
+ "tur": 20028,
103
+ "tür": 20028,
104
+ "lık": 20029,
105
+ "lik": 20029,
106
+ "luk": 20029,
107
+ "lük": 20029,
108
+ "lığ": 20029,
109
+ "liğ": 20029,
110
+ "luğ": 20029,
111
+ "lüğ": 20029,
112
+ "cık": 20030,
113
+ "cik": 20030,
114
+ "cuk": 20030,
115
+ "cük": 20030,
116
+ "çık": 20030,
117
+ "çik": 20030,
118
+ "çuk": 20030,
119
+ "çük": 20030,
120
+ "cığ": 20030,
121
+ "ciğ": 20030,
122
+ "cuğ": 20030,
123
+ "cüğ": 20030,
124
+ "çığ": 20030,
125
+ "çiğ": 20030,
126
+ "çuğ": 20030,
127
+ "çüğ": 20030,
128
+ "mak": 20031,
129
+ "mek": 20031,
130
+ "may": 20031,
131
+ "mey": 20031,
132
+ "acak": 20032,
133
+ "ecek": 20032,
134
+ "acağ": 20032,
135
+ "eceğ": 20032,
136
+ "yacak": 20032,
137
+ "yecek": 20032,
138
+ "yacağ": 20032,
139
+ "yeceğ": 20032,
140
+ "i": 20033,
141
+ "ı": 20034,
142
+ "u": 20035,
143
+ "ü": 20036,
144
+ "a": 20037,
145
+ "e": 20038,
146
+ "m": 20039,
147
+ "n": 20040,
148
+ "yor": 20041,
149
+ "ar": 20042,
150
+ "er": 20043,
151
+ "sa": 20044,
152
+ "se": 20045,
153
+ "r": 20046,
154
+ "ce": 20047,
155
+ "daş": 20048,
156
+ "deş": 20049,
157
+ "msı": 20050,
158
+ "msi": 20051,
159
+ "msu": 20052,
160
+ "gil": 20053,
161
+ "ımsa": 20054,
162
+ "ıcık": 20055,
163
+ "nç": 20056,
164
+ "sal": 20057,
165
+ "sel": 20058,
166
+ "ki": 20059,
167
+ "y": 20060,
168
+ "idi": 20061,
169
+ "imiş": 20062,
170
+ "ise": 20063,
171
+ "s": 20064,
172
+ "gül": 20065,
173
+ "kıl": 20066,
174
+ "kil": 20067,
175
+ "ka": 20068,
176
+ "ge": 20069,
177
+ "z": 20070,
178
+ "ek_temp_20071": 20071,
179
+ "ek_temp_20072": 20072,
180
+ "ek_temp_20073": 20073,
181
+ "ek_temp_20074": 20074,
182
+ "ek_temp_20075": 20075,
183
+ "ek_temp_20076": 20076,
184
+ "ek_temp_20077": 20077,
185
+ "ek_temp_20078": 20078,
186
+ "ek_temp_20079": 20079,
187
+ "ek_temp_20080": 20080,
188
+ "ek_temp_20081": 20081,
189
+ "ek_temp_20082": 20082,
190
+ "ek_temp_20083": 20083,
191
+ "ek_temp_20084": 20084,
192
+ "ek_temp_20085": 20085,
193
+ "ek_temp_20086": 20086,
194
+ "ek_temp_20087": 20087,
195
+ "ek_temp_20088": 20088,
196
+ "ek_temp_20089": 20089,
197
+ "ek_temp_20090": 20090,
198
+ "ek_temp_20091": 20091,
199
+ "ek_temp_20092": 20092,
200
+ "ek_temp_20093": 20093,
201
+ "ek_temp_20094": 20094,
202
+ "ek_temp_20095": 20095,
203
+ "ek_temp_20096": 20096,
204
+ "ek_temp_20097": 20097,
205
+ "ek_temp_20098": 20098,
206
+ "ek_temp_20099": 20099,
207
+ "ek_temp_20100": 20100,
208
+ "ek_temp_20101": 20101,
209
+ "ek_temp_20102": 20102,
210
+ "ek_temp_20103": 20103,
211
+ "ek_temp_20104": 20104,
212
+ "ek_temp_20105": 20105,
213
+ "ek_temp_20106": 20106,
214
+ "ek_temp_20107": 20107,
215
+ "ek_temp_20108": 20108,
216
+ "ek_temp_20109": 20109,
217
+ "ek_temp_20110": 20110,
218
+ "ek_temp_20111": 20111,
219
+ "ek_temp_20112": 20112,
220
+ "ek_temp_20113": 20113,
221
+ "ek_temp_20114": 20114,
222
+ "ek_temp_20115": 20115,
223
+ "ek_temp_20116": 20116,
224
+ "ek_temp_20117": 20117,
225
+ "ek_temp_20118": 20118,
226
+ "ek_temp_20119": 20119,
227
+ "ek_temp_20120": 20120,
228
+ "ek_temp_20121": 20121,
229
+ "ek_temp_20122": 20122,
230
+ "ek_temp_20123": 20123,
231
+ "ek_temp_20124": 20124,
232
+ "ek_temp_20125": 20125,
233
+ "ek_temp_20126": 20126,
234
+ "ek_temp_20127": 20127,
235
+ "ek_temp_20128": 20128,
236
+ "ek_temp_20129": 20129,
237
+ "ek_temp_20130": 20130,
238
+ "ek_temp_20131": 20131,
239
+ "ek_temp_20132": 20132,
240
+ "ek_temp_20133": 20133,
241
+ "ek_temp_20134": 20134,
242
+ "ek_temp_20135": 20135,
243
+ "ek_temp_20136": 20136,
244
+ "ek_temp_20137": 20137,
245
+ "ek_temp_20138": 20138,
246
+ "ek_temp_20139": 20139,
247
+ "ek_temp_20140": 20140,
248
+ "ek_temp_20141": 20141,
249
+ "ek_temp_20142": 20142,
250
+ "ek_temp_20143": 20143,
251
+ "ek_temp_20144": 20144,
252
+ "ek_temp_20145": 20145,
253
+ "ek_temp_20146": 20146,
254
+ "ek_temp_20147": 20147,
255
+ "ek_temp_20148": 20148,
256
+ "ek_temp_20149": 20149,
257
+ "ek_temp_20150": 20150,
258
+ "ek_temp_20151": 20151,
259
+ "ek_temp_20152": 20152,
260
+ "ek_temp_20153": 20153,
261
+ "ek_temp_20154": 20154,
262
+ "ek_temp_20155": 20155,
263
+ "ek_temp_20156": 20156,
264
+ "ek_temp_20157": 20157,
265
+ "ek_temp_20158": 20158,
266
+ "ek_temp_20159": 20159,
267
+ "ek_temp_20160": 20160,
268
+ "ek_temp_20161": 20161,
269
+ "ek_temp_20162": 20162,
270
+ "ek_temp_20163": 20163,
271
+ "ek_temp_20164": 20164,
272
+ "ek_temp_20165": 20165,
273
+ "ek_temp_20166": 20166,
274
+ "ek_temp_20167": 20167,
275
+ "ek_temp_20168": 20168,
276
+ "ek_temp_20169": 20169,
277
+ "ek_temp_20170": 20170,
278
+ "ek_temp_20171": 20171,
279
+ "ek_temp_20172": 20172,
280
+ "ek_temp_20173": 20173,
281
+ "ek_temp_20174": 20174,
282
+ "ek_temp_20175": 20175,
283
+ "ek_temp_20176": 20176,
284
+ "ek_temp_20177": 20177,
285
+ "ek_temp_20178": 20178,
286
+ "ek_temp_20179": 20179,
287
+ "ek_temp_20180": 20180,
288
+ "ek_temp_20181": 20181,
289
+ "ek_temp_20182": 20182,
290
+ "ek_temp_20183": 20183,
291
+ "ek_temp_20184": 20184,
292
+ "ek_temp_20185": 20185,
293
+ "ek_temp_20186": 20186,
294
+ "ek_temp_20187": 20187,
295
+ "ek_temp_20188": 20188,
296
+ "ek_temp_20189": 20189,
297
+ "ek_temp_20190": 20190,
298
+ "ek_temp_20191": 20191,
299
+ "ek_temp_20192": 20192,
300
+ "ek_temp_20193": 20193,
301
+ "ek_temp_20194": 20194,
302
+ "ek_temp_20195": 20195,
303
+ "ek_temp_20196": 20196,
304
+ "ek_temp_20197": 20197,
305
+ "ek_temp_20198": 20198,
306
+ "ek_temp_20199": 20199,
307
+ "ek_temp_20200": 20200,
308
+ "ek_temp_20201": 20201,
309
+ "ek_temp_20202": 20202,
310
+ "ek_temp_20203": 20203,
311
+ "ek_temp_20204": 20204,
312
+ "ek_temp_20205": 20205,
313
+ "ek_temp_20206": 20206,
314
+ "ek_temp_20207": 20207,
315
+ "ek_temp_20208": 20208,
316
+ "ek_temp_20209": 20209,
317
+ "ek_temp_20210": 20210,
318
+ "ek_temp_20211": 20211,
319
+ "ek_temp_20212": 20212,
320
+ "ek_temp_20213": 20213,
321
+ "ek_temp_20214": 20214,
322
+ "ek_temp_20215": 20215,
323
+ "ek_temp_20216": 20216,
324
+ "ek_temp_20217": 20217,
325
+ "ek_temp_20218": 20218,
326
+ "ek_temp_20219": 20219,
327
+ "ek_temp_20220": 20220,
328
+ "ek_temp_20221": 20221,
329
+ "ek_temp_20222": 20222,
330
+ "ek_temp_20223": 20223,
331
+ "ek_temp_20224": 20224,
332
+ "ek_temp_20225": 20225,
333
+ "ek_temp_20226": 20226,
334
+ "ek_temp_20227": 20227,
335
+ "ek_temp_20228": 20228,
336
+ "ek_temp_20229": 20229,
337
+ "ek_temp_20230": 20230,
338
+ "ek_temp_20231": 20231,
339
+ "ek_temp_20232": 20232,
340
+ "ek_temp_20233": 20233,
341
+ "ek_temp_20234": 20234,
342
+ "ek_temp_20235": 20235,
343
+ "ek_temp_20236": 20236,
344
+ "ek_temp_20237": 20237,
345
+ "ek_temp_20238": 20238,
346
+ "ek_temp_20239": 20239,
347
+ "ek_temp_20240": 20240,
348
+ "ek_temp_20241": 20241,
349
+ "ek_temp_20242": 20242,
350
+ "ek_temp_20243": 20243,
351
+ "ek_temp_20244": 20244,
352
+ "ek_temp_20245": 20245,
353
+ "ek_temp_20246": 20246,
354
+ "ek_temp_20247": 20247,
355
+ "ek_temp_20248": 20248,
356
+ "ek_temp_20249": 20249,
357
+ "ek_temp_20250": 20250,
358
+ "ek_temp_20251": 20251,
359
+ "ek_temp_20252": 20252,
360
+ "ek_temp_20253": 20253,
361
+ "ek_temp_20254": 20254,
362
+ "ek_temp_20255": 20255
363
+ }
kokler.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -1,5 +1 @@
1
- streamlit>=1.24.0
2
- numpy>=1.21.0
3
- json5>=0.9.0
4
- requests>=2.31.0
5
- pathlib>=1.0.1
 
1
+ gradio
 
 
 
 
tr_decoder.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+
4
+ class TRDecoder:
5
+ # Define vowel sets as class constants for better performance
6
+ ALL_VOWELS = "aeıioöuü"
7
+ INCE_VOWELS = "eiöü" # Front vowels
8
+ AI_VOWELS = "aı" # Back unrounded
9
+ EI_VOWELS = "ei" # Front unrounded
10
+ OU_VOWELS = "ou" # Back rounded
11
+ HARD_CONSONANTS = "fstkçşhp" # Sert ünsüzler
12
+ WHITESPACE = " \n\t"
13
+
14
+ def __init__(self, reverse_dict):
15
+ self.reverse_dict = reverse_dict
16
+
17
+ def _starts_with_vowel(self, word: str) -> bool:
18
+ """Check if word starts with a vowel."""
19
+ return word and word[0] in self.ALL_VOWELS
20
+
21
+ def _ends_with_vowel(self, word: str) -> bool:
22
+ """Check if word ends with a vowel."""
23
+ return word and word[-1] in self.ALL_VOWELS
24
+
25
+ def _ends_with_any(self, word: str, charset: str) -> bool:
26
+ # recursively check until first vowel starts from the end
27
+ i = len(word) - 1
28
+ while i >= 0:
29
+ if word[i] in charset:
30
+ return True
31
+ if word[i] in self.ALL_VOWELS:
32
+ return False
33
+ i -= 1
34
+ return False
35
+
36
+ def _ends_with_ince(self, word: str) -> bool:
37
+ """Check if word ends with front vowels (ince ünlü)."""
38
+ if word in ("saat", "kilovatsaat", "ziraat", "itaat"):
39
+ return True
40
+ # check until first vowel recursively
41
+ return self._ends_with_any(word, self.INCE_VOWELS)
42
+
43
+ def _ends_with_sert_unsuz(self, word: str) -> bool:
44
+ """Check if word ends with a hard consonant."""
45
+ return word and word[-1] in self.HARD_CONSONANTS
46
+
47
+ def _get_vowel_suffix_index(self, prev_token: str) -> int:
48
+ """Get suffix index based on vowel harmony rules."""
49
+ if self._ends_with_any(prev_token, self.AI_VOWELS):
50
+ return 0
51
+ elif self._ends_with_any(prev_token, self.EI_VOWELS):
52
+ return 1
53
+ elif self._ends_with_any(prev_token, self.OU_VOWELS):
54
+ return 2
55
+ return 3
56
+
57
+ def _select_correct_suffix(self, i: int, ids: List[int], prev_token: str) -> str:
58
+ """Select the correct suffix based on morphological rules."""
59
+ suffixes = self.reverse_dict[ids[i]]
60
+ token_id = ids[i]
61
+ # Handle different suffix types with cleaner logic
62
+ if token_id < 20013:
63
+ # Basic suffix selection based on vowel harmony
64
+ return suffixes[1] if self._ends_with_ince(prev_token) else suffixes[0]
65
+
66
+ elif token_id < 20023: # nın, nin, nun, nün
67
+ return suffixes[self._get_vowel_suffix_index(prev_token)]
68
+
69
+ elif token_id == 20023: # la, le, yla, yle
70
+ end_of_word = True
71
+ if i < len(ids) - 1:
72
+ next_token = self.reverse_dict[ids[i + 1]][0]
73
+ if next_token not in self.WHITESPACE:
74
+ end_of_word = False
75
+ return self._handle_la_le_suffix(prev_token, suffixes, end_of_word)
76
+
77
+ elif token_id <= 20025: # da, de, ta, te, dan, den, tan, ten
78
+ return self._handle_da_de_suffix(prev_token, suffixes)
79
+
80
+ elif 20025 < token_id < 20029: # dı, di, du, dü, tı, ti, tu, tü, etc.
81
+ return self._handle_di_du_suffix(prev_token, suffixes)
82
+
83
+ elif token_id == 20029: # lık, lik, luk, lük, etc.
84
+ return self._handle_lik_suffix(i, ids, prev_token, suffixes)
85
+
86
+ elif token_id == 20030: # cık, cik, cuk, cük, etc.
87
+ return self._handle_cik_suffix(i, ids, prev_token, suffixes)
88
+
89
+ elif token_id == 20031: # mak, mek, may, mey
90
+ return self._handle_mak_suffix(i, ids, prev_token, suffixes)
91
+
92
+ elif token_id == 20032: # acak, ecek, etc.
93
+ return self._handle_acak_suffix(i, ids, prev_token, suffixes)
94
+
95
+ return suffixes[0]
96
+
97
+ def _handle_la_le_suffix(self, prev_token: str, suffixes: List[str], end_of_word: bool) -> str:
98
+ """Handle la/le/yla/yle suffix selection."""
99
+ if self._ends_with_vowel(prev_token) and end_of_word:
100
+ return suffixes[3] if self._ends_with_ince(prev_token) else suffixes[2]
101
+ else:
102
+ return suffixes[1] if self._ends_with_ince(prev_token) else suffixes[0]
103
+
104
+ def _handle_da_de_suffix(self, prev_token: str, suffixes: List[str]) -> str:
105
+ """Handle da/de/ta/te suffix selection."""
106
+ if self._ends_with_sert_unsuz(prev_token):
107
+ return suffixes[3] if self._ends_with_ince(prev_token) else suffixes[2]
108
+ return suffixes[1] if self._ends_with_ince(prev_token) else suffixes[0]
109
+
110
+ def _handle_di_du_suffix(self, prev_token: str, suffixes: List[str]) -> str:
111
+ """Handle dı/di/du/dü suffix selection."""
112
+ base_index = self._get_vowel_suffix_index(prev_token)
113
+ return suffixes[base_index + 4] if self._ends_with_sert_unsuz(prev_token) else suffixes[base_index]
114
+
115
+ def _handle_lik_suffix(self, i: int, ids: List[int], prev_token: str, suffixes: List[str]) -> str:
116
+ """Handle lık/lik/luk/lük suffix selection."""
117
+ if i >= len(ids) - 1:
118
+ return suffixes[0]
119
+
120
+ next_token = self.reverse_dict[ids[i + 1]][0]
121
+ base_index = self._get_vowel_suffix_index(prev_token)
122
+ return suffixes[base_index + 4] if self._starts_with_vowel(next_token) else suffixes[base_index]
123
+
124
+ def _handle_cik_suffix(self, i: int, ids: List[int], prev_token: str, suffixes: List[str]) -> str:
125
+ """Handle cık/cik/cuk/cük suffix selection."""
126
+ if i >= len(ids) - 1:
127
+ return suffixes[0]
128
+
129
+ next_token = self.reverse_dict[ids[i + 1]][0]
130
+ base_index = self._get_vowel_suffix_index(prev_token)
131
+
132
+ if self._starts_with_vowel(next_token):
133
+ offset = 12 if self._ends_with_sert_unsuz(prev_token) else 8
134
+ else:
135
+ offset = 4 if self._ends_with_sert_unsuz(prev_token) else 0
136
+
137
+ return suffixes[base_index + offset]
138
+
139
+ def _handle_mak_suffix(self, i: int, ids: List[int], prev_token: str, suffixes: List[str]) -> str:
140
+ """Handle mak/mek/may/mey suffix selection."""
141
+ if i >= len(ids) - 1:
142
+ return suffixes[0]
143
+
144
+ next_token = self.reverse_dict[ids[i + 1]][0]
145
+ base_index = 1 if self._ends_with_ince(prev_token) else 0
146
+ return suffixes[base_index + 2] if self._starts_with_vowel(next_token) else suffixes[base_index]
147
+
148
+ def _handle_acak_suffix(self, i: int, ids: List[int], prev_token: str, suffixes: List[str]) -> str:
149
+ """Handle acak/ecek/yacak/yecek suffix selection."""
150
+ is_vowel_ending = self._ends_with_vowel(prev_token)
151
+ is_ince = self._ends_with_ince(prev_token)
152
+
153
+ is_vowel_starting = False
154
+ if i < len(ids) - 1:
155
+ next_token = self.reverse_dict[ids[i + 1]][0]
156
+ is_vowel_starting = self._starts_with_vowel(next_token)
157
+
158
+ if is_vowel_starting:
159
+ if is_vowel_ending:
160
+ return suffixes[7] if is_ince else suffixes[6]
161
+ else:
162
+ return suffixes[3] if is_ince else suffixes[2]
163
+ else:
164
+ if is_vowel_ending:
165
+ return suffixes[5] if is_ince else suffixes[4]
166
+ else:
167
+ return suffixes[1] if is_ince else suffixes[0]
168
+
169
+ def _select_correct_root(self, i: int, ids: List[int]) -> str:
170
+ """Select the correct root form based on morphological context."""
171
+ token_id = ids[i]
172
+
173
+ if i >= len(ids) - 2:
174
+ return self.reverse_dict[token_id][0]
175
+
176
+ next_token = self.reverse_dict[ids[i + 1]][0]
177
+
178
+ if 100 <= token_id < 2080:
179
+ if self._starts_with_vowel(next_token):
180
+ return self.reverse_dict[token_id][1]
181
+ elif token_id <= 110 and ids[i + 1] == 20034:
182
+ return self.reverse_dict[token_id][2]
183
+ else:
184
+ return self.reverse_dict[token_id][0]
185
+
186
+ elif 2080 <= token_id < 2315:
187
+ if ids[i + 1] == 20021: # yor
188
+ return self.reverse_dict[token_id][1]
189
+ else:
190
+ return self.reverse_dict[token_id][0]
191
+
192
+ return self.reverse_dict[token_id][0]
193
+
194
+ def decode(self, ids: List[int]) -> str:
195
+ """Decode a list of token IDs to text."""
196
+ if not ids:
197
+ return ""
198
+
199
+ text_parts = []
200
+ i = 0
201
+
202
+ while i < len(ids):
203
+ token_id = ids[i]
204
+ # Handle special tokens
205
+ if token_id == 0 and i < len(ids) - 1: # uppercase
206
+ next_token = self.reverse_dict[ids[i + 1]][0]
207
+ text_parts.append(next_token.capitalize())
208
+ i += 2
209
+ continue
210
+ elif token_id == 1: # unknown
211
+ text_parts.append("▁u▁")
212
+ elif token_id in self.reverse_dict:
213
+ tokens = self.reverse_dict[token_id]
214
+ if len(tokens) > 1 and i > 0:
215
+ if token_id < 20000: # root token
216
+ text_parts.append(self._select_correct_root(i, ids))
217
+ else: # suffix token
218
+ j = -1
219
+ prev_token = text_parts[j]
220
+ # while prev_token is not a word, get the previous token
221
+ while not prev_token.isalpha() and j > -len(text_parts):
222
+ prev_token = text_parts[j]
223
+ j -= 1
224
+ text_parts.append(self._select_correct_suffix(i, ids, prev_token))
225
+ else:
226
+ text_parts.append(tokens[0])
227
+ else:
228
+ text_parts.append("▁")
229
+
230
+ i += 1
231
+
232
+ return "".join(text_parts)
tr_tokenizer.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from enum import Enum
3
+ from typing import Dict, List, Optional, Tuple
4
+
5
+ from tr_decoder import TRDecoder
6
+
7
+
8
+ class TokenType(Enum):
9
+ ROOT = "ROOT"
10
+ SUFFIX = "SUFFIX"
11
+ BPE = "BPE"
12
+
13
+ class TRTokenizer:
14
+ def __init__(self):
15
+ with open("kokler.json", "r") as f:
16
+ roots = json.load(f)
17
+ with open("ekler.json", "r") as f:
18
+ suffixes = json.load(f)
19
+ with open("bpe_tokenler.json", "r") as f:
20
+ bpe_tokens = json.load(f)
21
+ self.reverse_dict = {}
22
+
23
+ for key, value in roots.items():
24
+ if value not in self.reverse_dict:
25
+ self.reverse_dict[value] = []
26
+ self.reverse_dict[value].append(key)
27
+ for key, value in suffixes.items():
28
+ if value not in self.reverse_dict:
29
+ self.reverse_dict[value] = []
30
+ self.reverse_dict[value].append(key)
31
+ for key, value in bpe_tokens.items():
32
+ if value not in self.reverse_dict:
33
+ self.reverse_dict[value] = []
34
+ self.reverse_dict[value].append(key)
35
+
36
+ self.decoder = TRDecoder(self.reverse_dict)
37
+
38
+ self.roots = roots
39
+ self.suffixes = suffixes
40
+ self.bpe_tokens = bpe_tokens
41
+ self.max_root_len = max(len(k) for k in roots) if roots else 0
42
+ self.max_suffix_len = max(len(k) for k in suffixes) if suffixes else 0
43
+ self.max_bpe_len = max(len(k) for k in bpe_tokens) if bpe_tokens else 0
44
+
45
+ self.uppercase_marker = {"token": "<uppercase>", "id": 0, "type": TokenType.ROOT}
46
+ self.unknown_marker = {"token": "<unknown>", "id": 1, "type": TokenType.ROOT}
47
+ self.space_marker = {"token": " ", "id": 2, "type": TokenType.ROOT}
48
+
49
+ def _tokenize_word(self, word: str) -> Tuple[List[dict], List[int]]:
50
+ uppercase_indices = [i for i, c in enumerate(word) if c.isupper()]
51
+ result = []
52
+
53
+ segments = self._camel_split_with_positions(word)
54
+
55
+ for seg, orig_pos in segments:
56
+ if orig_pos < len(word) and word[orig_pos].isupper():
57
+ result.append(self.uppercase_marker)
58
+
59
+ s = seg
60
+ pos = 0
61
+
62
+ while pos < len(s):
63
+ substr = s[pos:]
64
+
65
+ rid, rtok = self._longest_prefix_lookup(substr, self.roots, self.max_root_len)
66
+ if rid is not None:
67
+ result.append({"token": rtok, "id": rid, "type": TokenType.ROOT})
68
+ pos += len(rtok)
69
+ continue
70
+
71
+ sid, stok = self._longest_prefix_lookup(substr, self.suffixes, self.max_suffix_len)
72
+ if sid is not None:
73
+ result.append({"token": stok, "id": sid, "type": TokenType.SUFFIX})
74
+ pos += len(stok)
75
+ continue
76
+
77
+ bid, btok = self._longest_prefix_lookup(substr, self.bpe_tokens, self.max_bpe_len)
78
+ if bid is not None:
79
+ result.append({"token": btok, "id": bid, "type": TokenType.BPE})
80
+ pos += len(btok)
81
+ continue
82
+
83
+ result.append(self.unknown_marker)
84
+ pos += 1
85
+
86
+ return result, uppercase_indices
87
+
88
+ def tokenize_text(self, text: str) -> Tuple[List[dict], List[int]]:
89
+ final_tokens = []
90
+ uppercase_indices = [i for i, c in enumerate(text) if c.isupper()]
91
+
92
+ parts = text.split(" ")
93
+ for idx, part in enumerate(parts):
94
+ if part.strip():
95
+ tokens, _ = self._tokenize_word(part)
96
+ final_tokens.extend(tokens)
97
+ if idx < len(parts) - 1:
98
+ final_tokens.append(self.space_marker)
99
+
100
+ return final_tokens, uppercase_indices
101
+
102
+ def encode(self, text: str) -> List[int]:
103
+ tokens, _ = self.tokenize_text(text)
104
+ return [t["id"] for t in tokens]
105
+
106
+ def tokenize(self, text: str) -> List[str]:
107
+ tokens, _ = self.tokenize_text(text)
108
+ return [t["token"] for t in tokens]
109
+
110
+ def _longest_prefix_lookup(self, s: str, table: Dict[str, int], max_len: int = None) -> Tuple[Optional[int], str]:
111
+ end = min(len(s), max_len) if max_len else len(s)
112
+ for i in range(end, 0, -1):
113
+ cand = s[:i]
114
+ if cand in table:
115
+ return table[cand], cand
116
+ return None, ""
117
+
118
+ def _camel_split_with_positions(self, word: str) -> List[Tuple[str, int]]:
119
+ if not word:
120
+ return []
121
+
122
+ parts = []
123
+ start = 0
124
+
125
+ for i in range(1, len(word)):
126
+ if word[i].isupper():
127
+ if start < i:
128
+ parts.append((word[start:i].lower(), start))
129
+ start = i
130
+
131
+ if start < len(word):
132
+ parts.append((word[start:].lower(), start))
133
+
134
+ return parts if parts else [(word.lower(), 0)]
135
+
136
+ def decode(self, ids: List[int]) -> str:
137
+ return TRDecoder(self.reverse_dict).decode(ids)