Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Implement Gradio interface for Turkish Tokenizer, replacing Streamlit; update requirements to include Gradio.
Browse files- app.py +166 -247
- bpe_tokenler.json +0 -0
- ekler.json +363 -0
- kokler.json +0 -0
- requirements.txt +1 -5
- tr_decoder.py +232 -0
- tr_tokenizer.py +137 -0
    	
        app.py
    CHANGED
    
    | @@ -1,255 +1,174 @@ | |
| 1 | 
            -
            import  | 
| 2 | 
            -
             | 
| 3 | 
            -
             | 
| 4 | 
            -
             | 
| 5 | 
            -
             | 
| 6 | 
            -
             | 
| 7 | 
            -
            from  | 
| 8 | 
            -
             | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
            #  | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 16 | 
            -
             | 
| 17 | 
            -
             | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
            #  | 
| 21 | 
            -
            if 'text' not in st.session_state:
         | 
| 22 | 
            -
                st.session_state.text = "Akademisyenler ve aileleri birlikte çalışıyorlar."
         | 
| 23 | 
            -
            if 'token_results' not in st.session_state:
         | 
| 24 | 
            -
                st.session_state.token_results = None
         | 
| 25 | 
            -
             | 
| 26 | 
            -
            # Constants
         | 
| 27 | 
            -
            GITHUB_REPO = "malibayram/tokenizer"
         | 
| 28 | 
            -
            GITHUB_BRANCH = "main"
         | 
| 29 | 
            -
             | 
| 30 | 
            -
            # Special tokens and their IDs
         | 
| 31 | 
            -
            SPECIAL_TOKENS = {
         | 
| 32 | 
            -
                "<uppercase>": 0,    # Uppercase letter marker
         | 
| 33 | 
            -
                "<space>": 1,       # Space character
         | 
| 34 | 
            -
                "<newline>": 2,     # Newline character
         | 
| 35 | 
            -
                "<tab>": 3,         # Tab character
         | 
| 36 | 
            -
                "<unknown>": 4      # Unknown token
         | 
| 37 | 
             
            }
         | 
| 38 |  | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 41 | 
            -
                 | 
| 42 | 
            -
                " | 
| 43 | 
            -
                 | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 47 |  | 
| 48 | 
            -
             | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
            -
                " | 
| 53 | 
            -
                " | 
| 54 | 
            -
                 | 
| 55 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 56 |  | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
                 | 
| 60 | 
            -
             | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
| 64 |  | 
| 65 | 
            -
            # Token ID ranges
         | 
| 66 | 
            -
            TOKEN_RANGES = {
         | 
| 67 | 
            -
                'special': (0, 4),          # Special tokens
         | 
| 68 | 
            -
                'root_words': (5, 20000),   # Root words
         | 
| 69 | 
            -
                'suffixes': (22268, 22767), # Suffixes
         | 
| 70 | 
            -
                'bpe': (20000, None)        # BPE tokens (20000+)
         | 
| 71 | 
            -
            }
         | 
| 72 |  | 
| 73 | 
            -
             | 
| 74 | 
            -
             | 
| 75 | 
            -
                 | 
| 76 | 
            -
                 | 
| 77 | 
            -
                     | 
| 78 | 
            -
             | 
| 79 | 
            -
             | 
| 80 | 
            -
                     | 
| 81 | 
            -
             | 
| 82 | 
            -
             | 
| 83 | 
            -
             | 
| 84 | 
            -
             | 
| 85 | 
            -
                     | 
| 86 | 
            -
                     | 
| 87 | 
            -
                return colors
         | 
| 88 | 
            -
             | 
| 89 | 
            -
            def fetch_github_file(path, ref=GITHUB_BRANCH):
         | 
| 90 | 
            -
                """Fetch file content from GitHub repository."""
         | 
| 91 | 
            -
                url = f"https://api.github.com/repos/{GITHUB_REPO}/contents/{path}?ref={ref}"
         | 
| 92 | 
            -
                response = requests.get(url)
         | 
| 93 | 
            -
                if response.status_code == 200:
         | 
| 94 | 
            -
                    content = base64.b64decode(response.json()['content']).decode('utf-8')
         | 
| 95 | 
            -
                    return content
         | 
| 96 | 
            -
                else:
         | 
| 97 | 
            -
                    st.error(f"Could not fetch {path} from GitHub: {response.status_code}")
         | 
| 98 | 
            -
                    return None
         | 
| 99 | 
            -
             | 
| 100 | 
            -
            @st.cache_resource
         | 
| 101 | 
            -
            def load_tokenizer():
         | 
| 102 | 
            -
                """Load and initialize the tokenizer from GitHub."""
         | 
| 103 | 
            -
                temp_dir = Path("temp_tokenizer")
         | 
| 104 | 
            -
                temp_dir.mkdir(exist_ok=True)
         | 
| 105 | 
            -
                
         | 
| 106 | 
            -
                # Fetch required files
         | 
| 107 | 
            -
                for local_name, github_path in REQUIRED_FILES.items():
         | 
| 108 | 
            -
                    content = fetch_github_file(github_path)
         | 
| 109 | 
            -
                    if content is None:
         | 
| 110 | 
            -
                        return None
         | 
| 111 | 
            -
                    
         | 
| 112 | 
            -
                    with open(temp_dir / local_name, 'w', encoding='utf-8') as f:
         | 
| 113 | 
            -
                        f.write(content)
         | 
| 114 | 
            -
                
         | 
| 115 | 
            -
                # Modify tokenizer to use correct paths
         | 
| 116 | 
            -
                tokenizer_path = temp_dir / "tokenizer.py"
         | 
| 117 | 
            -
                with open(tokenizer_path, 'r', encoding='utf-8') as f:
         | 
| 118 | 
            -
                    tokenizer_code = f.read()
         | 
| 119 | 
            -
                
         | 
| 120 | 
            -
                modified_code = tokenizer_code.replace(
         | 
| 121 | 
            -
                    'def load_json(filename):',
         | 
| 122 | 
            -
                    f'''def load_json(filename):
         | 
| 123 | 
            -
                full_path = os.path.join("{temp_dir.absolute()}", filename)
         | 
| 124 | 
            -
                with open(full_path, 'r', encoding='utf-8') as file:
         | 
| 125 | 
            -
                    return json.load(file)'''
         | 
| 126 | 
             
                )
         | 
| 127 | 
            -
             | 
| 128 | 
            -
                 | 
| 129 | 
            -
                     | 
| 130 | 
            -
             | 
| 131 | 
            -
                # Load module
         | 
| 132 | 
            -
                spec = importlib.util.spec_from_file_location("tokenizer", str(temp_dir / "tokenizer.py"))
         | 
| 133 | 
            -
                module = importlib.util.module_from_spec(spec)
         | 
| 134 | 
            -
                sys.modules["tokenizer"] = module
         | 
| 135 | 
            -
                spec.loader.exec_module(module)
         | 
| 136 | 
            -
                
         | 
| 137 | 
            -
                return module.tokenize
         | 
| 138 | 
            -
             | 
| 139 | 
            -
            @st.cache_data(ttl=3600)
         | 
| 140 | 
            -
            def get_commit_history():
         | 
| 141 | 
            -
                """Fetch commit history from GitHub."""
         | 
| 142 | 
            -
                url = f"https://api.github.com/repos/{GITHUB_REPO}/commits"
         | 
| 143 | 
            -
                try:
         | 
| 144 | 
            -
                    response = requests.get(url)
         | 
| 145 | 
            -
                    if response.status_code == 200:
         | 
| 146 | 
            -
                        commits = response.json()
         | 
| 147 | 
            -
                        versions = []
         | 
| 148 | 
            -
                        for commit in commits[:10]:
         | 
| 149 | 
            -
                            date = datetime.strptime(commit['commit']['author']['date'], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d')
         | 
| 150 | 
            -
                            sha = commit['sha'][:7]
         | 
| 151 | 
            -
                            message = commit['commit']['message'].split('\n')[0][:50]
         | 
| 152 | 
            -
                            versions.append(f"{date} - {sha} - {message}")
         | 
| 153 | 
            -
                        return versions
         | 
| 154 | 
            -
                    return ["latest"]
         | 
| 155 | 
            -
                except Exception as e:
         | 
| 156 | 
            -
                    st.warning(f"Could not fetch commit history: {str(e)}")
         | 
| 157 | 
            -
                    return ["latest"]
         | 
| 158 | 
            -
             | 
| 159 | 
            -
            def render_tokens(tokens, token_colors):
         | 
| 160 | 
            -
                """Render colored token visualization."""
         | 
| 161 | 
            -
                html_tokens = []
         | 
| 162 | 
            -
                for token in tokens:
         | 
| 163 | 
            -
                    color = token_colors[token]
         | 
| 164 | 
            -
                    display_text = SPECIAL_TOKEN_SYMBOLS.get(token, token)  # Use symbol for special tokens
         | 
| 165 | 
            -
                    html_tokens.append(
         | 
| 166 | 
            -
                        f'<span style="background-color: {color}; padding: 2px 4px; margin: 2px; border-radius: 3px;" title="{token}">{display_text}</span>'
         | 
| 167 | 
            -
                    )
         | 
| 168 | 
            -
                return " ".join(html_tokens)
         | 
| 169 | 
            -
             | 
| 170 | 
            -
            # Load tokenizer
         | 
| 171 | 
            -
            tokenize = load_tokenizer()
         | 
| 172 | 
            -
            if tokenize is None:
         | 
| 173 | 
            -
                st.error("Failed to load tokenizer from GitHub")
         | 
| 174 | 
            -
                st.stop()
         | 
| 175 | 
            -
             | 
| 176 | 
            -
            # Tokenize example text on startup if no results exist
         | 
| 177 | 
            -
            if st.session_state.token_results is None and st.session_state.text:
         | 
| 178 | 
            -
                try:
         | 
| 179 | 
            -
                    st.session_state.token_results = tokenize(st.session_state.text)
         | 
| 180 | 
            -
                except Exception as e:
         | 
| 181 | 
            -
                    st.error(f"Error tokenizing text: {str(e)}")
         | 
| 182 | 
            -
             | 
| 183 | 
            -
            # UI Layout
         | 
| 184 | 
            -
            st.title("🇹🇷 Turkish Tiktokenizer")
         | 
| 185 | 
            -
             | 
| 186 | 
            -
            # Model selection
         | 
| 187 | 
            -
            versions = get_commit_history()
         | 
| 188 | 
            -
            model = st.selectbox("", versions, key="model_selection", label_visibility="collapsed")
         | 
| 189 | 
            -
             | 
| 190 | 
            -
            # Main layout
         | 
| 191 | 
            -
            col1, col2 = st.columns([0.4, 0.6])
         | 
| 192 | 
            -
             | 
| 193 | 
            -
            # Input column
         | 
| 194 | 
            -
            with col1:
         | 
| 195 | 
            -
                text = st.text_area(
         | 
| 196 | 
            -
                    "Enter Turkish text to tokenize",
         | 
| 197 | 
            -
                    value=st.session_state.text,
         | 
| 198 | 
            -
                    height=200,
         | 
| 199 | 
            -
                    key="text_input",
         | 
| 200 | 
            -
                    label_visibility="collapsed",
         | 
| 201 | 
            -
                    placeholder="Enter Turkish text to tokenize"
         | 
| 202 | 
             
                )
         | 
| 203 | 
            -
             | 
| 204 | 
            -
                 | 
| 205 | 
            -
             | 
| 206 | 
            -
                     | 
| 207 | 
            -
             | 
| 208 | 
            -
             | 
| 209 | 
            -
             | 
| 210 | 
            -
             | 
| 211 | 
            -
             | 
| 212 | 
            -
             | 
| 213 | 
            -
                        st.session_state.token_results = None
         | 
| 214 | 
            -
             | 
| 215 | 
            -
            # Results column
         | 
| 216 | 
            -
            with col2:
         | 
| 217 | 
            -
                st.markdown("Token count")
         | 
| 218 | 
            -
                if st.session_state.token_results is not None:
         | 
| 219 | 
            -
                    result = st.session_state.token_results
         | 
| 220 | 
            -
                    token_count = len(result["tokens"])
         | 
| 221 | 
            -
                    st.markdown(f"### {token_count}")
         | 
| 222 | 
            -
                    
         | 
| 223 | 
            -
                    st.markdown("Tokenized text")
         | 
| 224 | 
            -
                    
         | 
| 225 | 
            -
                    # Generate token colors
         | 
| 226 | 
            -
                    regular_tokens = [t for t in result["tokens"] if t not in SPECIAL_COLORS]
         | 
| 227 | 
            -
                    regular_token_colors = dict(zip(regular_tokens, generate_colors(len(regular_tokens))))
         | 
| 228 | 
            -
                    token_colors = {**SPECIAL_COLORS, **regular_token_colors}
         | 
| 229 | 
            -
                    
         | 
| 230 | 
            -
                    # Render tokens
         | 
| 231 | 
            -
                    with st.container():
         | 
| 232 | 
            -
                        st.markdown(render_tokens(result["tokens"], token_colors), unsafe_allow_html=True)
         | 
| 233 | 
            -
                    
         | 
| 234 | 
            -
                    st.markdown("Token IDs")
         | 
| 235 | 
            -
                    st.code(", ".join(map(str, result["ids"])), language=None)
         | 
| 236 | 
            -
                else:
         | 
| 237 | 
            -
                    st.markdown("### 0")
         | 
| 238 | 
            -
                    st.markdown("Tokenized text")
         | 
| 239 | 
            -
                    st.markdown("")
         | 
| 240 | 
            -
                    st.markdown("Token IDs")
         | 
| 241 | 
            -
                    st.text("")
         | 
| 242 | 
            -
             | 
| 243 | 
            -
            # Footer
         | 
| 244 | 
            -
            st.markdown("""
         | 
| 245 | 
            -
            <div style="position: fixed; bottom: 0; width: 100%; text-align: center; padding: 10px; background-color: white;">
         | 
| 246 | 
            -
                <a href="https://github.com/malibayram/tokenizer" target="_blank">View on GitHub</a>
         | 
| 247 | 
            -
            </div>
         | 
| 248 | 
            -
            """, unsafe_allow_html=True)
         | 
| 249 | 
            -
             | 
| 250 | 
            -
            # Cleanup
         | 
| 251 | 
            -
            def cleanup():
         | 
| 252 | 
            -
                if Path("temp_tokenizer").exists():
         | 
| 253 | 
            -
                    shutil.rmtree("temp_tokenizer")
         | 
| 254 | 
            -
             | 
| 255 | 
            -
            atexit.register(cleanup) 
         | 
|  | |
| 1 | 
            +
            import gradio as gr
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            # Assuming tr_tokenizer.py contains both TRTokenizer and TokenType
         | 
| 4 | 
            +
            # and that it correctly imports TRDecoder from tr_decoder.py.
         | 
| 5 | 
            +
            # Make sure tr_tokenizer.py, tr_decoder.py, and your .json files
         | 
| 6 | 
            +
            # are in the same directory as this app.py file.
         | 
| 7 | 
            +
            from tr_tokenizer import TokenType, TRTokenizer
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            # --- Gradio App ---
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            # Instantiate the tokenizer
         | 
| 12 | 
            +
            # This will now load directly from your existing .json files
         | 
| 13 | 
            +
            # as defined in your TRTokenizer class.
         | 
| 14 | 
            +
            tokenizer = TRTokenizer()
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            # Define colors for each token type (dark theme)
         | 
| 17 | 
            +
            dark_color_map = {
         | 
| 18 | 
            +
                TokenType.ROOT.name: "#FF6B6B",      # Darker Red
         | 
| 19 | 
            +
                TokenType.SUFFIX.name: "#4ECDC4",    # Teal
         | 
| 20 | 
            +
                TokenType.BPE.name: "#FFE66D",       # Darker Yellow
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 21 | 
             
            }
         | 
| 22 |  | 
| 23 | 
            +
            def tokenize_and_display(text, theme="light"):
         | 
| 24 | 
            +
                """
         | 
| 25 | 
            +
                Tokenizes the input text and prepares it for display in Gradio's HighlightedText component.
         | 
| 26 | 
            +
                """
         | 
| 27 | 
            +
                if not text:
         | 
| 28 | 
            +
                    # Return a structure that matches all outputs to avoid errors
         | 
| 29 | 
            +
                    return [], "", "", "", theme
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                tokens, _ = tokenizer.tokenize_text(text)
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                # Create the list of (token, label) for HighlightedText
         | 
| 34 | 
            +
                highlighted_tokens = []
         | 
| 35 | 
            +
                token_stats = {"ROOT": 0, "SUFFIX": 0, "BPE": 0}
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                for t in tokens:
         | 
| 38 | 
            +
                    token_text = t["token"]
         | 
| 39 | 
            +
                    token_type = t["type"].name
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                    # Count token types for statistics
         | 
| 42 | 
            +
                    token_stats[token_type] = token_stats.get(token_type, 0) + 1
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                    highlighted_tokens.append((token_text, token_type))
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                encoded_ids = tokenizer.encode(text)
         | 
| 47 | 
            +
                decoded_text = tokenizer.decode(encoded_ids)
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                # Calculate statistics
         | 
| 50 | 
            +
                total_tokens = len(tokens)
         | 
| 51 | 
            +
                total_chars = len(text)
         | 
| 52 | 
            +
                compression_ratio = (1 - total_tokens / total_chars) * 100 if total_chars > 0 else 0
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                # Define theme-specific colors for the stats block
         | 
| 55 | 
            +
                bg_col, text_col, card_col, border_col = ('#2d3748', '#f7fafc', '#4a5568', '#718096')
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                # Create statistics HTML
         | 
| 58 | 
            +
                stats_html = f"""
         | 
| 59 | 
            +
                <div style="background:{bg_col};padding:20px;border-radius:12px;margin:20px 0;">
         | 
| 60 | 
            +
                    <h4 style="color:{text_col};margin-bottom:15px;">📊 Tokenization Statistics</h4>
         | 
| 61 | 
            +
                    <div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(150px,1fr));gap:15px;margin-bottom:20px;">
         | 
| 62 | 
            +
                        <div style="background:{card_col};padding:15px;border-radius:8px;text-align:center;border:1px solid {border_col};"><div style="font-size:24px;font-weight:bold;color:#3b82f6;">{total_chars}</div><div style="color:{'#64748b' if theme == 'light' else '#a0aec0'};font-size:14px;">Characters</div></div>
         | 
| 63 | 
            +
                        <div style="background:{card_col};padding:15px;border-radius:8px;text-align:center;border:1px solid {border_col};"><div style="font-size:24px;font-weight:bold;color:#10b981;">{total_tokens}</div><div style="color:{'#64748b' if theme == 'light' else '#a0aec0'};font-size:14px;">Tokens</div></div>
         | 
| 64 | 
            +
                        <div style="background:{card_col};padding:15px;border-radius:8px;text-align:center;border:1px solid {border_col};"><div style="font-size:24px;font-weight:bold;color:#f59e0b;">{compression_ratio:.1f}%</div><div style="color:{'#64748b' if theme == 'light' else '#a0aec0'};font-size:14px;">Compression</div></div>
         | 
| 65 | 
            +
                    </div>
         | 
| 66 | 
            +
                    <div>
         | 
| 67 | 
            +
                        <h5 style="color:{text_col};margin-bottom:10px;">Token Type Distribution:</h5>
         | 
| 68 | 
            +
                        <div style="display:flex;gap:15px;flex-wrap:wrap;">
         | 
| 69 | 
            +
                            <div style="background:#FFADAD;color:#2d3748;padding:8px 12px;border-radius:6px;font-size:14px;font-weight:600;">🔴 Roots: {token_stats['ROOT']}</div>
         | 
| 70 | 
            +
                            <div style="background:#A0C4FF;color:#2d3748;padding:8px 12px;border-radius:6px;font-size:14px;font-weight:600;">🔵 Suffixes: {token_stats['SUFFIX']}</div>
         | 
| 71 | 
            +
                            <div style="background:#FDFFB6;color:#2d3748;padding:8px 12px;border-radius:6px;font-size:14px;font-weight:600;">🟡 BPE: {token_stats['BPE']}</div>
         | 
| 72 | 
            +
                        </div>
         | 
| 73 | 
            +
                    </div>
         | 
| 74 | 
            +
                </div>"""
         | 
| 75 | 
            +
                return highlighted_tokens, str(encoded_ids), decoded_text, stats_html, theme
         | 
| 76 | 
            +
             | 
| 77 | 
            +
            # Custom CSS for better styling
         | 
| 78 | 
            +
            custom_css = """
         | 
| 79 | 
            +
            .gradio-container{font-family:'Inter',-apple-system,BlinkMacSystemFont,sans-serif;}
         | 
| 80 | 
            +
            .custom-button{background:linear-gradient(135deg,#667eea 0%,#764ba2 100%);border:none;border-radius:8px;padding:12px 24px;color:white;font-weight:600;transition:all .3s ease;}
         | 
| 81 | 
            +
            .custom-button:hover{transform:translateY(-2px);box-shadow:0 8px 25px rgba(0,0,0,.15);}
         | 
| 82 | 
            +
            .theme-toggle{background:linear-gradient(135deg,#f093fb 0%,#f5576c 100%);border:none;border-radius:50px;padding:10px 20px;color:white;font-weight:600;transition:all .3s ease;}
         | 
| 83 | 
            +
            .theme-toggle:hover{transform:scale(1.05);box-shadow:0 4px 15px rgba(0,0,0,.2);}
         | 
| 84 | 
            +
            .input-textbox{border-radius:12px!important;border:2px solid #e2e8f0!important;transition:all .3s ease;}
         | 
| 85 | 
            +
            .input-textbox:focus{border-color:#667eea!important;box-shadow:0 0 0 3px rgba(102,126,234,.1)!important;}
         | 
| 86 | 
            +
            .dark .gradio-container{background:#1a202c!important;}
         | 
| 87 | 
            +
            .dark .input-textbox{background:#2d3748!important;border-color:#4a5568!important;color:#f7fafc!important;}
         | 
| 88 | 
            +
            """
         | 
| 89 | 
            +
             | 
| 90 | 
            +
            # Create the Gradio Interface
         | 
| 91 | 
            +
            with gr.Blocks(theme=gr.themes.Soft(), title="Turkish Tokenizer", css=custom_css) as demo:
         | 
| 92 | 
            +
                with gr.Row():
         | 
| 93 | 
            +
                    with gr.Column(scale=3):
         | 
| 94 | 
            +
                        gr.Markdown("""
         | 
| 95 | 
            +
                            # Turkish Tokenizer
         | 
| 96 | 
            +
                            ### Advanced Turkish Text Tokenization with Visual Analysis
         | 
| 97 | 
            +
                            Enter text to see how it's tokenized. Tokens are color-coded by type.
         | 
| 98 | 
            +
                        """)
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                theme_state = gr.State("light")
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                input_text = gr.Textbox(
         | 
| 103 | 
            +
                    label="📝 Input Text",
         | 
| 104 | 
            +
                    placeholder="Merhaba Dünya, kitapları okumak güzeldir.",
         | 
| 105 | 
            +
                    lines=4,
         | 
| 106 | 
            +
                    elem_classes=["input-textbox"]
         | 
| 107 | 
            +
                )
         | 
| 108 |  | 
| 109 | 
            +
                with gr.Row():
         | 
| 110 | 
            +
                    process_button = gr.Button("🚀 Tokenize", variant="primary", elem_classes=["custom-button"], size="lg")
         | 
| 111 | 
            +
                    clear_button = gr.Button("🗑️ Clear", variant="secondary", size="lg")
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                gr.Markdown("---")
         | 
| 114 | 
            +
                gr.Markdown("### 🔄 Encoded & Decoded Output")
         | 
| 115 | 
            +
                with gr.Row():
         | 
| 116 | 
            +
                    encoded_output = gr.Textbox(label="🔢 Encoded Token IDs", interactive=False, lines=2)
         | 
| 117 | 
            +
                    decoded_output = gr.Textbox(label="📝 Decoded Text", interactive=False, lines=2)
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                gr.Markdown("### 💡 Example Texts")
         | 
| 120 | 
            +
                gr.Examples(
         | 
| 121 | 
            +
                    examples=[
         | 
| 122 | 
            +
                        ["Merhaba Dünya! Bu bir gelişmiş Türkçe tokenizer testidir."],
         | 
| 123 | 
            +
                        ["İstanbul'da yaşıyorum ve Türkçe dilini öğreniyorum."],
         | 
| 124 | 
            +
                        ["KitapOkumak çok güzeldir ve bilgi verir."],
         | 
| 125 | 
            +
                        ["Türkiye Cumhuriyeti'nin başkenti Ankara'dır."],
         | 
| 126 | 
            +
                        ["Yapay zeka ve makine öğrenmesi teknolojileri gelişiyor."],
         | 
| 127 | 
            +
                    ],
         | 
| 128 | 
            +
                    inputs=input_text,
         | 
| 129 | 
            +
                    label="Try these examples:"
         | 
| 130 | 
            +
                )
         | 
| 131 |  | 
| 132 | 
            +
                gr.Markdown("---")
         | 
| 133 | 
            +
                gr.Markdown("### 🎨 Tokenization Output")
         | 
| 134 | 
            +
                highlighted_output = gr.HighlightedText(
         | 
| 135 | 
            +
                    label="Colorized Tokens",
         | 
| 136 | 
            +
                    color_map=dark_color_map, # This will be updated dynamically if needed
         | 
| 137 | 
            +
                    show_legend=True
         | 
| 138 | 
            +
                )
         | 
| 139 | 
            +
             | 
| 140 | 
            +
                gr.Markdown("---")
         | 
| 141 | 
            +
                gr.Markdown("### 📊 Statistics")
         | 
| 142 | 
            +
                stats_output = gr.HTML(label="")
         | 
| 143 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 144 |  | 
| 145 | 
            +
                gr.Markdown("--- \n **Turkish Tokenizer Pro** - Advanced tokenization for Turkish text.")
         | 
| 146 | 
            +
             | 
| 147 | 
            +
                # --- Event Handlers ---
         | 
| 148 | 
            +
                def process_with_theme(text, theme):
         | 
| 149 | 
            +
                    return tokenize_and_display(text, theme)
         | 
| 150 | 
            +
             | 
| 151 | 
            +
                def clear_all():
         | 
| 152 | 
            +
                    return "", [], "", "", ""
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                # Connect the buttons to the functions
         | 
| 155 | 
            +
                process_button.click(
         | 
| 156 | 
            +
                    fn=process_with_theme,
         | 
| 157 | 
            +
                    inputs=[input_text, theme_state],
         | 
| 158 | 
            +
                    outputs=[highlighted_output, encoded_output, decoded_output, stats_output, theme_state]
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 159 | 
             
                )
         | 
| 160 | 
            +
             | 
| 161 | 
            +
                clear_button.click(
         | 
| 162 | 
            +
                    fn=clear_all,
         | 
| 163 | 
            +
                    outputs=[input_text, highlighted_output, encoded_output, decoded_output, stats_output]
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 164 | 
             
                )
         | 
| 165 | 
            +
             | 
| 166 | 
            +
                # Auto-process on load with a default example
         | 
| 167 | 
            +
                demo.load(
         | 
| 168 | 
            +
                    fn=lambda theme: tokenize_and_display("Merhaba Dünya!", theme),
         | 
| 169 | 
            +
                    inputs=[theme_state],
         | 
| 170 | 
            +
                    outputs=[highlighted_output, encoded_output, decoded_output, stats_output, theme_state]
         | 
| 171 | 
            +
                )
         | 
| 172 | 
            +
             | 
| 173 | 
            +
            if __name__ == "__main__":
         | 
| 174 | 
            +
                demo.launch(show_error=True)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
    	
        bpe_tokenler.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        ekler.json
    ADDED
    
    | @@ -0,0 +1,363 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "lar": 20000,
         | 
| 3 | 
            +
              "ler": 20000,
         | 
| 4 | 
            +
              "ya": 20001,
         | 
| 5 | 
            +
              "ye": 20001,
         | 
| 6 | 
            +
              "ma": 20002,
         | 
| 7 | 
            +
              "me": 20002,
         | 
| 8 | 
            +
              "malı": 20003,
         | 
| 9 | 
            +
              "meli": 20003,
         | 
| 10 | 
            +
              "laş": 20004,
         | 
| 11 | 
            +
              "leş": 20004,
         | 
| 12 | 
            +
              "ça": 20005,
         | 
| 13 | 
            +
              "çe": 20005,
         | 
| 14 | 
            +
              "şar": 20006,
         | 
| 15 | 
            +
              "şer": 20006,
         | 
| 16 | 
            +
              "kan": 20007,
         | 
| 17 | 
            +
              "ken": 20007,
         | 
| 18 | 
            +
              "lak": 20008,
         | 
| 19 | 
            +
              "lek": 20008,
         | 
| 20 | 
            +
              "layın": 20009,
         | 
| 21 | 
            +
              "leyin": 20009,
         | 
| 22 | 
            +
              "sak": 20010,
         | 
| 23 | 
            +
              "sek": 20010,
         | 
| 24 | 
            +
              "arak": 20011,
         | 
| 25 | 
            +
              "erek": 20011,
         | 
| 26 | 
            +
              "an": 20012,
         | 
| 27 | 
            +
              "en": 20012,
         | 
| 28 | 
            +
              "ım": 20013,
         | 
| 29 | 
            +
              "im": 20013,
         | 
| 30 | 
            +
              "um": 20013,
         | 
| 31 | 
            +
              "üm": 20013,
         | 
| 32 | 
            +
              "ız": 20014,
         | 
| 33 | 
            +
              "iz": 20014,
         | 
| 34 | 
            +
              "uz": 20014,
         | 
| 35 | 
            +
              "üz": 20014,
         | 
| 36 | 
            +
              "sı": 20015,
         | 
| 37 | 
            +
              "si": 20015,
         | 
| 38 | 
            +
              "su": 20015,
         | 
| 39 | 
            +
              "sü": 20015,
         | 
| 40 | 
            +
              "mış": 20016,
         | 
| 41 | 
            +
              "miş": 20016,
         | 
| 42 | 
            +
              "muş": 20016,
         | 
| 43 | 
            +
              "müş": 20016,
         | 
| 44 | 
            +
              "yı": 20017,
         | 
| 45 | 
            +
              "yi": 20017,
         | 
| 46 | 
            +
              "yu": 20017,
         | 
| 47 | 
            +
              "yü": 20017,
         | 
| 48 | 
            +
              "lı": 20018,
         | 
| 49 | 
            +
              "li": 20018,
         | 
| 50 | 
            +
              "lu": 20018,
         | 
| 51 | 
            +
              "lü": 20018,
         | 
| 52 | 
            +
              "sız": 20019,
         | 
| 53 | 
            +
              "siz": 20019,
         | 
| 54 | 
            +
              "suz": 20019,
         | 
| 55 | 
            +
              "süz": 20019,
         | 
| 56 | 
            +
              "ncı": 20020,
         | 
| 57 | 
            +
              "nci": 20020,
         | 
| 58 | 
            +
              "ncu": 20020,
         | 
| 59 | 
            +
              "ncü": 20020,
         | 
| 60 | 
            +
              "ın": 20021,
         | 
| 61 | 
            +
              "in": 20021,
         | 
| 62 | 
            +
              "un": 20021,
         | 
| 63 | 
            +
              "ün": 20021,
         | 
| 64 | 
            +
              "nın": 20022,
         | 
| 65 | 
            +
              "nin": 20022,
         | 
| 66 | 
            +
              "nun": 20022,
         | 
| 67 | 
            +
              "nün": 20022,
         | 
| 68 | 
            +
              "la": 20023,
         | 
| 69 | 
            +
              "le": 20023,
         | 
| 70 | 
            +
              "yla": 20023,
         | 
| 71 | 
            +
              "yle": 20023,
         | 
| 72 | 
            +
              "da": 20024,
         | 
| 73 | 
            +
              "de": 20024,
         | 
| 74 | 
            +
              "ta": 20024,
         | 
| 75 | 
            +
              "te": 20024,
         | 
| 76 | 
            +
              "dan": 20025,
         | 
| 77 | 
            +
              "den": 20025,
         | 
| 78 | 
            +
              "tan": 20025,
         | 
| 79 | 
            +
              "ten": 20025,
         | 
| 80 | 
            +
              "dı": 20026,
         | 
| 81 | 
            +
              "di": 20026,
         | 
| 82 | 
            +
              "du": 20026,
         | 
| 83 | 
            +
              "dü": 20026,
         | 
| 84 | 
            +
              "tı": 20026,
         | 
| 85 | 
            +
              "ti": 20026,
         | 
| 86 | 
            +
              "tu": 20026,
         | 
| 87 | 
            +
              "tü": 20026,
         | 
| 88 | 
            +
              "cı": 20027,
         | 
| 89 | 
            +
              "ci": 20027,
         | 
| 90 | 
            +
              "cu": 20027,
         | 
| 91 | 
            +
              "cü": 20027,
         | 
| 92 | 
            +
              "çı": 20027,
         | 
| 93 | 
            +
              "çi": 20027,
         | 
| 94 | 
            +
              "çu": 20027,
         | 
| 95 | 
            +
              "çü": 20027,
         | 
| 96 | 
            +
              "dır": 20028,
         | 
| 97 | 
            +
              "dir": 20028,
         | 
| 98 | 
            +
              "dur": 20028,
         | 
| 99 | 
            +
              "dür": 20028,
         | 
| 100 | 
            +
              "tır": 20028,
         | 
| 101 | 
            +
              "tir": 20028,
         | 
| 102 | 
            +
              "tur": 20028,
         | 
| 103 | 
            +
              "tür": 20028,
         | 
| 104 | 
            +
              "lık": 20029,
         | 
| 105 | 
            +
              "lik": 20029,
         | 
| 106 | 
            +
              "luk": 20029,
         | 
| 107 | 
            +
              "lük": 20029,
         | 
| 108 | 
            +
              "lığ": 20029,
         | 
| 109 | 
            +
              "liğ": 20029,
         | 
| 110 | 
            +
              "luğ": 20029,
         | 
| 111 | 
            +
              "lüğ": 20029,
         | 
| 112 | 
            +
              "cık": 20030,
         | 
| 113 | 
            +
              "cik": 20030,
         | 
| 114 | 
            +
              "cuk": 20030,
         | 
| 115 | 
            +
              "cük": 20030,
         | 
| 116 | 
            +
              "çık": 20030,
         | 
| 117 | 
            +
              "çik": 20030,
         | 
| 118 | 
            +
              "çuk": 20030,
         | 
| 119 | 
            +
              "çük": 20030,
         | 
| 120 | 
            +
              "cığ": 20030,
         | 
| 121 | 
            +
              "ciğ": 20030,
         | 
| 122 | 
            +
              "cuğ": 20030,
         | 
| 123 | 
            +
              "cüğ": 20030,
         | 
| 124 | 
            +
              "çığ": 20030,
         | 
| 125 | 
            +
              "çiğ": 20030,
         | 
| 126 | 
            +
              "çuğ": 20030,
         | 
| 127 | 
            +
              "çüğ": 20030,
         | 
| 128 | 
            +
              "mak": 20031,
         | 
| 129 | 
            +
              "mek": 20031,
         | 
| 130 | 
            +
              "may": 20031,
         | 
| 131 | 
            +
              "mey": 20031,
         | 
| 132 | 
            +
              "acak": 20032,
         | 
| 133 | 
            +
              "ecek": 20032,
         | 
| 134 | 
            +
              "acağ": 20032,
         | 
| 135 | 
            +
              "eceğ": 20032,
         | 
| 136 | 
            +
              "yacak": 20032,
         | 
| 137 | 
            +
              "yecek": 20032,
         | 
| 138 | 
            +
              "yacağ": 20032,
         | 
| 139 | 
            +
              "yeceğ": 20032,
         | 
| 140 | 
            +
              "i": 20033,
         | 
| 141 | 
            +
              "ı": 20034,
         | 
| 142 | 
            +
              "u": 20035,
         | 
| 143 | 
            +
              "ü": 20036,
         | 
| 144 | 
            +
              "a": 20037,
         | 
| 145 | 
            +
              "e": 20038,
         | 
| 146 | 
            +
              "m": 20039,
         | 
| 147 | 
            +
              "n": 20040,
         | 
| 148 | 
            +
              "yor": 20041,
         | 
| 149 | 
            +
              "ar": 20042,
         | 
| 150 | 
            +
              "er": 20043,
         | 
| 151 | 
            +
              "sa": 20044,
         | 
| 152 | 
            +
              "se": 20045,
         | 
| 153 | 
            +
              "r": 20046,
         | 
| 154 | 
            +
              "ce": 20047,
         | 
| 155 | 
            +
              "daş": 20048,
         | 
| 156 | 
            +
              "deş": 20049,
         | 
| 157 | 
            +
              "msı": 20050,
         | 
| 158 | 
            +
              "msi": 20051,
         | 
| 159 | 
            +
              "msu": 20052,
         | 
| 160 | 
            +
              "gil": 20053,
         | 
| 161 | 
            +
              "ımsa": 20054,
         | 
| 162 | 
            +
              "ıcık": 20055,
         | 
| 163 | 
            +
              "nç": 20056,
         | 
| 164 | 
            +
              "sal": 20057,
         | 
| 165 | 
            +
              "sel": 20058,
         | 
| 166 | 
            +
              "ki": 20059,
         | 
| 167 | 
            +
              "y": 20060,
         | 
| 168 | 
            +
              "idi": 20061,
         | 
| 169 | 
            +
              "imiş": 20062,
         | 
| 170 | 
            +
              "ise": 20063,
         | 
| 171 | 
            +
              "s": 20064,
         | 
| 172 | 
            +
              "gül": 20065,
         | 
| 173 | 
            +
              "kıl": 20066,
         | 
| 174 | 
            +
              "kil": 20067,
         | 
| 175 | 
            +
              "ka": 20068,
         | 
| 176 | 
            +
              "ge": 20069,
         | 
| 177 | 
            +
              "z": 20070,
         | 
| 178 | 
            +
              "ek_temp_20071": 20071,
         | 
| 179 | 
            +
              "ek_temp_20072": 20072,
         | 
| 180 | 
            +
              "ek_temp_20073": 20073,
         | 
| 181 | 
            +
              "ek_temp_20074": 20074,
         | 
| 182 | 
            +
              "ek_temp_20075": 20075,
         | 
| 183 | 
            +
              "ek_temp_20076": 20076,
         | 
| 184 | 
            +
              "ek_temp_20077": 20077,
         | 
| 185 | 
            +
              "ek_temp_20078": 20078,
         | 
| 186 | 
            +
              "ek_temp_20079": 20079,
         | 
| 187 | 
            +
              "ek_temp_20080": 20080,
         | 
| 188 | 
            +
              "ek_temp_20081": 20081,
         | 
| 189 | 
            +
              "ek_temp_20082": 20082,
         | 
| 190 | 
            +
              "ek_temp_20083": 20083,
         | 
| 191 | 
            +
              "ek_temp_20084": 20084,
         | 
| 192 | 
            +
              "ek_temp_20085": 20085,
         | 
| 193 | 
            +
              "ek_temp_20086": 20086,
         | 
| 194 | 
            +
              "ek_temp_20087": 20087,
         | 
| 195 | 
            +
              "ek_temp_20088": 20088,
         | 
| 196 | 
            +
              "ek_temp_20089": 20089,
         | 
| 197 | 
            +
              "ek_temp_20090": 20090,
         | 
| 198 | 
            +
              "ek_temp_20091": 20091,
         | 
| 199 | 
            +
              "ek_temp_20092": 20092,
         | 
| 200 | 
            +
              "ek_temp_20093": 20093,
         | 
| 201 | 
            +
              "ek_temp_20094": 20094,
         | 
| 202 | 
            +
              "ek_temp_20095": 20095,
         | 
| 203 | 
            +
              "ek_temp_20096": 20096,
         | 
| 204 | 
            +
              "ek_temp_20097": 20097,
         | 
| 205 | 
            +
              "ek_temp_20098": 20098,
         | 
| 206 | 
            +
              "ek_temp_20099": 20099,
         | 
| 207 | 
            +
              "ek_temp_20100": 20100,
         | 
| 208 | 
            +
              "ek_temp_20101": 20101,
         | 
| 209 | 
            +
              "ek_temp_20102": 20102,
         | 
| 210 | 
            +
              "ek_temp_20103": 20103,
         | 
| 211 | 
            +
              "ek_temp_20104": 20104,
         | 
| 212 | 
            +
              "ek_temp_20105": 20105,
         | 
| 213 | 
            +
              "ek_temp_20106": 20106,
         | 
| 214 | 
            +
              "ek_temp_20107": 20107,
         | 
| 215 | 
            +
              "ek_temp_20108": 20108,
         | 
| 216 | 
            +
              "ek_temp_20109": 20109,
         | 
| 217 | 
            +
              "ek_temp_20110": 20110,
         | 
| 218 | 
            +
              "ek_temp_20111": 20111,
         | 
| 219 | 
            +
              "ek_temp_20112": 20112,
         | 
| 220 | 
            +
              "ek_temp_20113": 20113,
         | 
| 221 | 
            +
              "ek_temp_20114": 20114,
         | 
| 222 | 
            +
              "ek_temp_20115": 20115,
         | 
| 223 | 
            +
              "ek_temp_20116": 20116,
         | 
| 224 | 
            +
              "ek_temp_20117": 20117,
         | 
| 225 | 
            +
              "ek_temp_20118": 20118,
         | 
| 226 | 
            +
              "ek_temp_20119": 20119,
         | 
| 227 | 
            +
              "ek_temp_20120": 20120,
         | 
| 228 | 
            +
              "ek_temp_20121": 20121,
         | 
| 229 | 
            +
              "ek_temp_20122": 20122,
         | 
| 230 | 
            +
              "ek_temp_20123": 20123,
         | 
| 231 | 
            +
              "ek_temp_20124": 20124,
         | 
| 232 | 
            +
              "ek_temp_20125": 20125,
         | 
| 233 | 
            +
              "ek_temp_20126": 20126,
         | 
| 234 | 
            +
              "ek_temp_20127": 20127,
         | 
| 235 | 
            +
              "ek_temp_20128": 20128,
         | 
| 236 | 
            +
              "ek_temp_20129": 20129,
         | 
| 237 | 
            +
              "ek_temp_20130": 20130,
         | 
| 238 | 
            +
              "ek_temp_20131": 20131,
         | 
| 239 | 
            +
              "ek_temp_20132": 20132,
         | 
| 240 | 
            +
              "ek_temp_20133": 20133,
         | 
| 241 | 
            +
              "ek_temp_20134": 20134,
         | 
| 242 | 
            +
              "ek_temp_20135": 20135,
         | 
| 243 | 
            +
              "ek_temp_20136": 20136,
         | 
| 244 | 
            +
              "ek_temp_20137": 20137,
         | 
| 245 | 
            +
              "ek_temp_20138": 20138,
         | 
| 246 | 
            +
              "ek_temp_20139": 20139,
         | 
| 247 | 
            +
              "ek_temp_20140": 20140,
         | 
| 248 | 
            +
              "ek_temp_20141": 20141,
         | 
| 249 | 
            +
              "ek_temp_20142": 20142,
         | 
| 250 | 
            +
              "ek_temp_20143": 20143,
         | 
| 251 | 
            +
              "ek_temp_20144": 20144,
         | 
| 252 | 
            +
              "ek_temp_20145": 20145,
         | 
| 253 | 
            +
              "ek_temp_20146": 20146,
         | 
| 254 | 
            +
              "ek_temp_20147": 20147,
         | 
| 255 | 
            +
              "ek_temp_20148": 20148,
         | 
| 256 | 
            +
              "ek_temp_20149": 20149,
         | 
| 257 | 
            +
              "ek_temp_20150": 20150,
         | 
| 258 | 
            +
              "ek_temp_20151": 20151,
         | 
| 259 | 
            +
              "ek_temp_20152": 20152,
         | 
| 260 | 
            +
              "ek_temp_20153": 20153,
         | 
| 261 | 
            +
              "ek_temp_20154": 20154,
         | 
| 262 | 
            +
              "ek_temp_20155": 20155,
         | 
| 263 | 
            +
              "ek_temp_20156": 20156,
         | 
| 264 | 
            +
              "ek_temp_20157": 20157,
         | 
| 265 | 
            +
              "ek_temp_20158": 20158,
         | 
| 266 | 
            +
              "ek_temp_20159": 20159,
         | 
| 267 | 
            +
              "ek_temp_20160": 20160,
         | 
| 268 | 
            +
              "ek_temp_20161": 20161,
         | 
| 269 | 
            +
              "ek_temp_20162": 20162,
         | 
| 270 | 
            +
              "ek_temp_20163": 20163,
         | 
| 271 | 
            +
              "ek_temp_20164": 20164,
         | 
| 272 | 
            +
              "ek_temp_20165": 20165,
         | 
| 273 | 
            +
              "ek_temp_20166": 20166,
         | 
| 274 | 
            +
              "ek_temp_20167": 20167,
         | 
| 275 | 
            +
              "ek_temp_20168": 20168,
         | 
| 276 | 
            +
              "ek_temp_20169": 20169,
         | 
| 277 | 
            +
              "ek_temp_20170": 20170,
         | 
| 278 | 
            +
              "ek_temp_20171": 20171,
         | 
| 279 | 
            +
              "ek_temp_20172": 20172,
         | 
| 280 | 
            +
              "ek_temp_20173": 20173,
         | 
| 281 | 
            +
              "ek_temp_20174": 20174,
         | 
| 282 | 
            +
              "ek_temp_20175": 20175,
         | 
| 283 | 
            +
              "ek_temp_20176": 20176,
         | 
| 284 | 
            +
              "ek_temp_20177": 20177,
         | 
| 285 | 
            +
              "ek_temp_20178": 20178,
         | 
| 286 | 
            +
              "ek_temp_20179": 20179,
         | 
| 287 | 
            +
              "ek_temp_20180": 20180,
         | 
| 288 | 
            +
              "ek_temp_20181": 20181,
         | 
| 289 | 
            +
              "ek_temp_20182": 20182,
         | 
| 290 | 
            +
              "ek_temp_20183": 20183,
         | 
| 291 | 
            +
              "ek_temp_20184": 20184,
         | 
| 292 | 
            +
              "ek_temp_20185": 20185,
         | 
| 293 | 
            +
              "ek_temp_20186": 20186,
         | 
| 294 | 
            +
              "ek_temp_20187": 20187,
         | 
| 295 | 
            +
              "ek_temp_20188": 20188,
         | 
| 296 | 
            +
              "ek_temp_20189": 20189,
         | 
| 297 | 
            +
              "ek_temp_20190": 20190,
         | 
| 298 | 
            +
              "ek_temp_20191": 20191,
         | 
| 299 | 
            +
              "ek_temp_20192": 20192,
         | 
| 300 | 
            +
              "ek_temp_20193": 20193,
         | 
| 301 | 
            +
              "ek_temp_20194": 20194,
         | 
| 302 | 
            +
              "ek_temp_20195": 20195,
         | 
| 303 | 
            +
              "ek_temp_20196": 20196,
         | 
| 304 | 
            +
              "ek_temp_20197": 20197,
         | 
| 305 | 
            +
              "ek_temp_20198": 20198,
         | 
| 306 | 
            +
              "ek_temp_20199": 20199,
         | 
| 307 | 
            +
              "ek_temp_20200": 20200,
         | 
| 308 | 
            +
              "ek_temp_20201": 20201,
         | 
| 309 | 
            +
              "ek_temp_20202": 20202,
         | 
| 310 | 
            +
              "ek_temp_20203": 20203,
         | 
| 311 | 
            +
              "ek_temp_20204": 20204,
         | 
| 312 | 
            +
              "ek_temp_20205": 20205,
         | 
| 313 | 
            +
              "ek_temp_20206": 20206,
         | 
| 314 | 
            +
              "ek_temp_20207": 20207,
         | 
| 315 | 
            +
              "ek_temp_20208": 20208,
         | 
| 316 | 
            +
              "ek_temp_20209": 20209,
         | 
| 317 | 
            +
              "ek_temp_20210": 20210,
         | 
| 318 | 
            +
              "ek_temp_20211": 20211,
         | 
| 319 | 
            +
              "ek_temp_20212": 20212,
         | 
| 320 | 
            +
              "ek_temp_20213": 20213,
         | 
| 321 | 
            +
              "ek_temp_20214": 20214,
         | 
| 322 | 
            +
              "ek_temp_20215": 20215,
         | 
| 323 | 
            +
              "ek_temp_20216": 20216,
         | 
| 324 | 
            +
              "ek_temp_20217": 20217,
         | 
| 325 | 
            +
              "ek_temp_20218": 20218,
         | 
| 326 | 
            +
              "ek_temp_20219": 20219,
         | 
| 327 | 
            +
              "ek_temp_20220": 20220,
         | 
| 328 | 
            +
              "ek_temp_20221": 20221,
         | 
| 329 | 
            +
              "ek_temp_20222": 20222,
         | 
| 330 | 
            +
              "ek_temp_20223": 20223,
         | 
| 331 | 
            +
              "ek_temp_20224": 20224,
         | 
| 332 | 
            +
              "ek_temp_20225": 20225,
         | 
| 333 | 
            +
              "ek_temp_20226": 20226,
         | 
| 334 | 
            +
              "ek_temp_20227": 20227,
         | 
| 335 | 
            +
              "ek_temp_20228": 20228,
         | 
| 336 | 
            +
              "ek_temp_20229": 20229,
         | 
| 337 | 
            +
              "ek_temp_20230": 20230,
         | 
| 338 | 
            +
              "ek_temp_20231": 20231,
         | 
| 339 | 
            +
              "ek_temp_20232": 20232,
         | 
| 340 | 
            +
              "ek_temp_20233": 20233,
         | 
| 341 | 
            +
              "ek_temp_20234": 20234,
         | 
| 342 | 
            +
              "ek_temp_20235": 20235,
         | 
| 343 | 
            +
              "ek_temp_20236": 20236,
         | 
| 344 | 
            +
              "ek_temp_20237": 20237,
         | 
| 345 | 
            +
              "ek_temp_20238": 20238,
         | 
| 346 | 
            +
              "ek_temp_20239": 20239,
         | 
| 347 | 
            +
              "ek_temp_20240": 20240,
         | 
| 348 | 
            +
              "ek_temp_20241": 20241,
         | 
| 349 | 
            +
              "ek_temp_20242": 20242,
         | 
| 350 | 
            +
              "ek_temp_20243": 20243,
         | 
| 351 | 
            +
              "ek_temp_20244": 20244,
         | 
| 352 | 
            +
              "ek_temp_20245": 20245,
         | 
| 353 | 
            +
              "ek_temp_20246": 20246,
         | 
| 354 | 
            +
              "ek_temp_20247": 20247,
         | 
| 355 | 
            +
              "ek_temp_20248": 20248,
         | 
| 356 | 
            +
              "ek_temp_20249": 20249,
         | 
| 357 | 
            +
              "ek_temp_20250": 20250,
         | 
| 358 | 
            +
              "ek_temp_20251": 20251,
         | 
| 359 | 
            +
              "ek_temp_20252": 20252,
         | 
| 360 | 
            +
              "ek_temp_20253": 20253,
         | 
| 361 | 
            +
              "ek_temp_20254": 20254,
         | 
| 362 | 
            +
              "ek_temp_20255": 20255
         | 
| 363 | 
            +
            }
         | 
    	
        kokler.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        requirements.txt
    CHANGED
    
    | @@ -1,5 +1 @@ | |
| 1 | 
            -
             | 
| 2 | 
            -
            numpy>=1.21.0
         | 
| 3 | 
            -
            json5>=0.9.0
         | 
| 4 | 
            -
            requests>=2.31.0
         | 
| 5 | 
            -
            pathlib>=1.0.1 
         | 
|  | |
| 1 | 
            +
            gradio
         | 
|  | |
|  | |
|  | |
|  | 
    	
        tr_decoder.py
    ADDED
    
    | @@ -0,0 +1,232 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from typing import List
         | 
| 2 | 
            +
             | 
| 3 | 
            +
             | 
| 4 | 
            +
            class TRDecoder:
         | 
| 5 | 
            +
                # Define vowel sets as class constants for better performance
         | 
| 6 | 
            +
                ALL_VOWELS = "aeıioöuü"
         | 
| 7 | 
            +
                INCE_VOWELS = "eiöü"  # Front vowels
         | 
| 8 | 
            +
                AI_VOWELS = "aı"      # Back unrounded
         | 
| 9 | 
            +
                EI_VOWELS = "ei"      # Front unrounded  
         | 
| 10 | 
            +
                OU_VOWELS = "ou"      # Back rounded
         | 
| 11 | 
            +
                HARD_CONSONANTS = "fstkçşhp"  # Sert ünsüzler
         | 
| 12 | 
            +
                WHITESPACE = " \n\t"
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                def __init__(self, reverse_dict):
         | 
| 15 | 
            +
                    self.reverse_dict = reverse_dict
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                def _starts_with_vowel(self, word: str) -> bool:
         | 
| 18 | 
            +
                    """Check if word starts with a vowel."""
         | 
| 19 | 
            +
                    return word and word[0] in self.ALL_VOWELS
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                def _ends_with_vowel(self, word: str) -> bool:
         | 
| 22 | 
            +
                    """Check if word ends with a vowel."""
         | 
| 23 | 
            +
                    return word and word[-1] in self.ALL_VOWELS
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                def _ends_with_any(self, word: str, charset: str) -> bool:
         | 
| 26 | 
            +
                   # recursively check until first vowel starts from the end       
         | 
| 27 | 
            +
                   i = len(word) - 1
         | 
| 28 | 
            +
                   while i >= 0:
         | 
| 29 | 
            +
                       if word[i] in charset:
         | 
| 30 | 
            +
                           return True
         | 
| 31 | 
            +
                       if word[i] in self.ALL_VOWELS:
         | 
| 32 | 
            +
                           return False
         | 
| 33 | 
            +
                       i -= 1
         | 
| 34 | 
            +
                   return False
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                def _ends_with_ince(self, word: str) -> bool:
         | 
| 37 | 
            +
                    """Check if word ends with front vowels (ince ünlü)."""
         | 
| 38 | 
            +
                    if word in ("saat", "kilovatsaat", "ziraat", "itaat"):
         | 
| 39 | 
            +
                        return True
         | 
| 40 | 
            +
                    # check until first vowel recursively
         | 
| 41 | 
            +
                    return self._ends_with_any(word, self.INCE_VOWELS)
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                def _ends_with_sert_unsuz(self, word: str) -> bool:
         | 
| 44 | 
            +
                    """Check if word ends with a hard consonant."""
         | 
| 45 | 
            +
                    return word and word[-1] in self.HARD_CONSONANTS
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                def _get_vowel_suffix_index(self, prev_token: str) -> int:
         | 
| 48 | 
            +
                    """Get suffix index based on vowel harmony rules."""
         | 
| 49 | 
            +
                    if self._ends_with_any(prev_token, self.AI_VOWELS):
         | 
| 50 | 
            +
                        return 0
         | 
| 51 | 
            +
                    elif self._ends_with_any(prev_token, self.EI_VOWELS):
         | 
| 52 | 
            +
                        return 1
         | 
| 53 | 
            +
                    elif self._ends_with_any(prev_token, self.OU_VOWELS):
         | 
| 54 | 
            +
                        return 2
         | 
| 55 | 
            +
                    return 3
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                def _select_correct_suffix(self, i: int, ids: List[int], prev_token: str) -> str:
         | 
| 58 | 
            +
                    """Select the correct suffix based on morphological rules."""
         | 
| 59 | 
            +
                    suffixes = self.reverse_dict[ids[i]]
         | 
| 60 | 
            +
                    token_id = ids[i]        
         | 
| 61 | 
            +
                    # Handle different suffix types with cleaner logic
         | 
| 62 | 
            +
                    if token_id < 20013:
         | 
| 63 | 
            +
                        # Basic suffix selection based on vowel harmony
         | 
| 64 | 
            +
                        return suffixes[1] if self._ends_with_ince(prev_token) else suffixes[0]
         | 
| 65 | 
            +
                        
         | 
| 66 | 
            +
                    elif token_id < 20023:  # nın, nin, nun, nün
         | 
| 67 | 
            +
                        return suffixes[self._get_vowel_suffix_index(prev_token)]
         | 
| 68 | 
            +
                        
         | 
| 69 | 
            +
                    elif token_id == 20023:  # la, le, yla, yle
         | 
| 70 | 
            +
                        end_of_word = True
         | 
| 71 | 
            +
                        if i < len(ids) - 1:
         | 
| 72 | 
            +
                            next_token = self.reverse_dict[ids[i + 1]][0]
         | 
| 73 | 
            +
                            if next_token not in self.WHITESPACE:
         | 
| 74 | 
            +
                                end_of_word = False
         | 
| 75 | 
            +
                        return self._handle_la_le_suffix(prev_token, suffixes, end_of_word)
         | 
| 76 | 
            +
                        
         | 
| 77 | 
            +
                    elif token_id <= 20025:  # da, de, ta, te, dan, den, tan, ten
         | 
| 78 | 
            +
                        return self._handle_da_de_suffix(prev_token, suffixes)
         | 
| 79 | 
            +
                        
         | 
| 80 | 
            +
                    elif 20025 < token_id < 20029:  # dı, di, du, dü, tı, ti, tu, tü, etc.
         | 
| 81 | 
            +
                        return self._handle_di_du_suffix(prev_token, suffixes)
         | 
| 82 | 
            +
                        
         | 
| 83 | 
            +
                    elif token_id == 20029:  # lık, lik, luk, lük, etc.
         | 
| 84 | 
            +
                        return self._handle_lik_suffix(i, ids, prev_token, suffixes)
         | 
| 85 | 
            +
                        
         | 
| 86 | 
            +
                    elif token_id == 20030:  # cık, cik, cuk, cük, etc.
         | 
| 87 | 
            +
                        return self._handle_cik_suffix(i, ids, prev_token, suffixes)
         | 
| 88 | 
            +
                        
         | 
| 89 | 
            +
                    elif token_id == 20031:  # mak, mek, may, mey
         | 
| 90 | 
            +
                        return self._handle_mak_suffix(i, ids, prev_token, suffixes)
         | 
| 91 | 
            +
                        
         | 
| 92 | 
            +
                    elif token_id == 20032:  # acak, ecek, etc.
         | 
| 93 | 
            +
                        return self._handle_acak_suffix(i, ids, prev_token, suffixes)
         | 
| 94 | 
            +
                        
         | 
| 95 | 
            +
                    return suffixes[0]
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                def _handle_la_le_suffix(self, prev_token: str, suffixes: List[str], end_of_word: bool) -> str:
         | 
| 98 | 
            +
                    """Handle la/le/yla/yle suffix selection."""
         | 
| 99 | 
            +
                    if self._ends_with_vowel(prev_token) and end_of_word:
         | 
| 100 | 
            +
                        return suffixes[3] if self._ends_with_ince(prev_token) else suffixes[2]
         | 
| 101 | 
            +
                    else:
         | 
| 102 | 
            +
                        return suffixes[1] if self._ends_with_ince(prev_token) else suffixes[0]
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                def _handle_da_de_suffix(self, prev_token: str, suffixes: List[str]) -> str:
         | 
| 105 | 
            +
                    """Handle da/de/ta/te suffix selection."""
         | 
| 106 | 
            +
                    if self._ends_with_sert_unsuz(prev_token):
         | 
| 107 | 
            +
                        return suffixes[3] if self._ends_with_ince(prev_token) else suffixes[2]
         | 
| 108 | 
            +
                    return suffixes[1] if self._ends_with_ince(prev_token) else suffixes[0]
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                def _handle_di_du_suffix(self, prev_token: str, suffixes: List[str]) -> str:
         | 
| 111 | 
            +
                    """Handle dı/di/du/dü suffix selection."""
         | 
| 112 | 
            +
                    base_index = self._get_vowel_suffix_index(prev_token)
         | 
| 113 | 
            +
                    return suffixes[base_index + 4] if self._ends_with_sert_unsuz(prev_token) else suffixes[base_index]
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                def _handle_lik_suffix(self, i: int, ids: List[int], prev_token: str, suffixes: List[str]) -> str:
         | 
| 116 | 
            +
                    """Handle lık/lik/luk/lük suffix selection."""
         | 
| 117 | 
            +
                    if i >= len(ids) - 1:
         | 
| 118 | 
            +
                        return suffixes[0]
         | 
| 119 | 
            +
                    
         | 
| 120 | 
            +
                    next_token = self.reverse_dict[ids[i + 1]][0]
         | 
| 121 | 
            +
                    base_index = self._get_vowel_suffix_index(prev_token)
         | 
| 122 | 
            +
                    return suffixes[base_index + 4] if self._starts_with_vowel(next_token) else suffixes[base_index]
         | 
| 123 | 
            +
             | 
| 124 | 
            +
                def _handle_cik_suffix(self, i: int, ids: List[int], prev_token: str, suffixes: List[str]) -> str:
         | 
| 125 | 
            +
                    """Handle cık/cik/cuk/cük suffix selection."""
         | 
| 126 | 
            +
                    if i >= len(ids) - 1:
         | 
| 127 | 
            +
                        return suffixes[0]
         | 
| 128 | 
            +
                    
         | 
| 129 | 
            +
                    next_token = self.reverse_dict[ids[i + 1]][0]
         | 
| 130 | 
            +
                    base_index = self._get_vowel_suffix_index(prev_token)
         | 
| 131 | 
            +
                    
         | 
| 132 | 
            +
                    if self._starts_with_vowel(next_token):
         | 
| 133 | 
            +
                        offset = 12 if self._ends_with_sert_unsuz(prev_token) else 8
         | 
| 134 | 
            +
                    else:
         | 
| 135 | 
            +
                        offset = 4 if self._ends_with_sert_unsuz(prev_token) else 0
         | 
| 136 | 
            +
                    
         | 
| 137 | 
            +
                    return suffixes[base_index + offset]
         | 
| 138 | 
            +
             | 
| 139 | 
            +
                def _handle_mak_suffix(self, i: int, ids: List[int], prev_token: str, suffixes: List[str]) -> str:
         | 
| 140 | 
            +
                    """Handle mak/mek/may/mey suffix selection."""
         | 
| 141 | 
            +
                    if i >= len(ids) - 1:
         | 
| 142 | 
            +
                        return suffixes[0]
         | 
| 143 | 
            +
                    
         | 
| 144 | 
            +
                    next_token = self.reverse_dict[ids[i + 1]][0]
         | 
| 145 | 
            +
                    base_index = 1 if self._ends_with_ince(prev_token) else 0
         | 
| 146 | 
            +
                    return suffixes[base_index + 2] if self._starts_with_vowel(next_token) else suffixes[base_index]
         | 
| 147 | 
            +
             | 
| 148 | 
            +
                def _handle_acak_suffix(self, i: int, ids: List[int], prev_token: str, suffixes: List[str]) -> str:
         | 
| 149 | 
            +
                    """Handle acak/ecek/yacak/yecek suffix selection."""
         | 
| 150 | 
            +
                    is_vowel_ending = self._ends_with_vowel(prev_token)
         | 
| 151 | 
            +
                    is_ince = self._ends_with_ince(prev_token)
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                    is_vowel_starting = False
         | 
| 154 | 
            +
                    if i < len(ids) - 1:
         | 
| 155 | 
            +
                      next_token = self.reverse_dict[ids[i + 1]][0]
         | 
| 156 | 
            +
                      is_vowel_starting = self._starts_with_vowel(next_token)
         | 
| 157 | 
            +
                    
         | 
| 158 | 
            +
                    if is_vowel_starting:
         | 
| 159 | 
            +
                        if is_vowel_ending:
         | 
| 160 | 
            +
                            return suffixes[7] if is_ince else suffixes[6]
         | 
| 161 | 
            +
                        else:
         | 
| 162 | 
            +
                            return suffixes[3] if is_ince else suffixes[2]
         | 
| 163 | 
            +
                    else:
         | 
| 164 | 
            +
                        if is_vowel_ending:
         | 
| 165 | 
            +
                            return suffixes[5] if is_ince else suffixes[4]
         | 
| 166 | 
            +
                        else:
         | 
| 167 | 
            +
                            return suffixes[1] if is_ince else suffixes[0]
         | 
| 168 | 
            +
             | 
| 169 | 
            +
                def _select_correct_root(self, i: int, ids: List[int]) -> str:
         | 
| 170 | 
            +
                    """Select the correct root form based on morphological context."""
         | 
| 171 | 
            +
                    token_id = ids[i]
         | 
| 172 | 
            +
                    
         | 
| 173 | 
            +
                    if i >= len(ids) - 2:
         | 
| 174 | 
            +
                        return self.reverse_dict[token_id][0]
         | 
| 175 | 
            +
                    
         | 
| 176 | 
            +
                    next_token = self.reverse_dict[ids[i + 1]][0]
         | 
| 177 | 
            +
                    
         | 
| 178 | 
            +
                    if 100 <= token_id < 2080:
         | 
| 179 | 
            +
                        if self._starts_with_vowel(next_token):
         | 
| 180 | 
            +
                            return self.reverse_dict[token_id][1]
         | 
| 181 | 
            +
                        elif token_id <= 110 and ids[i + 1] == 20034:
         | 
| 182 | 
            +
                            return self.reverse_dict[token_id][2]
         | 
| 183 | 
            +
                        else:
         | 
| 184 | 
            +
                            return self.reverse_dict[token_id][0]
         | 
| 185 | 
            +
                            
         | 
| 186 | 
            +
                    elif 2080 <= token_id < 2315:
         | 
| 187 | 
            +
                        if ids[i + 1] == 20021:  # yor
         | 
| 188 | 
            +
                            return self.reverse_dict[token_id][1]
         | 
| 189 | 
            +
                        else:
         | 
| 190 | 
            +
                            return self.reverse_dict[token_id][0]
         | 
| 191 | 
            +
                    
         | 
| 192 | 
            +
                    return self.reverse_dict[token_id][0]
         | 
| 193 | 
            +
             | 
| 194 | 
            +
                def decode(self, ids: List[int]) -> str:
         | 
| 195 | 
            +
                    """Decode a list of token IDs to text."""
         | 
| 196 | 
            +
                    if not ids:
         | 
| 197 | 
            +
                        return ""
         | 
| 198 | 
            +
                    
         | 
| 199 | 
            +
                    text_parts = []
         | 
| 200 | 
            +
                    i = 0
         | 
| 201 | 
            +
                    
         | 
| 202 | 
            +
                    while i < len(ids):
         | 
| 203 | 
            +
                        token_id = ids[i]
         | 
| 204 | 
            +
                        # Handle special tokens
         | 
| 205 | 
            +
                        if token_id == 0 and i < len(ids) - 1:  # uppercase
         | 
| 206 | 
            +
                            next_token = self.reverse_dict[ids[i + 1]][0]
         | 
| 207 | 
            +
                            text_parts.append(next_token.capitalize())
         | 
| 208 | 
            +
                            i += 2
         | 
| 209 | 
            +
                            continue
         | 
| 210 | 
            +
                        elif token_id == 1:  # unknown
         | 
| 211 | 
            +
                            text_parts.append("▁u▁")
         | 
| 212 | 
            +
                        elif token_id in self.reverse_dict:
         | 
| 213 | 
            +
                            tokens = self.reverse_dict[token_id]
         | 
| 214 | 
            +
                            if len(tokens) > 1 and i > 0:
         | 
| 215 | 
            +
                                if token_id < 20000:  # root token
         | 
| 216 | 
            +
                                    text_parts.append(self._select_correct_root(i, ids))
         | 
| 217 | 
            +
                                else:  # suffix token
         | 
| 218 | 
            +
                                    j = -1
         | 
| 219 | 
            +
                                    prev_token = text_parts[j]
         | 
| 220 | 
            +
                                    # while prev_token is not a word, get the previous token
         | 
| 221 | 
            +
                                    while not prev_token.isalpha() and j > -len(text_parts):
         | 
| 222 | 
            +
                                        prev_token = text_parts[j]
         | 
| 223 | 
            +
                                        j -= 1
         | 
| 224 | 
            +
                                    text_parts.append(self._select_correct_suffix(i, ids, prev_token))
         | 
| 225 | 
            +
                            else:
         | 
| 226 | 
            +
                                text_parts.append(tokens[0])
         | 
| 227 | 
            +
                        else:
         | 
| 228 | 
            +
                            text_parts.append("▁")
         | 
| 229 | 
            +
                        
         | 
| 230 | 
            +
                        i += 1
         | 
| 231 | 
            +
                    
         | 
| 232 | 
            +
                    return "".join(text_parts)
         | 
    	
        tr_tokenizer.py
    ADDED
    
    | @@ -0,0 +1,137 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import json
         | 
| 2 | 
            +
            from enum import Enum
         | 
| 3 | 
            +
            from typing import Dict, List, Optional, Tuple
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            from tr_decoder import TRDecoder
         | 
| 6 | 
            +
             | 
| 7 | 
            +
             | 
| 8 | 
            +
            class TokenType(Enum):
         | 
| 9 | 
            +
                ROOT = "ROOT"
         | 
| 10 | 
            +
                SUFFIX = "SUFFIX"
         | 
| 11 | 
            +
                BPE = "BPE"
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            class TRTokenizer:
         | 
| 14 | 
            +
                def __init__(self):
         | 
| 15 | 
            +
                    with open("kokler.json", "r") as f:
         | 
| 16 | 
            +
                        roots = json.load(f)
         | 
| 17 | 
            +
                    with open("ekler.json", "r") as f:
         | 
| 18 | 
            +
                        suffixes = json.load(f)
         | 
| 19 | 
            +
                    with open("bpe_tokenler.json", "r") as f:
         | 
| 20 | 
            +
                        bpe_tokens = json.load(f)
         | 
| 21 | 
            +
                    self.reverse_dict = {}
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                    for key, value in roots.items():
         | 
| 24 | 
            +
                        if value not in self.reverse_dict:
         | 
| 25 | 
            +
                            self.reverse_dict[value] = []
         | 
| 26 | 
            +
                        self.reverse_dict[value].append(key)
         | 
| 27 | 
            +
                    for key, value in suffixes.items():
         | 
| 28 | 
            +
                        if value not in self.reverse_dict:
         | 
| 29 | 
            +
                            self.reverse_dict[value] = []
         | 
| 30 | 
            +
                        self.reverse_dict[value].append(key)
         | 
| 31 | 
            +
                    for key, value in bpe_tokens.items():
         | 
| 32 | 
            +
                        if value not in self.reverse_dict:
         | 
| 33 | 
            +
                            self.reverse_dict[value] = []
         | 
| 34 | 
            +
                        self.reverse_dict[value].append(key)
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                    self.decoder = TRDecoder(self.reverse_dict)
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                    self.roots = roots
         | 
| 39 | 
            +
                    self.suffixes = suffixes
         | 
| 40 | 
            +
                    self.bpe_tokens = bpe_tokens
         | 
| 41 | 
            +
                    self.max_root_len = max(len(k) for k in roots) if roots else 0
         | 
| 42 | 
            +
                    self.max_suffix_len = max(len(k) for k in suffixes) if suffixes else 0
         | 
| 43 | 
            +
                    self.max_bpe_len = max(len(k) for k in bpe_tokens) if bpe_tokens else 0
         | 
| 44 | 
            +
                    
         | 
| 45 | 
            +
                    self.uppercase_marker = {"token": "<uppercase>", "id": 0, "type": TokenType.ROOT}
         | 
| 46 | 
            +
                    self.unknown_marker = {"token": "<unknown>", "id": 1, "type": TokenType.ROOT}
         | 
| 47 | 
            +
                    self.space_marker = {"token": " ", "id": 2, "type": TokenType.ROOT}
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                def _tokenize_word(self, word: str) -> Tuple[List[dict], List[int]]:
         | 
| 50 | 
            +
                    uppercase_indices = [i for i, c in enumerate(word) if c.isupper()]
         | 
| 51 | 
            +
                    result = []
         | 
| 52 | 
            +
                    
         | 
| 53 | 
            +
                    segments = self._camel_split_with_positions(word)
         | 
| 54 | 
            +
                    
         | 
| 55 | 
            +
                    for seg, orig_pos in segments:
         | 
| 56 | 
            +
                        if orig_pos < len(word) and word[orig_pos].isupper():
         | 
| 57 | 
            +
                            result.append(self.uppercase_marker)
         | 
| 58 | 
            +
                        
         | 
| 59 | 
            +
                        s = seg
         | 
| 60 | 
            +
                        pos = 0
         | 
| 61 | 
            +
                        
         | 
| 62 | 
            +
                        while pos < len(s):
         | 
| 63 | 
            +
                            substr = s[pos:]
         | 
| 64 | 
            +
                            
         | 
| 65 | 
            +
                            rid, rtok = self._longest_prefix_lookup(substr, self.roots, self.max_root_len)
         | 
| 66 | 
            +
                            if rid is not None:
         | 
| 67 | 
            +
                                result.append({"token": rtok, "id": rid, "type": TokenType.ROOT})
         | 
| 68 | 
            +
                                pos += len(rtok)
         | 
| 69 | 
            +
                                continue
         | 
| 70 | 
            +
                            
         | 
| 71 | 
            +
                            sid, stok = self._longest_prefix_lookup(substr, self.suffixes, self.max_suffix_len)
         | 
| 72 | 
            +
                            if sid is not None:
         | 
| 73 | 
            +
                                result.append({"token": stok, "id": sid, "type": TokenType.SUFFIX})
         | 
| 74 | 
            +
                                pos += len(stok)
         | 
| 75 | 
            +
                                continue
         | 
| 76 | 
            +
                            
         | 
| 77 | 
            +
                            bid, btok = self._longest_prefix_lookup(substr, self.bpe_tokens, self.max_bpe_len)
         | 
| 78 | 
            +
                            if bid is not None:
         | 
| 79 | 
            +
                                result.append({"token": btok, "id": bid, "type": TokenType.BPE})
         | 
| 80 | 
            +
                                pos += len(btok)
         | 
| 81 | 
            +
                                continue
         | 
| 82 | 
            +
                            
         | 
| 83 | 
            +
                            result.append(self.unknown_marker)
         | 
| 84 | 
            +
                            pos += 1
         | 
| 85 | 
            +
                    
         | 
| 86 | 
            +
                    return result, uppercase_indices
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                def tokenize_text(self, text: str) -> Tuple[List[dict], List[int]]:
         | 
| 89 | 
            +
                    final_tokens = []
         | 
| 90 | 
            +
                    uppercase_indices = [i for i, c in enumerate(text) if c.isupper()]
         | 
| 91 | 
            +
                    
         | 
| 92 | 
            +
                    parts = text.split(" ")
         | 
| 93 | 
            +
                    for idx, part in enumerate(parts):
         | 
| 94 | 
            +
                        if part.strip():
         | 
| 95 | 
            +
                            tokens, _ = self._tokenize_word(part)
         | 
| 96 | 
            +
                            final_tokens.extend(tokens)
         | 
| 97 | 
            +
                        if idx < len(parts) - 1:
         | 
| 98 | 
            +
                            final_tokens.append(self.space_marker)
         | 
| 99 | 
            +
                    
         | 
| 100 | 
            +
                    return final_tokens, uppercase_indices
         | 
| 101 | 
            +
                
         | 
| 102 | 
            +
                def encode(self, text: str) -> List[int]:
         | 
| 103 | 
            +
                    tokens, _ = self.tokenize_text(text)
         | 
| 104 | 
            +
                    return [t["id"] for t in tokens]
         | 
| 105 | 
            +
                
         | 
| 106 | 
            +
                def tokenize(self, text: str) -> List[str]:
         | 
| 107 | 
            +
                    tokens, _ = self.tokenize_text(text)
         | 
| 108 | 
            +
                    return [t["token"] for t in tokens]
         | 
| 109 | 
            +
                
         | 
| 110 | 
            +
                def _longest_prefix_lookup(self, s: str, table: Dict[str, int], max_len: int = None) -> Tuple[Optional[int], str]:
         | 
| 111 | 
            +
                    end = min(len(s), max_len) if max_len else len(s)
         | 
| 112 | 
            +
                    for i in range(end, 0, -1):
         | 
| 113 | 
            +
                        cand = s[:i]
         | 
| 114 | 
            +
                        if cand in table:
         | 
| 115 | 
            +
                            return table[cand], cand
         | 
| 116 | 
            +
                    return None, ""
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                def _camel_split_with_positions(self, word: str) -> List[Tuple[str, int]]:
         | 
| 119 | 
            +
                    if not word:
         | 
| 120 | 
            +
                        return []
         | 
| 121 | 
            +
                    
         | 
| 122 | 
            +
                    parts = []
         | 
| 123 | 
            +
                    start = 0
         | 
| 124 | 
            +
                    
         | 
| 125 | 
            +
                    for i in range(1, len(word)):
         | 
| 126 | 
            +
                        if word[i].isupper():
         | 
| 127 | 
            +
                            if start < i:
         | 
| 128 | 
            +
                                parts.append((word[start:i].lower(), start))
         | 
| 129 | 
            +
                            start = i
         | 
| 130 | 
            +
                    
         | 
| 131 | 
            +
                    if start < len(word):
         | 
| 132 | 
            +
                        parts.append((word[start:].lower(), start))
         | 
| 133 | 
            +
                    
         | 
| 134 | 
            +
                    return parts if parts else [(word.lower(), 0)]
         | 
| 135 | 
            +
             | 
| 136 | 
            +
                def decode(self, ids: List[int]) -> str:
         | 
| 137 | 
            +
                    return TRDecoder(self.reverse_dict).decode(ids)
         | 
