Spaces:
Running
Running
| import streamlit as st | |
| import sys | |
| from datetime import datetime | |
| from pathlib import Path | |
| import base64 | |
| import colorsys | |
| import shutil | |
| import atexit | |
| import requests | |
| import importlib.util | |
| # Set page config - MUST BE FIRST STREAMLIT COMMAND | |
| st.set_page_config( | |
| page_title="Turkish Tiktokenizer", | |
| page_icon="🇹🇷", | |
| layout="wide" | |
| ) | |
| # Initialize session state | |
| if 'text' not in st.session_state: | |
| st.session_state.text = "Akademisyenler ve aileleri birlikte çalışıyorlar." | |
| if 'token_results' not in st.session_state: | |
| st.session_state.token_results = None | |
| # Constants | |
| GITHUB_REPO = "malibayram/tokenizer" | |
| GITHUB_BRANCH = "main" | |
| # Special tokens and their IDs | |
| SPECIAL_TOKENS = { | |
| "<uppercase>": 0, # Uppercase letter marker | |
| "<space>": 1, # Space character | |
| "<newline>": 2, # Newline character | |
| "<tab>": 3, # Tab character | |
| "<unknown>": 4 # Unknown token | |
| } | |
| # Special token display symbols | |
| SPECIAL_TOKEN_SYMBOLS = { | |
| "<uppercase>": "[uppercase]", # Up arrow for uppercase | |
| "<space>": "[space]", # Space symbol | |
| "<newline>": "[newline]", # Return symbol | |
| "<tab>": "[tab]", # Tab symbol | |
| "<unknown>": "[unknown]" # Question mark for unknown | |
| } | |
| # Colors for special tokens | |
| SPECIAL_COLORS = { | |
| "<uppercase>": "#FF9999", # Light red for uppercase markers | |
| "<space>": "#CCCCCC", # Gray for spaces | |
| "<newline>": "#CCCCCC", # Gray for newlines | |
| "<tab>": "#CCCCCC", # Gray for tabs | |
| "<unknown>": "#FF0000" # Red for unknown tokens | |
| } | |
| # Required files mapping | |
| REQUIRED_FILES = { | |
| 'tokenizer.py': 'turkish_tokenizer/turkish_tokenizer.py', | |
| 'kokler_v07.json': 'turkish_tokenizer/kokler_v07.json', | |
| 'ekler_v05.json': 'turkish_tokenizer/ekler_v05.json', | |
| 'bpe_v05.json': 'turkish_tokenizer/bpe_v05.json' | |
| } | |
| # Token ID ranges | |
| TOKEN_RANGES = { | |
| 'special': (0, 4), # Special tokens | |
| 'root_words': (5, 20000), # Root words | |
| 'suffixes': (22268, 22767), # Suffixes | |
| 'bpe': (20000, None) # BPE tokens (20000+) | |
| } | |
| def generate_colors(n): | |
| """Generate n visually distinct colors.""" | |
| colors = [] | |
| for i in range(n): | |
| hue = i / n | |
| saturation = 0.3 + (i % 3) * 0.1 # Vary saturation between 0.3-0.5 | |
| value = 0.95 - (i % 2) * 0.1 # Vary value between 0.85-0.95 | |
| rgb = colorsys.hsv_to_rgb(hue, saturation, value) | |
| hex_color = "#{:02x}{:02x}{:02x}".format( | |
| int(rgb[0] * 255), | |
| int(rgb[1] * 255), | |
| int(rgb[2] * 255) | |
| ) | |
| colors.append(hex_color) | |
| return colors | |
| def fetch_github_file(path, ref=GITHUB_BRANCH): | |
| """Fetch file content from GitHub repository.""" | |
| url = f"https://api.github.com/repos/{GITHUB_REPO}/contents/{path}?ref={ref}" | |
| response = requests.get(url) | |
| if response.status_code == 200: | |
| content = base64.b64decode(response.json()['content']).decode('utf-8') | |
| return content | |
| else: | |
| st.error(f"Could not fetch {path} from GitHub: {response.status_code}") | |
| return None | |
| def load_tokenizer(): | |
| """Load and initialize the tokenizer from GitHub.""" | |
| temp_dir = Path("temp_tokenizer") | |
| temp_dir.mkdir(exist_ok=True) | |
| # Fetch required files | |
| for local_name, github_path in REQUIRED_FILES.items(): | |
| content = fetch_github_file(github_path) | |
| if content is None: | |
| return None | |
| with open(temp_dir / local_name, 'w', encoding='utf-8') as f: | |
| f.write(content) | |
| # Modify tokenizer to use correct paths | |
| tokenizer_path = temp_dir / "tokenizer.py" | |
| with open(tokenizer_path, 'r', encoding='utf-8') as f: | |
| tokenizer_code = f.read() | |
| modified_code = tokenizer_code.replace( | |
| 'def load_json(filename):', | |
| f'''def load_json(filename): | |
| full_path = os.path.join("{temp_dir.absolute()}", filename) | |
| with open(full_path, 'r', encoding='utf-8') as file: | |
| return json.load(file)''' | |
| ) | |
| with open(tokenizer_path, 'w', encoding='utf-8') as f: | |
| f.write(modified_code) | |
| # Load module | |
| spec = importlib.util.spec_from_file_location("tokenizer", str(temp_dir / "tokenizer.py")) | |
| module = importlib.util.module_from_spec(spec) | |
| sys.modules["tokenizer"] = module | |
| spec.loader.exec_module(module) | |
| return module.tokenize | |
| def get_commit_history(): | |
| """Fetch commit history from GitHub.""" | |
| url = f"https://api.github.com/repos/{GITHUB_REPO}/commits" | |
| try: | |
| response = requests.get(url) | |
| if response.status_code == 200: | |
| commits = response.json() | |
| versions = [] | |
| for commit in commits[:10]: | |
| date = datetime.strptime(commit['commit']['author']['date'], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d') | |
| sha = commit['sha'][:7] | |
| message = commit['commit']['message'].split('\n')[0][:50] | |
| versions.append(f"{date} - {sha} - {message}") | |
| return versions | |
| return ["latest"] | |
| except Exception as e: | |
| st.warning(f"Could not fetch commit history: {str(e)}") | |
| return ["latest"] | |
| def render_tokens(tokens, token_colors): | |
| """Render colored token visualization.""" | |
| html_tokens = [] | |
| for token in tokens: | |
| color = token_colors[token] | |
| display_text = SPECIAL_TOKEN_SYMBOLS.get(token, token) # Use symbol for special tokens | |
| html_tokens.append( | |
| f'<span style="background-color: {color}; padding: 2px 4px; margin: 2px; border-radius: 3px;" title="{token}">{display_text}</span>' | |
| ) | |
| return " ".join(html_tokens) | |
| # Load tokenizer | |
| tokenize = load_tokenizer() | |
| if tokenize is None: | |
| st.error("Failed to load tokenizer from GitHub") | |
| st.stop() | |
| # Tokenize example text on startup if no results exist | |
| if st.session_state.token_results is None and st.session_state.text: | |
| try: | |
| st.session_state.token_results = tokenize(st.session_state.text) | |
| except Exception as e: | |
| st.error(f"Error tokenizing text: {str(e)}") | |
| # UI Layout | |
| st.title("🇹🇷 Turkish Tiktokenizer") | |
| # Model selection | |
| versions = get_commit_history() | |
| model = st.selectbox("", versions, key="model_selection", label_visibility="collapsed") | |
| # Main layout | |
| col1, col2 = st.columns([0.4, 0.6]) | |
| # Input column | |
| with col1: | |
| text = st.text_area( | |
| "Enter Turkish text to tokenize", | |
| value=st.session_state.text, | |
| height=200, | |
| key="text_input", | |
| label_visibility="collapsed", | |
| placeholder="Enter Turkish text to tokenize" | |
| ) | |
| if st.button("Tokenize", type="primary"): | |
| st.session_state.text = text | |
| if text.strip(): | |
| try: | |
| st.session_state.token_results = tokenize(text) | |
| except Exception as e: | |
| st.session_state.token_results = None | |
| st.error(f"Error tokenizing text: {str(e)}") | |
| else: | |
| st.session_state.token_results = None | |
| # Results column | |
| with col2: | |
| st.markdown("Token count") | |
| if st.session_state.token_results is not None: | |
| result = st.session_state.token_results | |
| token_count = len(result["tokens"]) | |
| st.markdown(f"### {token_count}") | |
| st.markdown("Tokenized text") | |
| # Generate token colors | |
| regular_tokens = [t for t in result["tokens"] if t not in SPECIAL_COLORS] | |
| regular_token_colors = dict(zip(regular_tokens, generate_colors(len(regular_tokens)))) | |
| token_colors = {**SPECIAL_COLORS, **regular_token_colors} | |
| # Render tokens | |
| with st.container(): | |
| st.markdown(render_tokens(result["tokens"], token_colors), unsafe_allow_html=True) | |
| st.markdown("Token IDs") | |
| st.code(", ".join(map(str, result["ids"])), language=None) | |
| else: | |
| st.markdown("### 0") | |
| st.markdown("Tokenized text") | |
| st.markdown("") | |
| st.markdown("Token IDs") | |
| st.text("") | |
| # Footer | |
| st.markdown(""" | |
| <div style="position: fixed; bottom: 0; width: 100%; text-align: center; padding: 10px; background-color: white;"> | |
| <a href="https://github.com/malibayram/tokenizer" target="_blank">View on GitHub</a> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Cleanup | |
| def cleanup(): | |
| if Path("temp_tokenizer").exists(): | |
| shutil.rmtree("temp_tokenizer") | |
| atexit.register(cleanup) |