import re from typing import List, Dict import os def parse_generation_history(file_path: str) -> Dict[int, List[str]]: """Improved parser that handles math symbols and spaces correctly""" history = {} token_pattern = re.compile(r"\*([^&]*)&?") with open(file_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if not line: continue try: step_part, content_part = line.split(',', 1) step = int(step_part.strip()) except ValueError: continue tokens = [] for match in token_pattern.finditer(content_part): raw_token = match.group(1).strip() if raw_token == "": tokens.append(" ") elif raw_token == "*": tokens.append("*") else: tokens.append(raw_token) while len(tokens) < 64: tokens.append(" ") if len(tokens) > 64: print(f"Truncating extra tokens: Step {step} ({len(tokens)} tokens)") tokens = tokens[:64] elif len(tokens) < 64: print(f"Padding missing tokens: Step {step} ({len(tokens)} tokens)") tokens += [" "] * (64 - len(tokens)) tokens = tokens[:62] history[step] = tokens return history def track_token_positions(history: Dict[int, List[str]]) -> List[int]: """Track the first generation step for each token""" num_positions = 64 steps_to_unmask = [-1] * num_positions for step in sorted(history.keys()): tokens = history[step] for idx in range(num_positions): if idx >= len(tokens): continue token = tokens[idx] if steps_to_unmask[idx] == -1 and token != '<|mdm_mask|>': steps_to_unmask[idx] = step return steps_to_unmask def generate_background_color(step: int, max_step: int) -> str: """Generate gradient color for text (darker version)""" color_stops = [ (176, 202, 224), (143, 179, 207), (110, 156, 191), (80, 130, 240), (40, 90, 200), (20, 70, 180), (0, 50, 160), ] color_index = min(int(step ** 0.7 / max_step ** 0.7 * 6), 6) ratio = (step % 2) / 2 start = color_stops[color_index] end = color_stops[min(color_index + 1, 6)] r = int(start[0] + (end[0] - start[0]) * ratio) g = int(start[1] + (end[1] - start[1]) * ratio) b = int(start[2] + (end[2] - start[2]) * ratio) return f"#{r:02x}{g:02x}{b:02x}" def generate_step_visualization(current_step: int, current_tokens: List[str], token_steps: List[int], max_step: int) -> str: """Final visualization version (completely borderless)""" html = [] for idx, token in enumerate(current_tokens): style = [ "padding: 6px 8px", "margin: 2px", "border-radius: 6px", "display: inline-block", "font-weight: 600", "font-size: 16px", "font-family: 'Segoe UI', sans-serif", "transition: all 0.2s ease", "width: 120px", "min-width: 120px", "text-align: center", "white-space: nowrap", "overflow: hidden", "text-overflow: ellipsis", "box-sizing: border-box", "vertical-align: middle", "border: 0 !important", "outline: 0 !important", "box-shadow: none !important", "position: relative", "z-index: 1" ] if token == '<|mdm_mask|>': style.extend([ "color: transparent", "background: #f8fafc", "text-shadow: none" ]) display_text = "​" else: text_color = generate_background_color(token_steps[idx], max_step) style.append(f"color: {text_color}") display_text = token if token != " " else "␣" html.append(f'''
{display_text}
''') return '\n'.join(html) def main(target_step: int = 64): """Main function supporting target step specification""" file_path = "sample_process.txt" final_step = 64 history = parse_generation_history(file_path) if target_step not in history: raise ValueError(f"Invalid target step: {target_step}") token_steps = track_token_positions(history) current_tokens = history[target_step] html_content = generate_step_visualization( target_step, current_tokens, token_steps, final_step ) example_steps = [0, 16, 32, 48, 64] example_colors = [generate_background_color(s, final_step) for s in example_steps] legend_html = ''.join( f'
Step {s}
' for s, color in zip(example_steps, example_colors) ) target_dir = "html/sample_process_zhihu" if not os.path.exists(target_dir): os.makedirs(target_dir) with open(f"{target_dir}/visualization_step_{target_step}.html", "w", encoding="utf-8") as f: f.write(f""" Step {target_step} Visualization
{html_content}
""") if __name__ == "__main__": for step in range(1, 65): main(target_step=step)