import re from typing import List, Dict import os def parse_generation_history(file_path: str) -> Dict[int, List[str]]: """Improved parser that handles math symbols and spaces correctly""" history = {} token_pattern = re.compile(r"\*([^&]*)&?") with open(file_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if not line: continue try: step_part, content_part = line.split(',', 1) step = int(step_part.strip()) except ValueError: continue tokens = [] for match in token_pattern.finditer(content_part): raw_token = match.group(1).strip() if raw_token == "": tokens.append(" ") elif raw_token == "*": tokens.append("*") else: tokens.append(raw_token) while len(tokens) < 64: tokens.append(" ") if len(tokens) > 64: print(f"Truncating extra tokens: Step {step} ({len(tokens)} tokens)") tokens = tokens[:64] elif len(tokens) < 64: print(f"Padding missing tokens: Step {step} ({len(tokens)} tokens)") tokens += [" "] * (64 - len(tokens)) history[step] = tokens return history def track_token_positions(history: Dict[int, List[str]]) -> List[int]: """Track the first generation step for each token""" num_positions = 64 steps_to_unmask = [-1] * num_positions for step in sorted(history.keys()): tokens = history[step] for idx in range(num_positions): if idx >= len(tokens): continue token = tokens[idx] if steps_to_unmask[idx] == -1 and token != '<|mdm_mask|>': steps_to_unmask[idx] = step return steps_to_unmask def generate_background_color(step: int, max_step: int) -> str: """Generate gradient color""" color_stops = [ (240, 248, 255), (209, 226, 241), (176, 202, 224), (143, 179, 207), (110, 156, 191), (77, 133, 175), (44, 110, 159), (12, 55, 112) ] color_index = min(step * 6 // max_step, 6) ratio = (step % 2) / 2 start = color_stops[color_index] end = color_stops[min(color_index + 1, 7)] r = int(start[0] + (end[0] - start[0]) * ratio) g = int(start[1] + (end[1] - start[1]) * ratio) b = int(start[2] + (end[2] - start[2]) * ratio) return f"#{r:02x}{g:02x}{b:02x}" def generate_step_visualization(current_step: int, current_tokens: List[str], token_steps: List[int], max_step: int) -> str: """Generate visualization for specific step""" html = [] for idx, token in enumerate(current_tokens): style = [ "color: #000000", "padding: 6px 8px", "margin: 3px", "border-radius: 6px", "display: inline-block", "font-weight: 600", "font-size: 16px", "font-family: 'Segoe UI', sans-serif", "box-shadow: 0 3px 6px rgba(12,55,112,0.15)", "transition: all 0.2s ease", "position: relative", "width: 120px", "min-width: 120px", "text-align: center", "white-space: nowrap", "overflow: hidden", "text-overflow: ellipsis", "box-sizing: border-box" ] if token == '<|mdm_mask|>': style.extend([ "background: #f8fafc", "border: 2px solid #ffffff", "font-weight: 800", "text-transform: uppercase", "padding: 4px 6px" ]) display_text = "Mask" else: bg_color = generate_background_color(token_steps[idx], max_step) style.append(f"background-color: {bg_color}") display_text = token if token != " " else "␣" html.append(f'{display_text}') return '\n'.join(html) def main(target_step: int = 64): """Main function supporting target step specification""" file_path = "sample_process.txt" final_step = 64 history = parse_generation_history(file_path) if target_step not in history: raise ValueError(f"Invalid target step: {target_step}") token_steps = track_token_positions(history) current_tokens = history[target_step] html_content = generate_step_visualization( target_step, current_tokens, token_steps, final_step ) example_steps = [0, 16, 32, 48, 64] example_colors = [generate_background_color(s, final_step) for s in example_steps] legend_html = ''.join( f'
Step {s}
' for s, color in zip(example_steps, example_colors) ) target_dir = "html/sample_process_paper" if not os.path.exists(target_dir): os.makedirs(target_dir) with open(f"{target_dir}/visualization_step_{target_step}.html", "w", encoding="utf-8") as f: f.write(f""" Step {target_step} Visualization

Generation Step {target_step}

{html_content}

Color Legend

{legend_html}
""") if __name__ == "__main__": for step in range(1, 65): main(target_step=step)