import re def clean_asterisks(text): """Aggressively remove all asterisk patterns.""" # Remove any number of asterisks with content between them text = re.sub(r'\*+([^*]*)\*+', r'\1', text) # Remove any remaining single asterisks text = text.replace('*', '') # Remove multiple spaces that might result text = ' '.join(text.split()) return text def remove_dialog_formatting(text): """Remove common dialog markers and formatting.""" # Remove speaker labels (e.g., "John:", "JOHN:", "[John]:") text = re.sub(r'^[A-Z0-9\[\]]+:', '', text, flags=re.MULTILINE) text = re.sub(r'^[A-Z][a-z]+:', '', text, flags=re.MULTILINE) # Remove parenthetical stage directions text = re.sub(r'\([^)]*\)', '', text) text = re.sub(r'\[[^\]]*\]', '', text) return text def remove_breakthrough_formatting(text): """Remove any LLM formatting that made it through the prompts.""" patterns = [ (r'^.*?:\s*', ''), # Remove any remaining speaker labels (r'\[.*?\]', ''), # Remove any breakthrough brackets (r'\(.*?\)', ''), # Remove any breakthrough parentheticals (r'"\w+:"', ''), # Remove quoted speaker labels (r'<.*?>', ''), # Remove any HTML-like tags (r'---.*?---', ''), # Remove any section separators (r'#\s*\w+', ''), # Remove any hashtag sections ] for pattern, replacement in patterns: text = re.sub(pattern, replacement, text, flags=re.MULTILINE) return text def convert_to_monologue(text): """Convert multi-party dialog into a flowing narrative.""" # Replace dialog markers with transitional phrases transitions = [ "Then", "After that", "Next", "Following that", "Subsequently", "Moving on", "Additionally" ] lines = text.split('\n') narrative = [] current_transition = 0 for line in lines: if line.strip(): # Remove speaker labels if any cleaned_line = re.sub(r'^[A-Z0-9\[\]]+:\s*', '', line) cleaned_line = re.sub(r'^[A-Z][a-z]+:\s*', '', cleaned_line) # Add transition if it seems like a new thought if narrative and cleaned_line[0].isupper(): narrative.append(f"{transitions[current_transition]}, {cleaned_line.lower()}") current_transition = (current_transition + 1) % len(transitions) else: narrative.append(cleaned_line) return ' '.join(narrative) def clean_formatting(text): """Remove markdown and other formatting symbols.""" # Apply asterisk cleaning first text = clean_asterisks(text) # Remove markdown formatting text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) # Bold text = re.sub(r'\*(.+?)\*', r'\1', text) # Italic text = re.sub(r'\_(.+?)\_', r'\1', text) # Underscore emphasis text = re.sub(r'\~\~(.+?)\~\~', r'\1', text) # Strikethrough # Remove code blocks and inline code text = re.sub(r'```[\s\S]*?```', '', text) text = re.sub(r'`[^`]*`', '', text) return text def process_for_podcast(text): """Main function to process text for podcast narration.""" # Apply asterisk cleaning as first step text = clean_asterisks(text) text = remove_dialog_formatting(text) text = clean_formatting(text) text = remove_breakthrough_formatting(text) text = convert_to_monologue(text) # Additional cleanups text = re.sub(r'\s+', ' ', text) # Remove multiple spaces text = re.sub(r'\n+', ' ', text) # Remove newlines text = text.strip() # Final asterisk check before returning text = clean_asterisks(text) return text