podcaster / text_processor.py
marks
More cleaning functions
218c539
import re
def clean_asterisks(text):
"""Aggressively remove all asterisk patterns."""
# Remove any number of asterisks with content between them
text = re.sub(r'\*+([^*]*)\*+', r'\1', text)
# Remove any remaining single asterisks
text = text.replace('*', '')
# Remove multiple spaces that might result
text = ' '.join(text.split())
return text
def remove_dialog_formatting(text):
"""Remove common dialog markers and formatting."""
# Remove speaker labels (e.g., "John:", "JOHN:", "[John]:")
text = re.sub(r'^[A-Z0-9\[\]]+:', '', text, flags=re.MULTILINE)
text = re.sub(r'^[A-Z][a-z]+:', '', text, flags=re.MULTILINE)
# Remove parenthetical stage directions
text = re.sub(r'\([^)]*\)', '', text)
text = re.sub(r'\[[^\]]*\]', '', text)
return text
def remove_breakthrough_formatting(text):
"""Remove any LLM formatting that made it through the prompts."""
patterns = [
(r'^.*?:\s*', ''), # Remove any remaining speaker labels
(r'\[.*?\]', ''), # Remove any breakthrough brackets
(r'\(.*?\)', ''), # Remove any breakthrough parentheticals
(r'"\w+:"', ''), # Remove quoted speaker labels
(r'<.*?>', ''), # Remove any HTML-like tags
(r'---.*?---', ''), # Remove any section separators
(r'#\s*\w+', ''), # Remove any hashtag sections
]
for pattern, replacement in patterns:
text = re.sub(pattern, replacement, text, flags=re.MULTILINE)
return text
def convert_to_monologue(text):
"""Convert multi-party dialog into a flowing narrative."""
# Replace dialog markers with transitional phrases
transitions = [
"Then", "After that", "Next", "Following that",
"Subsequently", "Moving on", "Additionally"
]
lines = text.split('\n')
narrative = []
current_transition = 0
for line in lines:
if line.strip():
# Remove speaker labels if any
cleaned_line = re.sub(r'^[A-Z0-9\[\]]+:\s*', '', line)
cleaned_line = re.sub(r'^[A-Z][a-z]+:\s*', '', cleaned_line)
# Add transition if it seems like a new thought
if narrative and cleaned_line[0].isupper():
narrative.append(f"{transitions[current_transition]}, {cleaned_line.lower()}")
current_transition = (current_transition + 1) % len(transitions)
else:
narrative.append(cleaned_line)
return ' '.join(narrative)
def clean_formatting(text):
"""Remove markdown and other formatting symbols."""
# Apply asterisk cleaning first
text = clean_asterisks(text)
# Remove markdown formatting
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) # Bold
text = re.sub(r'\*(.+?)\*', r'\1', text) # Italic
text = re.sub(r'\_(.+?)\_', r'\1', text) # Underscore emphasis
text = re.sub(r'\~\~(.+?)\~\~', r'\1', text) # Strikethrough
# Remove code blocks and inline code
text = re.sub(r'```[\s\S]*?```', '', text)
text = re.sub(r'`[^`]*`', '', text)
return text
def process_for_podcast(text):
"""Main function to process text for podcast narration."""
# Apply asterisk cleaning as first step
text = clean_asterisks(text)
text = remove_dialog_formatting(text)
text = clean_formatting(text)
text = remove_breakthrough_formatting(text)
text = convert_to_monologue(text)
# Additional cleanups
text = re.sub(r'\s+', ' ', text) # Remove multiple spaces
text = re.sub(r'\n+', ' ', text) # Remove newlines
text = text.strip()
# Final asterisk check before returning
text = clean_asterisks(text)
return text