|
import re |
|
|
|
def clean_asterisks(text): |
|
"""Aggressively remove all asterisk patterns.""" |
|
|
|
text = re.sub(r'\*+([^*]*)\*+', r'\1', text) |
|
|
|
text = text.replace('*', '') |
|
|
|
text = ' '.join(text.split()) |
|
return text |
|
|
|
def remove_dialog_formatting(text): |
|
"""Remove common dialog markers and formatting.""" |
|
|
|
text = re.sub(r'^[A-Z0-9\[\]]+:', '', text, flags=re.MULTILINE) |
|
text = re.sub(r'^[A-Z][a-z]+:', '', text, flags=re.MULTILINE) |
|
|
|
|
|
text = re.sub(r'\([^)]*\)', '', text) |
|
text = re.sub(r'\[[^\]]*\]', '', text) |
|
|
|
return text |
|
|
|
def remove_breakthrough_formatting(text): |
|
"""Remove any LLM formatting that made it through the prompts.""" |
|
patterns = [ |
|
(r'^.*?:\s*', ''), |
|
(r'\[.*?\]', ''), |
|
(r'\(.*?\)', ''), |
|
(r'"\w+:"', ''), |
|
(r'<.*?>', ''), |
|
(r'---.*?---', ''), |
|
(r'#\s*\w+', ''), |
|
] |
|
|
|
for pattern, replacement in patterns: |
|
text = re.sub(pattern, replacement, text, flags=re.MULTILINE) |
|
return text |
|
|
|
def convert_to_monologue(text): |
|
"""Convert multi-party dialog into a flowing narrative.""" |
|
|
|
transitions = [ |
|
"Then", "After that", "Next", "Following that", |
|
"Subsequently", "Moving on", "Additionally" |
|
] |
|
|
|
lines = text.split('\n') |
|
narrative = [] |
|
current_transition = 0 |
|
|
|
for line in lines: |
|
if line.strip(): |
|
|
|
cleaned_line = re.sub(r'^[A-Z0-9\[\]]+:\s*', '', line) |
|
cleaned_line = re.sub(r'^[A-Z][a-z]+:\s*', '', cleaned_line) |
|
|
|
|
|
if narrative and cleaned_line[0].isupper(): |
|
narrative.append(f"{transitions[current_transition]}, {cleaned_line.lower()}") |
|
current_transition = (current_transition + 1) % len(transitions) |
|
else: |
|
narrative.append(cleaned_line) |
|
|
|
return ' '.join(narrative) |
|
|
|
def clean_formatting(text): |
|
"""Remove markdown and other formatting symbols.""" |
|
|
|
text = clean_asterisks(text) |
|
|
|
|
|
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) |
|
text = re.sub(r'\*(.+?)\*', r'\1', text) |
|
text = re.sub(r'\_(.+?)\_', r'\1', text) |
|
text = re.sub(r'\~\~(.+?)\~\~', r'\1', text) |
|
|
|
|
|
text = re.sub(r'```[\s\S]*?```', '', text) |
|
text = re.sub(r'`[^`]*`', '', text) |
|
|
|
return text |
|
|
|
def process_for_podcast(text): |
|
"""Main function to process text for podcast narration.""" |
|
|
|
text = clean_asterisks(text) |
|
text = remove_dialog_formatting(text) |
|
text = clean_formatting(text) |
|
text = remove_breakthrough_formatting(text) |
|
text = convert_to_monologue(text) |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
text = re.sub(r'\n+', ' ', text) |
|
text = text.strip() |
|
|
|
|
|
text = clean_asterisks(text) |
|
return text |
|
|