Spaces:

phxdev
/

podcaster

Running

podcaster / text_processor.py

marks

More cleaning functions

218c539 2 months ago

3.74 kB

	import re

	def clean_asterisks(text):
	"""Aggressively remove all asterisk patterns."""
	# Remove any number of asterisks with content between them
	text = re.sub(r'\+([^])\+', r'\1', text)
	# Remove any remaining single asterisks
	text = text.replace('*', '')
	# Remove multiple spaces that might result
	text = ' '.join(text.split())
	return text

	def remove_dialog_formatting(text):
	"""Remove common dialog markers and formatting."""
	# Remove speaker labels (e.g., "John:", "JOHN:", "[John]:")
	text = re.sub(r'^[A-Z0-9\[\]]+:', '', text, flags=re.MULTILINE)
	text = re.sub(r'^[A-Z][a-z]+:', '', text, flags=re.MULTILINE)

	# Remove parenthetical stage directions
	text = re.sub(r'\([^)]*\)', '', text)
	text = re.sub(r'\[[^\]]*\]', '', text)

	return text

	def remove_breakthrough_formatting(text):
	"""Remove any LLM formatting that made it through the prompts."""
	patterns = [
	(r'^.?:\s', ''), # Remove any remaining speaker labels
	(r'\[.*?\]', ''), # Remove any breakthrough brackets
	(r'\(.*?\)', ''), # Remove any breakthrough parentheticals
	(r'"\w+:"', ''), # Remove quoted speaker labels
	(r'<.*?>', ''), # Remove any HTML-like tags
	(r'---.*?---', ''), # Remove any section separators
	(r'#\s*\w+', ''), # Remove any hashtag sections
	]

	for pattern, replacement in patterns:
	text = re.sub(pattern, replacement, text, flags=re.MULTILINE)
	return text

	def convert_to_monologue(text):
	"""Convert multi-party dialog into a flowing narrative."""
	# Replace dialog markers with transitional phrases
	transitions = [
	"Then", "After that", "Next", "Following that",
	"Subsequently", "Moving on", "Additionally"
	]

	lines = text.split('\n')
	narrative = []
	current_transition = 0

	for line in lines:
	if line.strip():
	# Remove speaker labels if any
	cleaned_line = re.sub(r'^[A-Z0-9\[\]]+:\s*', '', line)
	cleaned_line = re.sub(r'^[A-Z][a-z]+:\s*', '', cleaned_line)

	# Add transition if it seems like a new thought
	if narrative and cleaned_line[0].isupper():
	narrative.append(f"{transitions[current_transition]}, {cleaned_line.lower()}")
	current_transition = (current_transition + 1) % len(transitions)
	else:
	narrative.append(cleaned_line)

	return ' '.join(narrative)

	def clean_formatting(text):
	"""Remove markdown and other formatting symbols."""
	# Apply asterisk cleaning first
	text = clean_asterisks(text)

	# Remove markdown formatting
	text = re.sub(r'\\(.+?)\\', r'\1', text) # Bold
	text = re.sub(r'\(.+?)\', r'\1', text) # Italic
	text = re.sub(r'\_(.+?)\_', r'\1', text) # Underscore emphasis
	text = re.sub(r'\~\~(.+?)\~\~', r'\1', text) # Strikethrough

	# Remove code blocks and inline code
	text = re.sub(r'```[\s\S]*?```', '', text)
	text = re.sub(r'`[^`]*`', '', text)

	return text

	def process_for_podcast(text):
	"""Main function to process text for podcast narration."""
	# Apply asterisk cleaning as first step
	text = clean_asterisks(text)
	text = remove_dialog_formatting(text)
	text = clean_formatting(text)
	text = remove_breakthrough_formatting(text)
	text = convert_to_monologue(text)

	# Additional cleanups
	text = re.sub(r'\s+', ' ', text) # Remove multiple spaces
	text = re.sub(r'\n+', ' ', text) # Remove newlines
	text = text.strip()

	# Final asterisk check before returning
	text = clean_asterisks(text)
	return text