Spaces:

phxdev
/

podcaster

Running

marks commited on Jan 28

Commit

218c539

1 Parent(s): b5f9861

More cleaning functions

Files changed (2) hide show

text_processor.py CHANGED Viewed

@@ -1,5 +1,15 @@
 import re
 def remove_dialog_formatting(text):
     """Remove common dialog markers and formatting."""
     # Remove speaker labels (e.g., "John:", "JOHN:", "[John]:")
@@ -57,6 +67,9 @@ def convert_to_monologue(text):
 def clean_formatting(text):
     """Remove markdown and other formatting symbols."""
     # Remove markdown formatting
     text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)  # Bold
     text = re.sub(r'\*(.+?)\*', r'\1', text)      # Italic
@@ -71,6 +84,8 @@ def clean_formatting(text):
 def process_for_podcast(text):
     """Main function to process text for podcast narration."""
     text = remove_dialog_formatting(text)
     text = clean_formatting(text)
     text = remove_breakthrough_formatting(text)
@@ -81,4 +96,6 @@ def process_for_podcast(text):
     text = re.sub(r'\n+', ' ', text)  # Remove newlines
     text = text.strip()
     return text

 import re
+def clean_asterisks(text):
+    """Aggressively remove all asterisk patterns."""
+    # Remove any number of asterisks with content between them
+    text = re.sub(r'\*+([^*]*)\*+', r'\1', text)
+    # Remove any remaining single asterisks
+    text = text.replace('*', '')
+    # Remove multiple spaces that might result
+    text = ' '.join(text.split())
+    return text
 def remove_dialog_formatting(text):
     """Remove common dialog markers and formatting."""
     # Remove speaker labels (e.g., "John:", "JOHN:", "[John]:")
 def clean_formatting(text):
     """Remove markdown and other formatting symbols."""
+    # Apply asterisk cleaning first
+    text = clean_asterisks(text)
     # Remove markdown formatting
     text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)  # Bold
     text = re.sub(r'\*(.+?)\*', r'\1', text)      # Italic
 def process_for_podcast(text):
     """Main function to process text for podcast narration."""
+    # Apply asterisk cleaning as first step
+    text = clean_asterisks(text)
     text = remove_dialog_formatting(text)
     text = clean_formatting(text)
     text = remove_breakthrough_formatting(text)
     text = re.sub(r'\n+', ' ', text)  # Remove newlines
     text = text.strip()
+    # Final asterisk check before returning
+    text = clean_asterisks(text)
     return text

tts.py CHANGED Viewed

@@ -1,51 +1,9 @@
-import re
 import requests
 from .text_processor import process_for_podcast
-def clean_text_for_speech(text):
-    # First apply podcast-specific processing
-    text = process_for_podcast(text)
-    # Replace URLs with readable text
-    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
-                  ' link ', text)
-    # Replace common symbols with spoken equivalents
-    replacements = {
-        '*': '',
-        '#': 'hashtag',
-        '@': 'at',
-        '&': 'and',
-        '%': 'percent',
-        '+': 'plus',
-        '=': 'equals',
-        '/': ' or ',
-        '\\': ' ',
-        '|': ' ',
-        '_': ' ',
-        '>': 'greater than',
-        '<': 'less than',
-        '`': '',
-        '~': '',
-        '[': '',
-        ']': '',
-        '{': '',
-        '}': '',
-        '(': '',
-        ')': '',
-    }
-    for old, new in replacements.items():
-        text = text.replace(old, new)
-    # Remove multiple spaces
-    text = ' '.join(text.split())
-    return text
 def text_to_speech(text, api_key):
-    # Clean text before sending to API
-    cleaned_text = clean_text_for_speech(text)
     url = "https://api.elevenlabs.io/v1/text-to-speech"
     headers = {

 import requests
 from .text_processor import process_for_podcast
 def text_to_speech(text, api_key):
+    # Use the enhanced text processor
+    cleaned_text = process_for_podcast(text)
     url = "https://api.elevenlabs.io/v1/text-to-speech"
     headers = {