marks commited on
Commit
218c539
·
1 Parent(s): b5f9861

More cleaning functions

Browse files
Files changed (2) hide show
  1. text_processor.py +17 -0
  2. tts.py +2 -44
text_processor.py CHANGED
@@ -1,5 +1,15 @@
1
  import re
2
 
 
 
 
 
 
 
 
 
 
 
3
  def remove_dialog_formatting(text):
4
  """Remove common dialog markers and formatting."""
5
  # Remove speaker labels (e.g., "John:", "JOHN:", "[John]:")
@@ -57,6 +67,9 @@ def convert_to_monologue(text):
57
 
58
  def clean_formatting(text):
59
  """Remove markdown and other formatting symbols."""
 
 
 
60
  # Remove markdown formatting
61
  text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) # Bold
62
  text = re.sub(r'\*(.+?)\*', r'\1', text) # Italic
@@ -71,6 +84,8 @@ def clean_formatting(text):
71
 
72
  def process_for_podcast(text):
73
  """Main function to process text for podcast narration."""
 
 
74
  text = remove_dialog_formatting(text)
75
  text = clean_formatting(text)
76
  text = remove_breakthrough_formatting(text)
@@ -81,4 +96,6 @@ def process_for_podcast(text):
81
  text = re.sub(r'\n+', ' ', text) # Remove newlines
82
  text = text.strip()
83
 
 
 
84
  return text
 
1
  import re
2
 
3
+ def clean_asterisks(text):
4
+ """Aggressively remove all asterisk patterns."""
5
+ # Remove any number of asterisks with content between them
6
+ text = re.sub(r'\*+([^*]*)\*+', r'\1', text)
7
+ # Remove any remaining single asterisks
8
+ text = text.replace('*', '')
9
+ # Remove multiple spaces that might result
10
+ text = ' '.join(text.split())
11
+ return text
12
+
13
  def remove_dialog_formatting(text):
14
  """Remove common dialog markers and formatting."""
15
  # Remove speaker labels (e.g., "John:", "JOHN:", "[John]:")
 
67
 
68
  def clean_formatting(text):
69
  """Remove markdown and other formatting symbols."""
70
+ # Apply asterisk cleaning first
71
+ text = clean_asterisks(text)
72
+
73
  # Remove markdown formatting
74
  text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) # Bold
75
  text = re.sub(r'\*(.+?)\*', r'\1', text) # Italic
 
84
 
85
  def process_for_podcast(text):
86
  """Main function to process text for podcast narration."""
87
+ # Apply asterisk cleaning as first step
88
+ text = clean_asterisks(text)
89
  text = remove_dialog_formatting(text)
90
  text = clean_formatting(text)
91
  text = remove_breakthrough_formatting(text)
 
96
  text = re.sub(r'\n+', ' ', text) # Remove newlines
97
  text = text.strip()
98
 
99
+ # Final asterisk check before returning
100
+ text = clean_asterisks(text)
101
  return text
tts.py CHANGED
@@ -1,51 +1,9 @@
1
- import re
2
  import requests
3
  from .text_processor import process_for_podcast
4
 
5
- def clean_text_for_speech(text):
6
- # First apply podcast-specific processing
7
- text = process_for_podcast(text)
8
-
9
- # Replace URLs with readable text
10
- text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
11
- ' link ', text)
12
-
13
- # Replace common symbols with spoken equivalents
14
- replacements = {
15
- '*': '',
16
- '#': 'hashtag',
17
- '@': 'at',
18
- '&': 'and',
19
- '%': 'percent',
20
- '+': 'plus',
21
- '=': 'equals',
22
- '/': ' or ',
23
- '\\': ' ',
24
- '|': ' ',
25
- '_': ' ',
26
- '>': 'greater than',
27
- '<': 'less than',
28
- '`': '',
29
- '~': '',
30
- '[': '',
31
- ']': '',
32
- '{': '',
33
- '}': '',
34
- '(': '',
35
- ')': '',
36
- }
37
-
38
- for old, new in replacements.items():
39
- text = text.replace(old, new)
40
-
41
- # Remove multiple spaces
42
- text = ' '.join(text.split())
43
-
44
- return text
45
-
46
  def text_to_speech(text, api_key):
47
- # Clean text before sending to API
48
- cleaned_text = clean_text_for_speech(text)
49
 
50
  url = "https://api.elevenlabs.io/v1/text-to-speech"
51
  headers = {
 
 
1
  import requests
2
  from .text_processor import process_for_podcast
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  def text_to_speech(text, api_key):
5
+ # Use the enhanced text processor
6
+ cleaned_text = process_for_podcast(text)
7
 
8
  url = "https://api.elevenlabs.io/v1/text-to-speech"
9
  headers = {