marks
commited on
Commit
·
b5f9861
1
Parent(s):
697ec60
Upgraded prompt
Browse files- prompt_templates.py +29 -0
- setup.py +5 -1
- text_processor.py +84 -0
- tts.py +4 -0
prompt_templates.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PODCAST_SYSTEM_PROMPT = """You are a professional podcast scriptwriter. Follow these rules strictly:
|
2 |
+
1. Write in natural, conversational prose only
|
3 |
+
2. Never use markdown formatting
|
4 |
+
3. Never write dialog or conversation format
|
5 |
+
4. Never use speaker labels, colons, or turn-taking
|
6 |
+
5. Never include stage directions or [bracketed text]
|
7 |
+
6. Never use asterisks, underscores, or other formatting symbols
|
8 |
+
7. Write as a continuous narrative
|
9 |
+
8. Avoid technical jargon unless explicitly explaining it
|
10 |
+
9. Use complete sentences and proper transitions
|
11 |
+
10. Never include URLs or raw links
|
12 |
+
|
13 |
+
Bad example:
|
14 |
+
John: This is interesting
|
15 |
+
[excited] Mary: I agree!
|
16 |
+
|
17 |
+
Good example:
|
18 |
+
This topic is particularly interesting, and there's strong agreement among experts about its significance.
|
19 |
+
"""
|
20 |
+
|
21 |
+
def create_podcast_prompt(topic, duration_minutes=10):
|
22 |
+
return f"""Using the style guidelines provided, create a {duration_minutes}-minute podcast script about {topic}.
|
23 |
+
Focus on creating engaging, flowing narrative content that a single voice can narrate naturally.
|
24 |
+
The content should be informative yet conversational, avoiding any formatting or dialog structure."""
|
25 |
+
|
26 |
+
def create_episode_segments(topic, segments=3):
|
27 |
+
return f"""Create {segments} distinct segments about {topic}.
|
28 |
+
Each segment should flow naturally into the next, using clear transitional phrases.
|
29 |
+
Remember to maintain a single narrative voice throughout."""
|
setup.py
CHANGED
@@ -6,6 +6,10 @@ setup(
|
|
6 |
packages=find_packages(),
|
7 |
install_requires=[
|
8 |
'rich',
|
9 |
-
|
|
|
|
|
|
|
|
|
10 |
]
|
11 |
)
|
|
|
6 |
packages=find_packages(),
|
7 |
install_requires=[
|
8 |
'rich',
|
9 |
+
'requests',
|
10 |
+
'python-dotenv',
|
11 |
+
'openai', # If using OpenAI
|
12 |
+
'anthropic', # If using Claude
|
13 |
+
'regex', # For more advanced regex operations
|
14 |
]
|
15 |
)
|
text_processor.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
def remove_dialog_formatting(text):
|
4 |
+
"""Remove common dialog markers and formatting."""
|
5 |
+
# Remove speaker labels (e.g., "John:", "JOHN:", "[John]:")
|
6 |
+
text = re.sub(r'^[A-Z0-9\[\]]+:', '', text, flags=re.MULTILINE)
|
7 |
+
text = re.sub(r'^[A-Z][a-z]+:', '', text, flags=re.MULTILINE)
|
8 |
+
|
9 |
+
# Remove parenthetical stage directions
|
10 |
+
text = re.sub(r'\([^)]*\)', '', text)
|
11 |
+
text = re.sub(r'\[[^\]]*\]', '', text)
|
12 |
+
|
13 |
+
return text
|
14 |
+
|
15 |
+
def remove_breakthrough_formatting(text):
|
16 |
+
"""Remove any LLM formatting that made it through the prompts."""
|
17 |
+
patterns = [
|
18 |
+
(r'^.*?:\s*', ''), # Remove any remaining speaker labels
|
19 |
+
(r'\[.*?\]', ''), # Remove any breakthrough brackets
|
20 |
+
(r'\(.*?\)', ''), # Remove any breakthrough parentheticals
|
21 |
+
(r'"\w+:"', ''), # Remove quoted speaker labels
|
22 |
+
(r'<.*?>', ''), # Remove any HTML-like tags
|
23 |
+
(r'---.*?---', ''), # Remove any section separators
|
24 |
+
(r'#\s*\w+', ''), # Remove any hashtag sections
|
25 |
+
]
|
26 |
+
|
27 |
+
for pattern, replacement in patterns:
|
28 |
+
text = re.sub(pattern, replacement, text, flags=re.MULTILINE)
|
29 |
+
return text
|
30 |
+
|
31 |
+
def convert_to_monologue(text):
|
32 |
+
"""Convert multi-party dialog into a flowing narrative."""
|
33 |
+
# Replace dialog markers with transitional phrases
|
34 |
+
transitions = [
|
35 |
+
"Then", "After that", "Next", "Following that",
|
36 |
+
"Subsequently", "Moving on", "Additionally"
|
37 |
+
]
|
38 |
+
|
39 |
+
lines = text.split('\n')
|
40 |
+
narrative = []
|
41 |
+
current_transition = 0
|
42 |
+
|
43 |
+
for line in lines:
|
44 |
+
if line.strip():
|
45 |
+
# Remove speaker labels if any
|
46 |
+
cleaned_line = re.sub(r'^[A-Z0-9\[\]]+:\s*', '', line)
|
47 |
+
cleaned_line = re.sub(r'^[A-Z][a-z]+:\s*', '', cleaned_line)
|
48 |
+
|
49 |
+
# Add transition if it seems like a new thought
|
50 |
+
if narrative and cleaned_line[0].isupper():
|
51 |
+
narrative.append(f"{transitions[current_transition]}, {cleaned_line.lower()}")
|
52 |
+
current_transition = (current_transition + 1) % len(transitions)
|
53 |
+
else:
|
54 |
+
narrative.append(cleaned_line)
|
55 |
+
|
56 |
+
return ' '.join(narrative)
|
57 |
+
|
58 |
+
def clean_formatting(text):
|
59 |
+
"""Remove markdown and other formatting symbols."""
|
60 |
+
# Remove markdown formatting
|
61 |
+
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) # Bold
|
62 |
+
text = re.sub(r'\*(.+?)\*', r'\1', text) # Italic
|
63 |
+
text = re.sub(r'\_(.+?)\_', r'\1', text) # Underscore emphasis
|
64 |
+
text = re.sub(r'\~\~(.+?)\~\~', r'\1', text) # Strikethrough
|
65 |
+
|
66 |
+
# Remove code blocks and inline code
|
67 |
+
text = re.sub(r'```[\s\S]*?```', '', text)
|
68 |
+
text = re.sub(r'`[^`]*`', '', text)
|
69 |
+
|
70 |
+
return text
|
71 |
+
|
72 |
+
def process_for_podcast(text):
|
73 |
+
"""Main function to process text for podcast narration."""
|
74 |
+
text = remove_dialog_formatting(text)
|
75 |
+
text = clean_formatting(text)
|
76 |
+
text = remove_breakthrough_formatting(text)
|
77 |
+
text = convert_to_monologue(text)
|
78 |
+
|
79 |
+
# Additional cleanups
|
80 |
+
text = re.sub(r'\s+', ' ', text) # Remove multiple spaces
|
81 |
+
text = re.sub(r'\n+', ' ', text) # Remove newlines
|
82 |
+
text = text.strip()
|
83 |
+
|
84 |
+
return text
|
tts.py
CHANGED
@@ -1,7 +1,11 @@
|
|
1 |
import re
|
2 |
import requests
|
|
|
3 |
|
4 |
def clean_text_for_speech(text):
|
|
|
|
|
|
|
5 |
# Replace URLs with readable text
|
6 |
text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
|
7 |
' link ', text)
|
|
|
1 |
import re
|
2 |
import requests
|
3 |
+
from .text_processor import process_for_podcast
|
4 |
|
5 |
def clean_text_for_speech(text):
|
6 |
+
# First apply podcast-specific processing
|
7 |
+
text = process_for_podcast(text)
|
8 |
+
|
9 |
# Replace URLs with readable text
|
10 |
text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
|
11 |
' link ', text)
|