Spaces:
Running
Running
..
Browse files
app.py
CHANGED
@@ -63,33 +63,53 @@ english_topic_labels = [
|
|
63 |
]
|
64 |
|
65 |
# New Function to Classify Topics by Keywords
|
66 |
-
def classify_topic_by_keywords(text,
|
67 |
-
#
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
text = text.lower()
|
79 |
-
|
80 |
-
#
|
81 |
-
topic_scores = {
|
82 |
|
83 |
for topic, words in keywords.items():
|
84 |
for word in words:
|
85 |
if word in text:
|
86 |
-
topic_scores[topic] += 1 #
|
87 |
-
|
88 |
-
#
|
|
|
|
|
|
|
|
|
89 |
best_topic = max(topic_scores, key=topic_scores.get)
|
90 |
return best_topic
|
91 |
|
92 |
|
|
|
|
|
93 |
def transcribe_audio(audio):
|
94 |
"""Convert audio to text, translate it, and classify topics in both Darija and English."""
|
95 |
try:
|
@@ -111,8 +131,9 @@ def transcribe_audio(audio):
|
|
111 |
english_topic = classify_topic(translation, bert_tokenizer, bert_model, english_topic_labels)
|
112 |
|
113 |
# Classify topics using keywords-based classification
|
114 |
-
darija_keyword_topic = classify_topic_by_keywords(transcription,
|
115 |
-
english_keyword_topic = classify_topic_by_keywords(
|
|
|
116 |
|
117 |
return transcription, translation, darija_topic, english_topic, darija_keyword_topic, english_keyword_topic
|
118 |
|
|
|
63 |
]
|
64 |
|
65 |
# New Function to Classify Topics by Keywords
|
66 |
+
def classify_topic_by_keywords(text, language='ar'):
|
67 |
+
# Arabic keywords for each topic
|
68 |
+
arabic_keywords = {
|
69 |
+
"Customer Service": ["خدمة", "استفسار", "مساعدة", "دعم", "سؤال", "استفسار"],
|
70 |
+
"Retention Service": ["احتفاظ", "تجديد", "خصم", "عرض", "العرض"],
|
71 |
+
"Billing Issue": ["فاتورة", "دفع", "مشكلة", "خطأ", "مبلغ"],
|
72 |
+
"Other": ["شيء آخر", "غير ذلك", "أخرى"]
|
73 |
+
}
|
74 |
+
|
75 |
+
# English keywords for each topic
|
76 |
+
english_keywords = {
|
77 |
+
"Customer Service": ["service", "inquiry", "help", "support", "question", "assistance"],
|
78 |
+
"Retention Service": ["retain", "cut", "discount", "offer", "promotion","stop"],
|
79 |
+
"Billing Issue": ["bill", "payment", "problem", "error", "amount"],
|
80 |
+
"Other": ["other", "none of the above", "something else"]
|
81 |
+
}
|
82 |
+
|
83 |
+
# Select the appropriate keywords based on the language
|
84 |
+
if language == 'ar':
|
85 |
+
keywords = arabic_keywords
|
86 |
+
elif language == 'en':
|
87 |
+
keywords = english_keywords
|
88 |
+
else:
|
89 |
+
raise ValueError("Invalid language specified. Use 'ar' for Arabic or 'en' for English.")
|
90 |
+
|
91 |
+
# Convert text to lowercase to avoid inconsistencies
|
92 |
text = text.lower()
|
93 |
+
|
94 |
+
# Check for keywords in the text and calculate the topic scores
|
95 |
+
topic_scores = {topic: 0 for topic in keywords} # Initialize topic scores
|
96 |
|
97 |
for topic, words in keywords.items():
|
98 |
for word in words:
|
99 |
if word in text:
|
100 |
+
topic_scores[topic] += 1 # Increment score for each keyword found
|
101 |
+
|
102 |
+
# Check if no keywords are found, and in that case, return "Other"
|
103 |
+
if all(score == 0 for score in topic_scores.values()):
|
104 |
+
return "Other"
|
105 |
+
|
106 |
+
# Return the topic with the highest score
|
107 |
best_topic = max(topic_scores, key=topic_scores.get)
|
108 |
return best_topic
|
109 |
|
110 |
|
111 |
+
|
112 |
+
|
113 |
def transcribe_audio(audio):
|
114 |
"""Convert audio to text, translate it, and classify topics in both Darija and English."""
|
115 |
try:
|
|
|
131 |
english_topic = classify_topic(translation, bert_tokenizer, bert_model, english_topic_labels)
|
132 |
|
133 |
# Classify topics using keywords-based classification
|
134 |
+
darija_keyword_topic = classify_topic_by_keywords(transcription,language='ar' )
|
135 |
+
english_keyword_topic = classify_topic_by_keywords(transcription,language='en' )
|
136 |
+
#english_keyword_topic = classify_topic_by_keywords(translation )
|
137 |
|
138 |
return transcription, translation, darija_topic, english_topic, darija_keyword_topic, english_keyword_topic
|
139 |
|