Spaces:

Mohssinibra
/

STTDARIJAAPI

Running

App Files Files Community

Mohssinibra commited on Feb 9

Commit

b6cc6ac

verified ·

1 Parent(s): 19bcdca

..

Browse files

Files changed (1) hide show

app.py +41 -20

app.py CHANGED Viewed

@@ -63,33 +63,53 @@ english_topic_labels = [
 ]
 # New Function to Classify Topics by Keywords
-def classify_topic_by_keywords(text, topic_labels):
-    # Dictionnaire de mots-clés pour chaque topic
-    keywords = {
-    "خدمة العملاء": ["خدمة", "استفسار", "مساعدة", "دعم", "سؤال", "استفسار"],
-    "خدمة الاحتفاظ": ["احتفاظ", "تجديد", "خصم", "عرض", "العرض"],
-    "مشكلة في الفاتورة": ["فاتورة", "دفع", "مشكلة", "خطأ", "مبلغ"],
-    "إلغاء الخدمة": ["إلغاء", "إيقاف", "إلغاء الاشتراك", "إلغاء الخدمة", "إيقاف الخدمة", "إلغاء العقد", "إيقاف الاشتراك", "فسخ", "إيقاف التجديد"]
-}
-    # Convertir le texte en minuscule pour éviter les incohérences
     text = text.lower()
-    # Vérification de la présence des mots-clés dans le texte
-    topic_scores = {label: 0 for label in topic_labels}  # Initialiser le score des topics
     for topic, words in keywords.items():
         for word in words:
             if word in text:
-                topic_scores[topic] += 1  # Incrémenter le score pour chaque mot trouvé
-    # Retourner le topic avec le score le plus élevé
     best_topic = max(topic_scores, key=topic_scores.get)
     return best_topic
 def transcribe_audio(audio):
     """Convert audio to text, translate it, and classify topics in both Darija and English."""
     try:
@@ -111,8 +131,9 @@ def transcribe_audio(audio):
         english_topic = classify_topic(translation, bert_tokenizer, bert_model, english_topic_labels)
         # Classify topics using keywords-based classification
-        darija_keyword_topic = classify_topic_by_keywords(transcription, darija_topic_labels)
-        english_keyword_topic = classify_topic_by_keywords(translation, english_topic_labels)
         return transcription, translation, darija_topic, english_topic, darija_keyword_topic, english_keyword_topic

 ]
 # New Function to Classify Topics by Keywords
+def classify_topic_by_keywords(text, language='ar'):
+    # Arabic keywords for each topic
+    arabic_keywords = {
+        "Customer Service": ["خدمة", "استفسار", "مساعدة", "دعم", "سؤال", "استفسار"],
+        "Retention Service": ["احتفاظ", "تجديد", "خصم", "عرض", "العرض"],
+        "Billing Issue": ["فاتورة", "دفع", "مشكلة", "خطأ", "مبلغ"],
+        "Other": ["شيء آخر", "غير ذلك", "أخرى"]
+    }
+    # English keywords for each topic
+    english_keywords = {
+        "Customer Service": ["service", "inquiry", "help", "support", "question", "assistance"],
+        "Retention Service": ["retain", "cut", "discount", "offer", "promotion","stop"],
+        "Billing Issue": ["bill", "payment", "problem", "error", "amount"],
+        "Other": ["other", "none of the above", "something else"]
+    }
+    # Select the appropriate keywords based on the language
+    if language == 'ar':
+        keywords = arabic_keywords
+    elif language == 'en':
+        keywords = english_keywords
+    else:
+        raise ValueError("Invalid language specified. Use 'ar' for Arabic or 'en' for English.")
+    # Convert text to lowercase to avoid inconsistencies
     text = text.lower()
+    # Check for keywords in the text and calculate the topic scores
+    topic_scores = {topic: 0 for topic in keywords}  # Initialize topic scores
     for topic, words in keywords.items():
         for word in words:
             if word in text:
+                topic_scores[topic] += 1  # Increment score for each keyword found
+    # Check if no keywords are found, and in that case, return "Other"
+    if all(score == 0 for score in topic_scores.values()):
+        return "Other"
+    # Return the topic with the highest score
     best_topic = max(topic_scores, key=topic_scores.get)
     return best_topic
 def transcribe_audio(audio):
     """Convert audio to text, translate it, and classify topics in both Darija and English."""
     try:
         english_topic = classify_topic(translation, bert_tokenizer, bert_model, english_topic_labels)
         # Classify topics using keywords-based classification
+        darija_keyword_topic = classify_topic_by_keywords(transcription,language='ar' )
+        english_keyword_topic = classify_topic_by_keywords(transcription,language='en' )
+        #english_keyword_topic = classify_topic_by_keywords(translation )
         return transcription, translation, darija_topic, english_topic, darija_keyword_topic, english_keyword_topic