Mohssinibra commited on
Commit
b6cc6ac
·
verified ·
1 Parent(s): 19bcdca
Files changed (1) hide show
  1. app.py +41 -20
app.py CHANGED
@@ -63,33 +63,53 @@ english_topic_labels = [
63
  ]
64
 
65
  # New Function to Classify Topics by Keywords
66
- def classify_topic_by_keywords(text, topic_labels):
67
- # Dictionnaire de mots-clés pour chaque topic
68
-
69
- keywords = {
70
- "خدمة العملاء": ["خدمة", "استفسار", "مساعدة", "دعم", "سؤال", "استفسار"],
71
- "خدمة الاحتفاظ": ["احتفاظ", "تجديد", "خصم", "عرض", "العرض"],
72
- "مشكلة في الفاتورة": ["فاتورة", "دفع", "مشكلة", "خطأ", "مبلغ"],
73
- "إلغاء الخدمة": ["إلغاء", "إيقاف", "إلغاء الاشتراك", "إلغاء الخدمة", "إيقاف الخدمة", "إلغاء العقد", "إيقاف الاشتراك", "فسخ", "إيقاف التجديد"]
74
- }
75
-
76
-
77
- # Convertir le texte en minuscule pour éviter les incohérences
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  text = text.lower()
79
-
80
- # Vérification de la présence des mots-clés dans le texte
81
- topic_scores = {label: 0 for label in topic_labels} # Initialiser le score des topics
82
 
83
  for topic, words in keywords.items():
84
  for word in words:
85
  if word in text:
86
- topic_scores[topic] += 1 # Incrémenter le score pour chaque mot trouvé
87
-
88
- # Retourner le topic avec le score le plus élevé
 
 
 
 
89
  best_topic = max(topic_scores, key=topic_scores.get)
90
  return best_topic
91
 
92
 
 
 
93
  def transcribe_audio(audio):
94
  """Convert audio to text, translate it, and classify topics in both Darija and English."""
95
  try:
@@ -111,8 +131,9 @@ def transcribe_audio(audio):
111
  english_topic = classify_topic(translation, bert_tokenizer, bert_model, english_topic_labels)
112
 
113
  # Classify topics using keywords-based classification
114
- darija_keyword_topic = classify_topic_by_keywords(transcription, darija_topic_labels)
115
- english_keyword_topic = classify_topic_by_keywords(translation, english_topic_labels)
 
116
 
117
  return transcription, translation, darija_topic, english_topic, darija_keyword_topic, english_keyword_topic
118
 
 
63
  ]
64
 
65
  # New Function to Classify Topics by Keywords
66
+ def classify_topic_by_keywords(text, language='ar'):
67
+ # Arabic keywords for each topic
68
+ arabic_keywords = {
69
+ "Customer Service": ["خدمة", "استفسار", "مساعدة", "دعم", "سؤال", "استفسار"],
70
+ "Retention Service": ["احتفاظ", "تجديد", "خصم", "عرض", "العرض"],
71
+ "Billing Issue": ["فاتورة", "دفع", "مشكلة", "خطأ", "مبلغ"],
72
+ "Other": ["شيء آخر", "غير ذلك", "أخرى"]
73
+ }
74
+
75
+ # English keywords for each topic
76
+ english_keywords = {
77
+ "Customer Service": ["service", "inquiry", "help", "support", "question", "assistance"],
78
+ "Retention Service": ["retain", "cut", "discount", "offer", "promotion","stop"],
79
+ "Billing Issue": ["bill", "payment", "problem", "error", "amount"],
80
+ "Other": ["other", "none of the above", "something else"]
81
+ }
82
+
83
+ # Select the appropriate keywords based on the language
84
+ if language == 'ar':
85
+ keywords = arabic_keywords
86
+ elif language == 'en':
87
+ keywords = english_keywords
88
+ else:
89
+ raise ValueError("Invalid language specified. Use 'ar' for Arabic or 'en' for English.")
90
+
91
+ # Convert text to lowercase to avoid inconsistencies
92
  text = text.lower()
93
+
94
+ # Check for keywords in the text and calculate the topic scores
95
+ topic_scores = {topic: 0 for topic in keywords} # Initialize topic scores
96
 
97
  for topic, words in keywords.items():
98
  for word in words:
99
  if word in text:
100
+ topic_scores[topic] += 1 # Increment score for each keyword found
101
+
102
+ # Check if no keywords are found, and in that case, return "Other"
103
+ if all(score == 0 for score in topic_scores.values()):
104
+ return "Other"
105
+
106
+ # Return the topic with the highest score
107
  best_topic = max(topic_scores, key=topic_scores.get)
108
  return best_topic
109
 
110
 
111
+
112
+
113
  def transcribe_audio(audio):
114
  """Convert audio to text, translate it, and classify topics in both Darija and English."""
115
  try:
 
131
  english_topic = classify_topic(translation, bert_tokenizer, bert_model, english_topic_labels)
132
 
133
  # Classify topics using keywords-based classification
134
+ darija_keyword_topic = classify_topic_by_keywords(transcription,language='ar' )
135
+ english_keyword_topic = classify_topic_by_keywords(transcription,language='en' )
136
+ #english_keyword_topic = classify_topic_by_keywords(translation )
137
 
138
  return transcription, translation, darija_topic, english_topic, darija_keyword_topic, english_keyword_topic
139