Spaces:
Runtime error
Runtime error
..
Browse files
app.py
CHANGED
|
@@ -63,33 +63,53 @@ english_topic_labels = [
|
|
| 63 |
]
|
| 64 |
|
| 65 |
# New Function to Classify Topics by Keywords
|
| 66 |
-
def classify_topic_by_keywords(text,
|
| 67 |
-
#
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
text = text.lower()
|
| 79 |
-
|
| 80 |
-
#
|
| 81 |
-
topic_scores = {
|
| 82 |
|
| 83 |
for topic, words in keywords.items():
|
| 84 |
for word in words:
|
| 85 |
if word in text:
|
| 86 |
-
topic_scores[topic] += 1 #
|
| 87 |
-
|
| 88 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
best_topic = max(topic_scores, key=topic_scores.get)
|
| 90 |
return best_topic
|
| 91 |
|
| 92 |
|
|
|
|
|
|
|
| 93 |
def transcribe_audio(audio):
|
| 94 |
"""Convert audio to text, translate it, and classify topics in both Darija and English."""
|
| 95 |
try:
|
|
@@ -111,8 +131,9 @@ def transcribe_audio(audio):
|
|
| 111 |
english_topic = classify_topic(translation, bert_tokenizer, bert_model, english_topic_labels)
|
| 112 |
|
| 113 |
# Classify topics using keywords-based classification
|
| 114 |
-
darija_keyword_topic = classify_topic_by_keywords(transcription,
|
| 115 |
-
english_keyword_topic = classify_topic_by_keywords(
|
|
|
|
| 116 |
|
| 117 |
return transcription, translation, darija_topic, english_topic, darija_keyword_topic, english_keyword_topic
|
| 118 |
|
|
|
|
| 63 |
]
|
| 64 |
|
| 65 |
# New Function to Classify Topics by Keywords
|
| 66 |
+
def classify_topic_by_keywords(text, language='ar'):
|
| 67 |
+
# Arabic keywords for each topic
|
| 68 |
+
arabic_keywords = {
|
| 69 |
+
"Customer Service": ["خدمة", "استفسار", "مساعدة", "دعم", "سؤال", "استفسار"],
|
| 70 |
+
"Retention Service": ["احتفاظ", "تجديد", "خصم", "عرض", "العرض"],
|
| 71 |
+
"Billing Issue": ["فاتورة", "دفع", "مشكلة", "خطأ", "مبلغ"],
|
| 72 |
+
"Other": ["شيء آخر", "غير ذلك", "أخرى"]
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
# English keywords for each topic
|
| 76 |
+
english_keywords = {
|
| 77 |
+
"Customer Service": ["service", "inquiry", "help", "support", "question", "assistance"],
|
| 78 |
+
"Retention Service": ["retain", "cut", "discount", "offer", "promotion","stop"],
|
| 79 |
+
"Billing Issue": ["bill", "payment", "problem", "error", "amount"],
|
| 80 |
+
"Other": ["other", "none of the above", "something else"]
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
# Select the appropriate keywords based on the language
|
| 84 |
+
if language == 'ar':
|
| 85 |
+
keywords = arabic_keywords
|
| 86 |
+
elif language == 'en':
|
| 87 |
+
keywords = english_keywords
|
| 88 |
+
else:
|
| 89 |
+
raise ValueError("Invalid language specified. Use 'ar' for Arabic or 'en' for English.")
|
| 90 |
+
|
| 91 |
+
# Convert text to lowercase to avoid inconsistencies
|
| 92 |
text = text.lower()
|
| 93 |
+
|
| 94 |
+
# Check for keywords in the text and calculate the topic scores
|
| 95 |
+
topic_scores = {topic: 0 for topic in keywords} # Initialize topic scores
|
| 96 |
|
| 97 |
for topic, words in keywords.items():
|
| 98 |
for word in words:
|
| 99 |
if word in text:
|
| 100 |
+
topic_scores[topic] += 1 # Increment score for each keyword found
|
| 101 |
+
|
| 102 |
+
# Check if no keywords are found, and in that case, return "Other"
|
| 103 |
+
if all(score == 0 for score in topic_scores.values()):
|
| 104 |
+
return "Other"
|
| 105 |
+
|
| 106 |
+
# Return the topic with the highest score
|
| 107 |
best_topic = max(topic_scores, key=topic_scores.get)
|
| 108 |
return best_topic
|
| 109 |
|
| 110 |
|
| 111 |
+
|
| 112 |
+
|
| 113 |
def transcribe_audio(audio):
|
| 114 |
"""Convert audio to text, translate it, and classify topics in both Darija and English."""
|
| 115 |
try:
|
|
|
|
| 131 |
english_topic = classify_topic(translation, bert_tokenizer, bert_model, english_topic_labels)
|
| 132 |
|
| 133 |
# Classify topics using keywords-based classification
|
| 134 |
+
darija_keyword_topic = classify_topic_by_keywords(transcription,language='ar' )
|
| 135 |
+
english_keyword_topic = classify_topic_by_keywords(transcription,language='en' )
|
| 136 |
+
#english_keyword_topic = classify_topic_by_keywords(translation )
|
| 137 |
|
| 138 |
return transcription, translation, darija_topic, english_topic, darija_keyword_topic, english_keyword_topic
|
| 139 |
|