Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- logic.py +10 -9
- scoring.py +34 -60
logic.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
from phonemizer.separator import Separator
|
| 2 |
from phonemizer import phonemize
|
| 3 |
# from phonemizer.backend.espeak.wrapper import EspeakWrapper
|
| 4 |
-
from Levenshtein import distance as levenshtein_distance
|
| 5 |
from scoring import calculate_fluency_and_pronunciation
|
| 6 |
|
| 7 |
import whisper
|
|
@@ -28,13 +28,14 @@ def rate_pronunciation(expected_phonemes, actual_phonemes):
|
|
| 28 |
results = []
|
| 29 |
for i, base_word in enumerate(actual_phonemes):
|
| 30 |
best_dist = float('inf')
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
|
|
|
| 38 |
if best_dist == 0:
|
| 39 |
results.append(3)
|
| 40 |
elif best_dist <= error_threshold:
|
|
@@ -52,7 +53,7 @@ def Speaker_speech_analysis(audio_path, text):
|
|
| 52 |
transcribtion = text2phoneme(pre_transcribtion)
|
| 53 |
text_phone = text2phoneme(text)
|
| 54 |
scores = rate_pronunciation(transcribtion, text_phone)
|
| 55 |
-
FP_scores = calculate_fluency_and_pronunciation(audio_path, pre_transcribtion, scores, len(text.split()))
|
| 56 |
word_scores = [(word, s) for word, s in zip(text.split(), scores)]
|
| 57 |
|
| 58 |
FP_scores['word_scores'] = word_scores
|
|
|
|
| 1 |
from phonemizer.separator import Separator
|
| 2 |
from phonemizer import phonemize
|
| 3 |
# from phonemizer.backend.espeak.wrapper import EspeakWrapper
|
| 4 |
+
from Levenshtein import distance as levenshtein_distance
|
| 5 |
from scoring import calculate_fluency_and_pronunciation
|
| 6 |
|
| 7 |
import whisper
|
|
|
|
| 28 |
results = []
|
| 29 |
for i, base_word in enumerate(actual_phonemes):
|
| 30 |
best_dist = float('inf')
|
| 31 |
+
if i <= len(expected_phonemes):
|
| 32 |
+
for j in range(max(0, i-1), i + min(3, len(expected_phonemes) - i)):
|
| 33 |
+
dist = levenshtein_distance(expected_phonemes[j], base_word,)
|
| 34 |
+
if dist < best_dist:
|
| 35 |
+
best_dist = dist
|
| 36 |
+
if best_dist == 0: # Early stopping on perfect match
|
| 37 |
+
break
|
| 38 |
+
error_threshold = len(base_word) * 0.40
|
| 39 |
if best_dist == 0:
|
| 40 |
results.append(3)
|
| 41 |
elif best_dist <= error_threshold:
|
|
|
|
| 53 |
transcribtion = text2phoneme(pre_transcribtion)
|
| 54 |
text_phone = text2phoneme(text)
|
| 55 |
scores = rate_pronunciation(transcribtion, text_phone)
|
| 56 |
+
FP_scores = calculate_fluency_and_pronunciation(audio_path, len(pre_transcribtion.split()), scores, len(text.split()))
|
| 57 |
word_scores = [(word, s) for word, s in zip(text.split(), scores)]
|
| 58 |
|
| 59 |
FP_scores['word_scores'] = word_scores
|
scoring.py
CHANGED
|
@@ -2,26 +2,19 @@ import numpy as np
|
|
| 2 |
import librosa
|
| 3 |
|
| 4 |
def calculate_expected_value(scores):
|
| 5 |
-
"""
|
| 6 |
-
Calculate the expected value for a list of outcomes (scores), assuming each unique score
|
| 7 |
-
occurs with a frequency proportional to its count in the list.
|
| 8 |
-
|
| 9 |
-
:param scores: List of outcomes (numeric values).
|
| 10 |
-
:return: The expected value (a weighted average of all possible outcomes).
|
| 11 |
-
"""
|
| 12 |
# First calculate the probability of each unique score
|
| 13 |
unique_scores, counts = np.unique(scores, return_counts=True)
|
| 14 |
probabilities = counts / len(scores)
|
| 15 |
|
| 16 |
-
# Then calculate the expected value as the sum of scores times their probabilities
|
| 17 |
expected_value = np.dot(unique_scores, probabilities)
|
| 18 |
return expected_value
|
| 19 |
|
| 20 |
|
| 21 |
-
def calculate_fluency_score(audio_path,
|
| 22 |
-
|
| 23 |
avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
|
| 24 |
-
if (total_words / base_script_len) < 0.15 or avg_pronunciation_score < 1.
|
| 25 |
return 10
|
| 26 |
audio, sr = librosa.load(audio_path)
|
| 27 |
non_silent_intervals = librosa.effects.split(audio, top_db=22)
|
|
@@ -29,28 +22,25 @@ def calculate_fluency_score(audio_path, transcription, word_pronunciation_scores
|
|
| 29 |
|
| 30 |
total_duration = len(audio) / sr
|
| 31 |
|
| 32 |
-
|
| 33 |
-
non_silent_duration = non_silent_duration if total_words > 4 else 0
|
| 34 |
ideal_min_rate, ideal_max_rate = 120 / 60, 140 / 60
|
| 35 |
actual_speech_rate = (total_words / (non_silent_duration + 1e-7)) * (total_words / base_script_len)
|
| 36 |
speaking_ratio = non_silent_duration / total_duration
|
| 37 |
# Existing speech rate score calculation
|
| 38 |
|
| 39 |
# Determine if speech rate is within the ideal range
|
| 40 |
-
if
|
| 41 |
-
# Within the ideal range
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
| 43 |
else:
|
| 44 |
-
#
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
# Too fast
|
| 50 |
-
speech_rate_score = 2 - (actual_speech_rate / ideal_max_rate)
|
| 51 |
-
# Clamp the score between 0 and 1
|
| 52 |
-
speech_rate_score = max(0, min(speech_rate_score, 1))
|
| 53 |
-
|
| 54 |
# If speaking ratio is significantly less than the gold standard, reduce the fluency score
|
| 55 |
gold_standard_ratio = 0.9 # Assuming 90% speaking time is gold standard for natural speech
|
| 56 |
speaking_ratio_score = min(speaking_ratio / gold_standard_ratio, 1)
|
|
@@ -58,61 +48,45 @@ def calculate_fluency_score(audio_path, transcription, word_pronunciation_scores
|
|
| 58 |
|
| 59 |
# Pronunciation score calculation
|
| 60 |
avg_pronunciation_score = (avg_pronunciation_score - 1) / 2
|
| 61 |
-
|
|
|
|
| 62 |
|
| 63 |
# Weighted combination of scores
|
| 64 |
# Adjust weights as needed
|
| 65 |
-
weight_speech_rate = 0.
|
| 66 |
weight_speaking_ratio = 0.20
|
| 67 |
weight_pronunciation = 0.50
|
| 68 |
-
weight_pronunciation_variance = 0.10
|
| 69 |
|
| 70 |
-
combined_score =
|
| 71 |
-
speaking_ratio_score * weight_speaking_ratio +
|
| 72 |
-
avg_pronunciation_score * weight_pronunciation +
|
| 73 |
-
(1 / (1 + pronunciation_variance)) * weight_pronunciation_variance)
|
| 74 |
|
| 75 |
# Scale the combined score to be between 10% and 100%
|
| 76 |
scaled_fluency_score = 10 + combined_score * 80
|
| 77 |
|
| 78 |
return scaled_fluency_score
|
| 79 |
|
| 80 |
-
def calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len):
|
| 81 |
-
if
|
| 82 |
-
|
| 83 |
# Calculate average word pronunciation score
|
| 84 |
avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
|
| 85 |
-
print(avg_pronunciation_score)
|
| 86 |
-
# Adjust pronunciation score based on fluency
|
| 87 |
-
# fluency_score = fluency_score / 100
|
| 88 |
-
# This is a simplistic adjustment. It can be refined based on more detailed analysis
|
| 89 |
-
fluency_adjustment = fluency_score / 100
|
| 90 |
-
print(fluency_adjustment)
|
| 91 |
-
adjusted_pronunciation_score = avg_pronunciation_score * fluency_adjustment
|
| 92 |
-
print(adjusted_pronunciation_score)
|
| 93 |
-
# Map to 0-5 scale based on score guide
|
| 94 |
-
# These thresholds can be adjusted based on empirical data or further analysis
|
| 95 |
-
if adjusted_pronunciation_score >= 2.4:
|
| 96 |
-
score_guide_level = 5
|
| 97 |
-
elif adjusted_pronunciation_score >= 1.7:
|
| 98 |
-
score_guide_level = 4
|
| 99 |
-
elif adjusted_pronunciation_score >= 1.0:
|
| 100 |
-
score_guide_level = 3
|
| 101 |
-
elif adjusted_pronunciation_score >= 0.5:
|
| 102 |
-
score_guide_level = 2
|
| 103 |
-
else:
|
| 104 |
-
score_guide_level = 1
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
# Scale to 10% - 90%
|
| 107 |
-
final_score = 10 +
|
| 108 |
|
| 109 |
return final_score
|
| 110 |
|
| 111 |
-
def calculate_fluency_and_pronunciation(audio_path,
|
| 112 |
|
| 113 |
-
fluency_score = calculate_fluency_score(audio_path,
|
| 114 |
|
| 115 |
-
pronunciation_accuracy = calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len)
|
| 116 |
|
| 117 |
return {'fluency_score': fluency_score, 'pronunciation_accuracy': pronunciation_accuracy}
|
| 118 |
|
|
|
|
| 2 |
import librosa
|
| 3 |
|
| 4 |
def calculate_expected_value(scores):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
# First calculate the probability of each unique score
|
| 6 |
unique_scores, counts = np.unique(scores, return_counts=True)
|
| 7 |
probabilities = counts / len(scores)
|
| 8 |
|
| 9 |
+
# Then calculate the expected value as the sum of scores times their probabilities
|
| 10 |
expected_value = np.dot(unique_scores, probabilities)
|
| 11 |
return expected_value
|
| 12 |
|
| 13 |
|
| 14 |
+
def calculate_fluency_score(audio_path, total_words, word_pronunciation_scores, base_script_len):
|
| 15 |
+
|
| 16 |
avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
|
| 17 |
+
if (total_words / base_script_len) < 0.15 or avg_pronunciation_score < 1.5:
|
| 18 |
return 10
|
| 19 |
audio, sr = librosa.load(audio_path)
|
| 20 |
non_silent_intervals = librosa.effects.split(audio, top_db=22)
|
|
|
|
| 22 |
|
| 23 |
total_duration = len(audio) / sr
|
| 24 |
|
| 25 |
+
non_silent_duration = non_silent_duration
|
|
|
|
| 26 |
ideal_min_rate, ideal_max_rate = 120 / 60, 140 / 60
|
| 27 |
actual_speech_rate = (total_words / (non_silent_duration + 1e-7)) * (total_words / base_script_len)
|
| 28 |
speaking_ratio = non_silent_duration / total_duration
|
| 29 |
# Existing speech rate score calculation
|
| 30 |
|
| 31 |
# Determine if speech rate is within the ideal range
|
| 32 |
+
if actual_speech_rate <= ideal_max_rate:
|
| 33 |
+
# Within the ideal range or speaking slow
|
| 34 |
+
max_ratio = actual_speech_rate / ideal_max_rate
|
| 35 |
+
min_ratio = (actual_speech_rate / ideal_min_rate)
|
| 36 |
+
speech_rate_score = np.mean([max_ratio, min_ratio]) - 0.167
|
| 37 |
+
# for normal speaking speech_rate_score between (0.708, 1) and for slow speaking speech_rate_score (0.707, 0)
|
| 38 |
else:
|
| 39 |
+
# Too fast
|
| 40 |
+
# for fast speaking speech_rate_score (0.707, 0)
|
| 41 |
+
max_ratio = actual_speech_rate / ideal_max_rate
|
| 42 |
+
speech_rate_score = 0.7 / max_ratio
|
| 43 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
# If speaking ratio is significantly less than the gold standard, reduce the fluency score
|
| 45 |
gold_standard_ratio = 0.9 # Assuming 90% speaking time is gold standard for natural speech
|
| 46 |
speaking_ratio_score = min(speaking_ratio / gold_standard_ratio, 1)
|
|
|
|
| 48 |
|
| 49 |
# Pronunciation score calculation
|
| 50 |
avg_pronunciation_score = (avg_pronunciation_score - 1) / 2
|
| 51 |
+
|
| 52 |
+
# pronunciation_variance = np.var(word_pronunciation_scores, ddof=1,)
|
| 53 |
|
| 54 |
# Weighted combination of scores
|
| 55 |
# Adjust weights as needed
|
| 56 |
+
weight_speech_rate = 0.30
|
| 57 |
weight_speaking_ratio = 0.20
|
| 58 |
weight_pronunciation = 0.50
|
| 59 |
+
# weight_pronunciation_variance = 0.10
|
| 60 |
|
| 61 |
+
combined_score = speech_rate_score * weight_speech_rate + speaking_ratio_score * weight_speaking_ratio + avg_pronunciation_score * weight_pronunciation
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
# Scale the combined score to be between 10% and 100%
|
| 64 |
scaled_fluency_score = 10 + combined_score * 80
|
| 65 |
|
| 66 |
return scaled_fluency_score
|
| 67 |
|
| 68 |
+
def calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len, total_words):
|
| 69 |
+
# if total_words / base_script_len < 0.25:
|
| 70 |
+
# return 10
|
| 71 |
# Calculate average word pronunciation score
|
| 72 |
avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
+
fluency_score = fluency_score / 100
|
| 75 |
+
|
| 76 |
+
avg_pronunciation_score = (avg_pronunciation_score - 1) / 2
|
| 77 |
+
avg_weight = 0.8
|
| 78 |
+
flu_weight = 0.2
|
| 79 |
+
combined_score = avg_weight * avg_pronunciation_score + flu_weight * fluency_score
|
| 80 |
# Scale to 10% - 90%
|
| 81 |
+
final_score = 10 + combined_score * 90
|
| 82 |
|
| 83 |
return final_score
|
| 84 |
|
| 85 |
+
def calculate_fluency_and_pronunciation(audio_path, total_words, word_pronunciation_scores, base_script_len):
|
| 86 |
|
| 87 |
+
fluency_score = calculate_fluency_score(audio_path, total_words, word_pronunciation_scores, base_script_len)
|
| 88 |
|
| 89 |
+
pronunciation_accuracy = calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len, total_words)
|
| 90 |
|
| 91 |
return {'fluency_score': fluency_score, 'pronunciation_accuracy': pronunciation_accuracy}
|
| 92 |
|