|
import re |
|
import whisper |
|
from pydub import AudioSegment |
|
|
|
def analyze_fillers(file_path: str, model_size: str = "base", transcript = None ) -> dict: |
|
""" |
|
Analyzes English filler words in audio with proper duration handling. |
|
""" |
|
try: |
|
FILLER_WORDS = [ |
|
"um", "uh", "hmm", "ah", "er", "eh", |
|
"umm", "uhh", "mmm", "ahh", "err", |
|
"like", "you know", "well", "so", "actually", "basically", |
|
"right", "okay", "sort of", "kind of" |
|
] |
|
|
|
|
|
audio = AudioSegment.from_file(file_path) |
|
duration = len(audio) / 1000 |
|
|
|
if transcript is None: |
|
|
|
model = whisper.load_model(model_size) |
|
result = model.transcribe(file_path, word_timestamps=False, fp16=False) |
|
transcript = result["text"] |
|
|
|
|
|
pattern = r"(?<!\w)(" + "|".join(map(re.escape, FILLER_WORDS)) + r")(?!\w)" |
|
matches = re.findall(pattern, transcript, re.IGNORECASE) |
|
|
|
|
|
filler_counts = {} |
|
for word in matches: |
|
key = word.lower() |
|
filler_counts[key] = filler_counts.get(key, 0) + 1 |
|
total_fillers = sum(filler_counts.values()) |
|
|
|
|
|
filler_per_min = (total_fillers / duration) * 60 if duration > 0 else 0 |
|
|
|
|
|
if total_fillers == 0: |
|
filler_score = 100 |
|
elif filler_per_min < 1: |
|
filler_score = 90 |
|
elif filler_per_min < 3: |
|
filler_score = 80 |
|
elif filler_per_min < 5: |
|
filler_score = 60 |
|
elif filler_per_min < 10: |
|
filler_score = 40 |
|
else: |
|
filler_score = 20 |
|
|
|
|
|
top_fillers = sorted(filler_counts.items(), key=lambda x: x[1], reverse=True)[:2] |
|
|
|
if total_fillers == 0: |
|
insight = "Excellent! No filler words detected." |
|
elif total_fillers <= 2: |
|
insight = f"Minimal fillers ({total_fillers} total), mostly '{top_fillers[0][0]}'." |
|
elif total_fillers <= 5: |
|
examples = ", ".join(f"'{f[0]}'" for f in top_fillers) |
|
insight = f"Moderate fillers ({total_fillers} total), mainly {examples}." |
|
else: |
|
examples = ", ".join(f"'{f[0]}'" for f in top_fillers) |
|
insight = f"Excessive fillers ({total_fillers} total), dominated by {examples}." |
|
|
|
return { |
|
"filler_counts": filler_counts, |
|
"total_fillers": total_fillers, |
|
"filler_score": filler_score, |
|
"filler_rate_per_min": round(filler_per_min, 1), |
|
} |
|
|
|
except Exception as e: |
|
raise RuntimeError(f"Analysis failed: {str(e)}") |