Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -1,153 +1,69 @@
|
|
| 1 |
import os
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
import librosa
|
| 8 |
-
import numpy as np
|
| 9 |
-
import tempfile
|
| 10 |
-
from fastapi import FastAPI, UploadFile, File, HTTPException
|
| 11 |
-
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
| 12 |
-
from librosa.sequence import dtw
|
| 13 |
-
from contextlib import asynccontextmanager
|
| 14 |
|
| 15 |
-
#
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
if auth_token:
|
| 20 |
-
self.processor = Wav2Vec2Processor.from_pretrained(model_name, token=auth_token)
|
| 21 |
-
self.model = Wav2Vec2ForCTC.from_pretrained(model_name, token=auth_token)
|
| 22 |
-
else:
|
| 23 |
-
self.processor = Wav2Vec2Processor.from_pretrained(model_name)
|
| 24 |
-
self.model = Wav2Vec2ForCTC.from_pretrained(model_name)
|
| 25 |
-
self.model = self.model.to(self.device)
|
| 26 |
-
self.model.eval()
|
| 27 |
-
self.embedding_cache = {}
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
raise FileNotFoundError(f"Audio file not found: {file_path}")
|
| 32 |
-
y, sr = librosa.load(file_path, sr=target_sr)
|
| 33 |
-
if normalize:
|
| 34 |
-
y = librosa.util.normalize(y)
|
| 35 |
-
if trim_silence:
|
| 36 |
-
y, _ = librosa.effects.trim(y, top_db=30)
|
| 37 |
-
return y
|
| 38 |
|
| 39 |
-
|
| 40 |
-
input_values = self.processor(
|
| 41 |
-
audio,
|
| 42 |
-
sampling_rate=sr,
|
| 43 |
-
return_tensors="pt"
|
| 44 |
-
).input_values.to(self.device)
|
| 45 |
-
with torch.no_grad():
|
| 46 |
-
outputs = self.model(input_values, output_hidden_states=True)
|
| 47 |
-
hidden_states = outputs.hidden_states[-1]
|
| 48 |
-
embedding_seq = hidden_states.squeeze(0).cpu().numpy()
|
| 49 |
-
return embedding_seq
|
| 50 |
-
|
| 51 |
-
def compute_dtw_distance(self, features1, features2):
|
| 52 |
-
D, wp = dtw(X=features1, Y=features2, metric='euclidean')
|
| 53 |
-
distance = D[-1, -1]
|
| 54 |
-
normalized_distance = distance / len(wp)
|
| 55 |
-
return normalized_distance
|
| 56 |
-
|
| 57 |
-
def interpret_similarity(self, norm_distance):
|
| 58 |
-
if norm_distance == 0:
|
| 59 |
-
result = "The recitations are identical based on the deep embeddings."
|
| 60 |
-
score = 100
|
| 61 |
-
elif norm_distance < 1:
|
| 62 |
-
result = "The recitations are extremely similar."
|
| 63 |
-
score = 95
|
| 64 |
-
elif norm_distance < 5:
|
| 65 |
-
result = "The recitations are very similar with minor differences."
|
| 66 |
-
score = 80
|
| 67 |
-
elif norm_distance < 10:
|
| 68 |
-
result = "The recitations show moderate similarity."
|
| 69 |
-
score = 60
|
| 70 |
-
elif norm_distance < 20:
|
| 71 |
-
result = "The recitations show some noticeable differences."
|
| 72 |
-
score = 40
|
| 73 |
-
else:
|
| 74 |
-
result = "The recitations are quite different."
|
| 75 |
-
score = max(0, 100 - norm_distance)
|
| 76 |
-
return result, score
|
| 77 |
-
|
| 78 |
-
def get_embedding_for_file(self, file_path):
|
| 79 |
-
if file_path in self.embedding_cache:
|
| 80 |
-
return self.embedding_cache[file_path]
|
| 81 |
-
audio = self.load_audio(file_path)
|
| 82 |
-
embedding = self.get_deep_embedding(audio)
|
| 83 |
-
self.embedding_cache[file_path] = embedding
|
| 84 |
-
return embedding
|
| 85 |
-
|
| 86 |
-
def predict(self, file_path1, file_path2):
|
| 87 |
-
embedding1 = self.get_embedding_for_file(file_path1)
|
| 88 |
-
embedding2 = self.get_embedding_for_file(file_path2)
|
| 89 |
-
norm_distance = self.compute_dtw_distance(embedding1.T, embedding2.T)
|
| 90 |
-
interpretation, similarity_score = self.interpret_similarity(norm_distance)
|
| 91 |
-
print(f"Similarity Score: {similarity_score:.1f}/100")
|
| 92 |
-
print(f"Interpretation: {interpretation}")
|
| 93 |
-
return similarity_score, interpretation
|
| 94 |
-
|
| 95 |
-
def clear_cache(self):
|
| 96 |
-
self.embedding_cache = {}
|
| 97 |
-
|
| 98 |
-
# --- Lifespan Event Handler ---
|
| 99 |
-
@asynccontextmanager
|
| 100 |
-
async def lifespan(app: FastAPI):
|
| 101 |
-
global comparer
|
| 102 |
-
auth_token = os.environ.get("HF_TOKEN")
|
| 103 |
-
comparer = QuranRecitationComparer(
|
| 104 |
-
model_name="jonatasgrosman/wav2vec2-large-xlsr-53-arabic",
|
| 105 |
-
auth_token=auth_token
|
| 106 |
-
)
|
| 107 |
-
print("Model initialized and ready for predictions!")
|
| 108 |
-
yield
|
| 109 |
-
print("Application shutdown: Cleanup if necessary.")
|
| 110 |
-
|
| 111 |
-
app = FastAPI(
|
| 112 |
-
title="Quran Recitation Comparer API",
|
| 113 |
-
description="Compares two Quran recitations using a deep wav2vec2 model.",
|
| 114 |
-
version="1.0",
|
| 115 |
-
lifespan=lifespan
|
| 116 |
-
)
|
| 117 |
-
|
| 118 |
-
# --- API Endpoints ---
|
| 119 |
-
@app.get("/", summary="Health Check")
|
| 120 |
async def root():
|
| 121 |
-
return {
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
except Exception as e:
|
| 143 |
-
raise HTTPException(status_code=500, detail=str(e))
|
| 144 |
-
finally:
|
| 145 |
-
if tmp1_path and os.path.exists(tmp1_path):
|
| 146 |
-
os.remove(tmp1_path)
|
| 147 |
-
if tmp2_path and os.path.exists(tmp2_path):
|
| 148 |
-
os.remove(tmp2_path)
|
| 149 |
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
comparer.clear_cache()
|
| 153 |
-
return {"message": "Cache cleared."}
|
|
|
|
| 1 |
import os
|
| 2 |
+
from fastapi import FastAPI, UploadFile, File
|
| 3 |
+
from google import genai
|
| 4 |
+
from google.genai import types
|
| 5 |
+
import uvicorn
|
| 6 |
|
| 7 |
+
app = FastAPI()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
# Retrieve the GenAI API key from the environment variable.
|
| 10 |
+
api_key = os.getenv("GENAI_API_KEY")
|
| 11 |
+
if not api_key:
|
| 12 |
+
raise EnvironmentError("GENAI_API_KEY environment variable not set")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
# Initialize the GenAI client.
|
| 15 |
+
client = genai.Client(api_key=api_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
@app.get("/")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
async def root():
|
| 19 |
+
return {
|
| 20 |
+
"message": "Welcome to the Audio Similarity API!",
|
| 21 |
+
"usage": {
|
| 22 |
+
"endpoint": "/compare-audio",
|
| 23 |
+
"description": "POST two audio files (user recitation and professional qarri) for similarity analysis.",
|
| 24 |
+
"instructions": "Send audio files as form-data with keys 'audio1' and 'audio2'."
|
| 25 |
+
}
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
@app.post("/compare-audio")
|
| 29 |
+
async def compare_audio(
|
| 30 |
+
audio1: UploadFile = File(...),
|
| 31 |
+
audio2: UploadFile = File(...)
|
| 32 |
+
):
|
| 33 |
+
# Read the uploaded audio files.
|
| 34 |
+
audio1_bytes = await audio1.read()
|
| 35 |
+
audio2_bytes = await audio2.read()
|
| 36 |
+
|
| 37 |
+
# Create a refined prompt that clearly identifies the audio sources.
|
| 38 |
+
prompt = (
|
| 39 |
+
"""Please analyze and compare the two provided audio clips.
|
| 40 |
+
The first audio is the user's recitation, and the second audio is the professional qarri recitation.
|
| 41 |
+
Evaluate their similarity on a scale from 0 to 1, where:
|
| 42 |
+
- 1 indicates the user's recitation contains no mistakes compared to the professional version,
|
| 43 |
+
- 0 indicates there are significant mistakes.
|
| 44 |
+
Provide your response with:
|
| 45 |
+
1. A numerical similarity score on the first line.
|
| 46 |
+
2. A single sentence that indicates whether the user's recitation is similar, moderately similar, or dissimilar to the professional qarri."""
|
| 47 |
+
)
|
| 48 |
|
| 49 |
+
# Generate the content using the Gemini model with the two audio inputs.
|
| 50 |
+
response = client.models.generate_content(
|
| 51 |
+
model='gemini-2.0-flash',
|
| 52 |
+
contents=[
|
| 53 |
+
prompt,
|
| 54 |
+
types.Part.from_bytes(
|
| 55 |
+
data=audio1_bytes,
|
| 56 |
+
mime_type=audio1.content_type,
|
| 57 |
+
),
|
| 58 |
+
types.Part.from_bytes(
|
| 59 |
+
data=audio2_bytes,
|
| 60 |
+
mime_type=audio2.content_type,
|
| 61 |
+
)
|
| 62 |
+
]
|
| 63 |
+
)
|
| 64 |
|
| 65 |
+
# Return the model's response.
|
| 66 |
+
return {"result": response.text}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
+
if __name__ == "__main__":
|
| 69 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
|
|
|