Spaces:

Hammad712
/

recitation-compare

Sleeping

App Files Files Community

Hammad712 commited on Mar 23

Commit

8bb5ed1

verified ·

1 Parent(s): 7304ea8

Update main.py

Browse files

Files changed (1) hide show

main.py +60 -144

main.py CHANGED Viewed

@@ -1,153 +1,69 @@
 import os
-os.environ["TRANSFORMERS_CACHE"] = "/tmp"  # Ensure the cache directory is writable
-os.environ["NUMBA_CACHE_DIR"] = "/tmp"       # Ensure a writable cache directory for Numba
-os.environ["NUMBA_DISABLE_CACHE"] = "1"        # Disable Numba caching to avoid errors
-import torch
-import librosa
-import numpy as np
-import tempfile
-from fastapi import FastAPI, UploadFile, File, HTTPException
-from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
-from librosa.sequence import dtw
-from contextlib import asynccontextmanager
-# --- Core Class Definition ---
-class QuranRecitationComparer:
-    def __init__(self, model_name="jonatasgrosman/wav2vec2-large-xlsr-53-arabic", auth_token=None):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        if auth_token:
-            self.processor = Wav2Vec2Processor.from_pretrained(model_name, token=auth_token)
-            self.model = Wav2Vec2ForCTC.from_pretrained(model_name, token=auth_token)
-        else:
-            self.processor = Wav2Vec2Processor.from_pretrained(model_name)
-            self.model = Wav2Vec2ForCTC.from_pretrained(model_name)
-        self.model = self.model.to(self.device)
-        self.model.eval()
-        self.embedding_cache = {}
-    def load_audio(self, file_path, target_sr=16000, trim_silence=True, normalize=True):
-        if not os.path.exists(file_path):
-            raise FileNotFoundError(f"Audio file not found: {file_path}")
-        y, sr = librosa.load(file_path, sr=target_sr)
-        if normalize:
-            y = librosa.util.normalize(y)
-        if trim_silence:
-            y, _ = librosa.effects.trim(y, top_db=30)
-        return y
-    def get_deep_embedding(self, audio, sr=16000):
-        input_values = self.processor(
-            audio,
-            sampling_rate=sr,
-            return_tensors="pt"
-        ).input_values.to(self.device)
-        with torch.no_grad():
-            outputs = self.model(input_values, output_hidden_states=True)
-        hidden_states = outputs.hidden_states[-1]
-        embedding_seq = hidden_states.squeeze(0).cpu().numpy()
-        return embedding_seq
-    def compute_dtw_distance(self, features1, features2):
-        D, wp = dtw(X=features1, Y=features2, metric='euclidean')
-        distance = D[-1, -1]
-        normalized_distance = distance / len(wp)
-        return normalized_distance
-    def interpret_similarity(self, norm_distance):
-        if norm_distance == 0:
-            result = "The recitations are identical based on the deep embeddings."
-            score = 100
-        elif norm_distance < 1:
-            result = "The recitations are extremely similar."
-            score = 95
-        elif norm_distance < 5:
-            result = "The recitations are very similar with minor differences."
-            score = 80
-        elif norm_distance < 10:
-            result = "The recitations show moderate similarity."
-            score = 60
-        elif norm_distance < 20:
-            result = "The recitations show some noticeable differences."
-            score = 40
-        else:
-            result = "The recitations are quite different."
-            score = max(0, 100 - norm_distance)
-        return result, score
-    def get_embedding_for_file(self, file_path):
-        if file_path in self.embedding_cache:
-            return self.embedding_cache[file_path]
-        audio = self.load_audio(file_path)
-        embedding = self.get_deep_embedding(audio)
-        self.embedding_cache[file_path] = embedding
-        return embedding
-    def predict(self, file_path1, file_path2):
-        embedding1 = self.get_embedding_for_file(file_path1)
-        embedding2 = self.get_embedding_for_file(file_path2)
-        norm_distance = self.compute_dtw_distance(embedding1.T, embedding2.T)
-        interpretation, similarity_score = self.interpret_similarity(norm_distance)
-        print(f"Similarity Score: {similarity_score:.1f}/100")
-        print(f"Interpretation: {interpretation}")
-        return similarity_score, interpretation
-    def clear_cache(self):
-        self.embedding_cache = {}
-# --- Lifespan Event Handler ---
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    global comparer
-    auth_token = os.environ.get("HF_TOKEN")
-    comparer = QuranRecitationComparer(
-        model_name="jonatasgrosman/wav2vec2-large-xlsr-53-arabic",
-        auth_token=auth_token
-    )
-    print("Model initialized and ready for predictions!")
-    yield
-    print("Application shutdown: Cleanup if necessary.")
-app = FastAPI(
-    title="Quran Recitation Comparer API",
-    description="Compares two Quran recitations using a deep wav2vec2 model.",
-    version="1.0",
-    lifespan=lifespan
-)
-# --- API Endpoints ---
-@app.get("/", summary="Health Check")
 async def root():
-    return {"message": "Quran Recitation Comparer API is up and running."}
-@app.post("/predict", summary="Compare Two Audio Files", response_model=dict)
-async def predict(file1: UploadFile = File(...), file2: UploadFile = File(...)):
-    tmp1_path = None
-    tmp2_path = None
-    try:
-        suffix1 = os.path.splitext(file1.filename)[1] or ".wav"
-        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix1) as tmp1:
-            content1 = await file1.read()
-            tmp1.write(content1)
-            tmp1_path = tmp1.name
-        suffix2 = os.path.splitext(file2.filename)[1] or ".wav"
-        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix2) as tmp2:
-            content2 = await file2.read()
-            tmp2.write(content2)
-            tmp2_path = tmp2.name
-        similarity_score, interpretation = comparer.predict(tmp1_path, tmp2_path)
-        return {"similarity_score": similarity_score, "interpretation": interpretation}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-    finally:
-        if tmp1_path and os.path.exists(tmp1_path):
-            os.remove(tmp1_path)
-        if tmp2_path and os.path.exists(tmp2_path):
-            os.remove(tmp2_path)
-@app.post("/clear_cache", summary="Clear Embedding Cache", response_model=dict)
-async def clear_cache():
-    comparer.clear_cache()
-    return {"message": "Cache cleared."}

 import os
+from fastapi import FastAPI, UploadFile, File
+from google import genai
+from google.genai import types
+import uvicorn
+app = FastAPI()
+# Retrieve the GenAI API key from the environment variable.
+api_key = os.getenv("GENAI_API_KEY")
+if not api_key:
+    raise EnvironmentError("GENAI_API_KEY environment variable not set")
+# Initialize the GenAI client.
+client = genai.Client(api_key=api_key)
+@app.get("/")
 async def root():
+    return {
+        "message": "Welcome to the Audio Similarity API!",
+        "usage": {
+            "endpoint": "/compare-audio",
+            "description": "POST two audio files (user recitation and professional qarri) for similarity analysis.",
+            "instructions": "Send audio files as form-data with keys 'audio1' and 'audio2'."
+        }
+    }
+@app.post("/compare-audio")
+async def compare_audio(
+    audio1: UploadFile = File(...),
+    audio2: UploadFile = File(...)
+):
+    # Read the uploaded audio files.
+    audio1_bytes = await audio1.read()
+    audio2_bytes = await audio2.read()
+    # Create a refined prompt that clearly identifies the audio sources.
+    prompt = (
+        """Please analyze and compare the two provided audio clips.
+The first audio is the user's recitation, and the second audio is the professional qarri recitation.
+Evaluate their similarity on a scale from 0 to 1, where:
+  - 1 indicates the user's recitation contains no mistakes compared to the professional version,
+  - 0 indicates there are significant mistakes.
+Provide your response with:
+  1. A numerical similarity score on the first line.
+  2. A single sentence that indicates whether the user's recitation is similar, moderately similar, or dissimilar to the professional qarri."""
+    )
+    # Generate the content using the Gemini model with the two audio inputs.
+    response = client.models.generate_content(
+        model='gemini-2.0-flash',
+        contents=[
+            prompt,
+            types.Part.from_bytes(
+                data=audio1_bytes,
+                mime_type=audio1.content_type,
+            ),
+            types.Part.from_bytes(
+                data=audio2_bytes,
+                mime_type=audio2.content_type,
+            )
+        ]
+    )
+    # Return the model's response.
+    return {"result": response.text}
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)