Spaces:

slabstech
/

asr_indic_server_cpu

Sleeping

App Files Files Community

sachin commited on Feb 26

Commit

6a2d9d9

1 Parent(s): 436af15

add cpu mode

Browse files

Files changed (20) hide show

Dockerfile +28 -0
Dockerfile.cpu +28 -0
LICENSE +21 -0
compose.yaml +16 -0
docs/api_endpoints.md +3 -0
docs/kannada_sample_3_out.md +3 -0
docs/kannada_sample_4_out.md +3 -0
docs/load_testing.md +6 -0
docs/optimisations_grok.md +274 -0
requirements.txt +177 -0
samples/kannada_sample_1.wav +0 -0
samples/kannada_sample_2.wav +0 -0
server-setup.sh +17 -0
src/asr_api.py +307 -0
src/hf_asr.py +18 -0
src/hf_asr_advanced.py +42 -0
src/nemo_asr.py +52 -0
src/test/api_device_performance_test.py +57 -0
src/test/hf_performance_test.py +25 -0
src/test/locustfile.py +39 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM ubuntu:22.04
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    git \
+    ffmpeg \
+    sudo \
+    wget \
+    && ln -s /usr/bin/python3 /usr/bin/python \
+    && rm -rf /var/lib/apt/lists/*
+COPY cpu-requirements.txt .
+RUN pip install --no-cache-dir -r cpu-requirements.txt
+COPY . .
+RUN useradd -ms /bin/bash appuser \
+    && chown -R appuser:appuser /app
+USER appuser
+EXPOSE 7860
+# Use absolute path for clarity
+CMD ["python", "/app/src/asr_api.py", "--host", "0.0.0.0", "--port", "7860", "--device", "cuda"]

Dockerfile.cpu ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM nvidia/cuda:12.6.0-cudnn-devel-ubuntu22.04
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    git \
+    ffmpeg \
+    sudo \
+    wget \
+    && ln -s /usr/bin/python3 /usr/bin/python \
+    && rm -rf /var/lib/apt/lists/*
+COPY cpu-requirements.txt .
+RUN pip install --no-cache-dir -r cpu-requirements.txt
+COPY . .
+RUN useradd -ms /bin/bash appuser \
+    && chown -R appuser:appuser /app
+USER appuser
+EXPOSE 7860
+# Use absolute path for clarity
+CMD ["python", "/app/src/asr_api.py", "--host", "0.0.0.0", "--port", "7860", "--device", "cuda"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Sachin Shetty
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

compose.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+services:
+  asr-indic-server:
+    image: slabstech/asr_indic_server
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
+    ports:
+      - 8000:8000
+    environment:
+      - LANGUAGE=kn
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ['0']
+              capabilities: [gpu]

docs/api_endpoints.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ API endpoints
2	+
3	+ - sarvam batch ASR - https://github.com/sarvamai/sarvam-ai-cookbook/blob/main/notebooks/stt/stt-batch-api/Sarvam_STT_Batch_API_Demo.ipynb

docs/kannada_sample_3_out.md ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "text": "  ಹೋ ನಾವಾಡುವ ನುಡಿಗೆ ಕನ್ನಡ ನುಡಿ  ಚಿನ್ನದ ನುಡಿ ಸಿರಿಗನ್ನಡದ ನುಡಿ ನಾವಿರುವ ತಾಣವೆ ಗಂಧದ ಗುಡಿ ಅಂದದ ಗುಡಿ ಚಂದದ ಗುಡಿ  ನವಾಡುವ ನುಡಿಯೇ ಕನ್ನಡ ನುಡಿಯೇ ನಾವಿರುವ ತಾಣವೆ ಗಂಧದ ಗುಡಿ ಅಂಧದ ಗುಡಿ ಗಂಧದ ಗುಡಿ ಚಂದದ ಗುಡಿ ಶ್ರೀ ಗಂಧಿ  ಯದಗುಡಿ ಹಾಕಿದರು   ಹಸುರಿನ ಬಣ್ಣ ಸಿರಿಗೆ ಒಲಿದು ಸೌಂದರ್ಯ ಸರಸ್ವತಿ ಧರೆಗಿಳಿದು ಅಹ್  ಹಾ ಹೊಲಜು ಬನದಿ ಜಲ ಹೂ ಬನದಲ್ಲಿ ನಲಿಯುತವೋ ಲಾಡಿ ಚಲುವಿನ ಬಲಜಾ ಬಿ  ಈ ಕಂಧದ ಗುಡಿಗೆಲಿ ನೆಲೆಸಿದಳು ಇದು ಯಾರ ತಪಸ್ಸಿನ ಫಲವೋ ಈ ಕಂಗಳು ಮಾಡಿದ ಪುಣ್ಯವೋಹ ನಾವಿಲ್ಲ  ತಾಣವೆ ಗಂಧಬೆಗುಡಿ   ಹ ಹಿಂಬುತ ಓಡಿ ವೆಜಿಂಕೆಗಳು ಕುಣಿದು  ಆಡುತ್ತಲಿದಿವೆ ನವಿಲುಗಳು ಹಾ ಮುಗಿಲನ್ನು ಚುಂಬಿಸುವ ಸೆಲಿ ತೂಗಾಡುತ್ತ ನಿಂತು  ಮರಗಳಲಿ ಕಾಡು ತಿರೇ ಬಾನಾಡಿಗಳು ಎದೆಯಲ್ಲಿ ಸಂತಸ ದಾವನಲು ಇದು ಒನ್ಯ ಮೃಗಗಳ ಲೋಕವೋ ಈ ಭೂಮಿಗೆ ಇಳಿದ ನಾಕವೋ  ನವಾಡುವ ನುಡಿಗೆ ಕನ್ನಡ ನುಡಿ ನಾವಿರುವ ತಾಣವೆ ಗಂಧದ ಗುಡಿ ಅಂದದ ಗುಡಿ ಗಂಧದ ದುಡಿ  ಚಂದದ ಗುಡಿ ಶ್ರೀ ದಂತದ ಗುಡಿ ಅಹ್  ಹ್ಞೂ   ಹೋ ನಾವಾಡುವ ನುಡಿಗೆ ಕನ್ನಡ ನುಡಿ  ಚಿನ್ನದ ನುಡಿ ಸಿರಿಗನ್ನಡದ ನುಡಿ ನಾವಿರುವ ತಾಣವೆ ಗಂಧದ ಗುಡಿ ಅಂದದ ಗುಡಿ ಚಂದದ ಗುಡಿ  ನವಾಡುವ ನುಡಿಯೇ ಕನ್ನಡ ನುಡಿಯೇ ನಾವಿರುವ ತಾಣವೆ ಗಂಧದ ಗುಡಿ ಅಂಧದ ಗುಡಿ ಗಂಧದ ಗುಡಿ ಚಂದದ ಗುಡಿ ಶ್ರೀ ಗಂಧಿ  ಯದಗುಡಿ ಹಾಕಿದರು   ಹಸುರಿನ ಬಣ್ಣ ಸಿರಿಗೆ ಒಲಿದು ಸೌಂದರ್ಯ ಸರಸ್ವತಿ ಧರೆಗಿಳಿದು ಅಹ್  ಹಾ ಹೊಲಜು ಬನದಿ ಜಲ ಹೂ ಬನದಲ್ಲಿ ನಲಿಯುತವೋ ಲಾಡಿ ಚಲುವಿನ ಬಲಜಾ ಬಿ  ಈ ಕಂಧದ ಗುಡಿಗೆಲಿ ನೆಲೆಸಿದಳು ಇದು ಯಾರ ತಪಸ್ಸಿನ ಫಲವೋ ಈ ಕಂಗಳು ಮಾಡಿದ ಪುಣ್ಯವೋಹ ನಾವಿಲ್ಲ  ತಾಣವೆ ಗಂಧಬೆಗುಡಿ   ಹ ಹಿಂಬುತ ಓಡಿ ವೆಜಿಂಕೆಗಳು ಕುಣಿದು  ಆಡುತ್ತಲಿದಿವೆ ನವಿಲುಗಳು ಹಾ ಮುಗಿಲನ್ನು ಚುಂಬಿಸುವ ಸೆಲಿ ತೂಗಾಡುತ್ತ ನಿಂತು  ಮರಗಳಲಿ ಕಾಡು ತಿರೇ ಬಾನಾಡಿಗಳು ಎದೆಯಲ್ಲಿ ಸಂತಸ ದಾವನಲು ಇದು ಒನ್ಯ ಮೃಗಗಳ ಲೋಕವೋ ಈ ಭೂಮಿಗೆ ಇಳಿದ ನಾಕವೋ  ನವಾಡುವ ನುಡಿಗೆ ಕನ್ನಡ ನುಡಿ ನಾವಿರುವ ತಾಣವೆ ಗಂಧದ ಗುಡಿ ಅಂದದ ಗುಡಿ ಗಂಧದ ದುಡಿ  ಚಂದದ ಗುಡಿ ಶ್ರೀ ದಂತದ ಗುಡಿ ಅಹ್  ಹ್ಞೂ"
+}

docs/kannada_sample_4_out.md ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "text": " ನನ್ನ ಆಶೀರ್ವಾದ ಸಾಕಾರ ಇದ್ದೇ ಐತೆ ಅದು ಬಿಡು ಅಲ ನೀ ಕೈ ಹಾಕೊಂಡಿರೋ ಕೆಲಸಗೆ ಉದ್ಧಾರ ಆಕ್ಯ ಅನ್ನೋ ನಂಬಿಕೆ ನಂಗ್ ಎಳ್ಳಷ್ಟು ಇಲ್ವಲಪ್ಪು ಮಾರಾಯ ಇದಾಗದ ಕೆಲಸ ಇದ  ಆಗದು ಎಂದು ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಆಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸ ಒಂದು ಮುಂದೆ ಮನಸೊಂದಿ  ಕೆಚ್ಚದೆ ಇರಬೇಕೆಂದೆಂದು ಆಗದು ಎಂದು ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ      ಕೆತ್ತಲಾಗದು ಕಗ್ಗಲ್ಲೆಂದು ಎದೆಗುಂದಿ ಕರೆ ಶಿಲ್ಪಿ ಕೆತ್ತಲಾಗದು ಕಗ್ಗಲ್ಲೆಂದು ಎದೆಗುಂದಿ ಕರೆ ಶಿಲ್ಪಿ ಆಗುತ್ತಿದ್ದೇವೆ  ತಲೆಗಳ ಬೀಡು ತಮ್ಮ ಟೆ ಚನಮೆಲೆ ನಾಡು ಬೇಲೂ ಹಳೆಬೀಡು ಬೇಲೂ ಹಳೆಬೀಡು   ಕೆತ್ತಲಾಗದು ಕಗ್ಗಲ್ಲೆಂದು ಎದೆಗುಂದಿದ್ದರೆ ಶೀಪೀ ಆಗುತ್ತಿದ್ದೇ ಕಲೆಗಳ ಬೀಡು ಗೊಮ್ಮಟೆ  ದೇಶನ ನೆಲೆನಾಡು ಬೇಲೂರು ಹಳೆ ಬೀಡು ಬೇಲೂರು ಹಳೆ ಬೀಡು ಸಾಗದು ಎಂದು ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ    ಕಾದೇರಿಯನು ಹರಿಯಲು ಬಿಟ್ಟು ಕಾದೇರಿಯನು ಹರಿಯಲು ಬಿಟ್ಟು ವಿಶೇಷ ರೈ ಶ್ರಮ ಪಡದಿದ್ದರೇ ತನ್ನ ಪಾಡಿಯ ಪಟ್ಟದಿದ್ದರೇ    ಕಾದೇರಿಯನ್ನು ಹರಿಯಲು ಬಿಟ್ಟು ವಿಶೇಷತೆಯ ಶ್ರಮ ಪಡೆದಿದ್ದರೇ ತನ್ನಂಬ ಹಡಿಯ ಕಟ್ಟತೆ  ಬಂಗಾರ ಬೆಳೆವ ಹೊನ್ನಾಡು ಆ ಬಂಗಾರ ಬೆಳೆವ ಹೊನ್ನಾಡು ಆಗುತ್ತಿದ್ದೇ ನಾಡು ಕನ್ನಡ ಸಿರಿ ನಾಡು ನಮ್ಮ ಕನ್ನಡ ಸಿರಿ ನಾಡು ಆಗದು ಎಂದು  ಕೈ ಸಾಗದು ಎಂತು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ನುಂಬೆ ಸಾಗದು ಕೆಲಸವು ನುಂಬೆ    ಕೈ ಕೆಸರಾದರೆ ಬಾಯಿ ಮೊಸದೆಂಬ ಹೀಗೆ  ಹಿರಿಯರ ಅನುಭವ ಸತ್ಯ ಇದ ನೆನಪಿಡದೆ ಮುನಿತ್ಯ ಕೈ ಕೆಸರಾದರೆ ಬಾಯ್ಮಸದೆಂಬ ಹಿರಿಯರ ಅನುಭವ ಸತ್ಯ  ಇದು ನೆನಪಿಡಬೇಕು ನಿತ್ಯ ದುಡಿಮೆಯ ನಂಬಿ ಬದುಕ ದುಡಿಮೆಯ ನಂಬಿ ಬದುಕು ಅದರಲ್ಲಿ ದೇವರ ಹುಡುಕು ಬಾಳಲಿ ಬದುಬುದು ಬೆಳಕು ನಮ್ಮ ಬಾಳು  ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ ಮನಸೊಂದಿದ್ದರೆ ಆಗವು ಉಂಟು ಕೆಚ್ಚೆದೆಗಿರಬೇಕೆಂದು ಕೆಚ್ಚೆದೆಗಿರಬೇಕು ೆಂದು ಆಗದು ಎಂದು ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆಯೇ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ  ನನ್ನ ಆಶೀರ್ವಾದ ಸಾಕಾರ ಇದ್ದೇ ಐತೆ ಅದು ಬಿಡು ಅಲ ನೀ ಕೈ ಹಾಕೊಂಡಿರೋ ಕೆಲಸಗೆ ಉದ್ಧಾರ ಆಕ್ಯ ಅನ್ನೋ ನಂಬಿಕೆ ನಂಗ್ ಎಳ್ಳಷ್ಟು ಇಲ್ವಲಪ್ಪು ಮಾರಾಯ ಇದಾಗದ ಕೆಲಸ ಇದ  ಆಗದು ಎಂದು ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಆಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸ ಒಂದು ಮುಂದೆ ಮನಸೊಂದಿ  ಕೆಚ್ಚದೆ ಇರಬೇಕೆಂದೆಂದು ಆಗದು ಎಂದು ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ      ಕೆತ್ತಲಾಗದು ಕಗ್ಗಲ್ಲೆಂದು ಎದೆಗುಂದಿ ಕರೆ ಶಿಲ್ಪಿ ಕೆತ್ತಲಾಗದು ಕಗ್ಗಲ್ಲೆಂದು ಎದೆಗುಂದಿ ಕರೆ ಶಿಲ್ಪಿ ಆಗುತ್ತಿದ್ದೇವೆ  ತಲೆಗಳ ಬೀ��ು ತಮ್ಮ ಟೆ ಚನಮೆಲೆ ನಾಡು ಬೇಲೂ ಹಳೆಬೀಡು ಬೇಲೂ ಹಳೆಬೀಡು   ಕೆತ್ತಲಾಗದು ಕಗ್ಗಲ್ಲೆಂದು ಎದೆಗುಂದಿದ್ದರೆ ಶೀಪೀ ಆಗುತ್ತಿದ್ದೇ ಕಲೆಗಳ ಬೀಡು ಗೊಮ್ಮಟೆ  ದೇಶನ ನೆಲೆನಾಡು ಬೇಲೂರು ಹಳೆ ಬೀಡು ಬೇಲೂರು ಹಳೆ ಬೀಡು ಸಾಗದು ಎಂದು ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ    ಕಾದೇರಿಯನು ಹರಿಯಲು ಬಿಟ್ಟು ಕಾದೇರಿಯನು ಹರಿಯಲು ಬಿಟ್ಟು ವಿಶೇಷ ರೈ ಶ್ರಮ ಪಡದಿದ್ದರೇ ತನ್ನ ಪಾಡಿಯ ಪಟ್ಟದಿದ್ದರೇ    ಕಾದೇರಿಯನ್ನು ಹರಿಯಲು ಬಿಟ್ಟು ವಿಶೇಷತೆಯ ಶ್ರಮ ಪಡೆದಿದ್ದರೇ ತನ್ನಂಬ ಹಡಿಯ ಕಟ್ಟತೆ  ಬಂಗಾರ ಬೆಳೆವ ಹೊನ್ನಾಡು ಆ ಬಂಗಾರ ಬೆಳೆವ ಹೊನ್ನಾಡು ಆಗುತ್ತಿದ್ದೇ ನಾಡು ಕನ್ನಡ ಸಿರಿ ನಾಡು ನಮ್ಮ ಕನ್ನಡ ಸಿರಿ ನಾಡು ಆಗದು ಎಂದು  ಕೈ ಸಾಗದು ಎಂತು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ನುಂಬೆ ಸಾಗದು ಕೆಲಸವು ನುಂಬೆ    ಕೈ ಕೆಸರಾದರೆ ಬಾಯಿ ಮೊಸದೆಂಬ ಹೀಗೆ  ಹಿರಿಯರ ಅನುಭವ ಸತ್ಯ ಇದ ನೆನಪಿಡದೆ ಮುನಿತ್ಯ ಕೈ ಕೆಸರಾದರೆ ಬಾಯ್ಮಸದೆಂಬ ಹಿರಿಯರ ಅನುಭವ ಸತ್ಯ  ಇದು ನೆನಪಿಡಬೇಕು ನಿತ್ಯ ದುಡಿಮೆಯ ನಂಬಿ ಬದುಕ ದುಡಿಮೆಯ ನಂಬಿ ಬದುಕು ಅದರಲ್ಲಿ ದೇವರ ಹುಡುಕು ಬಾಳಲಿ ಬದುಬುದು ಬೆಳಕು ನಮ್ಮ ಬಾಳು  ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ ಮನಸೊಂದಿದ್ದರೆ ಆಗವು ಉಂಟು ಕೆಚ್ಚೆದೆಗಿರಬೇಕೆಂದು ಕೆಚ್ಚೆದೆಗಿರಬೇಕು ೆಂದು ಆಗದು ಎಂದು ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆಯೇ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ"
+}

docs/load_testing.md ADDED Viewed

	@@ -0,0 +1,6 @@

+Load Test API
+- pip instal locust
+- cd src/test
+- locust --host localhost:8000 --run-time 60

docs/optimisations_grok.md ADDED Viewed

	@@ -0,0 +1,274 @@

+Your code is a well-structured FastAPI application for transcribing audio files using NVIDIA NeMo ASR models, supporting multiple Indian languages and batch processing. It includes logging, audio preprocessing, and error handling, which are solid foundations for a production-ready service. However, there are several areas where improvements can be made to reduce latency, increase throughput (serve more users), and ensure production readiness. Below, I’ll suggest enhancements and identify missing elements for a robust production deployment.
+Key Improvements for Latency and Scalability
+1. Optimize Model Inference
+    Mixed Precision: Enable mixed-precision inference with torch.cuda.amp.autocast (currently commented out) to reduce latency on CUDA GPUs. This uses bfloat16 or float16, cutting computation time without significant accuracy loss.
+    python
+    with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+        rnnt_texts = asr_manager.model.transcribe(chunk_file_paths, batch_size=1, language_id=language_id)
+    CUDA Graphs: If using NeMo 2.0.0+, enable CUDA Graphs for repeated inference calls to reduce kernel launch overhead. This requires a fixed input shape, so test compatibility with dynamic audio lengths.
+    Batch Size: Dynamically adjust batch_size based on input size or GPU memory. Currently, it’s hardcoded (1 for single files, len(files) for batch). For larger batches, process in smaller chunks to avoid OOM errors:
+    python
+    batch_size = min(len(chunk_file_paths), 32)  # Cap at 32, adjust based on GPU memory
+    rnnt_texts = asr_manager.model.transcribe(chunk_file_paths, batch_size=batch_size, language_id=language_id)
+2. Model Management
+    Preload Models: Loading a new model for every language switch (e.g., load_model) is slow and memory-intensive. Preload all required models at startup if memory allows, or use a caching mechanism:
+    python
+    class ASRModelManager:
+        def __init__(self, default_language="kn"):
+            self.models = {}
+            self.default_language = default_language
+            self.load_initial_model(default_language)
+        def load_initial_model(self, language_id):
+            model = self.load_model(language_id)
+            self.models[language_id] = model
+        def get_model(self, language_id):
+            if language_id not in self.models:
+                self.models[language_id] = self.load_model(language_id)
+            return self.models[language_id]
+    Then update /transcribe/ and /transcribe_batch/ to use asr_manager.get_model(language_id) instead of reloading.
+    Model Sharing: Ensure thread-safety when sharing models across requests. FastAPI runs async, so use a lock if multiple workers access the same model:
+    python
+    from threading import Lock
+    class ASRModelManager:
+        def __init__(self, default_language="kn"):
+            self.model_locks = {lang: Lock() for lang in self.model_language.keys()}
+            ...
+        async def transcribe(self, paths, language_id, batch_size):
+            with self.model_locks[language_id]:
+                model = self.get_model(language_id)
+                return model.transcribe(paths, batch_size=batch_size, language_id=language_id)
+3. Audio Preprocessing
+    In-Memory Processing: Avoid writing to disk with temporary files (tempfile.NamedTemporaryFile) and splitting chunks to disk (split_audio). Process audio in memory to reduce I/O latency:
+    python
+    def split_audio_in_memory(self, audio_segment, chunk_duration_ms=15000):
+        duration_ms = len(audio_segment)
+        if duration_ms <= chunk_duration_ms:
+            return [audio_segment]
+        chunks = [audio_segment[i:i + chunk_duration_ms] for i in range(0, duration_ms, chunk_duration_ms)]
+        return chunks
+    Modify /transcribe/ to:
+    python
+    audio_chunks = asr_manager.split_audio_in_memory(audio)
+    chunk_buffers = [io.BytesIO() for _ in audio_chunks]
+    for chunk, buffer in zip(audio_chunks, chunk_buffers):
+        chunk.export(buffer, format="wav")
+        buffer.seek(0)
+    rnnt_texts = asr_manager.model.transcribe(chunk_buffers, batch_size=len(chunk_buffers), language_id=language_id)
+    Async Preprocessing: Offload audio conversion (e.g., sample rate adjustment) to an async task or worker queue to free up the main thread.
+4. Async and Concurrency
+    Worker Queue: For heavy loads, integrate a task queue (e.g., Celery with Redis) to handle transcription jobs asynchronously. This decouples preprocessing and inference from the HTTP response:
+    python
+    from celery import Celery
+    celery_app = Celery('asr', broker='redis://localhost:6379/0')
+    @celery_app.task
+    def transcribe_task(file_paths, language_id):
+        model = asr_manager.get_model(language_id)
+        return model.transcribe(file_paths, batch_size=len(file_paths), language_id=language_id)
+    @app.post("/transcribe_async/")
+    async def transcribe_async(file: UploadFile = File(...), language: str = Query(...)):
+        # Save file temporarily or process in memory
+        task = transcribe_task.delay([tmp_file_path], asr_manager.model_language[language])
+        return {"task_id": task.id}
+    Increase Workers: Run FastAPI with multiple Uvicorn workers (uvicorn --workers 4) to handle concurrent requests, leveraging multiple CPU cores.
+5. FastAPI Performance
+    Response Streaming: For long transcriptions, stream results back to the client instead of waiting for full processing:
+    python
+    from fastapi.responses import StreamingResponse
+    async def stream_transcriptions(chunk_file_paths, language_id):
+        model = asr_manager.get_model(language_id)
+        for chunk in chunk_file_paths:
+            text = model.transcribe([chunk], batch_size=1, language_id=language_id)[0]
+            yield f"data: {text}\n\n"
+    @app.post("/transcribe_stream/")
+    async def transcribe_stream(file: UploadFile = File(...), language: str = Query(...)):
+        audio_chunks = asr_manager.split_audio(tmp_file_path)
+        return StreamingResponse(stream_transcriptions(audio_chunks, asr_manager.model_language[language]), media_type="text/event-stream")
+    Rate Limiting: Add rate limiting (e.g., slowapi) to prevent overload:
+    python
+    from slowapi import Limiter
+    from slowapi.util import get_remote_address
+    limiter = Limiter(key_func=get_remote_address)
+    app.state.limiter = limiter
+    @app.post("/transcribe/", response_model=TranscriptionResponse)
+    @limiter.limit("10/minute")
+    async def transcribe_audio(...):
+        ...
+Production Readiness: Missing Elements
+1. Scalability
+    Load Balancing: Deploy behind a load balancer (e.g., NGINX, HAProxy) to distribute requests across multiple instances.
+    Containerization: Use Docker for consistent deployment:
+    dockerfile
+    FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
+    RUN apt-get update && apt-get install -y python3-pip ffmpeg
+    RUN pip3 install torch nemo_toolkit[asr] fastapi uvicorn pydub
+    COPY . /app
+    WORKDIR /app
+    CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4"]
+    Build and run:
+    bash
+    docker build -t asr-api .
+    docker run --gpus all -p 8000:8000 asr-api
+    Horizontal Scaling: Use Kubernetes or Docker Swarm to scale instances based on demand.
+2. Monitoring and Logging
+    Metrics: Add Prometheus metrics (e.g., prometheus-fastapi-instrumentator) to track latency, request rate, and errors:
+    python
+    from prometheus_fastapi_instrumentator import Instrumentator
+    Instrumentator().instrument(app).expose(app)
+    Distributed Logging: Send logs to a centralized system (e.g., ELK Stack, Loki) instead of local files for better analysis.
+3. Security
+    Authentication: Add API key or JWT authentication (e.g., fastapi-users) to restrict access.
+    Input Validation: Validate audio file size and duration to prevent abuse:
+    python
+    MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB
+    if len(file_content) > MAX_FILE_SIZE:
+        raise HTTPException(status_code=400, detail="File too large")
+    HTTPS: Configure SSL/TLS with NGINX or a cloud provider.
+4. Error Handling and Resilience
+    Retry Logic: Add retries for transient failures (e.g., model inference errors) using tenacity:
+    python
+    from tenacity import retry, stop_after_attempt, wait_fixed
+    @retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
+    def transcribe_with_retry(model, paths, batch_size, language_id):
+        return model.transcribe(paths, batch_size=batch_size, language_id=language_id)
+    Graceful Degradation: If a model fails to load, fall back to a default (e.g., Kannada).
+5. Configuration
+    Environment Variables: Use python-dotenv or pydantic-settings for configurable settings (e.g., port, host, chunk duration):
+    python
+    from pydantic_settings import BaseSettings
+    class Settings(BaseSettings):
+        host: str = "127.0.0.1"
+        port: int = 8000
+        chunk_duration_ms: int = 15000
+    settings = Settings()
+    uvicorn.run(app, host=settings.host, port=settings.port)
+Final Optimized Code Snippet
+Here’s an example incorporating some key improvements:
+python
+import torch
+import nemo.collections.asr as nemo_asr
+from fastapi import FastAPI, File, UploadFile, HTTPException, Query
+from fastapi.responses import JSONResponse
+from pydub import AudioSegment
+import io
+import logging
+from threading import Lock
+app = FastAPI()
+logging.basicConfig(level=logging.INFO)
+class ASRModelManager:
+    def __init__(self, default_language="kn"):
+        self.default_language = default_language
+        self.model_language = {...}  # Same as original
+        self.config_models = {...}  # Same as original
+        self.models = {}
+        self.model_locks = {lang: Lock() for lang in self.model_language.keys()}
+        self.load_initial_model(default_language)
+    def load_model(self, language_id):
+        model = nemo_asr.models.ASRModel.from_pretrained(self.config_models[language_id])
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        return model.to(device).eval()
+    def load_initial_model(self, language_id):
+        self.models[language_id] = self.load_model(language_id)
+    def get_model(self, language_id):
+        if language_id not in self.models:
+            with self.model_locks[language_id]:
+                if language_id not in self.models:  # Double-check locking
+                    self.models[language_id] = self.load_model(language_id)
+        return self.models[language_id]
+    def split_audio_in_memory(self, audio_segment, chunk_duration_ms=15000):
+        duration_ms = len(audio_segment)
+        if duration_ms <= chunk_duration_ms:
+            return [audio_segment]
+        return [audio_segment[i:i + chunk_duration_ms] for i in range(0, duration_ms, chunk_duration_ms)]
+asr_manager = ASRModelManager()
+@app.post("/transcribe/")
+async def transcribe_audio(file: UploadFile = File(...), language: str = Query(...)):
+    file_content = await file.read()
+    audio = AudioSegment.from_file(io.BytesIO(file_content), format=file.filename.split(".")[-1].lower())
+    if audio.frame_rate != 16000:
+        audio = audio.set_frame_rate(16000).set_channels(1)
+    audio_chunks = asr_manager.split_audio_in_memory(audio)
+    chunk_buffers = [io.BytesIO() for _ in audio_chunks]
+    for chunk, buffer in zip(audio_chunks, chunk_buffers):
+        chunk.export(buffer, format="wav")
+        buffer.seek(0)
+    language_id = asr_manager.model_language.get(language, asr_manager.default_language)
+    model = asr_manager.get_model(language_id)
+    model.cur_decoder = "rnnt"
+    with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+        rnnt_texts = model.transcribe(chunk_buffers, batch_size=min(len(chunk_buffers), 32), language_id=language_id)
+    text = " ".join(rnnt_texts)
+    return JSONResponse(content={"text": text})
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000, workers=4)
+Summary
+    Latency: Mixed precision, in-memory processing, and dynamic batching reduce inference time.
+    Scalability: Preloaded models, async workers, and Triton (as an alternative) handle more users.
+    Production: Add monitoring, security, and containerization for reliability.
+For maximum performance, consider switching to NVIDIA Triton Inference Server (as suggested previously) instead of FastAPI if inference throughput is the top priority. Let me know if you’d like a deeper dive into any specific improvement!

requirements.txt ADDED Viewed

	@@ -0,0 +1,177 @@

+absl-py==2.1.0
+aiohappyeyeballs==2.4.6
+aiohttp==3.11.12
+aiosignal==1.3.2
+alembic==1.14.1
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.8.0
+asteroid-filterbanks==0.4.0
+asttokens==3.0.0
+async-timeout==5.0.1
+attrs==25.1.0
+audioread==3.0.1
+blinker==1.9.0
+braceexpand==0.1.7
+Brotli==1.1.0
+certifi==2025.1.31
+cffi==1.17.1
+charset-normalizer==3.4.1
+click==8.1.8
+coloredlogs==15.0.1
+colorlog==6.9.0
+ConfigArgParse==1.7
+contourpy==1.3.1
+cycler==0.12.1
+Cython==0.29.37
+cytoolz==1.0.1
+datasets==2.21.0
+decorator==5.2.0
+dill==0.3.8
+docopt==0.6.2
+editdistance==0.8.1
+einops==0.8.1
+exceptiongroup==1.2.2
+executing==2.2.0
+fastapi==0.115.8
+filelock==3.17.0
+Flask==3.1.0
+flask-cors==5.0.1
+Flask-Login==0.6.3
+flatbuffers==25.2.10
+fonttools==4.56.0
+frozenlist==1.5.0
+fsspec==2024.6.1
+gevent==24.11.1
+geventhttpclient==2.3.3
+greenlet==3.1.1
+grpcio==1.70.0
+h11==0.14.0
+huggingface-hub==0.23.2
+humanfriendly==10.0
+hydra-core==1.3.2
+HyperPyYAML==1.2.2
+idna==3.10
+intervaltree==3.1.0
+ipython==8.27.0
+itsdangerous==2.2.0
+jedi==0.19.2
+Jinja2==3.1.5
+jiwer==3.0.4
+joblib==1.4.2
+julius==0.2.7
+kiwisolver==1.4.8
+lazy_loader==0.4
+lhotse==1.27.0
+librosa==0.10.2.post1
+lightning==2.5.0.post0
+lightning-utilities==0.12.0
+lilcom==1.8.0
+llvmlite==0.44.0
+locust==2.33.0
+Mako==1.3.9
+Markdown==3.7
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.10.0
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multiprocess==0.70.16
+nemo_toolkit @ git+https://github.com/AI4Bharat/NeMo@0a1560e398ee97dd3ff17b495d05cad31938cef0
+networkx==3.4.2
+numba==0.61.0
+numpy==1.26.4
+omegaconf==2.3.0
+onnx==1.17.0
+onnxruntime==1.19.0
+optuna==4.2.1
+packaging==24.2
+pandas==2.2.2
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.1.0
+platformdirs==4.3.6
+pooch==1.8.2
+primePy==1.3
+prompt_toolkit==3.0.50
+propcache==0.3.0
+protobuf==5.29.3
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyannote.audio==3.3.1
+pyannote.core==5.0.0
+pyannote.database==5.1.3
+pyannote.metrics==3.2.1
+pyannote.pipeline==3.0.1
+pyarrow==19.0.1
+pycparser==2.22
+pydantic==2.10.6
+pydantic_core==2.27.2
+pydub==0.25.1
+Pygments==2.19.1
+pyparsing==3.2.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytorch-lightning==2.4.0
+pytorch-metric-learning==2.8.1
+pytz==2025.1
+PyYAML==6.0.2
+pyzmq==26.2.1
+RapidFuzz==3.12.1
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+ruamel.yaml==0.18.10
+ruamel.yaml.clib==0.2.12
+safetensors==0.5.2
+scikit-learn==1.6.1
+scipy==1.15.2
+semver==3.0.4
+sentencepiece==0.2.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soundfile==0.13.1
+soxr==0.5.0.post1
+speechbrain==1.0.2
+SQLAlchemy==2.0.38
+stack-data==0.6.3
+starlette==0.45.3
+sympy==1.13.1
+tabulate==0.9.0
+tensorboard==2.19.0
+tensorboard-data-server==0.7.2
+tensorboardX==2.6.2.2
+text-unidecode==1.3
+threadpoolctl==3.5.0
+tokenizers==0.19.1
+tomli==2.2.1
+toolz==1.0.0
+torch==2.6.0
+torch-audiomentations==0.12.0
+torch_pitch_shift==1.2.5
+torchaudio==2.6.0
+torchmetrics==1.6.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.40.0
+triton==3.2.0
+typer==0.15.1
+typing_extensions==4.12.2
+tzdata==2025.1
+urllib3==2.3.0
+uvicorn==0.34.0
+wcwidth==0.2.13
+webdataset==0.2.100
+Werkzeug==3.1.3
+wget==3.2
+wrapt==1.17.2
+xxhash==3.5.0
+yarl==1.18.3
+zope.event==5.0
+zope.interface==7.2

samples/kannada_sample_1.wav ADDED Viewed

Binary file (157 kB). View file

samples/kannada_sample_2.wav ADDED Viewed

Binary file (378 kB). View file

server-setup.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+sudo apt-get update -y
+sudo apt-get upgrade -y
+sudo apt-get install -y python3-venv
+sudo apt-get install -y python3-pip
+sudo apt-get install -y ffmpeg
+sudo apt install net-tools -y
+python3 -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+#cd src/asr_indic_server
+python src/asr_api.py

src/asr_api.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import torch
+import nemo.collections.asr as nemo_asr
+from fastapi import FastAPI, File, UploadFile, HTTPException, Query
+from fastapi.responses import RedirectResponse
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from pydub import AudioSegment
+import os
+import tempfile
+import subprocess
+import asyncio
+import io
+import logging
+from logging.handlers import RotatingFileHandler
+from time import time
+from typing import List
+import argparse
+import uvicorn
+# Configure logging with log rotation
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        RotatingFileHandler("transcription_api.log", maxBytes=10*1024*1024, backupCount=5), # 10MB per file, keep 5 backup files
+        logging.StreamHandler() # This will also print logs to the console
+    ]
+)
+class ASRModelManager:
+    def __init__(self, default_language="kn", device_type="cuda"):
+        self.default_language = default_language
+        self.device_type = device_type
+        self.model_language = {
+            "kannada": "kn",
+            "hindi": "hi",
+            "malayalam": "ml",
+            "assamese": "as",
+            "bengali": "bn",
+            "bodo": "brx",
+            "dogri": "doi",
+            "gujarati": "gu",
+            "kashmiri": "ks",
+            "konkani": "kok",
+            "maithili": "mai",
+            "manipuri": "mni",
+            "marathi": "mr",
+            "nepali": "ne",
+            "odia": "or",
+            "punjabi": "pa",
+            "sanskrit": "sa",
+            "santali": "sat",
+            "sindhi": "sd",
+            "tamil": "ta",
+            "telugu": "te",
+            "urdu": "ur"
+        }
+        self.config_models = {
+            "as": "ai4bharat/indicconformer_stt_as_hybrid_rnnt_large",
+            "bn": "ai4bharat/indicconformer_stt_bn_hybrid_rnnt_large",
+            "brx": "ai4bharat/indicconformer_stt_brx_hybrid_rnnt_large",
+            "doi": "ai4bharat/indicconformer_stt_doi_hybrid_rnnt_large",
+            "gu": "ai4bharat/indicconformer_stt_gu_hybrid_rnnt_large",
+            "hi": "ai4bharat/indicconformer_stt_hi_hybrid_rnnt_large",
+            "kn": "ai4bharat/indicconformer_stt_kn_hybrid_rnnt_large",
+            "ks": "ai4bharat/indicconformer_stt_ks_hybrid_rnnt_large",
+            "kok": "ai4bharat/indicconformer_stt_kok_hybrid_rnnt_large",
+            "mai": "ai4bharat/indicconformer_stt_mai_hybrid_rnnt_large",
+            "ml": "ai4bharat/indicconformer_stt_ml_hybrid_rnnt_large",
+            "mni": "ai4bharat/indicconformer_stt_mni_hybrid_rnnt_large",
+            "mr": "ai4bharat/indicconformer_stt_mr_hybrid_rnnt_large",
+            "ne": "ai4bharat/indicconformer_stt_ne_hybrid_rnnt_large",
+            "or": "ai4bharat/indicconformer_stt_or_hybrid_rnnt_large",
+            "pa": "ai4bharat/indicconformer_stt_pa_hybrid_rnnt_large",
+            "sa": "ai4bharat/indicconformer_stt_sa_hybrid_rnnt_large",
+            "sat": "ai4bharat/indicconformer_stt_sat_hybrid_rnnt_large",
+            "sd": "ai4bharat/indicconformer_stt_sd_hybrid_rnnt_large",
+            "ta": "ai4bharat/indicconformer_stt_ta_hybrid_rnnt_large",
+            "te": "ai4bharat/indicconformer_stt_te_hybrid_rnnt_large",
+            "ur": "ai4bharat/indicconformer_stt_ur_hybrid_rnnt_large"
+        }
+        self.model = self.load_model(self.default_language)
+    def load_model(self, language_id="kn"):
+        model_name = self.config_models.get(language_id, self.config_models["kn"])
+        model = nemo_asr.models.ASRModel.from_pretrained(model_name)
+        device = torch.device(self.device_type if torch.cuda.is_available() and self.device_type == "cuda" else "cpu")
+        model.freeze() # inference mode
+        model = model.to(device) # transfer model to device
+        return model
+    def split_audio(self, file_path, chunk_duration_ms=15000):
+        """
+        Splits an audio file into chunks of specified duration if the audio duration exceeds the chunk duration.
+        :param file_path: Path to the audio file.
+        :param chunk_duration_ms: Duration of each chunk in milliseconds (default is 15000 ms or 15 seconds).
+        """
+        # Load the audio file
+        audio = AudioSegment.from_file(file_path)
+        # Get the duration of the audio in milliseconds
+        duration_ms = len(audio)
+        # Check if the duration is more than the specified chunk duration
+        if duration_ms > chunk_duration_ms:
+            # Calculate the number of chunks needed
+            num_chunks = duration_ms // chunk_duration_ms
+            if duration_ms % chunk_duration_ms != 0:
+                num_chunks += 1
+            # Split the audio into chunks
+            chunks = [audio[i*chunk_duration_ms:(i+1)*chunk_duration_ms] for i in range(num_chunks)]
+            # Create a directory to save the chunks
+            output_dir = "audio_chunks"
+            os.makedirs(output_dir, exist_ok=True)
+            # Export each chunk to separate files
+            chunk_file_paths = []
+            for i, chunk in enumerate(chunks):
+                chunk_file_path = os.path.join(output_dir, f"chunk_{i}.wav")
+                chunk.export(chunk_file_path, format="wav")
+                chunk_file_paths.append(chunk_file_path)
+                print(f"Chunk {i} exported successfully to {chunk_file_path}.")
+            return chunk_file_paths
+        else:
+            return [file_path]
+app = FastAPI()
+asr_manager = ASRModelManager()
+# Define the response model
+class TranscriptionResponse(BaseModel):
+    text: str
+class BatchTranscriptionResponse(BaseModel):
+    transcriptions: List[str]
+@app.post("/transcribe/", response_model=TranscriptionResponse)
+async def transcribe_audio(file: UploadFile = File(...), language: str = Query(..., enum=list(asr_manager.model_language.keys()))):
+    start_time = time()
+    try:
+        # Check file extension
+        file_extension = file.filename.split(".")[-1].lower()
+        if file_extension not in ["wav", "mp3"]:
+            logging.warning(f"Unsupported file format: {file_extension}")
+            raise HTTPException(status_code=400, detail="Unsupported file format. Please upload a WAV or MP3 file.")
+        # Read the file content
+        file_content = await file.read()
+        # Convert MP3 to WAV if necessary
+        if file_extension == "mp3":
+            audio = AudioSegment.from_mp3(io.BytesIO(file_content))
+        else:
+            audio = AudioSegment.from_wav(io.BytesIO(file_content))
+        # Check the sample rate of the WAV file
+        sample_rate = audio.frame_rate
+        # Convert WAV to the required format using ffmpeg if necessary
+        if sample_rate != 16000:
+            audio = audio.set_frame_rate(16000).set_channels(1)
+        # Export the audio to a temporary WAV file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+            audio.export(tmp_file.name, format="wav")
+            tmp_file_path = tmp_file.name
+        # Split the audio if necessary
+        chunk_file_paths = asr_manager.split_audio(tmp_file_path)
+        try:
+            # Transcribe the audio
+            language_id = asr_manager.model_language.get(language, asr_manager.default_language)
+            if language_id != asr_manager.default_language:
+                asr_manager.model = asr_manager.load_model(language_id)
+                asr_manager.default_language = language_id
+            asr_manager.model.cur_decoder = "rnnt"
+            #with torch.amp.autocast('cuda', dtype=torch.bfloat16):
+            #    rnnt_texts = asr_manager.model.transcribe(chunk_file_paths, batch_size=1, language_id=language_id)
+            rnnt_texts = asr_manager.model.transcribe(chunk_file_paths, batch_size=1, language_id=language_id)
+            # Flatten the list of transcriptions
+            rnnt_text = " ".join([text for sublist in rnnt_texts for text in sublist])
+            end_time = time()
+            logging.info(f"Transcription completed in {end_time - start_time:.2f} seconds")
+            return JSONResponse(content={"text": rnnt_text})
+        except subprocess.CalledProcessError as e:
+            logging.error(f"FFmpeg conversion failed: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"FFmpeg conversion failed: {str(e)}")
+        except Exception as e:
+            logging.error(f"An error occurred during processing: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"An error occurred during processing: {str(e)}")
+        finally:
+            # Clean up temporary files
+            for chunk_file_path in chunk_file_paths:
+                if os.path.exists(chunk_file_path):
+                    os.remove(chunk_file_path)
+    except HTTPException as e:
+        logging.error(f"HTTPException: {str(e)}")
+        raise e
+    except Exception as e:
+        logging.error(f"An unexpected error occurred: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {str(e)}")
+@app.get("/")
+async def home():
+    return RedirectResponse(url="/docs")
+@app.post("/transcribe_batch/", response_model=BatchTranscriptionResponse)
+async def transcribe_audio_batch(files: List[UploadFile] = File(...), language: str = Query(..., enum=list(asr_manager.model_language.keys()))):
+    start_time = time()
+    tmp_file_paths = []
+    transcriptions = []
+    try:
+        for file in files:
+            # Check file extension
+            file_extension = file.filename.split(".")[-1].lower()
+            if file_extension not in ["wav", "mp3"]:
+                logging.warning(f"Unsupported file format: {file_extension}")
+                raise HTTPException(status_code=400, detail="Unsupported file format. Please upload WAV or MP3 files.")
+            # Read the file content
+            file_content = await file.read()
+            # Convert MP3 to WAV if necessary
+            if file_extension == "mp3":
+                audio = AudioSegment.from_mp3(io.BytesIO(file_content))
+            else:
+                audio = AudioSegment.from_wav(io.BytesIO(file_content))
+            # Check the sample rate of the WAV file
+            sample_rate = audio.frame_rate
+            # Convert WAV to the required format using ffmpeg if necessary
+            if sample_rate != 16000:
+                audio = audio.set_frame_rate(16000).set_channels(1)
+            # Export the audio to a temporary WAV file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+                audio.export(tmp_file.name, format="wav")
+                tmp_file_path = tmp_file.name
+            # Split the audio if necessary
+            chunk_file_paths = asr_manager.split_audio(tmp_file_path)
+            tmp_file_paths.extend(chunk_file_paths)
+        logging.info(f"Temporary file paths: {tmp_file_paths}")
+        try:
+            # Transcribe the audio files in batch
+            language_id = asr_manager.model_language.get(language, asr_manager.default_language)
+            if language_id != asr_manager.default_language:
+                asr_manager.model = asr_manager.load_model(language_id)
+                asr_manager.default_language = language_id
+            asr_manager.model.cur_decoder = "rnnt"
+            #with torch.amp.autocast('cuda', dtype=torch.bfloat16):
+            #    rnnt_texts = asr_manager.model.transcribe(tmp_file_paths, batch_size=len(files), language_id=language_id)
+            rnnt_texts = asr_manager.model.transcribe(tmp_file_paths, batch_size=len(files), language_id=language_id)
+            logging.info(f"Raw transcriptions from model: {rnnt_texts}")
+            end_time = time()
+            logging.info(f"Transcription completed in {end_time - start_time:.2f} seconds")
+            # Flatten the list of transcriptions
+            transcriptions = [text for sublist in rnnt_texts for text in sublist]
+        except subprocess.CalledProcessError as e:
+            logging.error(f"FFmpeg conversion failed: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"FFmpeg conversion failed: {str(e)}")
+        except Exception as e:
+            logging.error(f"An error occurred during processing: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"An error occurred during processing: {str(e)}")
+        finally:
+            # Clean up temporary files
+            for tmp_file_path in tmp_file_paths:
+                if os.path.exists(tmp_file_path):
+                    os.remove(tmp_file_path)
+    except HTTPException as e:
+        logging.error(f"HTTPException: {str(e)}")
+        raise e
+    except Exception as e:
+        logging.error(f"An unexpected error occurred: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {str(e)}")
+    return JSONResponse(content={"transcriptions": transcriptions})
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the FastAPI server for ASR.")
+    parser.add_argument("--port", type=int, default=8888, help="Port to run the server on.")
+    parser.add_argument("--language", type=str, default="kn", help="Default language for the ASR model.")
+    parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to run the server on.")
+    parser.add_argument("--device", type=str, default="cuda", help="Device type to run the model on (cuda or cpu).")
+    args = parser.parse_args()
+    asr_manager = ASRModelManager(default_language=args.language, device_type=args.device)
+    uvicorn.run(app, host=args.host, port=args.port)

src/hf_asr.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import torch
+import nemo.collections.asr as nemo_asr
+model = nemo_asr.models.ASRModel.from_pretrained("ai4bharat/indicconformer_stt_kn_hybrid_rnnt_large")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.freeze() # inference mode
+model = model.to(device) # transfer model to device
+'''
+model.cur_decoder = "ctc"
+ctc_text = model.transcribe(['kannada_query_infer.wav'], batch_size=1,logprobs=False, language_id='kn')[0]
+print(ctc_text)
+'''
+model.cur_decoder = "rnnt"
+rnnt_text = model.transcribe(['kannada_query_infer.wav'], batch_size=1, language_id='kn')[0]
+print(rnnt_text)

src/hf_asr_advanced.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+import nemo.collections.asr as nemo_asr
+import time
+import argparse
+def load_model(model_name, device):
+    model = nemo_asr.models.ASRModel.from_pretrained(model_name)
+    model.freeze()  # inference mode
+    model = model.to(device)  # transfer model to device
+    return model
+def transcribe_audio(model, audio_file, batch_size, language_id, decoder_type):
+    model.cur_decoder = decoder_type
+    transcribed_text = model.transcribe([audio_file], batch_size=batch_size, language_id=language_id)[0]
+    return transcribed_text
+def measure_execution_time(model, audio_file, batch_size, language_id, decoder_type):
+    start_time = time.time()
+    transcribed_text = transcribe_audio(model, audio_file, batch_size, language_id, decoder_type)
+    end_time = time.time()
+    execution_time = end_time - start_time
+    return transcribed_text, execution_time
+def main(device_type):
+    model_name = "ai4bharat/indicconformer_stt_kn_hybrid_rnnt_large"
+    audio_file = 'kannada_query_infer.wav'
+    batch_size = 1
+    language_id = 'kn'
+    decoder_type = "rnnt"
+    device = torch.device(device_type if torch.cuda.is_available() and device_type == "cuda" else "cpu")
+    model = load_model(model_name, device)
+    transcribed_text, execution_time = measure_execution_time(model, audio_file, batch_size, language_id, decoder_type)
+    print(f"Execution time on {device_type}: {execution_time:.4f} seconds")
+    print(f"Transcribed text: {transcribed_text}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Transcribe audio using ASR model.")
+    parser.add_argument("--device", type=str, default="cpu", choices=["cpu", "cuda"], help="Device type to use for inference (cpu or cuda).")
+    args = parser.parse_args()
+    main(args.device)

src/nemo_asr.py ADDED Viewed

	@@ -0,0 +1,52 @@

+model_path = "kannada.nemo"
+lang_id = "kn"
+import torch
+import soundfile as sf
+import nemo.collections.asr as nemo_asr
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = nemo_asr.models.EncDecCTCModel.restore_from(restore_path=model_path)
+model.eval() # inference mode
+model = model.to(device)
+'''
+model.cur_decoder = "ctc"
+ctc_text = model.transcribe(['kannada_query_infer.wav'], batch_size=1, logprobs=False, language_id=lang_id)[0]
+print(ctc_text)
+'''
+model.cur_decoder = "rnnt"
+ctc_text = model.transcribe(['kannada_query_infer.wav'], batch_size=1, logprobs=False, language_id=lang_id)[0]
+print(ctc_text)
+'''
+import time
+# Start timing for CTC decoder
+start_time_ctc = time.time()
+model.cur_decoder = "ctc"
+ctc_text = model.transcribe(['kannada_query_infer.wav'], batch_size=1, logprobs=False, language_id=lang_id)[0]
+print(ctc_text)
+end_time_ctc = time.time()
+ctc_duration = end_time_ctc - start_time_ctc
+print(f"CTC transcription took {ctc_duration:.4f} seconds")
+# Start timing for RNNT decoder
+start_time_rnnt = time.time()
+model.cur_decoder = "rnnt"
+rnnt_text = model.transcribe(['kannada_query_infer.wav'], batch_size=1, logprobs=False, language_id=lang_id)[0]
+print(rnnt_text)
+end_time_rnnt = time.time()
+rnnt_duration = end_time_rnnt - start_time_rnnt
+print(f"RNNT transcription took {rnnt_duration:.4f} seconds")
+# Calculate and print the speed difference
+speed_difference = rnnt_duration - ctc_duration
+print(f"Speed difference: {speed_difference:.4f} seconds")
+'''

src/test/api_device_performance_test.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import pytest
+import torch
+import nemo.collections.asr as nemo_asr
+from fastapi.testclient import TestClient
+from src.asr_api import app, ASRModelManager
+import time
+# Initialize the FastAPI test client
+client = TestClient(app)
+# Mock audio file paths for testing
+AUDIO_FILE_PATH_WAV = "path/to/your/test_audio.wav"
+AUDIO_FILE_PATH_MP3 = "path/to/your/test_audio.mp3"
+# Function to measure the time taken for a request
+def measure_time(func, *args, **kwargs):
+    start_time = time.time()
+    result = func(*args, **kwargs)
+    end_time = time.time()
+    return result, end_time - start_time
+# Test case for CUDA mode
+def test_transcribe_audio_cuda():
+    asr_manager = ASRModelManager(default_language="kn", device_type="cuda")
+    app.dependency_overrides[ASRModelManager] = lambda: asr_manager
+    with open(AUDIO_FILE_PATH_WAV, "rb") as audio_file:
+        response, duration = measure_time(
+            client.post,
+            "/transcribe/",
+            files={"file": ("test_audio.wav", audio_file, "audio/wav")},
+            params={"language": "kannada"}
+        )
+    assert response.status_code == 200
+    assert "text" in response.json()
+    print(f"CUDA mode transcription time: {duration:.2f} seconds")
+# Test case for CPU mode
+def test_transcribe_audio_cpu():
+    asr_manager = ASRModelManager(default_language="kn", device_type="cpu")
+    app.dependency_overrides[ASRModelManager] = lambda: asr_manager
+    with open(AUDIO_FILE_PATH_WAV, "rb") as audio_file:
+        response, duration = measure_time(
+            client.post,
+            "/transcribe/",
+            files={"file": ("test_audio.wav", audio_file, "audio/wav")},
+            params={"language": "kannada"}
+        )
+    assert response.status_code == 200
+    assert "text" in response.json()
+    print(f"CPU mode transcription time: {duration:.2f} seconds")
+if __name__ == "__main__":
+    pytest.main()

src/test/hf_performance_test.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import subprocess
+import time
+def run_transcription(device_type):
+    start_time = time.time()
+    result = subprocess.run(["python", "src/hf_asr.py", "--device", device_type], capture_output=True, text=True)
+    end_time = time.time()
+    execution_time = end_time - start_time
+    transcribed_text = result.stdout.split("\n")[-2]  # Assuming the transcribed text is the second last line
+    return transcribed_text, execution_time
+def main():
+    # Measure execution time for CUDA
+    if torch.cuda.is_available():
+        cuda_text, cuda_time = run_transcription("cuda")
+        print(f"CUDA execution time: {cuda_time:.4f} seconds")
+        print(f"Transcribed text (CUDA): {cuda_text}")
+    # Measure execution time for CPU
+    cpu_text, cpu_time = run_transcription("cpu")
+    print(f"CPU execution time: {cpu_time:.4f} seconds")
+    print(f"Transcribed text (CPU): {cpu_text}")
+if __name__ == "__main__":
+    main()

src/test/locustfile.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from locust import HttpUser, task, between
+import os
+class TranscribeUser(HttpUser):
+    wait_time = between(1, 5)  # Wait time between tasks
+    @task
+    def transcribe_audio(self):
+        audio_file_path = "./../../kannada_sample_1.wav"
+        with open(audio_file_path, 'rb') as audio_file:
+            files = {'file': ('kannada_query_infer.wav', audio_file, 'audio/x-wav')}
+            headers = {
+                'accept': 'application/json'
+            }
+            response = self.client.post("http://localhost:8000/transcribe/", files=files, headers=headers)
+            if response.status_code == 200:
+                print("Success:", response.json())
+            else:
+                print("Failed:", response.status_code, response.text)
+    @task
+    def transcribe_batch(self):
+        batch_files = [
+            "./../../kannada_sample_1.wav",
+            "./../../kannada_sample_2.wav"
+        ]
+        files = []
+        for i, file_path in enumerate(batch_files):
+            with open(file_path, 'rb') as audio_file:
+                files.append(('file', (f'kannada_query_infer_{i}.wav', audio_file, 'audio/x-wav')))
+        headers = {
+            'accept': 'application/json'
+        }
+        response = self.client.post("http://localhost:8000/transcribe_batch/", files=files, headers=headers)
+        if response.status_code == 200:
+            print("Batch Success:", response.json())
+        else:
+            print("Batch Failed:", response.status_code, response.text)