sachin commited on
Commit
6a2d9d9
·
1 Parent(s): 436af15

add cpu mode

Browse files
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM ubuntu:22.04
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y \
6
+ python3 \
7
+ python3-pip \
8
+ git \
9
+ ffmpeg \
10
+ sudo \
11
+ wget \
12
+ && ln -s /usr/bin/python3 /usr/bin/python \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ COPY cpu-requirements.txt .
16
+ RUN pip install --no-cache-dir -r cpu-requirements.txt
17
+
18
+ COPY . .
19
+
20
+ RUN useradd -ms /bin/bash appuser \
21
+ && chown -R appuser:appuser /app
22
+
23
+ USER appuser
24
+
25
+ EXPOSE 7860
26
+
27
+ # Use absolute path for clarity
28
+ CMD ["python", "/app/src/asr_api.py", "--host", "0.0.0.0", "--port", "7860", "--device", "cuda"]
Dockerfile.cpu ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.6.0-cudnn-devel-ubuntu22.04
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y \
6
+ python3 \
7
+ python3-pip \
8
+ git \
9
+ ffmpeg \
10
+ sudo \
11
+ wget \
12
+ && ln -s /usr/bin/python3 /usr/bin/python \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ COPY cpu-requirements.txt .
16
+ RUN pip install --no-cache-dir -r cpu-requirements.txt
17
+
18
+ COPY . .
19
+
20
+ RUN useradd -ms /bin/bash appuser \
21
+ && chown -R appuser:appuser /app
22
+
23
+ USER appuser
24
+
25
+ EXPOSE 7860
26
+
27
+ # Use absolute path for clarity
28
+ CMD ["python", "/app/src/asr_api.py", "--host", "0.0.0.0", "--port", "7860", "--device", "cuda"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Sachin Shetty
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
compose.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ asr-indic-server:
3
+ image: slabstech/asr_indic_server
4
+ volumes:
5
+ - ~/.cache/huggingface:/root/.cache/huggingface
6
+ ports:
7
+ - 8000:8000
8
+ environment:
9
+ - LANGUAGE=kn
10
+ deploy:
11
+ resources:
12
+ reservations:
13
+ devices:
14
+ - driver: nvidia
15
+ device_ids: ['0']
16
+ capabilities: [gpu]
docs/api_endpoints.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ API endpoints
2
+
3
+ - sarvam batch ASR - https://github.com/sarvamai/sarvam-ai-cookbook/blob/main/notebooks/stt/stt-batch-api/Sarvam_STT_Batch_API_Demo.ipynb
docs/kannada_sample_3_out.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "text": " ಹೋ ನಾವಾಡುವ ನುಡಿಗೆ ಕನ್ನಡ ನುಡಿ ಚಿನ್ನದ ನುಡಿ ಸಿರಿಗನ್ನಡದ ನುಡಿ ನಾವಿರುವ ತಾಣವೆ ಗಂಧದ ಗುಡಿ ಅಂದದ ಗುಡಿ ಚಂದದ ಗುಡಿ ನವಾಡುವ ನುಡಿಯೇ ಕನ್ನಡ ನುಡಿಯೇ ನಾವಿರುವ ತಾಣವೆ ಗಂಧದ ಗುಡಿ ಅಂಧದ ಗುಡಿ ಗಂಧದ ಗುಡಿ ಚಂದದ ಗುಡಿ ಶ್ರೀ ಗಂಧಿ ಯದಗುಡಿ ಹಾಕಿದರು ಹಸುರಿನ ಬಣ್ಣ ಸಿರಿಗೆ ಒಲಿದು ಸೌಂದರ್ಯ ಸರಸ್ವತಿ ಧರೆಗಿಳಿದು ಅಹ್ ಹಾ ಹೊಲಜು ಬನದಿ ಜಲ ಹೂ ಬನದಲ್ಲಿ ನಲಿಯುತವೋ ಲಾಡಿ ಚಲುವಿನ ಬಲಜಾ ಬಿ ಈ ಕಂಧದ ಗುಡಿಗೆಲಿ ನೆಲೆಸಿದಳು ಇದು ಯಾರ ತಪಸ್ಸಿನ ಫಲವೋ ಈ ಕಂಗಳು ಮಾಡಿದ ಪುಣ್ಯವೋಹ ನಾವಿಲ್ಲ ತಾಣವೆ ಗಂಧಬೆಗುಡಿ ಹ ಹಿಂಬುತ ಓಡಿ ವೆಜಿಂಕೆಗಳು ಕುಣಿದು ಆಡುತ್ತಲಿದಿವೆ ನವಿಲುಗಳು ಹಾ ಮುಗಿಲನ್ನು ಚುಂಬಿಸುವ ಸೆಲಿ ತೂಗಾಡುತ್ತ ನಿಂತು ಮರಗಳಲಿ ಕಾಡು ತಿರೇ ಬಾನಾಡಿಗಳು ಎದೆಯಲ್ಲಿ ಸಂತಸ ದಾವನಲು ಇದು ಒನ್ಯ ಮೃಗಗಳ ಲೋಕವೋ ಈ ಭೂಮಿಗೆ ಇಳಿದ ನಾಕವೋ ನವಾಡುವ ನುಡಿಗೆ ಕನ್ನಡ ನುಡಿ ನಾವಿರುವ ತಾಣವೆ ಗಂಧದ ಗುಡಿ ಅಂದದ ಗುಡಿ ಗಂಧದ ದುಡಿ ಚಂದದ ಗುಡಿ ಶ್ರೀ ದಂತದ ಗುಡಿ ಅಹ್ ಹ್ಞೂ ಹೋ ನಾವಾಡುವ ನುಡಿಗೆ ಕನ್ನಡ ನುಡಿ ಚಿನ್ನದ ನುಡಿ ಸಿರಿಗನ್ನಡದ ನುಡಿ ನಾವಿರುವ ತಾಣವೆ ಗಂಧದ ಗುಡಿ ಅಂದದ ಗುಡಿ ಚಂದದ ಗುಡಿ ನವಾಡುವ ನುಡಿಯೇ ಕನ್ನಡ ನುಡಿಯೇ ನಾವಿರುವ ತಾಣವೆ ಗಂಧದ ಗುಡಿ ಅಂಧದ ಗುಡಿ ಗಂಧದ ಗುಡಿ ಚಂದದ ಗುಡಿ ಶ್ರೀ ಗಂಧಿ ಯದಗುಡಿ ಹಾಕಿದರು ಹಸುರಿನ ಬಣ್ಣ ಸಿರಿಗೆ ಒಲಿದು ಸೌಂದರ್ಯ ಸರಸ್ವತಿ ಧರೆಗಿಳಿದು ಅಹ್ ಹಾ ಹೊಲಜು ಬನದಿ ಜಲ ಹೂ ಬನದಲ್ಲಿ ನಲಿಯುತವೋ ಲಾಡಿ ಚಲುವಿನ ಬಲಜಾ ಬಿ ಈ ಕಂಧದ ಗುಡಿಗೆಲಿ ನೆಲೆಸಿದಳು ಇದು ಯಾರ ತಪಸ್ಸಿನ ಫಲವೋ ಈ ಕಂಗಳು ಮಾಡಿದ ಪುಣ್ಯವೋಹ ನಾವಿಲ್ಲ ತಾಣವೆ ಗಂಧಬೆಗುಡಿ ಹ ಹಿಂಬುತ ಓಡಿ ವೆಜಿಂಕೆಗಳು ಕುಣಿದು ಆಡುತ್ತಲಿದಿವೆ ನವಿಲುಗಳು ಹಾ ಮುಗಿಲನ್ನು ಚುಂಬಿಸುವ ಸೆಲಿ ತೂಗಾಡುತ್ತ ನಿಂತು ಮರಗಳಲಿ ಕಾಡು ತಿರೇ ಬಾನಾಡಿಗಳು ಎದೆಯಲ್ಲಿ ಸಂತಸ ದಾವನಲು ಇದು ಒನ್ಯ ಮೃಗಗಳ ಲೋಕವೋ ಈ ಭೂಮಿಗೆ ಇಳಿದ ನಾಕವೋ ನವಾಡುವ ನುಡಿಗೆ ಕನ್ನಡ ನುಡಿ ನಾವಿರುವ ತಾಣವೆ ಗಂಧದ ಗುಡಿ ಅಂದದ ಗುಡಿ ಗಂಧದ ದುಡಿ ಚಂದದ ಗುಡಿ ಶ್ರೀ ದಂತದ ಗುಡಿ ಅಹ್ ಹ್ಞೂ"
3
+ }
docs/kannada_sample_4_out.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "text": " ನನ್ನ ಆಶೀರ್ವಾದ ಸಾಕಾರ ಇದ್ದೇ ಐತೆ ಅದು ಬಿಡು ಅಲ ನೀ ಕೈ ಹಾಕೊಂಡಿರೋ ಕೆಲಸಗೆ ಉದ್ಧಾರ ಆಕ್ಯ ಅನ್ನೋ ನಂಬಿಕೆ ನಂಗ್ ಎಳ್ಳಷ್ಟು ಇಲ್ವಲಪ್ಪು ಮಾರಾಯ ಇದಾಗದ ಕೆಲಸ ಇದ ಆಗದು ಎಂದು ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಆಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸ ಒಂದು ಮುಂದೆ ಮನಸೊಂದಿ ಕೆಚ್ಚದೆ ಇರಬೇಕೆಂದೆಂದು ಆಗದು ಎಂದು ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ ಕೆತ್ತಲಾಗದು ಕಗ್ಗಲ್ಲೆಂದು ಎದೆಗುಂದಿ ಕರೆ ಶಿಲ್ಪಿ ಕೆತ್ತಲಾಗದು ಕಗ್ಗಲ್ಲೆಂದು ಎದೆಗುಂದಿ ಕರೆ ಶಿಲ್ಪಿ ಆಗುತ್ತಿದ್ದೇವೆ ತಲೆಗಳ ಬೀಡು ತಮ್ಮ ಟೆ ಚನಮೆಲೆ ನಾಡು ಬೇಲೂ ಹಳೆಬೀಡು ಬೇಲೂ ಹಳೆಬೀಡು ಕೆತ್ತಲಾಗದು ಕಗ್ಗಲ್ಲೆಂದು ಎದೆಗುಂದಿದ್ದರೆ ಶೀಪೀ ಆಗುತ್ತಿದ್ದೇ ಕಲೆಗಳ ಬೀಡು ಗೊಮ್ಮಟೆ ದೇಶನ ನೆಲೆನಾಡು ಬೇಲೂರು ಹಳೆ ಬೀಡು ಬೇಲೂರು ಹಳೆ ಬೀಡು ಸಾಗದು ಎಂದು ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ ಕಾದೇರಿಯನು ಹರಿಯಲು ಬಿಟ್ಟು ಕಾದೇರಿಯನು ಹರಿಯಲು ಬಿಟ್ಟು ವಿಶೇಷ ರೈ ಶ್ರಮ ಪಡದಿದ್ದರೇ ತನ್ನ ಪಾಡಿಯ ಪಟ್ಟದಿದ್ದರೇ ಕಾದೇರಿಯನ್ನು ಹರಿಯಲು ಬಿಟ್ಟು ವಿಶೇಷತೆಯ ಶ್ರಮ ಪಡೆದಿದ್ದರೇ ತನ್ನಂಬ ಹಡಿಯ ಕಟ್ಟತೆ ಬಂಗಾರ ಬೆಳೆವ ಹೊನ್ನಾಡು ಆ ಬಂಗಾರ ಬೆಳೆವ ಹೊನ್ನಾಡು ಆಗುತ್ತಿದ್ದೇ ನಾಡು ಕನ್ನಡ ಸಿರಿ ನಾಡು ನಮ್ಮ ಕನ್ನಡ ಸಿರಿ ನಾಡು ಆಗದು ಎಂದು ಕೈ ಸಾಗದು ಎಂತು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ನುಂಬೆ ಸಾಗದು ಕೆಲಸವು ನುಂಬೆ ಕೈ ಕೆಸರಾದರೆ ಬಾಯಿ ಮೊಸದೆಂಬ ಹೀಗೆ ಹಿರಿಯರ ಅನುಭವ ಸತ್ಯ ಇದ ನೆನಪಿಡದೆ ಮುನಿತ್ಯ ಕೈ ಕೆಸರಾದರೆ ಬಾಯ್ಮಸದೆಂಬ ಹಿರಿಯರ ಅನುಭವ ಸತ್ಯ ಇದು ನೆನಪಿಡಬೇಕು ನಿತ್ಯ ದುಡಿಮೆಯ ನಂಬಿ ಬದುಕ ದುಡಿಮೆಯ ನಂಬಿ ಬದುಕು ಅದರಲ್ಲಿ ದೇವರ ಹುಡುಕು ಬಾಳಲಿ ಬದುಬುದು ಬೆಳಕು ನಮ್ಮ ಬಾಳು ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ ಮನಸೊಂದಿದ್ದರೆ ಆಗವು ಉಂಟು ಕೆಚ್ಚೆದೆಗಿರಬೇಕೆಂದು ಕೆಚ್ಚೆದೆಗಿರಬೇಕು ೆಂದು ಆಗದು ಎಂದು ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆಯೇ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ ನನ್ನ ಆಶೀರ್ವಾದ ಸಾಕಾರ ಇದ್ದೇ ಐತೆ ಅದು ಬಿಡು ಅಲ ನೀ ಕೈ ಹಾಕೊಂಡಿರೋ ಕೆಲಸಗೆ ಉದ್ಧಾರ ಆಕ್ಯ ಅನ್ನೋ ನಂಬಿಕೆ ನಂಗ್ ಎಳ್ಳಷ್ಟು ಇಲ್ವಲಪ್ಪು ಮಾರಾಯ ಇದಾಗದ ಕೆಲಸ ಇದ ಆಗದು ಎಂದು ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಆಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸ ಒಂದು ಮುಂದೆ ಮನಸೊಂದಿ ಕೆಚ್ಚದೆ ಇರಬೇಕೆಂದೆಂದು ಆಗದು ಎಂದು ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ ಕೆತ್ತಲಾಗದು ಕಗ್ಗಲ್ಲೆಂದು ಎದೆಗುಂದಿ ಕರೆ ಶಿಲ್ಪಿ ಕೆತ್ತಲಾಗದು ಕಗ್ಗಲ್ಲೆಂದು ಎದೆಗುಂದಿ ಕರೆ ಶಿಲ್ಪಿ ಆಗುತ್ತಿದ್ದೇವೆ ತಲೆಗಳ ಬೀ��ು ತಮ್ಮ ಟೆ ಚನಮೆಲೆ ನಾಡು ಬೇಲೂ ಹಳೆಬೀಡು ಬೇಲೂ ಹಳೆಬೀಡು ಕೆತ್ತಲಾಗದು ಕಗ್ಗಲ್ಲೆಂದು ಎದೆಗುಂದಿದ್ದರೆ ಶೀಪೀ ಆಗುತ್ತಿದ್ದೇ ಕಲೆಗಳ ಬೀಡು ಗೊಮ್ಮಟೆ ದೇಶನ ನೆಲೆನಾಡು ಬೇಲೂರು ಹಳೆ ಬೀಡು ಬೇಲೂರು ಹಳೆ ಬೀಡು ಸಾಗದು ಎಂದು ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ ಕಾದೇರಿಯನು ಹರಿಯಲು ಬಿಟ್ಟು ಕಾದೇರಿಯನು ಹರಿಯಲು ಬಿಟ್ಟು ವಿಶೇಷ ರೈ ಶ್ರಮ ಪಡದಿದ್ದರೇ ತನ್ನ ಪಾಡಿಯ ಪಟ್ಟದಿದ್ದರೇ ಕಾದೇರಿಯನ್ನು ಹರಿಯಲು ಬಿಟ್ಟು ವಿಶೇಷತೆಯ ಶ್ರಮ ಪಡೆದಿದ್ದರೇ ತನ್ನಂಬ ಹಡಿಯ ಕಟ್ಟತೆ ಬಂಗಾರ ಬೆಳೆವ ಹೊನ್ನಾಡು ಆ ಬಂಗಾರ ಬೆಳೆವ ಹೊನ್ನಾಡು ಆಗುತ್ತಿದ್ದೇ ನಾಡು ಕನ್ನಡ ಸಿರಿ ನಾಡು ನಮ್ಮ ಕನ್ನಡ ಸಿರಿ ನಾಡು ಆಗದು ಎಂದು ಕೈ ಸಾಗದು ಎಂತು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ನುಂಬೆ ಸಾಗದು ಕೆಲಸವು ನುಂಬೆ ಕೈ ಕೆಸರಾದರೆ ಬಾಯಿ ಮೊಸದೆಂಬ ಹೀಗೆ ಹಿರಿಯರ ಅನುಭವ ಸತ್ಯ ಇದ ನೆನಪಿಡದೆ ಮುನಿತ್ಯ ಕೈ ಕೆಸರಾದರೆ ಬಾಯ್ಮಸದೆಂಬ ಹಿರಿಯರ ಅನುಭವ ಸತ್ಯ ಇದು ನೆನಪಿಡಬೇಕು ನಿತ್ಯ ದುಡಿಮೆಯ ನಂಬಿ ಬದುಕ ದುಡಿಮೆಯ ನಂಬಿ ಬದುಕು ಅದರಲ್ಲಿ ದೇವರ ಹುಡುಕು ಬಾಳಲಿ ಬದುಬುದು ಬೆಳಕು ನಮ್ಮ ಬಾಳು ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ ಮನಸೊಂದಿದ್ದರೆ ಆಗವು ಉಂಟು ಕೆಚ್ಚೆದೆಗಿರಬೇಕೆಂದು ಕೆಚ್ಚೆದೆಗಿರಬೇಕು ೆಂದು ಆಗದು ಎಂದು ಕೈಲಾಗದು ಎಂದು ಕೈ ಕಟ್ಟಿ ಕುಳಿತರೆ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆಯೇ ಸಾಗದು ಕೆಲಸವು ಮುಂದೆ"
3
+ }
docs/load_testing.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Load Test API
2
+
3
+
4
+ - pip instal locust
5
+ - cd src/test
6
+ - locust --host localhost:8000 --run-time 60
docs/optimisations_grok.md ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Your code is a well-structured FastAPI application for transcribing audio files using NVIDIA NeMo ASR models, supporting multiple Indian languages and batch processing. It includes logging, audio preprocessing, and error handling, which are solid foundations for a production-ready service. However, there are several areas where improvements can be made to reduce latency, increase throughput (serve more users), and ensure production readiness. Below, I’ll suggest enhancements and identify missing elements for a robust production deployment.
2
+ Key Improvements for Latency and Scalability
3
+ 1. Optimize Model Inference
4
+
5
+ Mixed Precision: Enable mixed-precision inference with torch.cuda.amp.autocast (currently commented out) to reduce latency on CUDA GPUs. This uses bfloat16 or float16, cutting computation time without significant accuracy loss.
6
+ python
7
+
8
+ with torch.cuda.amp.autocast(dtype=torch.bfloat16):
9
+ rnnt_texts = asr_manager.model.transcribe(chunk_file_paths, batch_size=1, language_id=language_id)
10
+
11
+ CUDA Graphs: If using NeMo 2.0.0+, enable CUDA Graphs for repeated inference calls to reduce kernel launch overhead. This requires a fixed input shape, so test compatibility with dynamic audio lengths.
12
+ Batch Size: Dynamically adjust batch_size based on input size or GPU memory. Currently, it’s hardcoded (1 for single files, len(files) for batch). For larger batches, process in smaller chunks to avoid OOM errors:
13
+ python
14
+
15
+ batch_size = min(len(chunk_file_paths), 32) # Cap at 32, adjust based on GPU memory
16
+ rnnt_texts = asr_manager.model.transcribe(chunk_file_paths, batch_size=batch_size, language_id=language_id)
17
+
18
+ 2. Model Management
19
+
20
+ Preload Models: Loading a new model for every language switch (e.g., load_model) is slow and memory-intensive. Preload all required models at startup if memory allows, or use a caching mechanism:
21
+ python
22
+
23
+ class ASRModelManager:
24
+ def __init__(self, default_language="kn"):
25
+ self.models = {}
26
+ self.default_language = default_language
27
+ self.load_initial_model(default_language)
28
+
29
+ def load_initial_model(self, language_id):
30
+ model = self.load_model(language_id)
31
+ self.models[language_id] = model
32
+
33
+ def get_model(self, language_id):
34
+ if language_id not in self.models:
35
+ self.models[language_id] = self.load_model(language_id)
36
+ return self.models[language_id]
37
+
38
+ Then update /transcribe/ and /transcribe_batch/ to use asr_manager.get_model(language_id) instead of reloading.
39
+ Model Sharing: Ensure thread-safety when sharing models across requests. FastAPI runs async, so use a lock if multiple workers access the same model:
40
+ python
41
+
42
+ from threading import Lock
43
+ class ASRModelManager:
44
+ def __init__(self, default_language="kn"):
45
+ self.model_locks = {lang: Lock() for lang in self.model_language.keys()}
46
+ ...
47
+ async def transcribe(self, paths, language_id, batch_size):
48
+ with self.model_locks[language_id]:
49
+ model = self.get_model(language_id)
50
+ return model.transcribe(paths, batch_size=batch_size, language_id=language_id)
51
+
52
+ 3. Audio Preprocessing
53
+
54
+ In-Memory Processing: Avoid writing to disk with temporary files (tempfile.NamedTemporaryFile) and splitting chunks to disk (split_audio). Process audio in memory to reduce I/O latency:
55
+ python
56
+
57
+ def split_audio_in_memory(self, audio_segment, chunk_duration_ms=15000):
58
+ duration_ms = len(audio_segment)
59
+ if duration_ms <= chunk_duration_ms:
60
+ return [audio_segment]
61
+ chunks = [audio_segment[i:i + chunk_duration_ms] for i in range(0, duration_ms, chunk_duration_ms)]
62
+ return chunks
63
+
64
+ Modify /transcribe/ to:
65
+ python
66
+
67
+ audio_chunks = asr_manager.split_audio_in_memory(audio)
68
+ chunk_buffers = [io.BytesIO() for _ in audio_chunks]
69
+ for chunk, buffer in zip(audio_chunks, chunk_buffers):
70
+ chunk.export(buffer, format="wav")
71
+ buffer.seek(0)
72
+ rnnt_texts = asr_manager.model.transcribe(chunk_buffers, batch_size=len(chunk_buffers), language_id=language_id)
73
+
74
+ Async Preprocessing: Offload audio conversion (e.g., sample rate adjustment) to an async task or worker queue to free up the main thread.
75
+
76
+ 4. Async and Concurrency
77
+
78
+ Worker Queue: For heavy loads, integrate a task queue (e.g., Celery with Redis) to handle transcription jobs asynchronously. This decouples preprocessing and inference from the HTTP response:
79
+ python
80
+
81
+ from celery import Celery
82
+ celery_app = Celery('asr', broker='redis://localhost:6379/0')
83
+
84
+ @celery_app.task
85
+ def transcribe_task(file_paths, language_id):
86
+ model = asr_manager.get_model(language_id)
87
+ return model.transcribe(file_paths, batch_size=len(file_paths), language_id=language_id)
88
+
89
+ @app.post("/transcribe_async/")
90
+ async def transcribe_async(file: UploadFile = File(...), language: str = Query(...)):
91
+ # Save file temporarily or process in memory
92
+ task = transcribe_task.delay([tmp_file_path], asr_manager.model_language[language])
93
+ return {"task_id": task.id}
94
+
95
+ Increase Workers: Run FastAPI with multiple Uvicorn workers (uvicorn --workers 4) to handle concurrent requests, leveraging multiple CPU cores.
96
+
97
+ 5. FastAPI Performance
98
+
99
+ Response Streaming: For long transcriptions, stream results back to the client instead of waiting for full processing:
100
+ python
101
+
102
+ from fastapi.responses import StreamingResponse
103
+ async def stream_transcriptions(chunk_file_paths, language_id):
104
+ model = asr_manager.get_model(language_id)
105
+ for chunk in chunk_file_paths:
106
+ text = model.transcribe([chunk], batch_size=1, language_id=language_id)[0]
107
+ yield f"data: {text}\n\n"
108
+
109
+ @app.post("/transcribe_stream/")
110
+ async def transcribe_stream(file: UploadFile = File(...), language: str = Query(...)):
111
+ audio_chunks = asr_manager.split_audio(tmp_file_path)
112
+ return StreamingResponse(stream_transcriptions(audio_chunks, asr_manager.model_language[language]), media_type="text/event-stream")
113
+
114
+ Rate Limiting: Add rate limiting (e.g., slowapi) to prevent overload:
115
+ python
116
+
117
+ from slowapi import Limiter
118
+ from slowapi.util import get_remote_address
119
+ limiter = Limiter(key_func=get_remote_address)
120
+ app.state.limiter = limiter
121
+ @app.post("/transcribe/", response_model=TranscriptionResponse)
122
+ @limiter.limit("10/minute")
123
+ async def transcribe_audio(...):
124
+ ...
125
+
126
+ Production Readiness: Missing Elements
127
+ 1. Scalability
128
+
129
+ Load Balancing: Deploy behind a load balancer (e.g., NGINX, HAProxy) to distribute requests across multiple instances.
130
+ Containerization: Use Docker for consistent deployment:
131
+ dockerfile
132
+
133
+ FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
134
+ RUN apt-get update && apt-get install -y python3-pip ffmpeg
135
+ RUN pip3 install torch nemo_toolkit[asr] fastapi uvicorn pydub
136
+ COPY . /app
137
+ WORKDIR /app
138
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4"]
139
+
140
+ Build and run:
141
+ bash
142
+
143
+ docker build -t asr-api .
144
+ docker run --gpus all -p 8000:8000 asr-api
145
+
146
+ Horizontal Scaling: Use Kubernetes or Docker Swarm to scale instances based on demand.
147
+
148
+ 2. Monitoring and Logging
149
+
150
+ Metrics: Add Prometheus metrics (e.g., prometheus-fastapi-instrumentator) to track latency, request rate, and errors:
151
+ python
152
+
153
+ from prometheus_fastapi_instrumentator import Instrumentator
154
+ Instrumentator().instrument(app).expose(app)
155
+
156
+ Distributed Logging: Send logs to a centralized system (e.g., ELK Stack, Loki) instead of local files for better analysis.
157
+
158
+ 3. Security
159
+
160
+ Authentication: Add API key or JWT authentication (e.g., fastapi-users) to restrict access.
161
+ Input Validation: Validate audio file size and duration to prevent abuse:
162
+ python
163
+
164
+ MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
165
+ if len(file_content) > MAX_FILE_SIZE:
166
+ raise HTTPException(status_code=400, detail="File too large")
167
+
168
+ HTTPS: Configure SSL/TLS with NGINX or a cloud provider.
169
+
170
+ 4. Error Handling and Resilience
171
+
172
+ Retry Logic: Add retries for transient failures (e.g., model inference errors) using tenacity:
173
+ python
174
+
175
+ from tenacity import retry, stop_after_attempt, wait_fixed
176
+ @retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
177
+ def transcribe_with_retry(model, paths, batch_size, language_id):
178
+ return model.transcribe(paths, batch_size=batch_size, language_id=language_id)
179
+
180
+ Graceful Degradation: If a model fails to load, fall back to a default (e.g., Kannada).
181
+
182
+ 5. Configuration
183
+
184
+ Environment Variables: Use python-dotenv or pydantic-settings for configurable settings (e.g., port, host, chunk duration):
185
+ python
186
+
187
+ from pydantic_settings import BaseSettings
188
+ class Settings(BaseSettings):
189
+ host: str = "127.0.0.1"
190
+ port: int = 8000
191
+ chunk_duration_ms: int = 15000
192
+ settings = Settings()
193
+ uvicorn.run(app, host=settings.host, port=settings.port)
194
+
195
+ Final Optimized Code Snippet
196
+ Here’s an example incorporating some key improvements:
197
+ python
198
+
199
+ import torch
200
+ import nemo.collections.asr as nemo_asr
201
+ from fastapi import FastAPI, File, UploadFile, HTTPException, Query
202
+ from fastapi.responses import JSONResponse
203
+ from pydub import AudioSegment
204
+ import io
205
+ import logging
206
+ from threading import Lock
207
+
208
+ app = FastAPI()
209
+ logging.basicConfig(level=logging.INFO)
210
+
211
+ class ASRModelManager:
212
+ def __init__(self, default_language="kn"):
213
+ self.default_language = default_language
214
+ self.model_language = {...} # Same as original
215
+ self.config_models = {...} # Same as original
216
+ self.models = {}
217
+ self.model_locks = {lang: Lock() for lang in self.model_language.keys()}
218
+ self.load_initial_model(default_language)
219
+
220
+ def load_model(self, language_id):
221
+ model = nemo_asr.models.ASRModel.from_pretrained(self.config_models[language_id])
222
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
223
+ return model.to(device).eval()
224
+
225
+ def load_initial_model(self, language_id):
226
+ self.models[language_id] = self.load_model(language_id)
227
+
228
+ def get_model(self, language_id):
229
+ if language_id not in self.models:
230
+ with self.model_locks[language_id]:
231
+ if language_id not in self.models: # Double-check locking
232
+ self.models[language_id] = self.load_model(language_id)
233
+ return self.models[language_id]
234
+
235
+ def split_audio_in_memory(self, audio_segment, chunk_duration_ms=15000):
236
+ duration_ms = len(audio_segment)
237
+ if duration_ms <= chunk_duration_ms:
238
+ return [audio_segment]
239
+ return [audio_segment[i:i + chunk_duration_ms] for i in range(0, duration_ms, chunk_duration_ms)]
240
+
241
+ asr_manager = ASRModelManager()
242
+
243
+ @app.post("/transcribe/")
244
+ async def transcribe_audio(file: UploadFile = File(...), language: str = Query(...)):
245
+ file_content = await file.read()
246
+ audio = AudioSegment.from_file(io.BytesIO(file_content), format=file.filename.split(".")[-1].lower())
247
+ if audio.frame_rate != 16000:
248
+ audio = audio.set_frame_rate(16000).set_channels(1)
249
+
250
+ audio_chunks = asr_manager.split_audio_in_memory(audio)
251
+ chunk_buffers = [io.BytesIO() for _ in audio_chunks]
252
+ for chunk, buffer in zip(audio_chunks, chunk_buffers):
253
+ chunk.export(buffer, format="wav")
254
+ buffer.seek(0)
255
+
256
+ language_id = asr_manager.model_language.get(language, asr_manager.default_language)
257
+ model = asr_manager.get_model(language_id)
258
+ model.cur_decoder = "rnnt"
259
+ with torch.cuda.amp.autocast(dtype=torch.bfloat16):
260
+ rnnt_texts = model.transcribe(chunk_buffers, batch_size=min(len(chunk_buffers), 32), language_id=language_id)
261
+ text = " ".join(rnnt_texts)
262
+ return JSONResponse(content={"text": text})
263
+
264
+ if __name__ == "__main__":
265
+ import uvicorn
266
+ uvicorn.run(app, host="0.0.0.0", port=8000, workers=4)
267
+
268
+ Summary
269
+
270
+ Latency: Mixed precision, in-memory processing, and dynamic batching reduce inference time.
271
+ Scalability: Preloaded models, async workers, and Triton (as an alternative) handle more users.
272
+ Production: Add monitoring, security, and containerization for reliability.
273
+
274
+ For maximum performance, consider switching to NVIDIA Triton Inference Server (as suggested previously) instead of FastAPI if inference throughput is the top priority. Let me know if you’d like a deeper dive into any specific improvement!
requirements.txt ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ aiohappyeyeballs==2.4.6
3
+ aiohttp==3.11.12
4
+ aiosignal==1.3.2
5
+ alembic==1.14.1
6
+ annotated-types==0.7.0
7
+ antlr4-python3-runtime==4.9.3
8
+ anyio==4.8.0
9
+ asteroid-filterbanks==0.4.0
10
+ asttokens==3.0.0
11
+ async-timeout==5.0.1
12
+ attrs==25.1.0
13
+ audioread==3.0.1
14
+ blinker==1.9.0
15
+ braceexpand==0.1.7
16
+ Brotli==1.1.0
17
+ certifi==2025.1.31
18
+ cffi==1.17.1
19
+ charset-normalizer==3.4.1
20
+ click==8.1.8
21
+ coloredlogs==15.0.1
22
+ colorlog==6.9.0
23
+ ConfigArgParse==1.7
24
+ contourpy==1.3.1
25
+ cycler==0.12.1
26
+ Cython==0.29.37
27
+ cytoolz==1.0.1
28
+ datasets==2.21.0
29
+ decorator==5.2.0
30
+ dill==0.3.8
31
+ docopt==0.6.2
32
+ editdistance==0.8.1
33
+ einops==0.8.1
34
+ exceptiongroup==1.2.2
35
+ executing==2.2.0
36
+ fastapi==0.115.8
37
+ filelock==3.17.0
38
+ Flask==3.1.0
39
+ flask-cors==5.0.1
40
+ Flask-Login==0.6.3
41
+ flatbuffers==25.2.10
42
+ fonttools==4.56.0
43
+ frozenlist==1.5.0
44
+ fsspec==2024.6.1
45
+ gevent==24.11.1
46
+ geventhttpclient==2.3.3
47
+ greenlet==3.1.1
48
+ grpcio==1.70.0
49
+ h11==0.14.0
50
+ huggingface-hub==0.23.2
51
+ humanfriendly==10.0
52
+ hydra-core==1.3.2
53
+ HyperPyYAML==1.2.2
54
+ idna==3.10
55
+ intervaltree==3.1.0
56
+ ipython==8.27.0
57
+ itsdangerous==2.2.0
58
+ jedi==0.19.2
59
+ Jinja2==3.1.5
60
+ jiwer==3.0.4
61
+ joblib==1.4.2
62
+ julius==0.2.7
63
+ kiwisolver==1.4.8
64
+ lazy_loader==0.4
65
+ lhotse==1.27.0
66
+ librosa==0.10.2.post1
67
+ lightning==2.5.0.post0
68
+ lightning-utilities==0.12.0
69
+ lilcom==1.8.0
70
+ llvmlite==0.44.0
71
+ locust==2.33.0
72
+ Mako==1.3.9
73
+ Markdown==3.7
74
+ markdown-it-py==3.0.0
75
+ MarkupSafe==3.0.2
76
+ matplotlib==3.10.0
77
+ matplotlib-inline==0.1.7
78
+ mdurl==0.1.2
79
+ mpmath==1.3.0
80
+ msgpack==1.1.0
81
+ multidict==6.1.0
82
+ multiprocess==0.70.16
83
+ nemo_toolkit @ git+https://github.com/AI4Bharat/NeMo@0a1560e398ee97dd3ff17b495d05cad31938cef0
84
+ networkx==3.4.2
85
+ numba==0.61.0
86
+ numpy==1.26.4
87
+ omegaconf==2.3.0
88
+ onnx==1.17.0
89
+ onnxruntime==1.19.0
90
+ optuna==4.2.1
91
+ packaging==24.2
92
+ pandas==2.2.2
93
+ parso==0.8.4
94
+ pexpect==4.9.0
95
+ pillow==11.1.0
96
+ platformdirs==4.3.6
97
+ pooch==1.8.2
98
+ primePy==1.3
99
+ prompt_toolkit==3.0.50
100
+ propcache==0.3.0
101
+ protobuf==5.29.3
102
+ psutil==7.0.0
103
+ ptyprocess==0.7.0
104
+ pure_eval==0.2.3
105
+ pyannote.audio==3.3.1
106
+ pyannote.core==5.0.0
107
+ pyannote.database==5.1.3
108
+ pyannote.metrics==3.2.1
109
+ pyannote.pipeline==3.0.1
110
+ pyarrow==19.0.1
111
+ pycparser==2.22
112
+ pydantic==2.10.6
113
+ pydantic_core==2.27.2
114
+ pydub==0.25.1
115
+ Pygments==2.19.1
116
+ pyparsing==3.2.1
117
+ python-dateutil==2.9.0.post0
118
+ python-multipart==0.0.20
119
+ pytorch-lightning==2.4.0
120
+ pytorch-metric-learning==2.8.1
121
+ pytz==2025.1
122
+ PyYAML==6.0.2
123
+ pyzmq==26.2.1
124
+ RapidFuzz==3.12.1
125
+ regex==2024.11.6
126
+ requests==2.32.3
127
+ rich==13.9.4
128
+ ruamel.yaml==0.18.10
129
+ ruamel.yaml.clib==0.2.12
130
+ safetensors==0.5.2
131
+ scikit-learn==1.6.1
132
+ scipy==1.15.2
133
+ semver==3.0.4
134
+ sentencepiece==0.2.0
135
+ shellingham==1.5.4
136
+ six==1.17.0
137
+ sniffio==1.3.1
138
+ sortedcontainers==2.4.0
139
+ soundfile==0.13.1
140
+ soxr==0.5.0.post1
141
+ speechbrain==1.0.2
142
+ SQLAlchemy==2.0.38
143
+ stack-data==0.6.3
144
+ starlette==0.45.3
145
+ sympy==1.13.1
146
+ tabulate==0.9.0
147
+ tensorboard==2.19.0
148
+ tensorboard-data-server==0.7.2
149
+ tensorboardX==2.6.2.2
150
+ text-unidecode==1.3
151
+ threadpoolctl==3.5.0
152
+ tokenizers==0.19.1
153
+ tomli==2.2.1
154
+ toolz==1.0.0
155
+ torch==2.6.0
156
+ torch-audiomentations==0.12.0
157
+ torch_pitch_shift==1.2.5
158
+ torchaudio==2.6.0
159
+ torchmetrics==1.6.1
160
+ tqdm==4.66.5
161
+ traitlets==5.14.3
162
+ transformers==4.40.0
163
+ triton==3.2.0
164
+ typer==0.15.1
165
+ typing_extensions==4.12.2
166
+ tzdata==2025.1
167
+ urllib3==2.3.0
168
+ uvicorn==0.34.0
169
+ wcwidth==0.2.13
170
+ webdataset==0.2.100
171
+ Werkzeug==3.1.3
172
+ wget==3.2
173
+ wrapt==1.17.2
174
+ xxhash==3.5.0
175
+ yarl==1.18.3
176
+ zope.event==5.0
177
+ zope.interface==7.2
samples/kannada_sample_1.wav ADDED
Binary file (157 kB). View file
 
samples/kannada_sample_2.wav ADDED
Binary file (378 kB). View file
 
server-setup.sh ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sudo apt-get update -y
2
+ sudo apt-get upgrade -y
3
+
4
+ sudo apt-get install -y python3-venv
5
+ sudo apt-get install -y python3-pip
6
+ sudo apt-get install -y ffmpeg
7
+ sudo apt install net-tools -y
8
+
9
+
10
+ python3 -m venv venv
11
+ source venv/bin/activate
12
+
13
+ pip install -r requirements.txt
14
+
15
+ #cd src/asr_indic_server
16
+ python src/asr_api.py
17
+
src/asr_api.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import nemo.collections.asr as nemo_asr
3
+ from fastapi import FastAPI, File, UploadFile, HTTPException, Query
4
+ from fastapi.responses import RedirectResponse
5
+ from fastapi.responses import JSONResponse
6
+ from pydantic import BaseModel
7
+ from pydub import AudioSegment
8
+ import os
9
+ import tempfile
10
+ import subprocess
11
+ import asyncio
12
+ import io
13
+ import logging
14
+ from logging.handlers import RotatingFileHandler
15
+ from time import time
16
+ from typing import List
17
+ import argparse
18
+ import uvicorn
19
+
20
+ # Configure logging with log rotation
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format='%(asctime)s - %(levelname)s - %(message)s',
24
+ handlers=[
25
+ RotatingFileHandler("transcription_api.log", maxBytes=10*1024*1024, backupCount=5), # 10MB per file, keep 5 backup files
26
+ logging.StreamHandler() # This will also print logs to the console
27
+ ]
28
+ )
29
+
30
+ class ASRModelManager:
31
+ def __init__(self, default_language="kn", device_type="cuda"):
32
+ self.default_language = default_language
33
+ self.device_type = device_type
34
+ self.model_language = {
35
+ "kannada": "kn",
36
+ "hindi": "hi",
37
+ "malayalam": "ml",
38
+ "assamese": "as",
39
+ "bengali": "bn",
40
+ "bodo": "brx",
41
+ "dogri": "doi",
42
+ "gujarati": "gu",
43
+ "kashmiri": "ks",
44
+ "konkani": "kok",
45
+ "maithili": "mai",
46
+ "manipuri": "mni",
47
+ "marathi": "mr",
48
+ "nepali": "ne",
49
+ "odia": "or",
50
+ "punjabi": "pa",
51
+ "sanskrit": "sa",
52
+ "santali": "sat",
53
+ "sindhi": "sd",
54
+ "tamil": "ta",
55
+ "telugu": "te",
56
+ "urdu": "ur"
57
+ }
58
+ self.config_models = {
59
+ "as": "ai4bharat/indicconformer_stt_as_hybrid_rnnt_large",
60
+ "bn": "ai4bharat/indicconformer_stt_bn_hybrid_rnnt_large",
61
+ "brx": "ai4bharat/indicconformer_stt_brx_hybrid_rnnt_large",
62
+ "doi": "ai4bharat/indicconformer_stt_doi_hybrid_rnnt_large",
63
+ "gu": "ai4bharat/indicconformer_stt_gu_hybrid_rnnt_large",
64
+ "hi": "ai4bharat/indicconformer_stt_hi_hybrid_rnnt_large",
65
+ "kn": "ai4bharat/indicconformer_stt_kn_hybrid_rnnt_large",
66
+ "ks": "ai4bharat/indicconformer_stt_ks_hybrid_rnnt_large",
67
+ "kok": "ai4bharat/indicconformer_stt_kok_hybrid_rnnt_large",
68
+ "mai": "ai4bharat/indicconformer_stt_mai_hybrid_rnnt_large",
69
+ "ml": "ai4bharat/indicconformer_stt_ml_hybrid_rnnt_large",
70
+ "mni": "ai4bharat/indicconformer_stt_mni_hybrid_rnnt_large",
71
+ "mr": "ai4bharat/indicconformer_stt_mr_hybrid_rnnt_large",
72
+ "ne": "ai4bharat/indicconformer_stt_ne_hybrid_rnnt_large",
73
+ "or": "ai4bharat/indicconformer_stt_or_hybrid_rnnt_large",
74
+ "pa": "ai4bharat/indicconformer_stt_pa_hybrid_rnnt_large",
75
+ "sa": "ai4bharat/indicconformer_stt_sa_hybrid_rnnt_large",
76
+ "sat": "ai4bharat/indicconformer_stt_sat_hybrid_rnnt_large",
77
+ "sd": "ai4bharat/indicconformer_stt_sd_hybrid_rnnt_large",
78
+ "ta": "ai4bharat/indicconformer_stt_ta_hybrid_rnnt_large",
79
+ "te": "ai4bharat/indicconformer_stt_te_hybrid_rnnt_large",
80
+ "ur": "ai4bharat/indicconformer_stt_ur_hybrid_rnnt_large"
81
+ }
82
+ self.model = self.load_model(self.default_language)
83
+
84
+ def load_model(self, language_id="kn"):
85
+ model_name = self.config_models.get(language_id, self.config_models["kn"])
86
+ model = nemo_asr.models.ASRModel.from_pretrained(model_name)
87
+
88
+ device = torch.device(self.device_type if torch.cuda.is_available() and self.device_type == "cuda" else "cpu")
89
+ model.freeze() # inference mode
90
+ model = model.to(device) # transfer model to device
91
+
92
+ return model
93
+
94
+ def split_audio(self, file_path, chunk_duration_ms=15000):
95
+ """
96
+ Splits an audio file into chunks of specified duration if the audio duration exceeds the chunk duration.
97
+
98
+ :param file_path: Path to the audio file.
99
+ :param chunk_duration_ms: Duration of each chunk in milliseconds (default is 15000 ms or 15 seconds).
100
+ """
101
+ # Load the audio file
102
+ audio = AudioSegment.from_file(file_path)
103
+
104
+ # Get the duration of the audio in milliseconds
105
+ duration_ms = len(audio)
106
+
107
+ # Check if the duration is more than the specified chunk duration
108
+ if duration_ms > chunk_duration_ms:
109
+ # Calculate the number of chunks needed
110
+ num_chunks = duration_ms // chunk_duration_ms
111
+ if duration_ms % chunk_duration_ms != 0:
112
+ num_chunks += 1
113
+
114
+ # Split the audio into chunks
115
+ chunks = [audio[i*chunk_duration_ms:(i+1)*chunk_duration_ms] for i in range(num_chunks)]
116
+
117
+ # Create a directory to save the chunks
118
+ output_dir = "audio_chunks"
119
+ os.makedirs(output_dir, exist_ok=True)
120
+
121
+ # Export each chunk to separate files
122
+ chunk_file_paths = []
123
+ for i, chunk in enumerate(chunks):
124
+ chunk_file_path = os.path.join(output_dir, f"chunk_{i}.wav")
125
+ chunk.export(chunk_file_path, format="wav")
126
+ chunk_file_paths.append(chunk_file_path)
127
+ print(f"Chunk {i} exported successfully to {chunk_file_path}.")
128
+
129
+ return chunk_file_paths
130
+ else:
131
+ return [file_path]
132
+
133
+ app = FastAPI()
134
+ asr_manager = ASRModelManager()
135
+
136
+ # Define the response model
137
+ class TranscriptionResponse(BaseModel):
138
+ text: str
139
+
140
+ class BatchTranscriptionResponse(BaseModel):
141
+ transcriptions: List[str]
142
+
143
+ @app.post("/transcribe/", response_model=TranscriptionResponse)
144
+ async def transcribe_audio(file: UploadFile = File(...), language: str = Query(..., enum=list(asr_manager.model_language.keys()))):
145
+ start_time = time()
146
+ try:
147
+ # Check file extension
148
+ file_extension = file.filename.split(".")[-1].lower()
149
+ if file_extension not in ["wav", "mp3"]:
150
+ logging.warning(f"Unsupported file format: {file_extension}")
151
+ raise HTTPException(status_code=400, detail="Unsupported file format. Please upload a WAV or MP3 file.")
152
+
153
+ # Read the file content
154
+ file_content = await file.read()
155
+
156
+ # Convert MP3 to WAV if necessary
157
+ if file_extension == "mp3":
158
+ audio = AudioSegment.from_mp3(io.BytesIO(file_content))
159
+ else:
160
+ audio = AudioSegment.from_wav(io.BytesIO(file_content))
161
+
162
+ # Check the sample rate of the WAV file
163
+ sample_rate = audio.frame_rate
164
+
165
+ # Convert WAV to the required format using ffmpeg if necessary
166
+ if sample_rate != 16000:
167
+ audio = audio.set_frame_rate(16000).set_channels(1)
168
+
169
+ # Export the audio to a temporary WAV file
170
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
171
+ audio.export(tmp_file.name, format="wav")
172
+ tmp_file_path = tmp_file.name
173
+
174
+ # Split the audio if necessary
175
+ chunk_file_paths = asr_manager.split_audio(tmp_file_path)
176
+
177
+ try:
178
+ # Transcribe the audio
179
+ language_id = asr_manager.model_language.get(language, asr_manager.default_language)
180
+
181
+ if language_id != asr_manager.default_language:
182
+ asr_manager.model = asr_manager.load_model(language_id)
183
+ asr_manager.default_language = language_id
184
+
185
+ asr_manager.model.cur_decoder = "rnnt"
186
+
187
+ #with torch.amp.autocast('cuda', dtype=torch.bfloat16):
188
+ # rnnt_texts = asr_manager.model.transcribe(chunk_file_paths, batch_size=1, language_id=language_id)
189
+ rnnt_texts = asr_manager.model.transcribe(chunk_file_paths, batch_size=1, language_id=language_id)
190
+
191
+ # Flatten the list of transcriptions
192
+ rnnt_text = " ".join([text for sublist in rnnt_texts for text in sublist])
193
+
194
+ end_time = time()
195
+ logging.info(f"Transcription completed in {end_time - start_time:.2f} seconds")
196
+ return JSONResponse(content={"text": rnnt_text})
197
+ except subprocess.CalledProcessError as e:
198
+ logging.error(f"FFmpeg conversion failed: {str(e)}")
199
+ raise HTTPException(status_code=500, detail=f"FFmpeg conversion failed: {str(e)}")
200
+ except Exception as e:
201
+ logging.error(f"An error occurred during processing: {str(e)}")
202
+ raise HTTPException(status_code=500, detail=f"An error occurred during processing: {str(e)}")
203
+ finally:
204
+ # Clean up temporary files
205
+ for chunk_file_path in chunk_file_paths:
206
+ if os.path.exists(chunk_file_path):
207
+ os.remove(chunk_file_path)
208
+ except HTTPException as e:
209
+ logging.error(f"HTTPException: {str(e)}")
210
+ raise e
211
+ except Exception as e:
212
+ logging.error(f"An unexpected error occurred: {str(e)}")
213
+ raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {str(e)}")
214
+
215
+ @app.get("/")
216
+ async def home():
217
+ return RedirectResponse(url="/docs")
218
+
219
+ @app.post("/transcribe_batch/", response_model=BatchTranscriptionResponse)
220
+ async def transcribe_audio_batch(files: List[UploadFile] = File(...), language: str = Query(..., enum=list(asr_manager.model_language.keys()))):
221
+ start_time = time()
222
+ tmp_file_paths = []
223
+ transcriptions = []
224
+ try:
225
+ for file in files:
226
+ # Check file extension
227
+ file_extension = file.filename.split(".")[-1].lower()
228
+ if file_extension not in ["wav", "mp3"]:
229
+ logging.warning(f"Unsupported file format: {file_extension}")
230
+ raise HTTPException(status_code=400, detail="Unsupported file format. Please upload WAV or MP3 files.")
231
+
232
+ # Read the file content
233
+ file_content = await file.read()
234
+
235
+ # Convert MP3 to WAV if necessary
236
+ if file_extension == "mp3":
237
+ audio = AudioSegment.from_mp3(io.BytesIO(file_content))
238
+ else:
239
+ audio = AudioSegment.from_wav(io.BytesIO(file_content))
240
+
241
+ # Check the sample rate of the WAV file
242
+ sample_rate = audio.frame_rate
243
+
244
+ # Convert WAV to the required format using ffmpeg if necessary
245
+ if sample_rate != 16000:
246
+ audio = audio.set_frame_rate(16000).set_channels(1)
247
+
248
+ # Export the audio to a temporary WAV file
249
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
250
+ audio.export(tmp_file.name, format="wav")
251
+ tmp_file_path = tmp_file.name
252
+
253
+ # Split the audio if necessary
254
+ chunk_file_paths = asr_manager.split_audio(tmp_file_path)
255
+ tmp_file_paths.extend(chunk_file_paths)
256
+
257
+ logging.info(f"Temporary file paths: {tmp_file_paths}")
258
+ try:
259
+ # Transcribe the audio files in batch
260
+ language_id = asr_manager.model_language.get(language, asr_manager.default_language)
261
+
262
+ if language_id != asr_manager.default_language:
263
+ asr_manager.model = asr_manager.load_model(language_id)
264
+ asr_manager.default_language = language_id
265
+
266
+ asr_manager.model.cur_decoder = "rnnt"
267
+
268
+ #with torch.amp.autocast('cuda', dtype=torch.bfloat16):
269
+ # rnnt_texts = asr_manager.model.transcribe(tmp_file_paths, batch_size=len(files), language_id=language_id)
270
+ rnnt_texts = asr_manager.model.transcribe(tmp_file_paths, batch_size=len(files), language_id=language_id)
271
+
272
+ logging.info(f"Raw transcriptions from model: {rnnt_texts}")
273
+ end_time = time()
274
+ logging.info(f"Transcription completed in {end_time - start_time:.2f} seconds")
275
+
276
+ # Flatten the list of transcriptions
277
+ transcriptions = [text for sublist in rnnt_texts for text in sublist]
278
+ except subprocess.CalledProcessError as e:
279
+ logging.error(f"FFmpeg conversion failed: {str(e)}")
280
+ raise HTTPException(status_code=500, detail=f"FFmpeg conversion failed: {str(e)}")
281
+ except Exception as e:
282
+ logging.error(f"An error occurred during processing: {str(e)}")
283
+ raise HTTPException(status_code=500, detail=f"An error occurred during processing: {str(e)}")
284
+ finally:
285
+ # Clean up temporary files
286
+ for tmp_file_path in tmp_file_paths:
287
+ if os.path.exists(tmp_file_path):
288
+ os.remove(tmp_file_path)
289
+ except HTTPException as e:
290
+ logging.error(f"HTTPException: {str(e)}")
291
+ raise e
292
+ except Exception as e:
293
+ logging.error(f"An unexpected error occurred: {str(e)}")
294
+ raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {str(e)}")
295
+
296
+ return JSONResponse(content={"transcriptions": transcriptions})
297
+
298
+ if __name__ == "__main__":
299
+ parser = argparse.ArgumentParser(description="Run the FastAPI server for ASR.")
300
+ parser.add_argument("--port", type=int, default=8888, help="Port to run the server on.")
301
+ parser.add_argument("--language", type=str, default="kn", help="Default language for the ASR model.")
302
+ parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to run the server on.")
303
+ parser.add_argument("--device", type=str, default="cuda", help="Device type to run the model on (cuda or cpu).")
304
+ args = parser.parse_args()
305
+
306
+ asr_manager = ASRModelManager(default_language=args.language, device_type=args.device)
307
+ uvicorn.run(app, host=args.host, port=args.port)
src/hf_asr.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import nemo.collections.asr as nemo_asr
3
+
4
+ model = nemo_asr.models.ASRModel.from_pretrained("ai4bharat/indicconformer_stt_kn_hybrid_rnnt_large")
5
+
6
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
7
+ model.freeze() # inference mode
8
+ model = model.to(device) # transfer model to device
9
+
10
+ '''
11
+ model.cur_decoder = "ctc"
12
+ ctc_text = model.transcribe(['kannada_query_infer.wav'], batch_size=1,logprobs=False, language_id='kn')[0]
13
+ print(ctc_text)
14
+ '''
15
+
16
+ model.cur_decoder = "rnnt"
17
+ rnnt_text = model.transcribe(['kannada_query_infer.wav'], batch_size=1, language_id='kn')[0]
18
+ print(rnnt_text)
src/hf_asr_advanced.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import nemo.collections.asr as nemo_asr
3
+ import time
4
+ import argparse
5
+
6
+ def load_model(model_name, device):
7
+ model = nemo_asr.models.ASRModel.from_pretrained(model_name)
8
+ model.freeze() # inference mode
9
+ model = model.to(device) # transfer model to device
10
+ return model
11
+
12
+ def transcribe_audio(model, audio_file, batch_size, language_id, decoder_type):
13
+ model.cur_decoder = decoder_type
14
+ transcribed_text = model.transcribe([audio_file], batch_size=batch_size, language_id=language_id)[0]
15
+ return transcribed_text
16
+
17
+ def measure_execution_time(model, audio_file, batch_size, language_id, decoder_type):
18
+ start_time = time.time()
19
+ transcribed_text = transcribe_audio(model, audio_file, batch_size, language_id, decoder_type)
20
+ end_time = time.time()
21
+ execution_time = end_time - start_time
22
+ return transcribed_text, execution_time
23
+
24
+ def main(device_type):
25
+ model_name = "ai4bharat/indicconformer_stt_kn_hybrid_rnnt_large"
26
+ audio_file = 'kannada_query_infer.wav'
27
+ batch_size = 1
28
+ language_id = 'kn'
29
+ decoder_type = "rnnt"
30
+
31
+ device = torch.device(device_type if torch.cuda.is_available() and device_type == "cuda" else "cpu")
32
+ model = load_model(model_name, device)
33
+ transcribed_text, execution_time = measure_execution_time(model, audio_file, batch_size, language_id, decoder_type)
34
+
35
+ print(f"Execution time on {device_type}: {execution_time:.4f} seconds")
36
+ print(f"Transcribed text: {transcribed_text}")
37
+
38
+ if __name__ == "__main__":
39
+ parser = argparse.ArgumentParser(description="Transcribe audio using ASR model.")
40
+ parser.add_argument("--device", type=str, default="cpu", choices=["cpu", "cuda"], help="Device type to use for inference (cpu or cuda).")
41
+ args = parser.parse_args()
42
+ main(args.device)
src/nemo_asr.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_path = "kannada.nemo"
2
+ lang_id = "kn"
3
+
4
+ import torch
5
+ import soundfile as sf
6
+ import nemo.collections.asr as nemo_asr
7
+
8
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
+ model = nemo_asr.models.EncDecCTCModel.restore_from(restore_path=model_path)
10
+ model.eval() # inference mode
11
+ model = model.to(device)
12
+
13
+ '''
14
+ model.cur_decoder = "ctc"
15
+ ctc_text = model.transcribe(['kannada_query_infer.wav'], batch_size=1, logprobs=False, language_id=lang_id)[0]
16
+ print(ctc_text)
17
+ '''
18
+ model.cur_decoder = "rnnt"
19
+ ctc_text = model.transcribe(['kannada_query_infer.wav'], batch_size=1, logprobs=False, language_id=lang_id)[0]
20
+ print(ctc_text)
21
+
22
+
23
+ '''
24
+ import time
25
+
26
+ # Start timing for CTC decoder
27
+ start_time_ctc = time.time()
28
+
29
+ model.cur_decoder = "ctc"
30
+ ctc_text = model.transcribe(['kannada_query_infer.wav'], batch_size=1, logprobs=False, language_id=lang_id)[0]
31
+ print(ctc_text)
32
+
33
+ end_time_ctc = time.time()
34
+ ctc_duration = end_time_ctc - start_time_ctc
35
+ print(f"CTC transcription took {ctc_duration:.4f} seconds")
36
+
37
+ # Start timing for RNNT decoder
38
+ start_time_rnnt = time.time()
39
+
40
+ model.cur_decoder = "rnnt"
41
+ rnnt_text = model.transcribe(['kannada_query_infer.wav'], batch_size=1, logprobs=False, language_id=lang_id)[0]
42
+ print(rnnt_text)
43
+
44
+ end_time_rnnt = time.time()
45
+ rnnt_duration = end_time_rnnt - start_time_rnnt
46
+ print(f"RNNT transcription took {rnnt_duration:.4f} seconds")
47
+
48
+ # Calculate and print the speed difference
49
+ speed_difference = rnnt_duration - ctc_duration
50
+ print(f"Speed difference: {speed_difference:.4f} seconds")
51
+
52
+ '''
src/test/api_device_performance_test.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ import torch
3
+ import nemo.collections.asr as nemo_asr
4
+ from fastapi.testclient import TestClient
5
+ from src.asr_api import app, ASRModelManager
6
+ import time
7
+
8
+ # Initialize the FastAPI test client
9
+ client = TestClient(app)
10
+
11
+ # Mock audio file paths for testing
12
+ AUDIO_FILE_PATH_WAV = "path/to/your/test_audio.wav"
13
+ AUDIO_FILE_PATH_MP3 = "path/to/your/test_audio.mp3"
14
+
15
+ # Function to measure the time taken for a request
16
+ def measure_time(func, *args, **kwargs):
17
+ start_time = time.time()
18
+ result = func(*args, **kwargs)
19
+ end_time = time.time()
20
+ return result, end_time - start_time
21
+
22
+ # Test case for CUDA mode
23
+ def test_transcribe_audio_cuda():
24
+ asr_manager = ASRModelManager(default_language="kn", device_type="cuda")
25
+ app.dependency_overrides[ASRModelManager] = lambda: asr_manager
26
+
27
+ with open(AUDIO_FILE_PATH_WAV, "rb") as audio_file:
28
+ response, duration = measure_time(
29
+ client.post,
30
+ "/transcribe/",
31
+ files={"file": ("test_audio.wav", audio_file, "audio/wav")},
32
+ params={"language": "kannada"}
33
+ )
34
+
35
+ assert response.status_code == 200
36
+ assert "text" in response.json()
37
+ print(f"CUDA mode transcription time: {duration:.2f} seconds")
38
+
39
+ # Test case for CPU mode
40
+ def test_transcribe_audio_cpu():
41
+ asr_manager = ASRModelManager(default_language="kn", device_type="cpu")
42
+ app.dependency_overrides[ASRModelManager] = lambda: asr_manager
43
+
44
+ with open(AUDIO_FILE_PATH_WAV, "rb") as audio_file:
45
+ response, duration = measure_time(
46
+ client.post,
47
+ "/transcribe/",
48
+ files={"file": ("test_audio.wav", audio_file, "audio/wav")},
49
+ params={"language": "kannada"}
50
+ )
51
+
52
+ assert response.status_code == 200
53
+ assert "text" in response.json()
54
+ print(f"CPU mode transcription time: {duration:.2f} seconds")
55
+
56
+ if __name__ == "__main__":
57
+ pytest.main()
src/test/hf_performance_test.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import time
3
+
4
+ def run_transcription(device_type):
5
+ start_time = time.time()
6
+ result = subprocess.run(["python", "src/hf_asr.py", "--device", device_type], capture_output=True, text=True)
7
+ end_time = time.time()
8
+ execution_time = end_time - start_time
9
+ transcribed_text = result.stdout.split("\n")[-2] # Assuming the transcribed text is the second last line
10
+ return transcribed_text, execution_time
11
+
12
+ def main():
13
+ # Measure execution time for CUDA
14
+ if torch.cuda.is_available():
15
+ cuda_text, cuda_time = run_transcription("cuda")
16
+ print(f"CUDA execution time: {cuda_time:.4f} seconds")
17
+ print(f"Transcribed text (CUDA): {cuda_text}")
18
+
19
+ # Measure execution time for CPU
20
+ cpu_text, cpu_time = run_transcription("cpu")
21
+ print(f"CPU execution time: {cpu_time:.4f} seconds")
22
+ print(f"Transcribed text (CPU): {cpu_text}")
23
+
24
+ if __name__ == "__main__":
25
+ main()
src/test/locustfile.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from locust import HttpUser, task, between
2
+ import os
3
+
4
+ class TranscribeUser(HttpUser):
5
+ wait_time = between(1, 5) # Wait time between tasks
6
+
7
+ @task
8
+ def transcribe_audio(self):
9
+ audio_file_path = "./../../kannada_sample_1.wav"
10
+ with open(audio_file_path, 'rb') as audio_file:
11
+ files = {'file': ('kannada_query_infer.wav', audio_file, 'audio/x-wav')}
12
+ headers = {
13
+ 'accept': 'application/json'
14
+ }
15
+ response = self.client.post("http://localhost:8000/transcribe/", files=files, headers=headers)
16
+ if response.status_code == 200:
17
+ print("Success:", response.json())
18
+ else:
19
+ print("Failed:", response.status_code, response.text)
20
+
21
+ @task
22
+ def transcribe_batch(self):
23
+ batch_files = [
24
+ "./../../kannada_sample_1.wav",
25
+ "./../../kannada_sample_2.wav"
26
+ ]
27
+ files = []
28
+ for i, file_path in enumerate(batch_files):
29
+ with open(file_path, 'rb') as audio_file:
30
+ files.append(('file', (f'kannada_query_infer_{i}.wav', audio_file, 'audio/x-wav')))
31
+
32
+ headers = {
33
+ 'accept': 'application/json'
34
+ }
35
+ response = self.client.post("http://localhost:8000/transcribe_batch/", files=files, headers=headers)
36
+ if response.status_code == 200:
37
+ print("Batch Success:", response.json())
38
+ else:
39
+ print("Batch Failed:", response.status_code, response.text)