gresqdf commited on
Commit
ea25f01
·
1 Parent(s): 3459eb7
Files changed (2) hide show
  1. Dockerfile +1 -1
  2. main.py +26 -6
Dockerfile CHANGED
@@ -19,7 +19,7 @@ RUN pip install --upgrade pip
19
  RUN pip install torch torchvision
20
 
21
  # Install Hugging Face Transformers and other dependencies
22
- RUN pip install transformers librosa deep-translator python-multipart fastapi uvicorn
23
 
24
  # Copy the main script
25
  COPY --chown=user main.py .
 
19
  RUN pip install torch torchvision
20
 
21
  # Install Hugging Face Transformers and other dependencies
22
+ RUN pip install transformers librosa deep-translator python-multipart fastapi uvicorn sentencepiece
23
 
24
  # Copy the main script
25
  COPY --chown=user main.py .
main.py CHANGED
@@ -1,4 +1,4 @@
1
- from fastapi import FastAPI, UploadFile, File
2
  from transformers import pipeline
3
  import librosa
4
  from deep_translator import GoogleTranslator
@@ -6,23 +6,43 @@ import io
6
 
7
  app = FastAPI()
8
 
9
- print("Loading Speech Recognition")
10
- pipe = pipeline("automatic-speech-recognition", model="Akashpb13/xlsr_kurmanji_kurdish")
11
- print("Speech Recognition Loaded")
 
12
 
13
  print("Loading translator")
14
  translator = GoogleTranslator(source='ku', target='fr')
15
  print("Translator loaded")
16
 
 
 
 
 
 
17
 
18
  def speech2text(audio_data: bytes):
19
- audio_array, _ = librosa.load(io.BytesIO(audio_data), sr=16000)
 
20
  output = pipe(audio_array)
21
  return output["text"]
22
 
 
 
 
 
 
23
  @app.post("/transcribe")
24
  async def transcribe(file: UploadFile = File(...)):
25
  audio_data = await file.read()
26
  text_output = speech2text(audio_data)
27
  translated = translator.translate(text_output)
28
- return {"text": text_output, "translation": translated}
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, Response
2
  from transformers import pipeline
3
  import librosa
4
  from deep_translator import GoogleTranslator
 
6
 
7
  app = FastAPI()
8
 
9
+
10
+ # print("Loading Speech Recognition")
11
+
12
+ # print("Speech Recognition Loaded")
13
 
14
  print("Loading translator")
15
  translator = GoogleTranslator(source='ku', target='fr')
16
  print("Translator loaded")
17
 
18
+ # print("Loading tts")
19
+
20
+ # print("TTS loaded")
21
+
22
+
23
 
24
  def speech2text(audio_data: bytes):
25
+ audio_array, _ = librosa.load(io.BytesIO(audio_data), sr=16000)
26
+ pipe = pipeline("automatic-speech-recognition", model="Akashpb13/xlsr_kurmanji_kurdish")
27
  output = pipe(audio_array)
28
  return output["text"]
29
 
30
+ def text2speech(text:str):
31
+ tts = pipeline("text-to-audio", model="roshna-omer/speecht5_tts_krd-kmr_CV17.0")
32
+ output = tts(text)
33
+ return output["audio"]
34
+
35
  @app.post("/transcribe")
36
  async def transcribe(file: UploadFile = File(...)):
37
  audio_data = await file.read()
38
  text_output = speech2text(audio_data)
39
  translated = translator.translate(text_output)
40
+ return {"text": text_output, "translation": translated}
41
+
42
+ @app.post("/transcribe_audio")
43
+ async def transcribe_and_return_audio(file: UploadFile = File(...)):
44
+ audio_data = await file.read()
45
+ text_output = speech2text(audio_data)
46
+ audio_output = text2speech(text_output)
47
+
48
+ return Response(content=audio_output, media_type="audio/wav")