vitorcalvi commited on
Commit
5630c13
·
1 Parent(s): 339e2ea
Files changed (3) hide show
  1. Dockerfile +6 -9
  2. app.py +40 -32
  3. requirements.txt +1 -9
Dockerfile CHANGED
@@ -1,21 +1,18 @@
 
1
  FROM python:3.9
2
 
3
- # Install system dependencies
4
- RUN apt-get update && apt-get install -y ffmpeg libsndfile1
5
-
6
- # Set the working directory
7
  WORKDIR /app
8
 
9
- # Copy requirements and install dependencies
10
  COPY requirements.txt .
11
  RUN pip install --no-cache-dir -r requirements.txt
12
 
13
- # Copy the application code
14
  COPY . .
15
 
16
- # Expose the required port
17
  EXPOSE 7860
18
 
19
- # Run the app
20
-
21
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ # Use the official Python image.
2
  FROM python:3.9
3
 
4
+ # Set the working directory.
 
 
 
5
  WORKDIR /app
6
 
7
+ # Copy requirements file and install dependencies.
8
  COPY requirements.txt .
9
  RUN pip install --no-cache-dir -r requirements.txt
10
 
11
+ # Copy the application code.
12
  COPY . .
13
 
14
+ # Expose port 7860 for the FastAPI app.
15
  EXPOSE 7860
16
 
17
+ # Run the app using Uvicorn.
 
18
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -1,37 +1,49 @@
1
  from fastapi import FastAPI, File, UploadFile, HTTPException, Form
2
  from fastapi.responses import JSONResponse
3
  from pydantic import BaseModel
4
- import librosa
5
  import numpy as np
6
  import tempfile
7
  import os
8
  import warnings
9
- import soundfile as sf
10
- from pydub import AudioSegment
11
 
12
- warnings.filterwarnings("ignore", category=UserWarning, module='librosa')
13
 
14
  app = FastAPI()
15
 
16
- def convert_mp3_to_wav(mp3_path, wav_path):
17
- # Convert mp3 to wav using pydub and ffmpeg
18
- audio = AudioSegment.from_mp3(mp3_path)
19
- audio.export(wav_path, format="wav")
20
-
21
  def extract_audio_features(audio_file_path):
22
- # Load the audio file and extract features
23
- y, sr = sf.read(audio_file_path)
24
- f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=75, fmax=600)
25
- f0 = f0[~np.isnan(f0)]
26
- energy = librosa.feature.rms(y=y)[0]
27
- mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
28
- onset_env = librosa.onset.onset_strength(y=y, sr=sr)
29
- tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
30
- speech_rate = tempo / 60
31
- return f0, energy, speech_rate, mfccs, y, sr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  def analyze_voice_stress(audio_file_path):
34
- f0, energy, speech_rate, mfccs, y, sr = extract_audio_features(audio_file_path)
 
 
35
  mean_f0 = np.mean(f0)
36
  std_f0 = np.std(f0)
37
  mean_energy = np.mean(energy)
@@ -79,20 +91,17 @@ async def analyze_stress(
79
  # Handle audio file analysis
80
  if file or file_path:
81
  if file:
82
- if not file.filename.endswith(".mp3"):
83
- raise HTTPException(status_code=400, detail="Only .mp3 files are supported.")
84
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
85
  temp_file.write(await file.read())
86
- temp_mp3_path = temp_file.name
87
- temp_wav_path = temp_mp3_path.replace(".mp3", ".wav")
88
- convert_mp3_to_wav(temp_mp3_path, temp_wav_path)
89
  else:
90
- if not file_path.endswith(".mp3"):
91
- raise HTTPException(status_code=400, detail="Only .mp3 files are supported.")
92
  if not os.path.exists(file_path):
93
  raise HTTPException(status_code=400, detail="File path does not exist.")
94
- temp_wav_path = file_path.replace(".mp3", ".wav")
95
- convert_mp3_to_wav(file_path, temp_wav_path)
96
 
97
  try:
98
  result = analyze_voice_stress(temp_wav_path)
@@ -102,7 +111,6 @@ async def analyze_stress(
102
  finally:
103
  # Clean up temporary files
104
  if file:
105
- os.remove(temp_mp3_path)
106
  os.remove(temp_wav_path)
107
 
108
  # Handle text analysis
@@ -112,5 +120,5 @@ async def analyze_stress(
112
 
113
  if __name__ == "__main__":
114
  import uvicorn
115
- port = int(os.getenv("PORT", 7860)) # Use the PORT environment variable for Render compatibility
116
  uvicorn.run("app:app", host="0.0.0.0", port=port, reload=True)
 
1
  from fastapi import FastAPI, File, UploadFile, HTTPException, Form
2
  from fastapi.responses import JSONResponse
3
  from pydantic import BaseModel
4
+ import torchaudio
5
  import numpy as np
6
  import tempfile
7
  import os
8
  import warnings
 
 
9
 
10
+ warnings.filterwarnings("ignore")
11
 
12
  app = FastAPI()
13
 
 
 
 
 
 
14
  def extract_audio_features(audio_file_path):
15
+ # Load the audio file using torchaudio
16
+ waveform, sample_rate = torchaudio.load(audio_file_path)
17
+
18
+ # Ensure waveform is mono by averaging channels if necessary
19
+ if waveform.shape[0] > 1:
20
+ waveform = waveform.mean(dim=0, keepdim=True)
21
+
22
+ waveform = waveform.squeeze() # Remove channel dimension if it's 1
23
+
24
+ # Extract pitch (fundamental frequency)
25
+ pitch_frequencies, voiced_flags, _ = torchaudio.functional.detect_pitch_frequency(
26
+ waveform, sample_rate, frame_time=0.01, win_length=1024
27
+ )
28
+ f0 = pitch_frequencies[voiced_flags > 0]
29
+
30
+ # Extract energy
31
+ energy = waveform.pow(2).numpy()
32
+
33
+ # Extract MFCCs
34
+ mfcc_transform = torchaudio.transforms.MFCC(sample_rate=sample_rate, n_mfcc=13)
35
+ mfccs = mfcc_transform(waveform.unsqueeze(0)).squeeze(0).numpy()
36
+
37
+ # Estimate speech rate (simplified)
38
+ tempo = torchaudio.functional.estimate_tempo(waveform, sample_rate)
39
+ speech_rate = tempo / 60 if tempo is not None else 0
40
+
41
+ return f0.numpy(), energy, speech_rate, mfccs, waveform.numpy(), sample_rate
42
 
43
  def analyze_voice_stress(audio_file_path):
44
+ f0, energy, speech_rate, mfccs, waveform, sample_rate = extract_audio_features(audio_file_path)
45
+ if len(f0) == 0:
46
+ raise ValueError("Could not extract fundamental frequency from the audio.")
47
  mean_f0 = np.mean(f0)
48
  std_f0 = np.std(f0)
49
  mean_energy = np.mean(energy)
 
91
  # Handle audio file analysis
92
  if file or file_path:
93
  if file:
94
+ if not file.filename.endswith(".wav"):
95
+ raise HTTPException(status_code=400, detail="Only .wav files are supported.")
96
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
97
  temp_file.write(await file.read())
98
+ temp_wav_path = temp_file.name
 
 
99
  else:
100
+ if not file_path.endswith(".wav"):
101
+ raise HTTPException(status_code=400, detail="Only .wav files are supported.")
102
  if not os.path.exists(file_path):
103
  raise HTTPException(status_code=400, detail="File path does not exist.")
104
+ temp_wav_path = file_path
 
105
 
106
  try:
107
  result = analyze_voice_stress(temp_wav_path)
 
111
  finally:
112
  # Clean up temporary files
113
  if file:
 
114
  os.remove(temp_wav_path)
115
 
116
  # Handle text analysis
 
120
 
121
  if __name__ == "__main__":
122
  import uvicorn
123
+ port = int(os.getenv("PORT", 7860)) # Use the PORT environment variable if needed
124
  uvicorn.run("app:app", host="0.0.0.0", port=port, reload=True)
requirements.txt CHANGED
@@ -1,13 +1,5 @@
1
  fastapi
2
  uvicorn
3
- librosa
4
  numpy
5
  pydantic
6
- soundfile
7
- pydub
8
- ffmpeg-python
9
- python-multipart
10
-
11
-
12
-
13
-
 
1
  fastapi
2
  uvicorn
3
+ torchaudio
4
  numpy
5
  pydantic