Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Update speech_edit.py
Browse files- speech_edit.py +41 -6
    	
        speech_edit.py
    CHANGED
    
    | @@ -8,7 +8,8 @@ CosyVoice gRPC back‑end – updated to mirror the FastAPI logic | |
| 8 | 
             
            *   inference_instruct2  ➜  new:  prompt‑audio + speed (no speaker‑ID)
         | 
| 9 | 
             
            """
         | 
| 10 |  | 
| 11 | 
            -
            import io,  | 
|  | |
| 12 | 
             
            import sys
         | 
| 13 | 
             
            from concurrent import futures
         | 
| 14 | 
             
            import argparse
         | 
| @@ -148,14 +149,48 @@ class CosyVoiceServiceImpl(cosyvoice_pb2_grpc.CosyVoiceServicer): | |
| 148 | 
             
                    if request.HasField("cross_lingual_request"):
         | 
| 149 | 
             
                        logging.info("Received cross‑lingual inference request")
         | 
| 150 | 
             
                        cr = request.cross_lingual_request
         | 
| 151 | 
            -
                         | 
| 152 | 
            -
             | 
| 153 | 
            -
             | 
| 154 | 
            -
                             | 
| 155 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 156 | 
             
                        yield from _yield_audio(mo)
         | 
| 157 | 
             
                        return
         | 
| 158 |  | 
|  | |
| 159 | 
             
                    # 4. Instruction‑TTS (two flavours)
         | 
| 160 | 
             
                    if request.HasField("instruct_request"):
         | 
| 161 | 
             
                        ir = request.instruct_request
         | 
|  | |
| 8 | 
             
            *   inference_instruct2  ➜  new:  prompt‑audio + speed (no speaker‑ID)
         | 
| 9 | 
             
            """
         | 
| 10 |  | 
| 11 | 
            +
            import io, tempfile, requests, soundfile as sf, torchaudio
         | 
| 12 | 
            +
            import os
         | 
| 13 | 
             
            import sys
         | 
| 14 | 
             
            from concurrent import futures
         | 
| 15 | 
             
            import argparse
         | 
|  | |
| 149 | 
             
                    if request.HasField("cross_lingual_request"):
         | 
| 150 | 
             
                        logging.info("Received cross‑lingual inference request")
         | 
| 151 | 
             
                        cr = request.cross_lingual_request
         | 
| 152 | 
            +
                        tmp_path = None
         | 
| 153 | 
            +
                    
         | 
| 154 | 
            +
                        try:
         | 
| 155 | 
            +
                            if cr.prompt_audio.startswith(b'http'):          # S3 URL case
         | 
| 156 | 
            +
                                url = cr.prompt_audio.decode('utf‑8')
         | 
| 157 | 
            +
                                logging.info("Downloading cross‑lingual prompt from %s", url)
         | 
| 158 | 
            +
                                resp = requests.get(url, timeout=10)
         | 
| 159 | 
            +
                                resp.raise_for_status()
         | 
| 160 | 
            +
                    
         | 
| 161 | 
            +
                                with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
         | 
| 162 | 
            +
                                    f.write(resp.content)
         | 
| 163 | 
            +
                                    tmp_path = f.name
         | 
| 164 | 
            +
                    
         | 
| 165 | 
            +
                                wav, sr = sf.read(tmp_path, dtype='float32')
         | 
| 166 | 
            +
                                if wav.ndim > 1:
         | 
| 167 | 
            +
                                    wav = wav.mean(axis=1)
         | 
| 168 | 
            +
                                if sr != 16_000:
         | 
| 169 | 
            +
                                    wav = torchaudio.functional.resample(
         | 
| 170 | 
            +
                                        torch.from_numpy(wav).unsqueeze(0), sr, 16_000
         | 
| 171 | 
            +
                                    )[0].numpy()
         | 
| 172 | 
            +
                                prompt = torch.from_numpy(wav).unsqueeze(0)
         | 
| 173 | 
            +
                    
         | 
| 174 | 
            +
                            else:                                           # legacy raw bytes
         | 
| 175 | 
            +
                                prompt = _bytes_to_tensor(cr.prompt_audio)
         | 
| 176 | 
            +
                    
         | 
| 177 | 
            +
                            mo = self.cosyvoice.inference_cross_lingual(
         | 
| 178 | 
            +
                                cr.tts_text,
         | 
| 179 | 
            +
                                prompt
         | 
| 180 | 
            +
                            )
         | 
| 181 | 
            +
                    
         | 
| 182 | 
            +
                        finally:
         | 
| 183 | 
            +
                            if tmp_path and os.path.exists(tmp_path):
         | 
| 184 | 
            +
                                try:
         | 
| 185 | 
            +
                                    os.remove(tmp_path)
         | 
| 186 | 
            +
                                except Exception as e:
         | 
| 187 | 
            +
                                    logging.warning("Could not remove temp file %s: %s",
         | 
| 188 | 
            +
                                                    tmp_path, e)
         | 
| 189 | 
            +
                    
         | 
| 190 | 
             
                        yield from _yield_audio(mo)
         | 
| 191 | 
             
                        return
         | 
| 192 |  | 
| 193 | 
            +
             | 
| 194 | 
             
                    # 4. Instruction‑TTS (two flavours)
         | 
| 195 | 
             
                    if request.HasField("instruct_request"):
         | 
| 196 | 
             
                        ir = request.instruct_request
         | 
 
			
