Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	upd: speech-to-text
Browse files
    	
        app.py
    CHANGED
    
    | @@ -7,7 +7,6 @@ from transformers import ( | |
| 7 | 
             
                pipeline,
         | 
| 8 | 
             
            )
         | 
| 9 | 
             
            import torch
         | 
| 10 | 
            -
            import torchaudio
         | 
| 11 |  | 
| 12 | 
             
            processor = AutoProcessor.from_pretrained(
         | 
| 13 | 
             
                "MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
         | 
| @@ -22,7 +21,7 @@ tokenizer_en2ru = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru") | |
| 22 | 
             
            model_en2ru = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
         | 
| 23 |  | 
| 24 | 
             
            transcriber = pipeline(
         | 
| 25 | 
            -
                "automatic-speech-recognition", model=" | 
| 26 | 
             
            )
         | 
| 27 |  | 
| 28 |  | 
| @@ -80,30 +79,7 @@ def transcribe(image, audio): | |
| 80 |  | 
| 81 | 
             
                if y.ndim > 1:
         | 
| 82 | 
             
                    y = y.mean(axis=1)
         | 
| 83 | 
            -
             | 
| 84 | 
            -
                y_tensor = torch.tensor(y, dtype=torch.float32)
         | 
| 85 | 
            -
                print(y.shape)
         | 
| 86 | 
            -
             | 
| 87 | 
            -
                if sr != 16000:
         | 
| 88 | 
            -
                    resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
         | 
| 89 | 
            -
                    y_tensor = resampler(y_tensor)
         | 
| 90 | 
            -
                    sr = 16000
         | 
| 91 | 
            -
             | 
| 92 | 
            -
                y_tensor /= torch.max(torch.abs(y_tensor))
         | 
| 93 | 
            -
             | 
| 94 | 
            -
                y = y_tensor.numpy()
         | 
| 95 | 
            -
                print(y.shape)
         | 
| 96 | 
            -
             | 
| 97 | 
            -
                input_features = transcriber.feature_extractor(
         | 
| 98 | 
            -
                    y, sampling_rate=sr, return_tensors="pt"
         | 
| 99 | 
            -
                ).input_features
         | 
| 100 | 
            -
                print(input_features.shape)
         | 
| 101 | 
            -
                print(input_features)
         | 
| 102 | 
            -
             | 
| 103 | 
            -
                transcription = transcriber.model.generate(input_features)
         | 
| 104 | 
            -
                transcription_text = transcriber.tokenizer.decode(
         | 
| 105 | 
            -
                    transcription[0], skip_special_tokens=True
         | 
| 106 | 
            -
                )
         | 
| 107 |  | 
| 108 | 
             
                return generate_answer(image, transcription_text)
         | 
| 109 |  | 
|  | |
| 7 | 
             
                pipeline,
         | 
| 8 | 
             
            )
         | 
| 9 | 
             
            import torch
         | 
|  | |
| 10 |  | 
| 11 | 
             
            processor = AutoProcessor.from_pretrained(
         | 
| 12 | 
             
                "MariaK/layoutlmv2-base-uncased_finetuned_docvqa_v2"
         | 
|  | |
| 21 | 
             
            model_en2ru = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
         | 
| 22 |  | 
| 23 | 
             
            transcriber = pipeline(
         | 
| 24 | 
            +
                "automatic-speech-recognition", model="lorenzoncina/whisper-medium-ru"
         | 
| 25 | 
             
            )
         | 
| 26 |  | 
| 27 |  | 
|  | |
| 79 |  | 
| 80 | 
             
                if y.ndim > 1:
         | 
| 81 | 
             
                    y = y.mean(axis=1)
         | 
| 82 | 
            +
                transcription_text = transcriber({"sampling_rate": sr, "raw": y})["text"]
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 83 |  | 
| 84 | 
             
                return generate_answer(image, transcription_text)
         | 
| 85 |  |