Upload app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -34,7 +34,6 @@ for key, state_dict in torch.load(os.path.join(snapshot, 'net.pth'), map_locatio | |
| 34 | 
             
                    model[key].load_state_dict(state_dict, strict=False)
         | 
| 35 |  | 
| 36 | 
             
            PARAM_COUNT = sum(p.numel() for value in model.values() for p in value.parameters())
         | 
| 37 | 
            -
            print('PARAM_COUNT', PARAM_COUNT)
         | 
| 38 | 
             
            assert PARAM_COUNT < 82_000_000, PARAM_COUNT
         | 
| 39 |  | 
| 40 | 
             
            random_texts = {}
         | 
| @@ -442,6 +441,36 @@ with gr.Blocks() as lf_tts: | |
| 442 | 
             
                segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
         | 
| 443 | 
             
                generate_btn.click(lf_generate, inputs=[segments, voice, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after, pad_between], outputs=[audio])
         | 
| 444 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 445 | 
             
            with gr.Blocks() as api_info:
         | 
| 446 | 
             
                gr.Markdown("""
         | 
| 447 | 
             
            This Space can be used via API. The following code block can be copied and run in one Google Colab cell.
         | 
| @@ -465,13 +494,13 @@ from IPython.display import display, Audio | |
| 465 | 
             
            display(Audio(audio_path))
         | 
| 466 | 
             
            print(out_ps)
         | 
| 467 | 
             
            ```
         | 
| 468 | 
            -
            Note that this Space and the underlying Kokoro model are both under development and subject to change.  | 
| 469 | 
             
            """)
         | 
| 470 |  | 
| 471 | 
             
            with gr.Blocks() as app:
         | 
| 472 | 
             
                gr.TabbedInterface(
         | 
| 473 | 
            -
                    [basic_tts, lf_tts, api_info],
         | 
| 474 | 
            -
                    ['Basic TTS', 'Long-Form', 'Gradio API'],
         | 
| 475 | 
             
                )
         | 
| 476 |  | 
| 477 | 
             
            if __name__ == '__main__':
         | 
|  | |
| 34 | 
             
                    model[key].load_state_dict(state_dict, strict=False)
         | 
| 35 |  | 
| 36 | 
             
            PARAM_COUNT = sum(p.numel() for value in model.values() for p in value.parameters())
         | 
|  | |
| 37 | 
             
            assert PARAM_COUNT < 82_000_000, PARAM_COUNT
         | 
| 38 |  | 
| 39 | 
             
            random_texts = {}
         | 
|  | |
| 441 | 
             
                segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
         | 
| 442 | 
             
                generate_btn.click(lf_generate, inputs=[segments, voice, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after, pad_between], outputs=[audio])
         | 
| 443 |  | 
| 444 | 
            +
            with gr.Blocks() as about:
         | 
| 445 | 
            +
                gr.Markdown("""
         | 
| 446 | 
            +
            Kokoro is a frontier TTS model for its size. It has 80 million parameters,<sup>[1]</sup> uses a lean StyleTTS 2 architecture,<sup>[2]</sup> and was trained on high-quality data.
         | 
| 447 | 
            +
             | 
| 448 | 
            +
            The weights are currently private, but a free public demo is hosted at https://hf.co/spaces/hexgrad/Kokoro-TTS
         | 
| 449 | 
            +
             | 
| 450 | 
            +
            ### Compute
         | 
| 451 | 
            +
            The model was trained on 1x A100-class 80GB instances rented from [Vast.ai](https://cloud.vast.ai/?ref_id=79907).<sup>[3]</sup> Vast was selected over other compute providers due to its competitive on-demand hourly rates. The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
         | 
| 452 | 
            +
             | 
| 453 | 
            +
            ### Updates
         | 
| 454 | 
            +
            This Space and the underlying Kokoro model are both under development and subject to change.
         | 
| 455 | 
            +
            Last model update: 2024 Nov 15
         | 
| 456 | 
            +
            Model trained by: Raven (@rzvzn on Discord)
         | 
| 457 | 
            +
             | 
| 458 | 
            +
            ### Licenses
         | 
| 459 | 
            +
            Inference code: MIT
         | 
| 460 | 
            +
            espeak-ng dependency: GPL-3.0<sup>[4]</sup>
         | 
| 461 | 
            +
            Random English texts: Unknown<sup>[5]</sup>
         | 
| 462 | 
            +
            Random Japanese texts: CC0 public domain<sup>[6]</sup>
         | 
| 463 | 
            +
            Kokoro model weights: N/A
         | 
| 464 | 
            +
             | 
| 465 | 
            +
            ### References
         | 
| 466 | 
            +
            1. Kokoro parameter count | https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L37
         | 
| 467 | 
            +
            2. StyleTTS 2 | https://github.com/yl4579/StyleTTS2
         | 
| 468 | 
            +
            3. Vast.ai referral link | https://cloud.vast.ai/?ref_id=79907
         | 
| 469 | 
            +
            4. eSpeak NG | https://github.com/espeak-ng/espeak-ng
         | 
| 470 | 
            +
            5. Quotable Data | https://github.com/quotable-io/data/blob/master/data/quotes.json
         | 
| 471 | 
            +
            6. Common Voice Japanese sentences | https://github.com/common-voice/common-voice/tree/main/server/data/ja
         | 
| 472 | 
            +
            """)
         | 
| 473 | 
            +
             | 
| 474 | 
             
            with gr.Blocks() as api_info:
         | 
| 475 | 
             
                gr.Markdown("""
         | 
| 476 | 
             
            This Space can be used via API. The following code block can be copied and run in one Google Colab cell.
         | 
|  | |
| 494 | 
             
            display(Audio(audio_path))
         | 
| 495 | 
             
            print(out_ps)
         | 
| 496 | 
             
            ```
         | 
| 497 | 
            +
            Note that this Space and the underlying Kokoro model are both under development and subject to change. Reliability is not guaranteed. Hugging Face and/or Gradio might enforce their own rate limits.
         | 
| 498 | 
             
            """)
         | 
| 499 |  | 
| 500 | 
             
            with gr.Blocks() as app:
         | 
| 501 | 
             
                gr.TabbedInterface(
         | 
| 502 | 
            +
                    [basic_tts, lf_tts, about, api_info],
         | 
| 503 | 
            +
                    ['Basic TTS', 'Long-Form', 'About', 'Gradio API'],
         | 
| 504 | 
             
                )
         | 
| 505 |  | 
| 506 | 
             
            if __name__ == '__main__':
         | 
 
			
