init
Browse files- README.md +24 -0
- checkpoints/base_speakers/EN/checkpoint.pth +3 -0
- checkpoints/base_speakers/EN/config.json +145 -0
- checkpoints/base_speakers/EN/en_default_se.pth +3 -0
- checkpoints/base_speakers/EN/en_style_se.pth +3 -0
- checkpoints/base_speakers/ZH/checkpoint.pth +3 -0
- checkpoints/base_speakers/ZH/config.json +137 -0
- checkpoints/base_speakers/ZH/zh_default_se.pth +3 -0
- checkpoints/converter/checkpoint.pth +3 -0
- checkpoints/converter/config.json +57 -0
    	
        README.md
    CHANGED
    
    | @@ -1,3 +1,27 @@ | |
| 1 | 
             
            ---
         | 
| 2 | 
             
            license: cc-by-nc-4.0
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 3 | 
             
            ---
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
             
            license: cc-by-nc-4.0
         | 
| 3 | 
            +
            tags:
         | 
| 4 | 
            +
            - audio
         | 
| 5 | 
            +
            - text-to-speech
         | 
| 6 | 
            +
            - instant-voice-cloning
         | 
| 7 | 
            +
            language:
         | 
| 8 | 
            +
            - en
         | 
| 9 | 
            +
            - zh
         | 
| 10 | 
             
            ---
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            # OpenVoice
         | 
| 13 | 
            +
            OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            ### Features
         | 
| 16 | 
            +
            - **Accurate Tone Color Cloning.** OpenVoice can accurately clone the reference tone color and generate speech in multiple languages and accents.
         | 
| 17 | 
            +
            - **Flexible Voice Style Control.** OpenVoice enables granular control over voice styles, such as emotion and accent, as well as other style parameters including rhythm, pauses, and intonation.
         | 
| 18 | 
            +
            - **Zero-shot Cross-lingual Voice Cloning.** Neither of the language of the generated speech nor the language of the reference speech needs to be presented in the massive-speaker multi-lingual training dataset.
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            ### Languages
         | 
| 21 | 
            +
            We only support **English** and **Chinese** with this version of model weight. OpenVoice can adapt to any other language as long as a base speaker is provided. **For multi-lingual and cross-lingual examples, please refer to [this jupyter notebook](https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb).**
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            ### Links
         | 
| 24 | 
            +
            [Github](https://github.com/myshell-ai/OpenVoice)
         | 
| 25 | 
            +
            [HFDemo](https://huggingface.co/spaces/myshell-ai/OpenVoice)
         | 
| 26 | 
            +
            [Discord](https://discord.gg/myshell)
         | 
| 27 | 
            +
             | 
    	
        checkpoints/base_speakers/EN/checkpoint.pth
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:1db1ae1a5c8ded049bd1536051489aefbfad4a5077c01c2257e9e88fa1bb8422
         | 
| 3 | 
            +
            size 160467309
         | 
    	
        checkpoints/base_speakers/EN/config.json
    ADDED
    
    | @@ -0,0 +1,145 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "data": {
         | 
| 3 | 
            +
                "text_cleaners": [
         | 
| 4 | 
            +
                  "cjke_cleaners2"
         | 
| 5 | 
            +
                ],
         | 
| 6 | 
            +
                "sampling_rate": 22050,
         | 
| 7 | 
            +
                "filter_length": 1024,
         | 
| 8 | 
            +
                "hop_length": 256,
         | 
| 9 | 
            +
                "win_length": 1024,
         | 
| 10 | 
            +
                "n_mel_channels": 80,
         | 
| 11 | 
            +
                "add_blank": true,
         | 
| 12 | 
            +
                "cleaned_text": true,
         | 
| 13 | 
            +
                "n_speakers": 10
         | 
| 14 | 
            +
              },
         | 
| 15 | 
            +
              "model": {
         | 
| 16 | 
            +
                "inter_channels": 192,
         | 
| 17 | 
            +
                "hidden_channels": 192,
         | 
| 18 | 
            +
                "filter_channels": 768,
         | 
| 19 | 
            +
                "n_heads": 2,
         | 
| 20 | 
            +
                "n_layers": 6,
         | 
| 21 | 
            +
                "n_layers_trans_flow": 3,
         | 
| 22 | 
            +
                "kernel_size": 3,
         | 
| 23 | 
            +
                "p_dropout": 0.1,
         | 
| 24 | 
            +
                "resblock": "1",
         | 
| 25 | 
            +
                "resblock_kernel_sizes": [
         | 
| 26 | 
            +
                  3,
         | 
| 27 | 
            +
                  7,
         | 
| 28 | 
            +
                  11
         | 
| 29 | 
            +
                ],
         | 
| 30 | 
            +
                "resblock_dilation_sizes": [
         | 
| 31 | 
            +
                  [
         | 
| 32 | 
            +
                    1,
         | 
| 33 | 
            +
                    3,
         | 
| 34 | 
            +
                    5
         | 
| 35 | 
            +
                  ],
         | 
| 36 | 
            +
                  [
         | 
| 37 | 
            +
                    1,
         | 
| 38 | 
            +
                    3,
         | 
| 39 | 
            +
                    5
         | 
| 40 | 
            +
                  ],
         | 
| 41 | 
            +
                  [
         | 
| 42 | 
            +
                    1,
         | 
| 43 | 
            +
                    3,
         | 
| 44 | 
            +
                    5
         | 
| 45 | 
            +
                  ]
         | 
| 46 | 
            +
                ],
         | 
| 47 | 
            +
                "upsample_rates": [
         | 
| 48 | 
            +
                  8,
         | 
| 49 | 
            +
                  8,
         | 
| 50 | 
            +
                  2,
         | 
| 51 | 
            +
                  2
         | 
| 52 | 
            +
                ],
         | 
| 53 | 
            +
                "upsample_initial_channel": 512,
         | 
| 54 | 
            +
                "upsample_kernel_sizes": [
         | 
| 55 | 
            +
                  16,
         | 
| 56 | 
            +
                  16,
         | 
| 57 | 
            +
                  4,
         | 
| 58 | 
            +
                  4
         | 
| 59 | 
            +
                ],
         | 
| 60 | 
            +
                "n_layers_q": 3,
         | 
| 61 | 
            +
                "use_spectral_norm": false,
         | 
| 62 | 
            +
                "gin_channels": 256
         | 
| 63 | 
            +
              },
         | 
| 64 | 
            +
              "symbols": [
         | 
| 65 | 
            +
                "_",
         | 
| 66 | 
            +
                ",",
         | 
| 67 | 
            +
                ".",
         | 
| 68 | 
            +
                "!",
         | 
| 69 | 
            +
                "?",
         | 
| 70 | 
            +
                "-",
         | 
| 71 | 
            +
                "~",
         | 
| 72 | 
            +
                "\u2026",
         | 
| 73 | 
            +
                "N",
         | 
| 74 | 
            +
                "Q",
         | 
| 75 | 
            +
                "a",
         | 
| 76 | 
            +
                "b",
         | 
| 77 | 
            +
                "d",
         | 
| 78 | 
            +
                "e",
         | 
| 79 | 
            +
                "f",
         | 
| 80 | 
            +
                "g",
         | 
| 81 | 
            +
                "h",
         | 
| 82 | 
            +
                "i",
         | 
| 83 | 
            +
                "j",
         | 
| 84 | 
            +
                "k",
         | 
| 85 | 
            +
                "l",
         | 
| 86 | 
            +
                "m",
         | 
| 87 | 
            +
                "n",
         | 
| 88 | 
            +
                "o",
         | 
| 89 | 
            +
                "p",
         | 
| 90 | 
            +
                "s",
         | 
| 91 | 
            +
                "t",
         | 
| 92 | 
            +
                "u",
         | 
| 93 | 
            +
                "v",
         | 
| 94 | 
            +
                "w",
         | 
| 95 | 
            +
                "x",
         | 
| 96 | 
            +
                "y",
         | 
| 97 | 
            +
                "z",
         | 
| 98 | 
            +
                "\u0251",
         | 
| 99 | 
            +
                "\u00e6",
         | 
| 100 | 
            +
                "\u0283",
         | 
| 101 | 
            +
                "\u0291",
         | 
| 102 | 
            +
                "\u00e7",
         | 
| 103 | 
            +
                "\u026f",
         | 
| 104 | 
            +
                "\u026a",
         | 
| 105 | 
            +
                "\u0254",
         | 
| 106 | 
            +
                "\u025b",
         | 
| 107 | 
            +
                "\u0279",
         | 
| 108 | 
            +
                "\u00f0",
         | 
| 109 | 
            +
                "\u0259",
         | 
| 110 | 
            +
                "\u026b",
         | 
| 111 | 
            +
                "\u0265",
         | 
| 112 | 
            +
                "\u0278",
         | 
| 113 | 
            +
                "\u028a",
         | 
| 114 | 
            +
                "\u027e",
         | 
| 115 | 
            +
                "\u0292",
         | 
| 116 | 
            +
                "\u03b8",
         | 
| 117 | 
            +
                "\u03b2",
         | 
| 118 | 
            +
                "\u014b",
         | 
| 119 | 
            +
                "\u0266",
         | 
| 120 | 
            +
                "\u207c",
         | 
| 121 | 
            +
                "\u02b0",
         | 
| 122 | 
            +
                "`",
         | 
| 123 | 
            +
                "^",
         | 
| 124 | 
            +
                "#",
         | 
| 125 | 
            +
                "*",
         | 
| 126 | 
            +
                "=",
         | 
| 127 | 
            +
                "\u02c8",
         | 
| 128 | 
            +
                "\u02cc",
         | 
| 129 | 
            +
                "\u2192",
         | 
| 130 | 
            +
                "\u2193",
         | 
| 131 | 
            +
                "\u2191",
         | 
| 132 | 
            +
                " "
         | 
| 133 | 
            +
              ],
         | 
| 134 | 
            +
              "speakers": {
         | 
| 135 | 
            +
                "default": 1,
         | 
| 136 | 
            +
                "whispering": 2,
         | 
| 137 | 
            +
                "shouting": 3,
         | 
| 138 | 
            +
                "excited": 4,
         | 
| 139 | 
            +
                "cheerful": 5,
         | 
| 140 | 
            +
                "terrified": 6,
         | 
| 141 | 
            +
                "angry": 7,
         | 
| 142 | 
            +
                "sad": 8,
         | 
| 143 | 
            +
                "friendly": 9
         | 
| 144 | 
            +
              }
         | 
| 145 | 
            +
            }
         | 
    	
        checkpoints/base_speakers/EN/en_default_se.pth
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:9cab24002eec738d0fe72cb73a34e57fbc3999c1bd4a1670a7b56ee4e3590ac9
         | 
| 3 | 
            +
            size 1789
         | 
    	
        checkpoints/base_speakers/EN/en_style_se.pth
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:6f698153be5004b90a8642d1157c89cae7dd296752a3276450ced6a17b8b98a9
         | 
| 3 | 
            +
            size 1783
         | 
    	
        checkpoints/base_speakers/ZH/checkpoint.pth
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:de9fb0eb749f3254130fe0172fcbb20e75f88a9b16b54dd0b73cac0dc40da7d9
         | 
| 3 | 
            +
            size 160467309
         | 
    	
        checkpoints/base_speakers/ZH/config.json
    ADDED
    
    | @@ -0,0 +1,137 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "data": {
         | 
| 3 | 
            +
                "text_cleaners": [
         | 
| 4 | 
            +
                  "cjke_cleaners2"
         | 
| 5 | 
            +
                ],
         | 
| 6 | 
            +
                "sampling_rate": 22050,
         | 
| 7 | 
            +
                "filter_length": 1024,
         | 
| 8 | 
            +
                "hop_length": 256,
         | 
| 9 | 
            +
                "win_length": 1024,
         | 
| 10 | 
            +
                "n_mel_channels": 80,
         | 
| 11 | 
            +
                "add_blank": true,
         | 
| 12 | 
            +
                "cleaned_text": true,
         | 
| 13 | 
            +
                "n_speakers": 10
         | 
| 14 | 
            +
              },
         | 
| 15 | 
            +
              "model": {
         | 
| 16 | 
            +
                "inter_channels": 192,
         | 
| 17 | 
            +
                "hidden_channels": 192,
         | 
| 18 | 
            +
                "filter_channels": 768,
         | 
| 19 | 
            +
                "n_heads": 2,
         | 
| 20 | 
            +
                "n_layers": 6,
         | 
| 21 | 
            +
                "n_layers_trans_flow": 3,
         | 
| 22 | 
            +
                "kernel_size": 3,
         | 
| 23 | 
            +
                "p_dropout": 0.1,
         | 
| 24 | 
            +
                "resblock": "1",
         | 
| 25 | 
            +
                "resblock_kernel_sizes": [
         | 
| 26 | 
            +
                  3,
         | 
| 27 | 
            +
                  7,
         | 
| 28 | 
            +
                  11
         | 
| 29 | 
            +
                ],
         | 
| 30 | 
            +
                "resblock_dilation_sizes": [
         | 
| 31 | 
            +
                  [
         | 
| 32 | 
            +
                    1,
         | 
| 33 | 
            +
                    3,
         | 
| 34 | 
            +
                    5
         | 
| 35 | 
            +
                  ],
         | 
| 36 | 
            +
                  [
         | 
| 37 | 
            +
                    1,
         | 
| 38 | 
            +
                    3,
         | 
| 39 | 
            +
                    5
         | 
| 40 | 
            +
                  ],
         | 
| 41 | 
            +
                  [
         | 
| 42 | 
            +
                    1,
         | 
| 43 | 
            +
                    3,
         | 
| 44 | 
            +
                    5
         | 
| 45 | 
            +
                  ]
         | 
| 46 | 
            +
                ],
         | 
| 47 | 
            +
                "upsample_rates": [
         | 
| 48 | 
            +
                  8,
         | 
| 49 | 
            +
                  8,
         | 
| 50 | 
            +
                  2,
         | 
| 51 | 
            +
                  2
         | 
| 52 | 
            +
                ],
         | 
| 53 | 
            +
                "upsample_initial_channel": 512,
         | 
| 54 | 
            +
                "upsample_kernel_sizes": [
         | 
| 55 | 
            +
                  16,
         | 
| 56 | 
            +
                  16,
         | 
| 57 | 
            +
                  4,
         | 
| 58 | 
            +
                  4
         | 
| 59 | 
            +
                ],
         | 
| 60 | 
            +
                "n_layers_q": 3,
         | 
| 61 | 
            +
                "use_spectral_norm": false,
         | 
| 62 | 
            +
                "gin_channels": 256
         | 
| 63 | 
            +
              },
         | 
| 64 | 
            +
              "symbols": [
         | 
| 65 | 
            +
                "_",
         | 
| 66 | 
            +
                ",",
         | 
| 67 | 
            +
                ".",
         | 
| 68 | 
            +
                "!",
         | 
| 69 | 
            +
                "?",
         | 
| 70 | 
            +
                "-",
         | 
| 71 | 
            +
                "~",
         | 
| 72 | 
            +
                "\u2026",
         | 
| 73 | 
            +
                "N",
         | 
| 74 | 
            +
                "Q",
         | 
| 75 | 
            +
                "a",
         | 
| 76 | 
            +
                "b",
         | 
| 77 | 
            +
                "d",
         | 
| 78 | 
            +
                "e",
         | 
| 79 | 
            +
                "f",
         | 
| 80 | 
            +
                "g",
         | 
| 81 | 
            +
                "h",
         | 
| 82 | 
            +
                "i",
         | 
| 83 | 
            +
                "j",
         | 
| 84 | 
            +
                "k",
         | 
| 85 | 
            +
                "l",
         | 
| 86 | 
            +
                "m",
         | 
| 87 | 
            +
                "n",
         | 
| 88 | 
            +
                "o",
         | 
| 89 | 
            +
                "p",
         | 
| 90 | 
            +
                "s",
         | 
| 91 | 
            +
                "t",
         | 
| 92 | 
            +
                "u",
         | 
| 93 | 
            +
                "v",
         | 
| 94 | 
            +
                "w",
         | 
| 95 | 
            +
                "x",
         | 
| 96 | 
            +
                "y",
         | 
| 97 | 
            +
                "z",
         | 
| 98 | 
            +
                "\u0251",
         | 
| 99 | 
            +
                "\u00e6",
         | 
| 100 | 
            +
                "\u0283",
         | 
| 101 | 
            +
                "\u0291",
         | 
| 102 | 
            +
                "\u00e7",
         | 
| 103 | 
            +
                "\u026f",
         | 
| 104 | 
            +
                "\u026a",
         | 
| 105 | 
            +
                "\u0254",
         | 
| 106 | 
            +
                "\u025b",
         | 
| 107 | 
            +
                "\u0279",
         | 
| 108 | 
            +
                "\u00f0",
         | 
| 109 | 
            +
                "\u0259",
         | 
| 110 | 
            +
                "\u026b",
         | 
| 111 | 
            +
                "\u0265",
         | 
| 112 | 
            +
                "\u0278",
         | 
| 113 | 
            +
                "\u028a",
         | 
| 114 | 
            +
                "\u027e",
         | 
| 115 | 
            +
                "\u0292",
         | 
| 116 | 
            +
                "\u03b8",
         | 
| 117 | 
            +
                "\u03b2",
         | 
| 118 | 
            +
                "\u014b",
         | 
| 119 | 
            +
                "\u0266",
         | 
| 120 | 
            +
                "\u207c",
         | 
| 121 | 
            +
                "\u02b0",
         | 
| 122 | 
            +
                "`",
         | 
| 123 | 
            +
                "^",
         | 
| 124 | 
            +
                "#",
         | 
| 125 | 
            +
                "*",
         | 
| 126 | 
            +
                "=",
         | 
| 127 | 
            +
                "\u02c8",
         | 
| 128 | 
            +
                "\u02cc",
         | 
| 129 | 
            +
                "\u2192",
         | 
| 130 | 
            +
                "\u2193",
         | 
| 131 | 
            +
                "\u2191",
         | 
| 132 | 
            +
                " "
         | 
| 133 | 
            +
              ],
         | 
| 134 | 
            +
              "speakers": {
         | 
| 135 | 
            +
                "default": 0
         | 
| 136 | 
            +
              }
         | 
| 137 | 
            +
            }
         | 
    	
        checkpoints/base_speakers/ZH/zh_default_se.pth
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:3b62e8264962059b8a84dd00b29e2fcccc92f5d3be90eec67dfa082c0cf58ccf
         | 
| 3 | 
            +
            size 1789
         | 
    	
        checkpoints/converter/checkpoint.pth
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:89ae83aa4e3668fef64b388b789ff7b0ce0def9f801069edfc18a00ea420748d
         | 
| 3 | 
            +
            size 131327338
         | 
    	
        checkpoints/converter/config.json
    ADDED
    
    | @@ -0,0 +1,57 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "data": {
         | 
| 3 | 
            +
                "sampling_rate": 22050,
         | 
| 4 | 
            +
                "filter_length": 1024,
         | 
| 5 | 
            +
                "hop_length": 256,
         | 
| 6 | 
            +
                "win_length": 1024,
         | 
| 7 | 
            +
                "n_speakers": 0
         | 
| 8 | 
            +
              },
         | 
| 9 | 
            +
              "model": {
         | 
| 10 | 
            +
                "inter_channels": 192,
         | 
| 11 | 
            +
                "hidden_channels": 192,
         | 
| 12 | 
            +
                "filter_channels": 768,
         | 
| 13 | 
            +
                "n_heads": 2,
         | 
| 14 | 
            +
                "n_layers": 6,
         | 
| 15 | 
            +
                "kernel_size": 3,
         | 
| 16 | 
            +
                "p_dropout": 0.1,
         | 
| 17 | 
            +
                "resblock": "1",
         | 
| 18 | 
            +
                "resblock_kernel_sizes": [
         | 
| 19 | 
            +
                  3,
         | 
| 20 | 
            +
                  7,
         | 
| 21 | 
            +
                  11
         | 
| 22 | 
            +
                ],
         | 
| 23 | 
            +
                "resblock_dilation_sizes": [
         | 
| 24 | 
            +
                  [
         | 
| 25 | 
            +
                    1,
         | 
| 26 | 
            +
                    3,
         | 
| 27 | 
            +
                    5
         | 
| 28 | 
            +
                  ],
         | 
| 29 | 
            +
                  [
         | 
| 30 | 
            +
                    1,
         | 
| 31 | 
            +
                    3,
         | 
| 32 | 
            +
                    5
         | 
| 33 | 
            +
                  ],
         | 
| 34 | 
            +
                  [
         | 
| 35 | 
            +
                    1,
         | 
| 36 | 
            +
                    3,
         | 
| 37 | 
            +
                    5
         | 
| 38 | 
            +
                  ]
         | 
| 39 | 
            +
                ],
         | 
| 40 | 
            +
                "upsample_rates": [
         | 
| 41 | 
            +
                  8,
         | 
| 42 | 
            +
                  8,
         | 
| 43 | 
            +
                  2,
         | 
| 44 | 
            +
                  2
         | 
| 45 | 
            +
                ],
         | 
| 46 | 
            +
                "upsample_initial_channel": 512,
         | 
| 47 | 
            +
                "upsample_kernel_sizes": [
         | 
| 48 | 
            +
                  16,
         | 
| 49 | 
            +
                  16,
         | 
| 50 | 
            +
                  4,
         | 
| 51 | 
            +
                  4
         | 
| 52 | 
            +
                ],
         | 
| 53 | 
            +
                "n_layers_q": 3,
         | 
| 54 | 
            +
                "use_spectral_norm": false,
         | 
| 55 | 
            +
                "gin_channels": 256
         | 
| 56 | 
            +
              }
         | 
| 57 | 
            +
            }
         | 

