File size: 3,909 Bytes
87337b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
/**
 *
 * Agora Real Time Engagement
 * Created by XinHui Li in 2024.
 * Copyright (c) 2024 Agora IO. All rights reserved.
 *
 */
// An extension written by Go for TTS
package extension

import (
	"bufio"
	"bytes"
	"encoding/hex"
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"time"

	"ten_framework/ten"

	"github.com/go-resty/resty/v2"
)

type minimaxTTS struct {
	client *resty.Client
	config minimaxTTSConfig
}

type minimaxTTSConfig struct {
	ApiKey                string
	GroupId               string
	Model                 string
	RequestTimeoutSeconds int
	SampleRate            int32
	Url                   string
	VoiceId               string
}

func defaultMinimaxTTSConfig() minimaxTTSConfig {
	return minimaxTTSConfig{
		ApiKey:                "",
		GroupId:               "",
		Model:                 "speech-01-turbo",
		RequestTimeoutSeconds: 10,
		SampleRate:            32000,
		Url:                   "https://api.minimax.chat/v1/t2a_v2",
		VoiceId:               "male-qn-qingse",
	}
}

func newMinimaxTTS(config minimaxTTSConfig) (*minimaxTTS, error) {
	return &minimaxTTS{
		config: config,
		client: resty.New().
			SetRetryCount(0).
			SetTimeout(time.Duration(config.RequestTimeoutSeconds) * time.Second),
	}, nil
}

func (e *minimaxTTS) textToSpeechStream(tenEnv ten.TenEnv, streamWriter io.Writer, text string) (err error) {
	tenEnv.LogDebug("textToSpeechStream start tts")

	payload := map[string]any{
		"audio_setting": map[string]any{
			"channel":     1,
			"format":      "pcm",
			"sample_rate": e.config.SampleRate,
		},
		"model": e.config.Model,
		"pronunciation_dict": map[string]any{
			"tone": []string{},
		},
		"stream": true,
		"text":   text,
		"voice_setting": map[string]any{
			"pitch":    0,
			"speed":    1.0,
			"voice_id": e.config.VoiceId,
			"vol":      1.0,
		},
	}

	resp, err := e.client.R().
		SetHeader("Content-Type", "application/json").
		SetHeader("Authorization", "Bearer "+e.config.ApiKey).
		SetDoNotParseResponse(true).
		SetBody(payload).
		Post(fmt.Sprintf("%s?GroupId=%s", e.config.Url, e.config.GroupId))

	if err != nil {
		tenEnv.LogError(fmt.Sprintf("request failed, err: %v, text: %s", err, text))
		return fmt.Errorf("textToSpeechStream failed, err: %v", err)
	}

	defer func() {
		resp.RawBody().Close()

		tenEnv.LogDebug(fmt.Sprintf("textToSpeechStream close response, err: %v, text: %s", err, text))
	}()

	// Check the response status code
	if resp.StatusCode() != http.StatusOK {
		tenEnv.LogError(fmt.Sprintf("unexpected response status: %d", resp.StatusCode()))
		return fmt.Errorf("unexpected response status: %d", resp.StatusCode())
	}

	reader := bufio.NewReader(resp.RawBody())
	for {
		line, err := reader.ReadBytes('\n')
		if err != nil {
			if err == io.EOF {
				break
			}

			tenEnv.LogError(fmt.Sprintf("failed to read line: %v", err))
			return err
		}

		if !bytes.HasPrefix(line, []byte("data:")) {
			tenEnv.LogDebug(fmt.Sprintf("drop chunk, text: %s, line: %s", text, line))
			continue
		}

		var chunk struct {
			Data struct {
				Audio  string `json:"audio"`
				Status int    `json:"status"`
			} `json:"data"`
			TraceId  string `json:"trace_id"`
			BaseResp struct {
				StatusCode int    `json:"status_code"`
				StatusMsg  string `json:"status_msg"`
			} `json:"base_resp"`
		}

		if err = json.Unmarshal(line[5:], &chunk); err != nil {
			tenEnv.LogError(fmt.Sprintf("failed to decode JSON chunk: %v", err))
			break
		}

		if chunk.Data.Status == 2 {
			break
		}

		audioData, err := hex.DecodeString(chunk.Data.Audio)
		if err != nil {
			tenEnv.LogError(fmt.Sprintf("failed to decode audio data: %v, traceId: %s, BaseResp: %v", err, chunk.TraceId, chunk.BaseResp))
			break
		}

		_, err = streamWriter.Write(audioData)
		if err != nil {
			tenEnv.LogError(fmt.Sprintf("failed to write to streamWriter: %v, traceId: %s, BaseResp: %v", err, chunk.TraceId, chunk.BaseResp))
			break
		}
	}

	return
}