vumichien commited on
Commit
c1ec7d2
·
1 Parent(s): 678d0c9

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +71 -31
main.py CHANGED
@@ -45,6 +45,7 @@ def client_settings_api():
45
  async def camera_picture_api(
46
  file: UploadFile = File(...),
47
  last_seen: Optional[Union[str, UploadFile]] = Form(None),
 
48
  ):
49
  # parameters
50
  total_time = time.time()
@@ -71,12 +72,18 @@ async def camera_picture_api(
71
 
72
  # check detect people or not
73
  if out_img is None:
74
- return {
75
- "status": "No face detected",
76
- "text": None,
77
- "voice": None,
78
- "image": None
79
- }
 
 
 
 
 
 
80
  else:
81
  if ZIP:
82
  image_bot_path = pil_to_base64(out_img, encode=False)
@@ -109,44 +116,77 @@ async def camera_picture_api(
109
  headers={"Content-Disposition": f"attachment;filename=%s" % zip_filename}
110
  )
111
  else:
112
- voice_bot_path = tts(default_bot_voice, language="ja", encode=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  print("Total time", time.time() - total_time)
114
  return {
115
- "status": "New people",
116
- "text": default_bot_voice,
117
- "voice": voice_bot_path,
118
- "image": image_bot_path
 
 
 
 
 
 
119
  }
120
- else:
121
- print("Total time", time.time() - total_time)
122
- return {
123
- "status": "Old people",
124
- "text": None,
125
- "voice": None,
126
- "image": image_bot_path,
127
- }
128
 
129
 
130
  @app.post("/human_input/")
131
  async def human_input_api(
132
- input_data: Union[str, bytes],
133
- temperature: float = 0.7,
134
- max_tokens: int = 1000,
 
 
135
  ):
136
- print("Input data type", type(input_data))
137
- if type(input_data) != str:
138
- upload_audio = ffmpeg_read(input_data, sampling_rate=24000)
 
139
  sf.write('temp.wav', upload_audio, 24000, subtype='PCM_16')
140
  text = stt('temp.wav')
 
141
  else:
142
- text = input_data
 
 
 
 
 
 
 
 
 
 
143
  prompt_msg = {"role": "user", "content": text}
144
  messages = system_prompt + [prompt_msg]
145
  completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, temperature=temperature,
146
  max_tokens=max_tokens)
147
  print(completion['usage']['total_tokens'])
148
- return {
149
- "human_text": str(text),
150
- "robot_text": completion.choices[0].message.content,
151
- "robot_voice": tts(completion.choices[0].message.content, language="ja", encode=True)
152
- }
 
 
 
 
 
 
 
45
  async def camera_picture_api(
46
  file: UploadFile = File(...),
47
  last_seen: Optional[Union[str, UploadFile]] = Form(None),
48
+ return_voice: Annotated[bool, Form()] = True,
49
  ):
50
  # parameters
51
  total_time = time.time()
 
72
 
73
  # check detect people or not
74
  if out_img is None:
75
+ if return_voice:
76
+ return {
77
+ "status": "No face detected",
78
+ "text": None,
79
+ "voice": None,
80
+ "image": None
81
+ }
82
+ else:
83
+ return {
84
+ "status": "No face detected",
85
+ "image": None
86
+ }
87
  else:
88
  if ZIP:
89
  image_bot_path = pil_to_base64(out_img, encode=False)
 
116
  headers={"Content-Disposition": f"attachment;filename=%s" % zip_filename}
117
  )
118
  else:
119
+ if return_voice:
120
+ print("Total time", time.time() - total_time)
121
+ return {
122
+ "status": "New people",
123
+ "text": default_bot_voice,
124
+ "voice": tts(default_bot_voice, language="ja", encode=True),
125
+ "image": image_bot_path
126
+ }
127
+ else:
128
+ print("Total time", time.time() - total_time)
129
+ return {
130
+ "status": "New people",
131
+ "image": image_bot_path
132
+ }
133
+ else:
134
+ if return_voice:
135
  print("Total time", time.time() - total_time)
136
  return {
137
+ "status": "Old people",
138
+ "text": None,
139
+ "voice": None,
140
+ "image": image_bot_path,
141
+ }
142
+ else:
143
+ print("Total time", time.time() - total_time)
144
+ return {
145
+ "status": "Old people",
146
+ "image": image_bot_path,
147
  }
 
 
 
 
 
 
 
 
148
 
149
 
150
  @app.post("/human_input/")
151
  async def human_input_api(
152
+ voice_input: bytes = File(None),
153
+ text_input: str = Form(None),
154
+ temperature: Annotated[float, Form()] = 0.7,
155
+ max_tokens: Annotated[int, Form()] = 100,
156
+ return_voice: Annotated[bool, Form()] = False,
157
  ):
158
+ if text_input:
159
+ text = text_input
160
+ elif text_input is None and voice_input is not None:
161
+ upload_audio = ffmpeg_read(voice_input, sampling_rate=24000)
162
  sf.write('temp.wav', upload_audio, 24000, subtype='PCM_16')
163
  text = stt('temp.wav')
164
+ print(text)
165
  else:
166
+ if return_voice:
167
+ return {
168
+ "human_text": None,
169
+ "robot_text": None,
170
+ "robot_voice": None
171
+ }
172
+ else:
173
+ return {
174
+ "human_text": None,
175
+ "robot_text": None,
176
+ }
177
  prompt_msg = {"role": "user", "content": text}
178
  messages = system_prompt + [prompt_msg]
179
  completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, temperature=temperature,
180
  max_tokens=max_tokens)
181
  print(completion['usage']['total_tokens'])
182
+ if return_voice:
183
+ return {
184
+ "human_text": text,
185
+ "robot_text": completion.choices[0].message.content,
186
+ "robot_voice": tts(completion.choices[0].message.content, language="ja", encode=True)
187
+ }
188
+ else:
189
+ return {
190
+ "human_text": text,
191
+ "robot_text": completion.choices[0].message.content,
192
+ }