Skip to content

添加Azure TTS文本转语音支持和 Grok LLM配置 #785

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion main/xiaozhi-server/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,13 @@ LLM:
bot_id: "你的bot_id"
user_id: "你的user_id"
personal_access_token: 你的coze个人令牌
XaiLLM:
# 定义LLM API类型
type: openai
api_key: 你的Xai web key # Xai API,需要先在xAI平台创建API密钥并获取api_key
model_name: "grok-3-fast-beta"
base_url: "https://api.x.ai/v1"
max_tokens: 131072
LMStudioLLM:
# 定义LLM API类型
type: openai
Expand Down Expand Up @@ -344,7 +351,7 @@ LLM:
model_name: qwen2.5:3b-AWQ # 使用的小模型名称,用于意图识别
base_url: http://localhost:9997 # Xinference服务地址
TTS:
# 当前支持的type为edge、doubao,可自行适配
# 当前支持的type为edge、doubao、azure,可自行适配
EdgeTTS:
# 定义TTS API类型
type: edge
Expand All @@ -365,6 +372,19 @@ TTS:
appid: 你的火山引擎语音合成服务appid
access_token: 你的火山引擎语音合成服务access_token
cluster: volcano_tts
AzureTTS:
# 定义TTS API类型
type: azure
# Azure语音服务订阅密钥,可在Azure门户获取
# 创建资源地址:https://portal.azure.com/#create/Microsoft.CognitiveServicesSpeechServices
subscription_key: 你的Azure语音合成服务密钥
# 服务区域,如eastus、westus等
region: westus
# 语音名称,可选值参考:https://learn.microsoft.com/zh-cn/azure/cognitive-services/speech-service/language-support?tabs=tts
voice_name: zh-CN-XiaochenMultilingualNeural
# 输出格式,可选值参考:https://learn.microsoft.com/zh-cn/azure/cognitive-services/speech-service/rest-text-to-speech?tabs=streaming#audio-outputs
output_format: riff-16khz-16bit-mono-pcm
output_dir: tmp/
CosyVoiceSiliconflow:
type: siliconflow
# 硅基流动TTS
Expand Down
80 changes: 80 additions & 0 deletions main/xiaozhi-server/core/providers/tts/azure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import os
import asyncio
import aiohttp
import time
from .base import TTSProviderBase

class TTSProvider(TTSProviderBase):
def __init__(self, config, delete_audio_file):
super().__init__(config, delete_audio_file)
self.subscription_key = config.get("subscription_key")
self.region = config.get("region", "eastus")
self.voice_name = config.get("voice_name", "zh-CN-YunxiNeural")
self.output_format = config.get("output_format", "audio-24khz-48kbitrate-mono-mp3")
self.api_url = f"https://{self.region}.tts.speech.microsoft.com/cognitiveservices/v1"
self.token_url = f"https://{self.region}.api.cognitive.microsoft.com/sts/v1.0/issueToken"
self.access_token = None
self.token_expiry = 0

def generate_filename(self, extension=".wav"):
"""生成唯一的音频文件名"""
return os.path.join(self.output_file, f"azure_tts_{os.urandom(4).hex()}{extension}")

async def _get_access_token(self):
"""获取Azure TTS访问令牌"""
if time.time() < self.token_expiry and self.access_token:
return self.access_token

headers = {
"Ocp-Apim-Subscription-Key": self.subscription_key,
"Content-Type": "application/x-www-form-urlencoded"
}

try:
async with aiohttp.ClientSession() as session:
async with session.post(
self.token_url,
headers=headers
) as response:
if response.status == 200:
self.access_token = await response.text()
self.token_expiry = time.time() + 540 # 令牌有效期9分钟(540秒)
return self.access_token
else:
error = await response.text()
raise Exception(f"获取Azure TTS令牌失败: {response.status} - {error}")
except Exception as e:
raise Exception(f"获取Azure TTS令牌异常: {e}")

async def text_to_speak(self, text, output_file):
"""调用Azure TTS API将文本转换为语音"""
token = await self._get_access_token()
headers = {
"Authorization": f"Bearer {token}",
"Content-Type": "application/ssml+xml",
"X-Microsoft-OutputFormat": self.output_format,
"User-Agent": "xiaozhi-server"
}

ssml = f"""<speak version='1.0' xml:lang='zh-CN'>
<voice name='{self.voice_name}'>
{text}
</voice>
</speak>"""

try:
async with aiohttp.ClientSession() as session:
async with session.post(
self.api_url,
headers=headers,
data=ssml.encode("utf-8")
) as response:
if response.status == 200:
with open(output_file, "wb") as f:
f.write(await response.read())
else:
error = await response.text()
headers = response.headers
raise Exception(f"Azure TTS请求失败: {response.status} - 错误信息: {error}, 完整响应: {response}")
except Exception as e:
raise Exception(f"Azure TTS请求异常: {e}")