feat: add ElevenLabs TTS API provider

This commit is contained in:
Zayn
2026-06-10 16:47:14 +08:00
committed by GitHub
parent 56d2b3fb55
commit 0b22349363
6 changed files with 268 additions and 0 deletions

View File

@@ -1807,6 +1807,25 @@ CONFIG_METADATA_2 = {
"gemini_tts_voice_name": "Leda",
"proxy": "",
},
"ElevenLabs TTS(API)": {
"hint": "API Key 从 https://elevenlabs.io/app/settings/api-keys 获取。Voice ID 可在 https://elevenlabs.io/app/voice-library 浏览选择。",
"id": "elevenlabs_tts",
"type": "elevenlabs_tts_api",
"provider": "elevenlabs",
"provider_type": "text_to_speech",
"enable": False,
"api_key": "",
"api_base": "https://api.elevenlabs.io/v1",
"model": "eleven_multilingual_v2",
"elevenlabs-tts-voice-id": "JBFqnCBsd6RMkjVDRZzb",
"elevenlabs-tts-output-format": "mp3_44100_128",
"elevenlabs-tts-stability": "",
"elevenlabs-tts-similarity-boost": "",
"elevenlabs-tts-style": "",
"elevenlabs-tts-use-speaker-boost": True,
"timeout": "20",
"proxy": "",
},
"OpenAI Embedding": {
"id": "openai_embedding",
"type": "openai_embedding",

View File

@@ -467,6 +467,10 @@ class ProviderManager:
from .sources.gemini_tts_source import (
ProviderGeminiTTSAPI as ProviderGeminiTTSAPI,
)
case "elevenlabs_tts_api":
from .sources.elevenlabs_tts_source import (
ProviderElevenLabsTTSAPI as ProviderElevenLabsTTSAPI,
)
case "openai_embedding":
from .sources.openai_embedding_source import (
OpenAIEmbeddingProvider as OpenAIEmbeddingProvider,

View File

@@ -0,0 +1,173 @@
import uuid
from pathlib import Path
import httpx
from astrbot import logger
from astrbot.core.utils.astrbot_path import get_astrbot_temp_path
from ..entities import ProviderType
from ..provider import TTSProvider
from ..register import register_provider_adapter
SUPPORTED_CONTAINER_OUTPUT_PREFIXES = ("mp3", "wav", "opus")
RAW_AUDIO_OUTPUT_PREFIXES = ("pcm", "ulaw", "alaw")
def _parse_optional_float(
provider_config: dict,
cfg_name: str,
) -> float | None:
value = provider_config.get(cfg_name, "")
if value in ("", None):
return None
try:
parsed = float(value)
except (TypeError, ValueError) as exc:
raise ValueError(f"{cfg_name} must be a number between 0 and 1.") from exc
if not 0 <= parsed <= 1:
raise ValueError(f"{cfg_name} must be between 0 and 1.")
return parsed
def _parse_bool(provider_config: dict, cfg_name: str) -> bool:
value = provider_config[cfg_name]
if isinstance(value, bool):
return value
if isinstance(value, int):
return bool(value)
if isinstance(value, str):
normalized = value.strip().lower()
if normalized in {"true", "1", "yes", "y", "on"}:
return True
if normalized in {"false", "0", "no", "n", "off"}:
return False
raise ValueError(f"{cfg_name} must be a boolean value.")
def _normalize_timeout(value: int | str | None) -> int:
if value in ("", None):
return 20
try:
timeout = int(value)
except (TypeError, ValueError) as exc:
raise ValueError("timeout must be a positive integer.") from exc
if timeout <= 0:
raise ValueError("timeout must be a positive integer.")
return timeout
def _validate_output_format(output_format: str) -> None:
fmt = output_format.lower()
if fmt.startswith(RAW_AUDIO_OUTPUT_PREFIXES):
raise ValueError(
"ElevenLabs raw audio output formats are not supported by this provider. "
"Use an mp3, wav, or opus output format instead."
)
if not fmt.startswith(SUPPORTED_CONTAINER_OUTPUT_PREFIXES):
raise ValueError(
"Unsupported ElevenLabs output format. "
"Use an mp3, wav, or opus output format."
)
@register_provider_adapter(
"elevenlabs_tts_api",
"ElevenLabs TTS API",
provider_type=ProviderType.TEXT_TO_SPEECH,
)
class ProviderElevenLabsTTSAPI(TTSProvider):
def __init__(
self,
provider_config: dict,
provider_settings: dict,
) -> None:
super().__init__(provider_config, provider_settings)
self.api_key = provider_config.get("api_key", "")
self.api_base = provider_config.get(
"api_base", "https://api.elevenlabs.io/v1"
).removesuffix("/")
self.voice_id = provider_config.get(
"elevenlabs-tts-voice-id", "JBFqnCBsd6RMkjVDRZzb"
)
self.model_id = provider_config.get("model", "eleven_multilingual_v2")
self.set_model(self.model_id)
self.output_format = provider_config.get(
"elevenlabs-tts-output-format", "mp3_44100_128"
)
_validate_output_format(self.output_format)
# Only send explicitly configured voice settings so the API can apply defaults.
self.voice_settings: dict = {}
for key, cfg_name in (
("stability", "elevenlabs-tts-stability"),
("similarity_boost", "elevenlabs-tts-similarity-boost"),
("style", "elevenlabs-tts-style"),
):
value = _parse_optional_float(provider_config, cfg_name)
if value is not None:
self.voice_settings[key] = value
if "elevenlabs-tts-use-speaker-boost" in provider_config:
self.voice_settings["use_speaker_boost"] = _parse_bool(
provider_config,
"elevenlabs-tts-use-speaker-boost",
)
timeout = _normalize_timeout(provider_config.get("timeout", 20))
proxy = provider_config.get("proxy", "")
if proxy:
logger.info(f"[ElevenLabs TTS] 使用代理: {proxy}")
self.client = httpx.AsyncClient(
timeout=timeout,
proxy=proxy or None,
trust_env=False,
)
def _output_extension(self) -> str:
"""Infer the audio file extension from the configured output format."""
fmt = self.output_format.lower()
if fmt.startswith("mp3"):
return "mp3"
if fmt.startswith("opus"):
return "opus"
if fmt.startswith("wav"):
return "wav"
return "mp3"
async def get_audio(self, text: str) -> str:
url = f"{self.api_base}/text-to-speech/{self.voice_id}"
headers = {
"xi-api-key": self.api_key,
"Content-Type": "application/json",
}
payload: dict = {
"text": text,
"model_id": self.model_name,
}
if self.voice_settings:
payload["voice_settings"] = self.voice_settings
response = await self.client.post(
url,
headers=headers,
params={"output_format": self.output_format},
json=payload,
)
if response.status_code != 200:
error_text = response.text[:1024]
raise Exception(
f"ElevenLabs TTS API 请求失败: {response.status_code}, {error_text}"
)
temp_dir = Path(get_astrbot_temp_path())
temp_dir.mkdir(parents=True, exist_ok=True)
path = (
temp_dir / f"elevenlabs_tts_api_{uuid.uuid4()}.{self._output_extension()}"
)
path.write_bytes(response.content)
return str(path)
async def terminate(self):
if self.client:
await self.client.aclose()

View File

@@ -1590,6 +1590,30 @@
"description": "voice",
"hint": "OpenAI TTS voice. OpenAI defaults: 'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'."
},
"elevenlabs-tts-voice-id": {
"description": "Voice ID",
"hint": "ElevenLabs voice ID. Browse and copy voice IDs at https://elevenlabs.io/app/voice-library. Default 'JBFqnCBsd6RMkjVDRZzb' (George)."
},
"elevenlabs-tts-output-format": {
"description": "Output format",
"hint": "Audio output format, e.g. 'mp3_44100_128', 'mp3_22050_32', 'wav_44100', or 'opus_48000_128'. Raw PCM/u-law/a-law formats are not supported. Default 'mp3_44100_128'."
},
"elevenlabs-tts-stability": {
"description": "Stability",
"hint": "Voice stability, range [0, 1]. Higher is more consistent, lower is more expressive. Leave empty to use the server default."
},
"elevenlabs-tts-similarity-boost": {
"description": "Similarity boost",
"hint": "How closely the output matches the original voice, range [0, 1]. Leave empty to use the server default."
},
"elevenlabs-tts-style": {
"description": "Style exaggeration",
"hint": "Style exaggeration of the voice, range [0, 1]. Higher values increase latency. Leave empty to use the server default."
},
"elevenlabs-tts-use-speaker-boost": {
"description": "Speaker boost",
"hint": "Boost similarity to the original speaker. May slightly increase latency."
},
"mimo-tts-voice": {
"description": "Voice",
"hint": "MiMo TTS voice name. Supported values include 'mimo_default', 'default_en', and 'default_zh'."

View File

@@ -1587,6 +1587,30 @@
"description": "API Base URL",
"hint": "Голоса OpenAI TTS: alloy, echo и др."
},
"elevenlabs-tts-voice-id": {
"description": "ID голоса",
"hint": "ID голоса ElevenLabs. Просмотрите и скопируйте ID на https://elevenlabs.io/app/voice-library. По умолчанию 'JBFqnCBsd6RMkjVDRZzb' (George)."
},
"elevenlabs-tts-output-format": {
"description": "Формат вывода",
"hint": "Формат аудио, например 'mp3_44100_128', 'mp3_22050_32', 'wav_44100' или 'opus_48000_128'. Raw PCM/u-law/a-law форматы не поддерживаются. По умолчанию 'mp3_44100_128'."
},
"elevenlabs-tts-stability": {
"description": "Стабильность",
"hint": "Стабильность голоса, диапазон [0, 1]. Оставьте пустым для значения по умолчанию."
},
"elevenlabs-tts-similarity-boost": {
"description": "Усиление сходства",
"hint": "Насколько вывод соответствует исходному голосу, диапазон [0, 1]. Оставьте пустым для значения по умолчанию."
},
"elevenlabs-tts-style": {
"description": "Выразительность стиля",
"hint": "Выразительность стиля голоса, диапазон [0, 1]. Высокие значения увеличивают задержку. Оставьте пустым для значения по умолчанию."
},
"elevenlabs-tts-use-speaker-boost": {
"description": "Усиление диктора",
"hint": "Усиливает сходство с исходным диктором. Может немного увеличить задержку."
},
"mimo-tts-voice": {
"description": "Голос",
"hint": "Имя голоса MiMo TTS. Поддерживаются значения 'mimo_default', 'default_en' и 'default_zh'."

View File

@@ -1592,6 +1592,30 @@
"description": "voice",
"hint": "OpenAI TTS 的声音。OpenAI 默认支持:'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'"
},
"elevenlabs-tts-voice-id": {
"description": "音色 ID",
"hint": "ElevenLabs 音色 ID。可在 https://elevenlabs.io/app/voice-library 浏览并复制音色 ID。默认 'JBFqnCBsd6RMkjVDRZzb'George。"
},
"elevenlabs-tts-output-format": {
"description": "输出格式",
"hint": "音频输出格式,例如 'mp3_44100_128'、'mp3_22050_32'、'wav_44100'、'opus_48000_128'。不支持裸 PCM/u-law/a-law 格式。默认 'mp3_44100_128'。"
},
"elevenlabs-tts-stability": {
"description": "稳定性",
"hint": "音色稳定性,范围 [0, 1]。值越高越稳定,越低越富有表现力。留空则使用服务端默认值。"
},
"elevenlabs-tts-similarity-boost": {
"description": "相似度增强",
"hint": "输出与原始音色的接近程度,范围 [0, 1]。留空则使用服务端默认值。"
},
"elevenlabs-tts-style": {
"description": "风格夸张度",
"hint": "音色风格的夸张程度,范围 [0, 1]。值越高延迟越大。留空则使用服务端默认值。"
},
"elevenlabs-tts-use-speaker-boost": {
"description": "说话人增强",
"hint": "增强与原始说话人的相似度,可能略微增加延迟。"
},
"mimo-tts-voice": {
"description": "音色",
"hint": "MiMo TTS 的音色名称。可选值包括 'mimo_default'、'default_en'、'default_zh'。"