Compare commits

...

1 Commits

Author SHA1 Message Date
Soulter
5ddd8ae1f1 feat: support stt selfhost provider 2026-04-24 14:35:34 +08:00
7 changed files with 196 additions and 0 deletions

View File

@@ -1588,6 +1588,15 @@ CONFIG_METADATA_2 = {
"stt_model": "iic/SenseVoiceSmall",
"is_emotion": False,
},
"Faster Whisper(Local)": {
"type": "faster_whisper_stt_selfhost",
"provider": "faster-whisper",
"provider_type": "speech_to_text",
"enable": False,
"id": "faster_whisper",
"model": "small",
"faster_whisper_device": "auto",
},
"OpenAI TTS(API)": {
"id": "openai_tts",
"type": "openai_tts_api",
@@ -2489,6 +2498,11 @@ CONFIG_METADATA_2 = {
"type": "string",
"hint": "启用前请 pip 安装 funasr、funasr_onnx、torchaudio、torch、modelscope、jieba 库默认使用CPU大约下载 1 GB并且安装 ffmpeg。否则将无法正常转文字。",
},
"faster_whisper_hint": {
"description": "部署 faster-whisper",
"type": "string",
"hint": "启用前请 pip 安装 faster-whisper。CPU 可直接使用NVIDIA GPU 需要额外准备 CTranslate2 对应的 CUDA/cuDNN 运行库。模型会在首次加载时自动下载到 Hugging Face 缓存目录或你指定的下载目录。",
},
"is_emotion": {
"description": "情绪识别",
"type": "bool",
@@ -2583,6 +2597,12 @@ CONFIG_METADATA_2 = {
"hint": "Whisper 推理设备。Apple Silicon 可选 mps其他环境建议使用 cpu。若指定 mps 但当前环境不可用,将自动回退到 cpu。",
"options": ["cpu", "mps"],
},
"faster_whisper_device": {
"description": "推理设备",
"type": "string",
"hint": "faster-whisper 推理设备。可选 auto、cpu、cuda。通常建议保留 auto让底层自行选择。",
"options": ["auto", "cpu", "cuda"],
},
"id": {
"description": "ID",
"type": "string",

View File

@@ -397,6 +397,10 @@ class ProviderManager:
from .sources.sensevoice_selfhosted_source import (
ProviderSenseVoiceSTTSelfHost as ProviderSenseVoiceSTTSelfHost,
)
case "faster_whisper_stt_selfhost":
from .sources.faster_whisper_selfhosted_source import (
ProviderFasterWhisperSTTSelfHost as ProviderFasterWhisperSTTSelfHost,
)
case "openai_whisper_api":
from .sources.whisper_api_source import (
ProviderOpenAIWhisperAPI as ProviderOpenAIWhisperAPI,

View File

@@ -0,0 +1,147 @@
import asyncio
import importlib
import uuid
from functools import partial
from pathlib import Path
from typing import Any, cast
from urllib.parse import urlparse
from astrbot.core import logger
from astrbot.core.utils.astrbot_path import get_astrbot_temp_path
from astrbot.core.utils.io import download_file
from astrbot.core.utils.tencent_record_helper import (
convert_to_pcm_wav,
tencent_silk_to_wav,
)
from ..entities import ProviderType
from ..provider import STTProvider
from ..register import register_provider_adapter
@register_provider_adapter(
"faster_whisper_stt_selfhost",
"faster-whisper 模型部署",
provider_type=ProviderType.SPEECH_TO_TEXT,
)
class ProviderFasterWhisperSTTSelfHost(STTProvider):
def __init__(
self,
provider_config: dict,
provider_settings: dict,
) -> None:
super().__init__(provider_config, provider_settings)
self.set_model(provider_config["model"])
self.device = str(
provider_config.get("faster_whisper_device", "auto"),
).strip()
self.model: Any = None
async def initialize(self) -> None:
loop = asyncio.get_running_loop()
def _load_model() -> Any:
faster_whisper = importlib.import_module("faster_whisper")
whisper_model_cls = faster_whisper.WhisperModel
return whisper_model_cls(
self.model_name,
device=self.device,
)
logger.info("下载或者加载 faster-whisper 模型中,这可能需要一些时间 ...")
self.model = await loop.run_in_executor(None, _load_model)
logger.info(
"faster-whisper 模型加载完成。device=%s",
self.device,
)
def _get_temp_dir(self) -> Path:
temp_dir = Path(get_astrbot_temp_path())
temp_dir.mkdir(parents=True, exist_ok=True)
return temp_dir
async def _detect_audio_format(self, file_path: Path) -> str | None:
try:
with file_path.open("rb") as file:
file_header = file.read(8)
except FileNotFoundError:
return None
if b"SILK" in file_header:
return "silk"
if b"#!AMR" in file_header:
return "amr"
return None
async def _prepare_audio_input(self, audio_url: str) -> tuple[Path, list[Path]]:
cleanup_paths: list[Path] = []
source_path = Path(audio_url)
is_remote = audio_url.startswith(("http://", "https://"))
is_tencent = "multimedia.nt.qq.com.cn" in audio_url if is_remote else False
if is_remote:
parsed_url = urlparse(audio_url)
suffix = Path(parsed_url.path).suffix or ".input"
download_path = (
self._get_temp_dir()
/ f"faster_whisper_selfhost_{uuid.uuid4().hex[:8]}{suffix}"
)
await download_file(audio_url, str(download_path))
source_path = download_path
cleanup_paths.append(download_path)
if not source_path.exists():
raise FileNotFoundError(f"文件不存在: {source_path}")
if source_path.suffix.lower() in {".amr", ".silk"} or is_tencent:
file_format = await self._detect_audio_format(source_path)
if file_format in {"silk", "amr"}:
converted_path = (
self._get_temp_dir()
/ f"faster_whisper_selfhost_{uuid.uuid4().hex[:8]}.wav"
)
cleanup_paths.append(converted_path)
if file_format == "silk":
logger.info("Converting silk file to wav ...")
await tencent_silk_to_wav(str(source_path), str(converted_path))
else:
logger.info("Converting amr file to wav ...")
await convert_to_pcm_wav(str(source_path), str(converted_path))
source_path = converted_path
return source_path, cleanup_paths
def _transcribe_audio(self, audio_path: Path) -> str:
if self.model is None:
raise RuntimeError("faster-whisper 模型未初始化")
segments, info = self.model.transcribe(str(audio_path))
segment_list = list(segments)
text = "".join(segment.text for segment in segment_list).strip()
logger.debug(
"faster-whisper transcription completed. language=%s, text=%s",
getattr(info, "language", None),
text,
)
return cast(str, text)
async def get_text(self, audio_url: str) -> str:
loop = asyncio.get_running_loop()
audio_path, cleanup_paths = await self._prepare_audio_input(audio_url)
try:
return await loop.run_in_executor(
None,
partial(self._transcribe_audio, audio_path),
)
finally:
for path in cleanup_paths:
try:
path.unlink(missing_ok=True)
except Exception as exc:
logger.warning(
"Failed to remove temporary faster-whisper file %s: %s",
path,
exc,
)

View File

@@ -329,6 +329,7 @@ export function useProviderSources(options: UseProviderSourcesOptions) {
mimo_stt_api: 'speech_to_text',
openai_whisper_selfhost: 'speech_to_text',
sensevoice_stt_selfhost: 'speech_to_text',
faster_whisper_stt_selfhost: 'speech_to_text',
openai_tts_api: 'text_to_speech',
mimo_tts_api: 'text_to_speech',
edge_tts: 'text_to_speech',

View File

@@ -1462,6 +1462,10 @@
"description": "Deploy SenseVoice",
"hint": "Before enabling, install funasr, funasr_onnx, torchaudio, torch, modelscope, and jieba (CPU by default, about 1 GB download), and install ffmpeg. Otherwise STT will not work."
},
"faster_whisper_hint": {
"description": "Deploy faster-whisper",
"hint": "Before enabling, install faster-whisper. CPU inference works directly; NVIDIA GPU inference also requires the matching CUDA/cuDNN runtime libraries for CTranslate2."
},
"is_emotion": {
"description": "Emotion recognition",
"hint": "Enable emotion recognition. happy?sad?angry?neutral?fearful?disgusted?surprised?unknown"
@@ -1530,6 +1534,10 @@
"description": "Inference device",
"hint": "Whisper inference device. Apple Silicon can use mps; other environments should use cpu. If mps is selected but unavailable, AstrBot will fall back to cpu."
},
"faster_whisper_device": {
"description": "Inference device",
"hint": "faster-whisper inference device. Supported values are auto, cpu, and cuda. In most cases keeping auto is the safest choice."
},
"id": {
"description": "ID"
},

View File

@@ -1459,6 +1459,10 @@
"description": "Развертывание SenseVoice",
"hint": "Перед включением установите необходимые библиотеки: funasr, torch и др. Также требуется ffmpeg."
},
"faster_whisper_hint": {
"description": "Развертывание faster-whisper",
"hint": "Перед включением установите faster-whisper. CPU-режим работает сразу; для NVIDIA GPU дополнительно потребуются совместимые CUDA/cuDNN runtime-библиотеки для CTranslate2."
},
"is_emotion": {
"description": "Распознавание эмоций",
"hint": "Включить распознавание эмоций (радость, грусть, гнев и т.д.)."
@@ -1527,6 +1531,10 @@
"description": "Устройство инференса",
"hint": "Устройство для инференса Whisper. На Apple Silicon можно выбрать mps; в остальных средах рекомендуется cpu. Если выбран mps, но он недоступен, AstrBot автоматически переключится на cpu."
},
"faster_whisper_device": {
"description": "Устройство инференса",
"hint": "Устройство инференса faster-whisper. Поддерживаются значения auto, cpu и cuda. В большинстве случаев лучше оставить auto."
},
"id": {
"description": "ID провайдера"
},

View File

@@ -1464,6 +1464,10 @@
"description": "部署SenseVoice",
"hint": "启用前请 pip 安装 funasr、funasr_onnx、torchaudio、torch、modelscope、jieba 库默认使用CPU大约下载 1 GB并且安装 ffmpeg。否则将无法正常转文字。"
},
"faster_whisper_hint": {
"description": "部署 faster-whisper",
"hint": "启用前请 pip 安装 faster-whisper。CPU 可直接使用NVIDIA GPU 需要额外准备 CTranslate2 对应的 CUDA/cuDNN 运行库。"
},
"is_emotion": {
"description": "情绪识别",
"hint": "是否开启情绪识别。happysadangryneutralfearfuldisgustedsurprisedunknown"
@@ -1532,6 +1536,10 @@
"description": "推理设备",
"hint": "Whisper 推理设备。Apple Silicon 可选 mps其他环境建议使用 cpu。若指定 mps 但当前环境不可用,将自动回退到 cpu。"
},
"faster_whisper_device": {
"description": "推理设备",
"hint": "faster-whisper 推理设备。可选 auto、cpu、cuda。通常建议保留 auto让底层自行选择。"
},
"id": {
"description": "ID"
},