feat: support stt selfhost provider

2026-07-02 10:40:15 +08:00 · 2026-04-24 14:35:34 +08:00
7 changed files with 196 additions and 0 deletions
--- a/astrbot/core/config/default.py
+++ b/astrbot/core/config/default.py
@@ -1588,6 +1588,15 @@ CONFIG_METADATA_2 = {
                        "stt_model": "iic/SenseVoiceSmall",
                        "is_emotion": False,
                    },
+                    "Faster Whisper(Local)": {
+                        "type": "faster_whisper_stt_selfhost",
+                        "provider": "faster-whisper",
+                        "provider_type": "speech_to_text",
+                        "enable": False,
+                        "id": "faster_whisper",
+                        "model": "small",
+                        "faster_whisper_device": "auto",
+                    },
                    "OpenAI TTS(API)": {
                        "id": "openai_tts",
                        "type": "openai_tts_api",
@@ -2489,6 +2498,11 @@ CONFIG_METADATA_2 = {
                        "type": "string",
                        "hint": "启用前请 pip 安装 funasr、funasr_onnx、torchaudio、torch、modelscope、jieba 库（默认使用CPU，大约下载 1 GB），并且安装 ffmpeg。否则将无法正常转文字。",
                    },
+                    "faster_whisper_hint": {
+                        "description": "部署 faster-whisper",
+                        "type": "string",
+                        "hint": "启用前请 pip 安装 faster-whisper。CPU 可直接使用；NVIDIA GPU 需要额外准备 CTranslate2 对应的 CUDA/cuDNN 运行库。模型会在首次加载时自动下载到 Hugging Face 缓存目录或你指定的下载目录。",
+                    },
                    "is_emotion": {
                        "description": "情绪识别",
                        "type": "bool",
@@ -2583,6 +2597,12 @@ CONFIG_METADATA_2 = {
                        "hint": "Whisper 推理设备。Apple Silicon 可选 mps；其他环境建议使用 cpu。若指定 mps 但当前环境不可用，将自动回退到 cpu。",
                        "options": ["cpu", "mps"],
                    },
+                    "faster_whisper_device": {
+                        "description": "推理设备",
+                        "type": "string",
+                        "hint": "faster-whisper 推理设备。可选 auto、cpu、cuda。通常建议保留 auto，让底层自行选择。",
+                        "options": ["auto", "cpu", "cuda"],
+                    },
                    "id": {
                        "description": "ID",
                        "type": "string",
--- a/astrbot/core/provider/manager.py
+++ b/astrbot/core/provider/manager.py
@@ -397,6 +397,10 @@ class ProviderManager:
                from .sources.sensevoice_selfhosted_source import (
                    ProviderSenseVoiceSTTSelfHost as ProviderSenseVoiceSTTSelfHost,
                )
+            case "faster_whisper_stt_selfhost":
+                from .sources.faster_whisper_selfhosted_source import (
+                    ProviderFasterWhisperSTTSelfHost as ProviderFasterWhisperSTTSelfHost,
+                )
            case "openai_whisper_api":
                from .sources.whisper_api_source import (
                    ProviderOpenAIWhisperAPI as ProviderOpenAIWhisperAPI,
--- a/astrbot/core/provider/sources/faster_whisper_selfhosted_source.py
+++ b/astrbot/core/provider/sources/faster_whisper_selfhosted_source.py
@@ -0,0 +1,147 @@
+import asyncio
+import importlib
+import uuid
+from functools import partial
+from pathlib import Path
+from typing import Any, cast
+from urllib.parse import urlparse
+
+from astrbot.core import logger
+from astrbot.core.utils.astrbot_path import get_astrbot_temp_path
+from astrbot.core.utils.io import download_file
+from astrbot.core.utils.tencent_record_helper import (
+    convert_to_pcm_wav,
+    tencent_silk_to_wav,
+)
+
+from ..entities import ProviderType
+from ..provider import STTProvider
+from ..register import register_provider_adapter
+
+
+@register_provider_adapter(
+    "faster_whisper_stt_selfhost",
+    "faster-whisper 模型部署",
+    provider_type=ProviderType.SPEECH_TO_TEXT,
+)
+class ProviderFasterWhisperSTTSelfHost(STTProvider):
+    def __init__(
+        self,
+        provider_config: dict,
+        provider_settings: dict,
+    ) -> None:
+        super().__init__(provider_config, provider_settings)
+        self.set_model(provider_config["model"])
+        self.device = str(
+            provider_config.get("faster_whisper_device", "auto"),
+        ).strip()
+        self.model: Any = None
+
+    async def initialize(self) -> None:
+        loop = asyncio.get_running_loop()
+
+        def _load_model() -> Any:
+            faster_whisper = importlib.import_module("faster_whisper")
+            whisper_model_cls = faster_whisper.WhisperModel
+            return whisper_model_cls(
+                self.model_name,
+                device=self.device,
+            )
+
+        logger.info("下载或者加载 faster-whisper 模型中，这可能需要一些时间 ...")
+        self.model = await loop.run_in_executor(None, _load_model)
+        logger.info(
+            "faster-whisper 模型加载完成。device=%s",
+            self.device,
+        )
+
+    def _get_temp_dir(self) -> Path:
+        temp_dir = Path(get_astrbot_temp_path())
+        temp_dir.mkdir(parents=True, exist_ok=True)
+        return temp_dir
+
+    async def _detect_audio_format(self, file_path: Path) -> str | None:
+        try:
+            with file_path.open("rb") as file:
+                file_header = file.read(8)
+        except FileNotFoundError:
+            return None
+
+        if b"SILK" in file_header:
+            return "silk"
+        if b"#!AMR" in file_header:
+            return "amr"
+        return None
+
+    async def _prepare_audio_input(self, audio_url: str) -> tuple[Path, list[Path]]:
+        cleanup_paths: list[Path] = []
+        source_path = Path(audio_url)
+        is_remote = audio_url.startswith(("http://", "https://"))
+        is_tencent = "multimedia.nt.qq.com.cn" in audio_url if is_remote else False
+
+        if is_remote:
+            parsed_url = urlparse(audio_url)
+            suffix = Path(parsed_url.path).suffix or ".input"
+            download_path = (
+                self._get_temp_dir()
+                / f"faster_whisper_selfhost_{uuid.uuid4().hex[:8]}{suffix}"
+            )
+            await download_file(audio_url, str(download_path))
+            source_path = download_path
+            cleanup_paths.append(download_path)
+
+        if not source_path.exists():
+            raise FileNotFoundError(f"文件不存在: {source_path}")
+
+        if source_path.suffix.lower() in {".amr", ".silk"} or is_tencent:
+            file_format = await self._detect_audio_format(source_path)
+            if file_format in {"silk", "amr"}:
+                converted_path = (
+                    self._get_temp_dir()
+                    / f"faster_whisper_selfhost_{uuid.uuid4().hex[:8]}.wav"
+                )
+                cleanup_paths.append(converted_path)
+
+                if file_format == "silk":
+                    logger.info("Converting silk file to wav ...")
+                    await tencent_silk_to_wav(str(source_path), str(converted_path))
+                else:
+                    logger.info("Converting amr file to wav ...")
+                    await convert_to_pcm_wav(str(source_path), str(converted_path))
+
+                source_path = converted_path
+
+        return source_path, cleanup_paths
+
+    def _transcribe_audio(self, audio_path: Path) -> str:
+        if self.model is None:
+            raise RuntimeError("faster-whisper 模型未初始化")
+
+        segments, info = self.model.transcribe(str(audio_path))
+        segment_list = list(segments)
+        text = "".join(segment.text for segment in segment_list).strip()
+        logger.debug(
+            "faster-whisper transcription completed. language=%s, text=%s",
+            getattr(info, "language", None),
+            text,
+        )
+        return cast(str, text)
+
+    async def get_text(self, audio_url: str) -> str:
+        loop = asyncio.get_running_loop()
+        audio_path, cleanup_paths = await self._prepare_audio_input(audio_url)
+        try:
+            return await loop.run_in_executor(
+                None,
+                partial(self._transcribe_audio, audio_path),
+            )
+        finally:
+            for path in cleanup_paths:
+                try:
+                    path.unlink(missing_ok=True)
+                except Exception as exc:
+                    logger.warning(
+                        "Failed to remove temporary faster-whisper file %s: %s",
+                        path,
+                        exc,
+                    )
--- a/dashboard/src/composables/useProviderSources.ts
+++ b/dashboard/src/composables/useProviderSources.ts
@@ -329,6 +329,7 @@ export function useProviderSources(options: UseProviderSourcesOptions) {
      mimo_stt_api: 'speech_to_text',
      openai_whisper_selfhost: 'speech_to_text',
      sensevoice_stt_selfhost: 'speech_to_text',
+      faster_whisper_stt_selfhost: 'speech_to_text',
      openai_tts_api: 'text_to_speech',
      mimo_tts_api: 'text_to_speech',
      edge_tts: 'text_to_speech',
--- a/dashboard/src/i18n/locales/en-US/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/en-US/features/config-metadata.json
@@ -1462,6 +1462,10 @@
        "description": "Deploy SenseVoice",
        "hint": "Before enabling, install funasr, funasr_onnx, torchaudio, torch, modelscope, and jieba (CPU by default, about 1 GB download), and install ffmpeg. Otherwise STT will not work."
      },
+      "faster_whisper_hint": {
+        "description": "Deploy faster-whisper",
+        "hint": "Before enabling, install faster-whisper. CPU inference works directly; NVIDIA GPU inference also requires the matching CUDA/cuDNN runtime libraries for CTranslate2."
+      },
      "is_emotion": {
        "description": "Emotion recognition",
        "hint": "Enable emotion recognition. happy?sad?angry?neutral?fearful?disgusted?surprised?unknown"
@@ -1530,6 +1534,10 @@
        "description": "Inference device",
        "hint": "Whisper inference device. Apple Silicon can use mps; other environments should use cpu. If mps is selected but unavailable, AstrBot will fall back to cpu."
      },
+      "faster_whisper_device": {
+        "description": "Inference device",
+        "hint": "faster-whisper inference device. Supported values are auto, cpu, and cuda. In most cases keeping auto is the safest choice."
+      },
      "id": {
        "description": "ID"
      },
--- a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
@@ -1459,6 +1459,10 @@
                "description": "Развертывание SenseVoice",
                "hint": "Перед включением установите необходимые библиотеки: funasr, torch и др. Также требуется ffmpeg."
            },
+            "faster_whisper_hint": {
+                "description": "Развертывание faster-whisper",
+                "hint": "Перед включением установите faster-whisper. CPU-режим работает сразу; для NVIDIA GPU дополнительно потребуются совместимые CUDA/cuDNN runtime-библиотеки для CTranslate2."
+            },
            "is_emotion": {
                "description": "Распознавание эмоций",
                "hint": "Включить распознавание эмоций (радость, грусть, гнев и т.д.)."
@@ -1527,6 +1531,10 @@
                "description": "Устройство инференса",
                "hint": "Устройство для инференса Whisper. На Apple Silicon можно выбрать mps; в остальных средах рекомендуется cpu. Если выбран mps, но он недоступен, AstrBot автоматически переключится на cpu."
            },
+            "faster_whisper_device": {
+                "description": "Устройство инференса",
+                "hint": "Устройство инференса faster-whisper. Поддерживаются значения auto, cpu и cuda. В большинстве случаев лучше оставить auto."
+            },
            "id": {
                "description": "ID провайдера"
            },
--- a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
@@ -1464,6 +1464,10 @@
        "description": "部署SenseVoice",
        "hint": "启用前请 pip 安装 funasr、funasr_onnx、torchaudio、torch、modelscope、jieba 库（默认使用CPU，大约下载 1 GB），并且安装 ffmpeg。否则将无法正常转文字。"
      },
+      "faster_whisper_hint": {
+        "description": "部署 faster-whisper",
+        "hint": "启用前请 pip 安装 faster-whisper。CPU 可直接使用；NVIDIA GPU 需要额外准备 CTranslate2 对应的 CUDA/cuDNN 运行库。"
+      },
      "is_emotion": {
        "description": "情绪识别",
        "hint": "是否开启情绪识别。happy｜sad｜angry｜neutral｜fearful｜disgusted｜surprised｜unknown"
@@ -1532,6 +1536,10 @@
        "description": "推理设备",
        "hint": "Whisper 推理设备。Apple Silicon 可选 mps；其他环境建议使用 cpu。若指定 mps 但当前环境不可用，将自动回退到 cpu。"
      },
+      "faster_whisper_device": {
+        "description": "推理设备",
+        "hint": "faster-whisper 推理设备。可选 auto、cpu、cuda。通常建议保留 auto，让底层自行选择。"
+      },
      "id": {
        "description": "ID"
      },