mirror of
https://github.com/AstrBotDevs/AstrBot
synced 2026-07-02 10:40:15 +08:00
Compare commits
1 Commits
codex/fix-
...
feat/faste
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5ddd8ae1f1 |
@@ -1588,6 +1588,15 @@ CONFIG_METADATA_2 = {
|
||||
"stt_model": "iic/SenseVoiceSmall",
|
||||
"is_emotion": False,
|
||||
},
|
||||
"Faster Whisper(Local)": {
|
||||
"type": "faster_whisper_stt_selfhost",
|
||||
"provider": "faster-whisper",
|
||||
"provider_type": "speech_to_text",
|
||||
"enable": False,
|
||||
"id": "faster_whisper",
|
||||
"model": "small",
|
||||
"faster_whisper_device": "auto",
|
||||
},
|
||||
"OpenAI TTS(API)": {
|
||||
"id": "openai_tts",
|
||||
"type": "openai_tts_api",
|
||||
@@ -2489,6 +2498,11 @@ CONFIG_METADATA_2 = {
|
||||
"type": "string",
|
||||
"hint": "启用前请 pip 安装 funasr、funasr_onnx、torchaudio、torch、modelscope、jieba 库(默认使用CPU,大约下载 1 GB),并且安装 ffmpeg。否则将无法正常转文字。",
|
||||
},
|
||||
"faster_whisper_hint": {
|
||||
"description": "部署 faster-whisper",
|
||||
"type": "string",
|
||||
"hint": "启用前请 pip 安装 faster-whisper。CPU 可直接使用;NVIDIA GPU 需要额外准备 CTranslate2 对应的 CUDA/cuDNN 运行库。模型会在首次加载时自动下载到 Hugging Face 缓存目录或你指定的下载目录。",
|
||||
},
|
||||
"is_emotion": {
|
||||
"description": "情绪识别",
|
||||
"type": "bool",
|
||||
@@ -2583,6 +2597,12 @@ CONFIG_METADATA_2 = {
|
||||
"hint": "Whisper 推理设备。Apple Silicon 可选 mps;其他环境建议使用 cpu。若指定 mps 但当前环境不可用,将自动回退到 cpu。",
|
||||
"options": ["cpu", "mps"],
|
||||
},
|
||||
"faster_whisper_device": {
|
||||
"description": "推理设备",
|
||||
"type": "string",
|
||||
"hint": "faster-whisper 推理设备。可选 auto、cpu、cuda。通常建议保留 auto,让底层自行选择。",
|
||||
"options": ["auto", "cpu", "cuda"],
|
||||
},
|
||||
"id": {
|
||||
"description": "ID",
|
||||
"type": "string",
|
||||
|
||||
@@ -397,6 +397,10 @@ class ProviderManager:
|
||||
from .sources.sensevoice_selfhosted_source import (
|
||||
ProviderSenseVoiceSTTSelfHost as ProviderSenseVoiceSTTSelfHost,
|
||||
)
|
||||
case "faster_whisper_stt_selfhost":
|
||||
from .sources.faster_whisper_selfhosted_source import (
|
||||
ProviderFasterWhisperSTTSelfHost as ProviderFasterWhisperSTTSelfHost,
|
||||
)
|
||||
case "openai_whisper_api":
|
||||
from .sources.whisper_api_source import (
|
||||
ProviderOpenAIWhisperAPI as ProviderOpenAIWhisperAPI,
|
||||
|
||||
@@ -0,0 +1,147 @@
|
||||
import asyncio
|
||||
import importlib
|
||||
import uuid
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
from typing import Any, cast
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from astrbot.core import logger
|
||||
from astrbot.core.utils.astrbot_path import get_astrbot_temp_path
|
||||
from astrbot.core.utils.io import download_file
|
||||
from astrbot.core.utils.tencent_record_helper import (
|
||||
convert_to_pcm_wav,
|
||||
tencent_silk_to_wav,
|
||||
)
|
||||
|
||||
from ..entities import ProviderType
|
||||
from ..provider import STTProvider
|
||||
from ..register import register_provider_adapter
|
||||
|
||||
|
||||
@register_provider_adapter(
|
||||
"faster_whisper_stt_selfhost",
|
||||
"faster-whisper 模型部署",
|
||||
provider_type=ProviderType.SPEECH_TO_TEXT,
|
||||
)
|
||||
class ProviderFasterWhisperSTTSelfHost(STTProvider):
|
||||
def __init__(
|
||||
self,
|
||||
provider_config: dict,
|
||||
provider_settings: dict,
|
||||
) -> None:
|
||||
super().__init__(provider_config, provider_settings)
|
||||
self.set_model(provider_config["model"])
|
||||
self.device = str(
|
||||
provider_config.get("faster_whisper_device", "auto"),
|
||||
).strip()
|
||||
self.model: Any = None
|
||||
|
||||
async def initialize(self) -> None:
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
def _load_model() -> Any:
|
||||
faster_whisper = importlib.import_module("faster_whisper")
|
||||
whisper_model_cls = faster_whisper.WhisperModel
|
||||
return whisper_model_cls(
|
||||
self.model_name,
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
logger.info("下载或者加载 faster-whisper 模型中,这可能需要一些时间 ...")
|
||||
self.model = await loop.run_in_executor(None, _load_model)
|
||||
logger.info(
|
||||
"faster-whisper 模型加载完成。device=%s",
|
||||
self.device,
|
||||
)
|
||||
|
||||
def _get_temp_dir(self) -> Path:
|
||||
temp_dir = Path(get_astrbot_temp_path())
|
||||
temp_dir.mkdir(parents=True, exist_ok=True)
|
||||
return temp_dir
|
||||
|
||||
async def _detect_audio_format(self, file_path: Path) -> str | None:
|
||||
try:
|
||||
with file_path.open("rb") as file:
|
||||
file_header = file.read(8)
|
||||
except FileNotFoundError:
|
||||
return None
|
||||
|
||||
if b"SILK" in file_header:
|
||||
return "silk"
|
||||
if b"#!AMR" in file_header:
|
||||
return "amr"
|
||||
return None
|
||||
|
||||
async def _prepare_audio_input(self, audio_url: str) -> tuple[Path, list[Path]]:
|
||||
cleanup_paths: list[Path] = []
|
||||
source_path = Path(audio_url)
|
||||
is_remote = audio_url.startswith(("http://", "https://"))
|
||||
is_tencent = "multimedia.nt.qq.com.cn" in audio_url if is_remote else False
|
||||
|
||||
if is_remote:
|
||||
parsed_url = urlparse(audio_url)
|
||||
suffix = Path(parsed_url.path).suffix or ".input"
|
||||
download_path = (
|
||||
self._get_temp_dir()
|
||||
/ f"faster_whisper_selfhost_{uuid.uuid4().hex[:8]}{suffix}"
|
||||
)
|
||||
await download_file(audio_url, str(download_path))
|
||||
source_path = download_path
|
||||
cleanup_paths.append(download_path)
|
||||
|
||||
if not source_path.exists():
|
||||
raise FileNotFoundError(f"文件不存在: {source_path}")
|
||||
|
||||
if source_path.suffix.lower() in {".amr", ".silk"} or is_tencent:
|
||||
file_format = await self._detect_audio_format(source_path)
|
||||
if file_format in {"silk", "amr"}:
|
||||
converted_path = (
|
||||
self._get_temp_dir()
|
||||
/ f"faster_whisper_selfhost_{uuid.uuid4().hex[:8]}.wav"
|
||||
)
|
||||
cleanup_paths.append(converted_path)
|
||||
|
||||
if file_format == "silk":
|
||||
logger.info("Converting silk file to wav ...")
|
||||
await tencent_silk_to_wav(str(source_path), str(converted_path))
|
||||
else:
|
||||
logger.info("Converting amr file to wav ...")
|
||||
await convert_to_pcm_wav(str(source_path), str(converted_path))
|
||||
|
||||
source_path = converted_path
|
||||
|
||||
return source_path, cleanup_paths
|
||||
|
||||
def _transcribe_audio(self, audio_path: Path) -> str:
|
||||
if self.model is None:
|
||||
raise RuntimeError("faster-whisper 模型未初始化")
|
||||
|
||||
segments, info = self.model.transcribe(str(audio_path))
|
||||
segment_list = list(segments)
|
||||
text = "".join(segment.text for segment in segment_list).strip()
|
||||
logger.debug(
|
||||
"faster-whisper transcription completed. language=%s, text=%s",
|
||||
getattr(info, "language", None),
|
||||
text,
|
||||
)
|
||||
return cast(str, text)
|
||||
|
||||
async def get_text(self, audio_url: str) -> str:
|
||||
loop = asyncio.get_running_loop()
|
||||
audio_path, cleanup_paths = await self._prepare_audio_input(audio_url)
|
||||
try:
|
||||
return await loop.run_in_executor(
|
||||
None,
|
||||
partial(self._transcribe_audio, audio_path),
|
||||
)
|
||||
finally:
|
||||
for path in cleanup_paths:
|
||||
try:
|
||||
path.unlink(missing_ok=True)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"Failed to remove temporary faster-whisper file %s: %s",
|
||||
path,
|
||||
exc,
|
||||
)
|
||||
@@ -329,6 +329,7 @@ export function useProviderSources(options: UseProviderSourcesOptions) {
|
||||
mimo_stt_api: 'speech_to_text',
|
||||
openai_whisper_selfhost: 'speech_to_text',
|
||||
sensevoice_stt_selfhost: 'speech_to_text',
|
||||
faster_whisper_stt_selfhost: 'speech_to_text',
|
||||
openai_tts_api: 'text_to_speech',
|
||||
mimo_tts_api: 'text_to_speech',
|
||||
edge_tts: 'text_to_speech',
|
||||
|
||||
@@ -1462,6 +1462,10 @@
|
||||
"description": "Deploy SenseVoice",
|
||||
"hint": "Before enabling, install funasr, funasr_onnx, torchaudio, torch, modelscope, and jieba (CPU by default, about 1 GB download), and install ffmpeg. Otherwise STT will not work."
|
||||
},
|
||||
"faster_whisper_hint": {
|
||||
"description": "Deploy faster-whisper",
|
||||
"hint": "Before enabling, install faster-whisper. CPU inference works directly; NVIDIA GPU inference also requires the matching CUDA/cuDNN runtime libraries for CTranslate2."
|
||||
},
|
||||
"is_emotion": {
|
||||
"description": "Emotion recognition",
|
||||
"hint": "Enable emotion recognition. happy?sad?angry?neutral?fearful?disgusted?surprised?unknown"
|
||||
@@ -1530,6 +1534,10 @@
|
||||
"description": "Inference device",
|
||||
"hint": "Whisper inference device. Apple Silicon can use mps; other environments should use cpu. If mps is selected but unavailable, AstrBot will fall back to cpu."
|
||||
},
|
||||
"faster_whisper_device": {
|
||||
"description": "Inference device",
|
||||
"hint": "faster-whisper inference device. Supported values are auto, cpu, and cuda. In most cases keeping auto is the safest choice."
|
||||
},
|
||||
"id": {
|
||||
"description": "ID"
|
||||
},
|
||||
|
||||
@@ -1459,6 +1459,10 @@
|
||||
"description": "Развертывание SenseVoice",
|
||||
"hint": "Перед включением установите необходимые библиотеки: funasr, torch и др. Также требуется ffmpeg."
|
||||
},
|
||||
"faster_whisper_hint": {
|
||||
"description": "Развертывание faster-whisper",
|
||||
"hint": "Перед включением установите faster-whisper. CPU-режим работает сразу; для NVIDIA GPU дополнительно потребуются совместимые CUDA/cuDNN runtime-библиотеки для CTranslate2."
|
||||
},
|
||||
"is_emotion": {
|
||||
"description": "Распознавание эмоций",
|
||||
"hint": "Включить распознавание эмоций (радость, грусть, гнев и т.д.)."
|
||||
@@ -1527,6 +1531,10 @@
|
||||
"description": "Устройство инференса",
|
||||
"hint": "Устройство для инференса Whisper. На Apple Silicon можно выбрать mps; в остальных средах рекомендуется cpu. Если выбран mps, но он недоступен, AstrBot автоматически переключится на cpu."
|
||||
},
|
||||
"faster_whisper_device": {
|
||||
"description": "Устройство инференса",
|
||||
"hint": "Устройство инференса faster-whisper. Поддерживаются значения auto, cpu и cuda. В большинстве случаев лучше оставить auto."
|
||||
},
|
||||
"id": {
|
||||
"description": "ID провайдера"
|
||||
},
|
||||
|
||||
@@ -1464,6 +1464,10 @@
|
||||
"description": "部署SenseVoice",
|
||||
"hint": "启用前请 pip 安装 funasr、funasr_onnx、torchaudio、torch、modelscope、jieba 库(默认使用CPU,大约下载 1 GB),并且安装 ffmpeg。否则将无法正常转文字。"
|
||||
},
|
||||
"faster_whisper_hint": {
|
||||
"description": "部署 faster-whisper",
|
||||
"hint": "启用前请 pip 安装 faster-whisper。CPU 可直接使用;NVIDIA GPU 需要额外准备 CTranslate2 对应的 CUDA/cuDNN 运行库。"
|
||||
},
|
||||
"is_emotion": {
|
||||
"description": "情绪识别",
|
||||
"hint": "是否开启情绪识别。happy|sad|angry|neutral|fearful|disgusted|surprised|unknown"
|
||||
@@ -1532,6 +1536,10 @@
|
||||
"description": "推理设备",
|
||||
"hint": "Whisper 推理设备。Apple Silicon 可选 mps;其他环境建议使用 cpu。若指定 mps 但当前环境不可用,将自动回退到 cpu。"
|
||||
},
|
||||
"faster_whisper_device": {
|
||||
"description": "推理设备",
|
||||
"hint": "faster-whisper 推理设备。可选 auto、cpu、cuda。通常建议保留 auto,让底层自行选择。"
|
||||
},
|
||||
"id": {
|
||||
"description": "ID"
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user