Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src-tauri/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ rusqlite = { version = "0.37", features = ["bundled"] }
tar = "0.4.44"
flate2 = "1.0"
sha2 = "0.10"
transcribe-rs = { version = "0.3.8", features = ["whisper-cpp", "onnx"] }
transcribe-rs = { version = "0.3.8", features = ["whisper-cpp", "onnx", "vad-silero"] }
handy-keys = "0.2.4"
ferrous-opencc = "0.2.3"
clap = { version = "4", features = ["derive"] }
Expand All @@ -88,7 +88,7 @@ tauri-plugin-single-instance = "2.3.2"
tauri-plugin-updater = "2.10.0"

[target.'cfg(windows)'.dependencies]
transcribe-rs = { version = "0.3.3", features = ["whisper-vulkan", "ort-directml"] }
transcribe-rs = { version = "0.3.3", features = ["whisper-vulkan", "ort-directml", "vad-silero"] }
windows = { version = "0.61.3", features = [
"Win32_Media_Audio_Endpoints",
"Win32_System_Com_StructuredStorage",
Expand All @@ -100,12 +100,12 @@ winreg = "0.55"

[target.'cfg(target_os = "macos")'.dependencies]
tauri-nspanel = { git = "https://github.com/ahkohd/tauri-nspanel", branch = "v2.1" }
transcribe-rs = { version = "0.3.3", features = ["whisper-metal"] }
transcribe-rs = { version = "0.3.3", features = ["whisper-metal", "vad-silero"] }

[target.'cfg(target_os = "linux")'.dependencies]
gtk-layer-shell = { version = "0.8", features = ["v0_6"] }
gtk = "0.18"
transcribe-rs = { version = "0.3.3", features = ["whisper-vulkan"] }
transcribe-rs = { version = "0.3.3", features = ["whisper-vulkan", "vad-silero"] }

[patch.crates-io]
tauri-runtime = { git = "https://github.com/cjpais/tauri.git", branch = "handy-2.10.2" }
Expand Down
61 changes: 57 additions & 4 deletions src-tauri/src/managers/transcription.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ use transcribe_rs::{
sense_voice::{SenseVoiceModel, SenseVoiceParams},
Quantization,
},
transcriber::{Transcriber, VadChunked, VadChunkedConfig},
vad::{SileroVad, SmoothedVad},
whisper_cpp::{WhisperEngine, WhisperInferenceParams},
SpeechModel, TranscribeOptions,
};
Expand All @@ -42,7 +44,7 @@ enum LoadedEngine {
Moonshine(MoonshineModel),
MoonshineStreaming(StreamingModel),
SenseVoice(SenseVoiceModel),
GigaAM(GigaAMModel),
GigaAM(GigaAMModel, VadChunked),
Canary(CanaryModel),
Cohere(CohereModel),
}
Expand Down Expand Up @@ -359,7 +361,27 @@ impl TranscriptionManager {
emit_loading_failed(&error_msg);
anyhow::anyhow!(error_msg)
})?;
LoadedEngine::GigaAM(engine)
// GigaAM-v3 was trained on ≤25-30s segments; longer audio breaks
// the model. Wrap inference in a VadChunked so the model only
// sees short chunks. See `build_longform_chunker` below.
let vad_path = self
.app_handle
.path()
.resolve(
"resources/models/silero_vad_v4.onnx",
tauri::path::BaseDirectory::Resource,
)
.map_err(|e| {
let error_msg = format!("Failed to resolve Silero VAD path: {}", e);
emit_loading_failed(&error_msg);
anyhow::anyhow!(error_msg)
})?;
let chunker = build_longform_chunker(&vad_path).map_err(|e| {
let error_msg = format!("Failed to build longform chunker: {}", e);
emit_loading_failed(&error_msg);
anyhow::anyhow!(e)
})?;
LoadedEngine::GigaAM(engine, chunker)
}
EngineType::Canary => {
let engine = CanaryModel::load(&model_path, &Quantization::Int8).map_err(|e| {
Expand Down Expand Up @@ -593,8 +615,8 @@ impl TranscriptionManager {
anyhow::anyhow!("SenseVoice transcription failed: {}", e)
})
}
LoadedEngine::GigaAM(gigaam_engine) => gigaam_engine
.transcribe(&audio, &TranscribeOptions::default())
LoadedEngine::GigaAM(gigaam_engine, chunker) => chunker
.transcribe(gigaam_engine, &audio)
.map_err(|e| anyhow::anyhow!("GigaAM transcription failed: {}", e)),
LoadedEngine::Canary(canary_engine) => {
let lang = if validated_language == "auto" {
Expand Down Expand Up @@ -733,6 +755,37 @@ impl TranscriptionManager {
}
}

/// Build a `VadChunked` chunker tuned for long-form ASR.
///
/// Targets CTC models trained on short segments (≤25-30s) — currently only
/// GigaAM-v3 in Handy, but other models with the same limitation can reuse
/// this same builder. If parameters need to differ between models, promote
/// the hardcoded values to function arguments.
///
/// Algorithm reference: Sber's `segment_audio_file` in
/// <https://github.com/salute-developers/GigaAM/blob/main/gigaam/vad_utils.py>.
fn build_longform_chunker(vad_path: &std::path::Path) -> Result<VadChunked, anyhow::Error> {
let silero = SileroVad::new(vad_path, 0.3)
.map_err(|e| anyhow::anyhow!("Failed to load Silero VAD for longform chunker: {}", e))?;
// SmoothedVad: prefill=15 (450ms history before onset, recovers attack of
// first word), hangover=30 (900ms patience after speech ends — emulates
// Sber's greedy packing of pyannote segments separated by short pauses),
// onset=2 (60ms of consecutive speech required to confirm onset).
let smoothed = SmoothedVad::new(Box::new(silero), 15, 30, 2);
let config = VadChunkedConfig {
min_chunk_secs: 0.2, // = Sber's `new_chunk_threshold`
max_chunk_secs: 30.0, // = Sber's `strict_limit_duration`
padding_secs: 0.0, // Sber pipeline does not pad; CTC tolerates raw edges
smart_split_search_secs: Some(3.0), // upgrade over Sber's uniform-time split
merge_separator: " ".into(),
};
Ok(VadChunked::new(
Box::new(smoothed),
config,
TranscribeOptions::default(),
))
}

/// Apply the user's accelerator preferences to the transcribe-rs global atomics.
/// Called on startup and whenever the user changes the setting.
pub fn apply_accelerator_settings(app: &tauri::AppHandle) {
Expand Down