diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index 0acb6e3a8..6fc70353b 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -358,6 +358,7 @@ pub fn run(cli_args: CliArgs) { shortcut::delete_post_process_prompt, shortcut::set_post_process_selected_prompt, shortcut::update_custom_words, + shortcut::update_transcription_prompt, shortcut::suspend_binding, shortcut::resume_binding, shortcut::change_mute_while_recording_setting, diff --git a/src-tauri/src/managers/audio.rs b/src-tauri/src/managers/audio.rs index 24dd04fc7..7a3ee5e3f 100644 --- a/src-tauri/src/managers/audio.rs +++ b/src-tauri/src/managers/audio.rs @@ -465,8 +465,12 @@ impl AudioRecordingManager { // Pad if very short let s_len = samples.len(); - // debug!("Got {} samples", s_len); - if s_len < WHISPER_SAMPLE_RATE && s_len > 0 { + const MIN_SPEECH_SAMPLES: usize = 1600; // 100ms at 16kHz + if s_len < MIN_SPEECH_SAMPLES { + // Too short to be real speech — SmoothedVad minimum real output is ~8000 samples + // (15 prefill + 2 onset + 15 hangover frames). Anything shorter is leakage. + Some(Vec::new()) + } else if s_len < WHISPER_SAMPLE_RATE { let mut padded = samples; padded.resize(WHISPER_SAMPLE_RATE * 5 / 4, 0.0); Some(padded) diff --git a/src-tauri/src/managers/transcription.rs b/src-tauri/src/managers/transcription.rs index 2ccd6af43..fa71693dc 100644 --- a/src-tauri/src/managers/transcription.rs +++ b/src-tauri/src/managers/transcription.rs @@ -458,6 +458,17 @@ impl TranscriptionManager { return Ok(String::new()); } + const RMS_SILENCE_THRESHOLD: f32 = 0.005; + let rms = (audio.iter().map(|&s| s * s).sum::() / audio.len() as f32).sqrt(); + if rms < RMS_SILENCE_THRESHOLD { + debug!( + "Audio RMS {:.6} below silence threshold {:.4}; skipping transcription", + rms, RMS_SILENCE_THRESHOLD + ); + self.maybe_unload_immediately("silent audio"); + return Ok(String::new()); + } + // Check if model is loaded, if not try to load it { // If the model is loading, wait for it to complete. @@ -543,10 +554,21 @@ impl TranscriptionManager { let params = WhisperInferenceParams { language: whisper_language, translate: settings.translate_to_english, - initial_prompt: if settings.custom_words.is_empty() { - None - } else { - Some(settings.custom_words.join(", ")) + initial_prompt: { + let mut parts = Vec::new(); + if !settings.custom_words.is_empty() { + parts.push(settings.custom_words.join(", ")); + } + if let Some(ref prompt) = settings.transcription_prompt { + if !prompt.trim().is_empty() { + parts.push(prompt.clone()); + } + } + if parts.is_empty() { + None + } else { + Some(parts.join("\n\n")) + } }, ..Default::default() }; diff --git a/src-tauri/src/settings.rs b/src-tauri/src/settings.rs index d930599cc..303bdef6a 100644 --- a/src-tauri/src/settings.rs +++ b/src-tauri/src/settings.rs @@ -430,6 +430,8 @@ pub struct AppSettings { pub whisper_gpu_device: i32, #[serde(default)] pub extra_recording_buffer_ms: u64, + #[serde(default)] + pub transcription_prompt: Option, } fn default_model() -> String { @@ -804,6 +806,7 @@ pub fn get_default_settings() -> AppSettings { ort_accelerator: OrtAcceleratorSetting::default(), whisper_gpu_device: default_whisper_gpu_device(), extra_recording_buffer_ms: 0, + transcription_prompt: None, } } diff --git a/src-tauri/src/shortcut/mod.rs b/src-tauri/src/shortcut/mod.rs index 6d179f175..53b4c45fa 100644 --- a/src-tauri/src/shortcut/mod.rs +++ b/src-tauri/src/shortcut/mod.rs @@ -648,6 +648,15 @@ pub fn update_custom_words(app: AppHandle, words: Vec) -> Result<(), Str Ok(()) } +#[tauri::command] +#[specta::specta] +pub fn update_transcription_prompt(app: AppHandle, prompt: Option) -> Result<(), String> { + let mut settings = settings::get_settings(&app); + settings.transcription_prompt = prompt; + settings::write_settings(&app, settings); + Ok(()) +} + #[tauri::command] #[specta::specta] pub fn change_word_correction_threshold_setting( diff --git a/src/bindings.ts b/src/bindings.ts index 378d630da..62e5ad5c7 100644 --- a/src/bindings.ts +++ b/src/bindings.ts @@ -272,6 +272,14 @@ async updateCustomWords(words: string[]) : Promise> { else return { status: "error", error: e as any }; } }, +async updateTranscriptionPrompt(prompt: string | null) : Promise> { + try { + return { status: "ok", data: await TAURI_INVOKE("update_transcription_prompt", { prompt }) }; +} catch (e) { + if(e instanceof Error) throw e; + else return { status: "error", error: e as any }; +} +}, /** * Temporarily unregister a binding while the user is editing it in the UI. * This avoids firing the action while keys are being recorded. @@ -827,7 +835,7 @@ historyUpdatePayload: "history-update-payload" /** user-defined types **/ -export type AppSettings = { bindings: Partial<{ [key in string]: ShortcutBinding }>; push_to_talk: boolean; audio_feedback: boolean; audio_feedback_volume?: number; sound_theme?: SoundTheme; start_hidden?: boolean; autostart_enabled?: boolean; update_checks_enabled?: boolean; selected_model?: string; always_on_microphone?: boolean; selected_microphone?: string | null; clamshell_microphone?: string | null; selected_output_device?: string | null; translate_to_english?: boolean; selected_language?: string; overlay_position?: OverlayPosition; debug_mode?: boolean; log_level?: LogLevel; custom_words?: string[]; model_unload_timeout?: ModelUnloadTimeout; word_correction_threshold?: number; history_limit?: number; recording_retention_period?: RecordingRetentionPeriod; paste_method?: PasteMethod; clipboard_handling?: ClipboardHandling; auto_submit?: boolean; auto_submit_key?: AutoSubmitKey; post_process_enabled?: boolean; post_process_provider_id?: string; post_process_providers?: PostProcessProvider[]; post_process_api_keys?: SecretMap; post_process_models?: Partial<{ [key in string]: string }>; post_process_prompts?: LLMPrompt[]; post_process_selected_prompt_id?: string | null; mute_while_recording?: boolean; append_trailing_space?: boolean; app_language?: string; experimental_enabled?: boolean; lazy_stream_close?: boolean; keyboard_implementation?: KeyboardImplementation; show_tray_icon?: boolean; paste_delay_ms?: number; typing_tool?: TypingTool; external_script_path: string | null; custom_filler_words?: string[] | null; whisper_accelerator?: WhisperAcceleratorSetting; ort_accelerator?: OrtAcceleratorSetting; whisper_gpu_device?: number; extra_recording_buffer_ms?: number } +export type AppSettings = { bindings: Partial<{ [key in string]: ShortcutBinding }>; push_to_talk: boolean; audio_feedback: boolean; audio_feedback_volume?: number; sound_theme?: SoundTheme; start_hidden?: boolean; autostart_enabled?: boolean; update_checks_enabled?: boolean; selected_model?: string; always_on_microphone?: boolean; selected_microphone?: string | null; clamshell_microphone?: string | null; selected_output_device?: string | null; translate_to_english?: boolean; selected_language?: string; overlay_position?: OverlayPosition; debug_mode?: boolean; log_level?: LogLevel; custom_words?: string[]; model_unload_timeout?: ModelUnloadTimeout; word_correction_threshold?: number; history_limit?: number; recording_retention_period?: RecordingRetentionPeriod; paste_method?: PasteMethod; clipboard_handling?: ClipboardHandling; auto_submit?: boolean; auto_submit_key?: AutoSubmitKey; post_process_enabled?: boolean; post_process_provider_id?: string; post_process_providers?: PostProcessProvider[]; post_process_api_keys?: SecretMap; post_process_models?: Partial<{ [key in string]: string }>; post_process_prompts?: LLMPrompt[]; post_process_selected_prompt_id?: string | null; mute_while_recording?: boolean; append_trailing_space?: boolean; app_language?: string; experimental_enabled?: boolean; lazy_stream_close?: boolean; keyboard_implementation?: KeyboardImplementation; show_tray_icon?: boolean; paste_delay_ms?: number; typing_tool?: TypingTool; external_script_path: string | null; custom_filler_words?: string[] | null; whisper_accelerator?: WhisperAcceleratorSetting; ort_accelerator?: OrtAcceleratorSetting; whisper_gpu_device?: number; extra_recording_buffer_ms?: number; transcription_prompt?: string | null } export type AudioDevice = { index: string; name: string; is_default: boolean } export type AutoSubmitKey = "enter" | "ctrl_enter" | "cmd_enter" export type AvailableAccelerators = { whisper: string[]; ort: string[]; gpu_devices: GpuDeviceOption[] } diff --git a/src/components/settings/TranscriptionPrompt.tsx b/src/components/settings/TranscriptionPrompt.tsx new file mode 100644 index 000000000..42c340cf7 --- /dev/null +++ b/src/components/settings/TranscriptionPrompt.tsx @@ -0,0 +1,238 @@ +import React, { useState, useCallback, useEffect, useMemo } from "react"; +import { useTranslation } from "react-i18next"; +import { useSettings } from "../../hooks/useSettings"; +import { useModelStore } from "../../stores/modelStore"; +import { SettingContainer } from "../ui/SettingContainer"; +import { Textarea } from "../ui/Textarea"; +import { Dropdown } from "../ui/Dropdown"; +import type { DropdownOption } from "../ui/Dropdown"; + +interface TranscriptionPromptProps { + descriptionMode?: "inline" | "tooltip"; + grouped?: boolean; +} + +function estimateTokens(text: string): number { + let tokens = 0; + for (const ch of text) { + const code = ch.codePointAt(0)!; + if ( + (code >= 0x3000 && code <= 0x9fff) || + (code >= 0xf900 && code <= 0xfaff) || + (code >= 0xff00 && code <= 0xffef) + ) { + tokens += 2.2; // CJK ideographs, compatibility, fullwidth + } else if (code >= 0x0400 && code <= 0x04ff) { + tokens += 0.5; // Cyrillic + } else { + tokens += 0.25; // Latin/spaces/punctuation + } + } + return Math.round(tokens); +} + +const TOKEN_BUDGET = 112; + +const PRESETS: Record = { + english: `Hello! How are you? He said: "Let's do this today — while we have time." Of course, it's not that simple.`, + spanish: `¡Hola! ¿Cómo estás? Él dijo: "Hagámoslo hoy, mientras tengamos tiempo." Claro, no es tan sencillo.`, + french: `Bonjour ! Comment allez-vous ? Il a dit : « Faisons-le aujourd'hui — tant qu'on a le temps. » Ce n'est pas si simple.`, + german: `Hallo! Wie geht es Ihnen? Er sagte: „Machen wir es heute — solange wir Zeit haben." So einfach ist es nicht.`, + portuguese: `Olá! Como você está? Ele disse: "Vamos fazer isso hoje — enquanto temos tempo." Claro, não é tão simples.`, + italian: `Ciao! Come stai? Ha detto: "Facciamolo oggi — finché abbiamo tempo." Non è così semplice.`, + russian: `Привет! Как дела? Он сказал: «Сделаем это сегодня — пока есть время». Конечно, не всё так просто; нужно учесть погоду.`, + japanese: `こんにちは!元気ですか?「今日やりましょう。」もちろん、簡単ではない。`, + chinese_simplified: `你好!你怎么样?他说:"今天就做吧。"当然,事情没那么简单。`, + chinese_traditional: `你好!你怎麼樣?他說:「今天就做吧。」當然,事情沒那麼簡單。`, +}; + +export const TranscriptionPrompt: React.FC = + React.memo(({ descriptionMode = "tooltip", grouped = false }) => { + const { t } = useTranslation(); + const { getSetting, updateSetting, isUpdating } = useSettings(); + const currentPrompt = getSetting("transcription_prompt") ?? ""; + const selectedLanguage = getSetting("selected_language"); + const currentModelId = useModelStore((s) => s.currentModel); + const getModelInfo = useModelStore((s) => s.getModelInfo); + const isWhisper = + getModelInfo(currentModelId)?.engine_type === "Whisper"; + const [localValue, setLocalValue] = useState(currentPrompt); + const [isDirty, setIsDirty] = useState(false); + + const activePreset = + Object.entries(PRESETS).find( + ([, text]) => text === localValue.trim(), + )?.[0] ?? "none"; + + const presetOptions: DropdownOption[] = useMemo( + () => [ + { + value: "none", + label: t("settings.advanced.transcriptionPrompt.presets.none"), + }, + { + value: "english", + label: t("settings.advanced.transcriptionPrompt.presets.english"), + }, + { + value: "spanish", + label: t("settings.advanced.transcriptionPrompt.presets.spanish"), + }, + { + value: "french", + label: t("settings.advanced.transcriptionPrompt.presets.french"), + }, + { + value: "german", + label: t("settings.advanced.transcriptionPrompt.presets.german"), + }, + { + value: "portuguese", + label: t("settings.advanced.transcriptionPrompt.presets.portuguese"), + }, + { + value: "italian", + label: t("settings.advanced.transcriptionPrompt.presets.italian"), + }, + { + value: "russian", + label: t("settings.advanced.transcriptionPrompt.presets.russian"), + }, + { + value: "japanese", + label: t("settings.advanced.transcriptionPrompt.presets.japanese"), + }, + { + value: "chinese_simplified", + label: t( + "settings.advanced.transcriptionPrompt.presets.chineseSimplified", + ), + }, + { + value: "chinese_traditional", + label: t( + "settings.advanced.transcriptionPrompt.presets.chineseTraditional", + ), + }, + ], + [t], + ); + + useEffect(() => { + if (!isDirty) { + setLocalValue(currentPrompt); + } + }, [currentPrompt, isDirty]); + + const handleChange = useCallback( + (e: React.ChangeEvent) => { + const value = e.target.value; + if (estimateTokens(value) <= TOKEN_BUDGET) { + setLocalValue(value); + setIsDirty(true); + } + }, + [], + ); + + const handleBlur = useCallback(() => { + if (!isDirty) return; + const trimmed = localValue.trim(); + updateSetting( + "transcription_prompt", + trimmed.length > 0 ? trimmed : null, + ); + setIsDirty(false); + }, [localValue, isDirty, updateSetting]); + + const handlePreset = useCallback( + (key: string) => { + if (key === "none") { + setLocalValue(""); + updateSetting("transcription_prompt", null); + } else { + const preset = PRESETS[key] ?? ""; + setLocalValue(preset); + updateSetting("transcription_prompt", preset); + } + setIsDirty(false); + }, + [updateSetting], + ); + + const estimatedTokens = estimateTokens(localValue); + const percentage = Math.min( + 100, + Math.round((estimatedTokens / TOKEN_BUDGET) * 100), + ); + + return ( + +
+
+ + +
+