Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src-tauri/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ pub fn run(cli_args: CliArgs) {
shortcut::delete_post_process_prompt,
shortcut::set_post_process_selected_prompt,
shortcut::update_custom_words,
shortcut::update_transcription_prompt,
shortcut::suspend_binding,
shortcut::resume_binding,
shortcut::change_mute_while_recording_setting,
Expand Down
8 changes: 6 additions & 2 deletions src-tauri/src/managers/audio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -465,8 +465,12 @@ impl AudioRecordingManager {

// Pad if very short
let s_len = samples.len();
// debug!("Got {} samples", s_len);
if s_len < WHISPER_SAMPLE_RATE && s_len > 0 {
const MIN_SPEECH_SAMPLES: usize = 1600; // 100ms at 16kHz
if s_len < MIN_SPEECH_SAMPLES {
// Too short to be real speech — SmoothedVad minimum real output is ~8000 samples
// (15 prefill + 2 onset + 15 hangover frames). Anything shorter is leakage.
Some(Vec::new())
} else if s_len < WHISPER_SAMPLE_RATE {
let mut padded = samples;
padded.resize(WHISPER_SAMPLE_RATE * 5 / 4, 0.0);
Some(padded)
Expand Down
30 changes: 26 additions & 4 deletions src-tauri/src/managers/transcription.rs
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,17 @@ impl TranscriptionManager {
return Ok(String::new());
}

const RMS_SILENCE_THRESHOLD: f32 = 0.005;
let rms = (audio.iter().map(|&s| s * s).sum::<f32>() / audio.len() as f32).sqrt();
if rms < RMS_SILENCE_THRESHOLD {
debug!(
"Audio RMS {:.6} below silence threshold {:.4}; skipping transcription",
rms, RMS_SILENCE_THRESHOLD
);
self.maybe_unload_immediately("silent audio");
return Ok(String::new());
}

// Check if model is loaded, if not try to load it
{
// If the model is loading, wait for it to complete.
Expand Down Expand Up @@ -543,10 +554,21 @@ impl TranscriptionManager {
let params = WhisperInferenceParams {
language: whisper_language,
translate: settings.translate_to_english,
initial_prompt: if settings.custom_words.is_empty() {
None
} else {
Some(settings.custom_words.join(", "))
initial_prompt: {
let mut parts = Vec::new();
if !settings.custom_words.is_empty() {
parts.push(settings.custom_words.join(", "));
}
if let Some(ref prompt) = settings.transcription_prompt {
if !prompt.trim().is_empty() {
parts.push(prompt.clone());
}
}
if parts.is_empty() {
None
} else {
Some(parts.join("\n\n"))
}
},
..Default::default()
};
Expand Down
3 changes: 3 additions & 0 deletions src-tauri/src/settings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,8 @@ pub struct AppSettings {
pub whisper_gpu_device: i32,
#[serde(default)]
pub extra_recording_buffer_ms: u64,
#[serde(default)]
pub transcription_prompt: Option<String>,
}

fn default_model() -> String {
Expand Down Expand Up @@ -804,6 +806,7 @@ pub fn get_default_settings() -> AppSettings {
ort_accelerator: OrtAcceleratorSetting::default(),
whisper_gpu_device: default_whisper_gpu_device(),
extra_recording_buffer_ms: 0,
transcription_prompt: None,
}
}

Expand Down
9 changes: 9 additions & 0 deletions src-tauri/src/shortcut/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,15 @@ pub fn update_custom_words(app: AppHandle, words: Vec<String>) -> Result<(), Str
Ok(())
}

#[tauri::command]
#[specta::specta]
pub fn update_transcription_prompt(app: AppHandle, prompt: Option<String>) -> Result<(), String> {
let mut settings = settings::get_settings(&app);
settings.transcription_prompt = prompt;
settings::write_settings(&app, settings);
Ok(())
}

#[tauri::command]
#[specta::specta]
pub fn change_word_correction_threshold_setting(
Expand Down
10 changes: 9 additions & 1 deletion src/bindings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,14 @@ async updateCustomWords(words: string[]) : Promise<Result<null, string>> {
else return { status: "error", error: e as any };
}
},
async updateTranscriptionPrompt(prompt: string | null) : Promise<Result<null, string>> {
try {
return { status: "ok", data: await TAURI_INVOKE("update_transcription_prompt", { prompt }) };
} catch (e) {
if(e instanceof Error) throw e;
else return { status: "error", error: e as any };
}
},
/**
* Temporarily unregister a binding while the user is editing it in the UI.
* This avoids firing the action while keys are being recorded.
Expand Down Expand Up @@ -827,7 +835,7 @@ historyUpdatePayload: "history-update-payload"

/** user-defined types **/

export type AppSettings = { bindings: Partial<{ [key in string]: ShortcutBinding }>; push_to_talk: boolean; audio_feedback: boolean; audio_feedback_volume?: number; sound_theme?: SoundTheme; start_hidden?: boolean; autostart_enabled?: boolean; update_checks_enabled?: boolean; selected_model?: string; always_on_microphone?: boolean; selected_microphone?: string | null; clamshell_microphone?: string | null; selected_output_device?: string | null; translate_to_english?: boolean; selected_language?: string; overlay_position?: OverlayPosition; debug_mode?: boolean; log_level?: LogLevel; custom_words?: string[]; model_unload_timeout?: ModelUnloadTimeout; word_correction_threshold?: number; history_limit?: number; recording_retention_period?: RecordingRetentionPeriod; paste_method?: PasteMethod; clipboard_handling?: ClipboardHandling; auto_submit?: boolean; auto_submit_key?: AutoSubmitKey; post_process_enabled?: boolean; post_process_provider_id?: string; post_process_providers?: PostProcessProvider[]; post_process_api_keys?: SecretMap; post_process_models?: Partial<{ [key in string]: string }>; post_process_prompts?: LLMPrompt[]; post_process_selected_prompt_id?: string | null; mute_while_recording?: boolean; append_trailing_space?: boolean; app_language?: string; experimental_enabled?: boolean; lazy_stream_close?: boolean; keyboard_implementation?: KeyboardImplementation; show_tray_icon?: boolean; paste_delay_ms?: number; typing_tool?: TypingTool; external_script_path: string | null; custom_filler_words?: string[] | null; whisper_accelerator?: WhisperAcceleratorSetting; ort_accelerator?: OrtAcceleratorSetting; whisper_gpu_device?: number; extra_recording_buffer_ms?: number }
export type AppSettings = { bindings: Partial<{ [key in string]: ShortcutBinding }>; push_to_talk: boolean; audio_feedback: boolean; audio_feedback_volume?: number; sound_theme?: SoundTheme; start_hidden?: boolean; autostart_enabled?: boolean; update_checks_enabled?: boolean; selected_model?: string; always_on_microphone?: boolean; selected_microphone?: string | null; clamshell_microphone?: string | null; selected_output_device?: string | null; translate_to_english?: boolean; selected_language?: string; overlay_position?: OverlayPosition; debug_mode?: boolean; log_level?: LogLevel; custom_words?: string[]; model_unload_timeout?: ModelUnloadTimeout; word_correction_threshold?: number; history_limit?: number; recording_retention_period?: RecordingRetentionPeriod; paste_method?: PasteMethod; clipboard_handling?: ClipboardHandling; auto_submit?: boolean; auto_submit_key?: AutoSubmitKey; post_process_enabled?: boolean; post_process_provider_id?: string; post_process_providers?: PostProcessProvider[]; post_process_api_keys?: SecretMap; post_process_models?: Partial<{ [key in string]: string }>; post_process_prompts?: LLMPrompt[]; post_process_selected_prompt_id?: string | null; mute_while_recording?: boolean; append_trailing_space?: boolean; app_language?: string; experimental_enabled?: boolean; lazy_stream_close?: boolean; keyboard_implementation?: KeyboardImplementation; show_tray_icon?: boolean; paste_delay_ms?: number; typing_tool?: TypingTool; external_script_path: string | null; custom_filler_words?: string[] | null; whisper_accelerator?: WhisperAcceleratorSetting; ort_accelerator?: OrtAcceleratorSetting; whisper_gpu_device?: number; extra_recording_buffer_ms?: number; transcription_prompt?: string | null }
export type AudioDevice = { index: string; name: string; is_default: boolean }
export type AutoSubmitKey = "enter" | "ctrl_enter" | "cmd_enter"
export type AvailableAccelerators = { whisper: string[]; ort: string[]; gpu_devices: GpuDeviceOption[] }
Expand Down
238 changes: 238 additions & 0 deletions src/components/settings/TranscriptionPrompt.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
import React, { useState, useCallback, useEffect, useMemo } from "react";
import { useTranslation } from "react-i18next";
import { useSettings } from "../../hooks/useSettings";
import { useModelStore } from "../../stores/modelStore";
import { SettingContainer } from "../ui/SettingContainer";
import { Textarea } from "../ui/Textarea";
import { Dropdown } from "../ui/Dropdown";
import type { DropdownOption } from "../ui/Dropdown";

interface TranscriptionPromptProps {
descriptionMode?: "inline" | "tooltip";
grouped?: boolean;
}

function estimateTokens(text: string): number {
let tokens = 0;
for (const ch of text) {
const code = ch.codePointAt(0)!;
if (
(code >= 0x3000 && code <= 0x9fff) ||
(code >= 0xf900 && code <= 0xfaff) ||
(code >= 0xff00 && code <= 0xffef)
) {
tokens += 2.2; // CJK ideographs, compatibility, fullwidth
} else if (code >= 0x0400 && code <= 0x04ff) {
tokens += 0.5; // Cyrillic
} else {
tokens += 0.25; // Latin/spaces/punctuation
}
}
return Math.round(tokens);
}

const TOKEN_BUDGET = 112;

const PRESETS: Record<string, string> = {
english: `Hello! How are you? He said: "Let's do this today — while we have time." Of course, it's not that simple.`,
spanish: `¡Hola! ¿Cómo estás? Él dijo: "Hagámoslo hoy, mientras tengamos tiempo." Claro, no es tan sencillo.`,
french: `Bonjour ! Comment allez-vous ? Il a dit : « Faisons-le aujourd'hui — tant qu'on a le temps. » Ce n'est pas si simple.`,
german: `Hallo! Wie geht es Ihnen? Er sagte: „Machen wir es heute — solange wir Zeit haben." So einfach ist es nicht.`,
portuguese: `Olá! Como você está? Ele disse: "Vamos fazer isso hoje — enquanto temos tempo." Claro, não é tão simples.`,
italian: `Ciao! Come stai? Ha detto: "Facciamolo oggi — finché abbiamo tempo." Non è così semplice.`,
russian: `Привет! Как дела? Он сказал: «Сделаем это сегодня — пока есть время». Конечно, не всё так просто; нужно учесть погоду.`,
japanese: `こんにちは!元気ですか?「今日やりましょう。」もちろん、簡単ではない。`,
chinese_simplified: `你好!你怎么样?他说:"今天就做吧。"当然,事情没那么简单。`,
chinese_traditional: `你好!你怎麼樣?他說:「今天就做吧。」當然,事情沒那麼簡單。`,
};

export const TranscriptionPrompt: React.FC<TranscriptionPromptProps> =
React.memo(({ descriptionMode = "tooltip", grouped = false }) => {
const { t } = useTranslation();
const { getSetting, updateSetting, isUpdating } = useSettings();
const currentPrompt = getSetting("transcription_prompt") ?? "";
const selectedLanguage = getSetting("selected_language");
const currentModelId = useModelStore((s) => s.currentModel);
const getModelInfo = useModelStore((s) => s.getModelInfo);
const isWhisper =
getModelInfo(currentModelId)?.engine_type === "Whisper";
const [localValue, setLocalValue] = useState(currentPrompt);
const [isDirty, setIsDirty] = useState(false);

const activePreset =
Object.entries(PRESETS).find(
([, text]) => text === localValue.trim(),
)?.[0] ?? "none";

const presetOptions: DropdownOption[] = useMemo(
() => [
{
value: "none",
label: t("settings.advanced.transcriptionPrompt.presets.none"),
},
{
value: "english",
label: t("settings.advanced.transcriptionPrompt.presets.english"),
},
{
value: "spanish",
label: t("settings.advanced.transcriptionPrompt.presets.spanish"),
},
{
value: "french",
label: t("settings.advanced.transcriptionPrompt.presets.french"),
},
{
value: "german",
label: t("settings.advanced.transcriptionPrompt.presets.german"),
},
{
value: "portuguese",
label: t("settings.advanced.transcriptionPrompt.presets.portuguese"),
},
{
value: "italian",
label: t("settings.advanced.transcriptionPrompt.presets.italian"),
},
{
value: "russian",
label: t("settings.advanced.transcriptionPrompt.presets.russian"),
},
{
value: "japanese",
label: t("settings.advanced.transcriptionPrompt.presets.japanese"),
},
{
value: "chinese_simplified",
label: t(
"settings.advanced.transcriptionPrompt.presets.chineseSimplified",
),
},
{
value: "chinese_traditional",
label: t(
"settings.advanced.transcriptionPrompt.presets.chineseTraditional",
),
},
],
[t],
);

useEffect(() => {
if (!isDirty) {
setLocalValue(currentPrompt);
}
}, [currentPrompt, isDirty]);

const handleChange = useCallback(
(e: React.ChangeEvent<HTMLTextAreaElement>) => {
const value = e.target.value;
if (estimateTokens(value) <= TOKEN_BUDGET) {
setLocalValue(value);
setIsDirty(true);
}
},
[],
);

const handleBlur = useCallback(() => {
if (!isDirty) return;
const trimmed = localValue.trim();
updateSetting(
"transcription_prompt",
trimmed.length > 0 ? trimmed : null,
);
setIsDirty(false);
}, [localValue, isDirty, updateSetting]);

const handlePreset = useCallback(
(key: string) => {
if (key === "none") {
setLocalValue("");
updateSetting("transcription_prompt", null);
} else {
const preset = PRESETS[key] ?? "";
setLocalValue(preset);
updateSetting("transcription_prompt", preset);
}
setIsDirty(false);
},
[updateSetting],
);

const estimatedTokens = estimateTokens(localValue);
const percentage = Math.min(
100,
Math.round((estimatedTokens / TOKEN_BUDGET) * 100),
);

return (
<SettingContainer
title={t("settings.advanced.transcriptionPrompt.title")}
description={t("settings.advanced.transcriptionPrompt.description")}
descriptionMode={descriptionMode}
grouped={grouped}
layout="stacked"
>
<div className="flex flex-col gap-2 w-full">
<div className="flex items-center gap-2">
<label className="text-xs text-mid-gray">
{t("settings.advanced.transcriptionPrompt.presets.label")}
</label>
<Dropdown
options={presetOptions}
selectedValue={activePreset}
onSelect={handlePreset}
disabled={isUpdating("transcription_prompt")}
className="min-w-[140px]"
/>
</div>
<Textarea
variant="compact"
className="w-full"
value={localValue}
onChange={handleChange}
onBlur={handleBlur}
placeholder={t("settings.advanced.transcriptionPrompt.placeholder")}
disabled={isUpdating("transcription_prompt")}
/>
<div className="flex items-start justify-between gap-2 text-xs">
<div className="flex flex-col gap-0.5 text-yellow-500">
{!isWhisper && (
<span>
{t("settings.advanced.transcriptionPrompt.whisperOnly")}
</span>
)}
{selectedLanguage === "auto" && localValue.length > 0 && (
<span>
{t("settings.advanced.transcriptionPrompt.languageWarning")}
</span>
)}
</div>
<div className="flex items-center gap-2 shrink-0">
<div className="w-24 h-1.5 rounded-full bg-mid-gray/20 overflow-hidden">
<div
className={`h-full rounded-full transition-all ${
percentage >= 95
? "bg-red-400"
: percentage >= 80
? "bg-yellow-400"
: "bg-mid-gray/50"
}`}
style={{ width: `${percentage}%` }}
/>
</div>
<span className="text-mid-gray text-xs tabular-nums">
{percentage}%
</span>
</div>
</div>
{localValue.length > 0 && (
<span className="text-mid-gray/60 text-xs">
{t("settings.advanced.transcriptionPrompt.tokenBudgetHint")}
</span>
)}
</div>
</SettingContainer>
);
});
2 changes: 2 additions & 0 deletions src/components/settings/advanced/AdvancedSettings.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { useTranslation } from "react-i18next";
import { ShowOverlay } from "../ShowOverlay";
import { ModelUnloadTimeoutSetting } from "../ModelUnloadTimeout";
import { CustomWords } from "../CustomWords";
import { TranscriptionPrompt } from "../TranscriptionPrompt";
import { SettingsGroup } from "../../ui/SettingsGroup";
import { StartHidden } from "../StartHidden";
import { AutostartToggle } from "../AutostartToggle";
Expand Down Expand Up @@ -46,6 +47,7 @@ export const AdvancedSettings: React.FC = () => {

<SettingsGroup title={t("settings.advanced.groups.transcription")}>
<CustomWords descriptionMode="tooltip" grouped />
<TranscriptionPrompt descriptionMode="tooltip" grouped />
<AppendTrailingSpace descriptionMode="tooltip" grouped={true} />
</SettingsGroup>

Expand Down
Loading