Fix premature speech cutoff in Julia AI voice agent

- Increase Silero VAD min_silence_duration from 0.9s to 1.8s to allow natural pauses
- Lower activation_threshold from 0.4 to 0.35 for better quiet speaker detection
- Increase padding_duration from 0.3s to 0.5s to capture soft word endings
- Add Deepgram STT endpointing=1500ms to prevent early transcript finalization
- Add Deepgram utterance_end_ms=2000ms for slow speakers

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Sergei 2026-01-29 16:33:28 -08:00
parent a1e30939a6
commit 4b91aba08e

View File

@ -312,13 +312,30 @@ class WellNuoLLMStream(llm.LLMStream):
def prewarm(proc: JobProcess): def prewarm(proc: JobProcess):
"""Preload VAD model for faster startup.""" """Preload VAD model for faster startup."""
# Increase min_silence_duration to prevent cutting off user speech during barge-in # CRITICAL FIX: Prevent premature speech cutoff
# Default is 0.55s which is too short - user pauses between words get interpreted as end of speech #
# 0.9s gives user more time to continue speaking without being cut off # The VAD (Voice Activity Detection) determines when the user has finished speaking.
# Default settings are too aggressive and cut off speech during natural pauses.
#
# Key parameters:
# - min_silence_duration: How long to wait after silence before ending speech
# Default 0.55s is WAY too short - people pause between sentences/thoughts
# 1.8s allows natural conversation flow without being cut off
#
# - min_speech_duration: Minimum speech length to be considered valid
# Keeping it low (0.1s) allows short responses but filters noise
#
# - activation_threshold: Voice detection sensitivity (0-1)
# Lower = more sensitive to quiet speech, but may pick up background noise
# 0.35 is a good balance for typical indoor environments
#
# - padding_duration: Audio padding around detected speech (default: 0.3)
# Increased to 0.5s to capture soft word endings
proc.userdata["vad"] = silero.VAD.load( proc.userdata["vad"] = silero.VAD.load(
min_silence_duration=0.9, # Wait 0.9s of silence before ending speech (default: 0.55) min_silence_duration=1.8, # Wait 1.8s of silence before ending speech (was 0.9s)
min_speech_duration=0.05, # Keep low for quick interruption detection (default: 0.05) min_speech_duration=0.1, # Minimum valid speech duration (was 0.05s)
activation_threshold=0.4, # Slightly lower for better sensitivity (default: 0.5) activation_threshold=0.35, # Slightly more sensitive for quiet speakers (was 0.4)
padding_duration=0.5, # Extra audio padding around speech (default: 0.3)
) )
@ -408,6 +425,13 @@ async def entrypoint(ctx: JobContext):
# Deepgram for STT - better accuracy and faster than AssemblyAI # Deepgram for STT - better accuracy and faster than AssemblyAI
# AssemblyAI was giving garbage like "shambhala balashambal" instead of actual speech # AssemblyAI was giving garbage like "shambhala balashambal" instead of actual speech
#
# CRITICAL FIX: Endpointing settings prevent premature speech cutoff
# - endpointing: Time in ms of silence before finalizing transcript
# Default is ~500ms which is too aggressive for natural speech
# 1500ms (1.5s) allows for thinking pauses without cutting off
# - utterance_end_ms: Additional buffer for utterance detection
# 2000ms gives extra time for slow speakers or complex sentences
session = AgentSession( session = AgentSession(
# Deepgram Nova-2 model for best STT accuracy # Deepgram Nova-2 model for best STT accuracy
stt=deepgram.STT( stt=deepgram.STT(
@ -415,6 +439,8 @@ async def entrypoint(ctx: JobContext):
language="en-US", language="en-US",
smart_format=True, # Better punctuation and formatting smart_format=True, # Better punctuation and formatting
no_delay=True, # Faster response for real-time no_delay=True, # Faster response for real-time
endpointing=1500, # Wait 1.5s of silence before finalizing (default: ~500ms)
utterance_end_ms=2000, # Extra 2s buffer for utterance end detection
), ),
# WellNuo voice_ask API for LLM with dynamic beneficiary data # WellNuo voice_ask API for LLM with dynamic beneficiary data
llm=WellNuoLLM( llm=WellNuoLLM(