Fix premature speech cutoff in Julia AI voice agent

- Increase Silero VAD min_silence_duration from 0.9s to 1.8s to allow natural pauses - Lower activation_threshold from 0.4 to 0.35 for better quiet speaker detection - Increase padding_duration from 0.3s to 0.5s to capture soft word endings - Add Deepgram STT endpointing=1500ms to prevent early transcript finalization - Add Deepgram utterance_end_ms=2000ms for slow speakers 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2026-01-29 16:33:28 -08:00 · 2026-01-29 16:33:28 -08:00 · 4b91aba08e
commit 4b91aba08e
parent a1e30939a6
1 changed files with 34 additions and 8 deletions
--- a/julia-agent/julia-ai/src/agent.py
+++ b/julia-agent/julia-ai/src/agent.py
@ -312,13 +312,30 @@ class WellNuoLLMStream(llm.LLMStream):
 def prewarm(proc: JobProcess):
    """Preload VAD model for faster startup."""
-    # Increase min_silence_duration to prevent cutting off user speech during barge-in
+    # CRITICAL FIX: Prevent premature speech cutoff
-    # Default is 0.55s which is too short - user pauses between words get interpreted as end of speech
+    #
-    # 0.9s gives user more time to continue speaking without being cut off
+    # The VAD (Voice Activity Detection) determines when the user has finished speaking.
    # Default settings are too aggressive and cut off speech during natural pauses.
    #
    # Key parameters:
    # - min_silence_duration: How long to wait after silence before ending speech
    #   Default 0.55s is WAY too short - people pause between sentences/thoughts
    #   1.8s allows natural conversation flow without being cut off
    #
    # - min_speech_duration: Minimum speech length to be considered valid
    #   Keeping it low (0.1s) allows short responses but filters noise
    #
    # - activation_threshold: Voice detection sensitivity (0-1)
    #   Lower = more sensitive to quiet speech, but may pick up background noise
    #   0.35 is a good balance for typical indoor environments
    #
    # - padding_duration: Audio padding around detected speech (default: 0.3)
    #   Increased to 0.5s to capture soft word endings
    proc.userdata["vad"] = silero.VAD.load(
-        min_silence_duration=0.9,  # Wait 0.9s of silence before ending speech (default: 0.55)
+        min_silence_duration=1.8,     # Wait 1.8s of silence before ending speech (was 0.9s)
-        min_speech_duration=0.05,  # Keep low for quick interruption detection (default: 0.05)
+        min_speech_duration=0.1,      # Minimum valid speech duration (was 0.05s)
-        activation_threshold=0.4,  # Slightly lower for better sensitivity (default: 0.5)
+        activation_threshold=0.35,    # Slightly more sensitive for quiet speakers (was 0.4)
        padding_duration=0.5,         # Extra audio padding around speech (default: 0.3)
    )
@ -408,13 +425,22 @@ async def entrypoint(ctx: JobContext):
    # Deepgram for STT - better accuracy and faster than AssemblyAI
    # AssemblyAI was giving garbage like "shambhala balashambal" instead of actual speech
    #
    # CRITICAL FIX: Endpointing settings prevent premature speech cutoff
    # - endpointing: Time in ms of silence before finalizing transcript
    #   Default is ~500ms which is too aggressive for natural speech
    #   1500ms (1.5s) allows for thinking pauses without cutting off
    # - utterance_end_ms: Additional buffer for utterance detection
    #   2000ms gives extra time for slow speakers or complex sentences
    session = AgentSession(
        # Deepgram Nova-2 model for best STT accuracy
        stt=deepgram.STT(
            model="nova-2-general",
            language="en-US",
-            smart_format=True,  # Better punctuation and formatting
+            smart_format=True,       # Better punctuation and formatting
-            no_delay=True,      # Faster response for real-time
+            no_delay=True,           # Faster response for real-time
            endpointing=1500,        # Wait 1.5s of silence before finalizing (default: ~500ms)
            utterance_end_ms=2000,   # Extra 2s buffer for utterance end detection
        ),
        # WellNuo voice_ask API for LLM with dynamic beneficiary data
        llm=WellNuoLLM(