diff --git a/julia-agent/julia-ai/src/agent.py b/julia-agent/julia-ai/src/agent.py index 9bf7c53..11ae0b4 100644 --- a/julia-agent/julia-ai/src/agent.py +++ b/julia-agent/julia-ai/src/agent.py @@ -312,13 +312,30 @@ class WellNuoLLMStream(llm.LLMStream): def prewarm(proc: JobProcess): """Preload VAD model for faster startup.""" - # Increase min_silence_duration to prevent cutting off user speech during barge-in - # Default is 0.55s which is too short - user pauses between words get interpreted as end of speech - # 0.9s gives user more time to continue speaking without being cut off + # CRITICAL FIX: Prevent premature speech cutoff + # + # The VAD (Voice Activity Detection) determines when the user has finished speaking. + # Default settings are too aggressive and cut off speech during natural pauses. + # + # Key parameters: + # - min_silence_duration: How long to wait after silence before ending speech + # Default 0.55s is WAY too short - people pause between sentences/thoughts + # 1.8s allows natural conversation flow without being cut off + # + # - min_speech_duration: Minimum speech length to be considered valid + # Keeping it low (0.1s) allows short responses but filters noise + # + # - activation_threshold: Voice detection sensitivity (0-1) + # Lower = more sensitive to quiet speech, but may pick up background noise + # 0.35 is a good balance for typical indoor environments + # + # - padding_duration: Audio padding around detected speech (default: 0.3) + # Increased to 0.5s to capture soft word endings proc.userdata["vad"] = silero.VAD.load( - min_silence_duration=0.9, # Wait 0.9s of silence before ending speech (default: 0.55) - min_speech_duration=0.05, # Keep low for quick interruption detection (default: 0.05) - activation_threshold=0.4, # Slightly lower for better sensitivity (default: 0.5) + min_silence_duration=1.8, # Wait 1.8s of silence before ending speech (was 0.9s) + min_speech_duration=0.1, # Minimum valid speech duration (was 0.05s) + activation_threshold=0.35, # Slightly more sensitive for quiet speakers (was 0.4) + padding_duration=0.5, # Extra audio padding around speech (default: 0.3) ) @@ -408,13 +425,22 @@ async def entrypoint(ctx: JobContext): # Deepgram for STT - better accuracy and faster than AssemblyAI # AssemblyAI was giving garbage like "shambhala balashambal" instead of actual speech + # + # CRITICAL FIX: Endpointing settings prevent premature speech cutoff + # - endpointing: Time in ms of silence before finalizing transcript + # Default is ~500ms which is too aggressive for natural speech + # 1500ms (1.5s) allows for thinking pauses without cutting off + # - utterance_end_ms: Additional buffer for utterance detection + # 2000ms gives extra time for slow speakers or complex sentences session = AgentSession( # Deepgram Nova-2 model for best STT accuracy stt=deepgram.STT( model="nova-2-general", language="en-US", - smart_format=True, # Better punctuation and formatting - no_delay=True, # Faster response for real-time + smart_format=True, # Better punctuation and formatting + no_delay=True, # Faster response for real-time + endpointing=1500, # Wait 1.5s of silence before finalizing (default: ~500ms) + utterance_end_ms=2000, # Extra 2s buffer for utterance end detection ), # WellNuo voice_ask API for LLM with dynamic beneficiary data llm=WellNuoLLM(