Fix premature speech cutoff in Julia AI voice agent
- Increase Silero VAD min_silence_duration from 0.9s to 1.8s to allow natural pauses - Lower activation_threshold from 0.4 to 0.35 for better quiet speaker detection - Increase padding_duration from 0.3s to 0.5s to capture soft word endings - Add Deepgram STT endpointing=1500ms to prevent early transcript finalization - Add Deepgram utterance_end_ms=2000ms for slow speakers 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
a1e30939a6
commit
4b91aba08e
@ -312,13 +312,30 @@ class WellNuoLLMStream(llm.LLMStream):
|
|||||||
|
|
||||||
def prewarm(proc: JobProcess):
|
def prewarm(proc: JobProcess):
|
||||||
"""Preload VAD model for faster startup."""
|
"""Preload VAD model for faster startup."""
|
||||||
# Increase min_silence_duration to prevent cutting off user speech during barge-in
|
# CRITICAL FIX: Prevent premature speech cutoff
|
||||||
# Default is 0.55s which is too short - user pauses between words get interpreted as end of speech
|
#
|
||||||
# 0.9s gives user more time to continue speaking without being cut off
|
# The VAD (Voice Activity Detection) determines when the user has finished speaking.
|
||||||
|
# Default settings are too aggressive and cut off speech during natural pauses.
|
||||||
|
#
|
||||||
|
# Key parameters:
|
||||||
|
# - min_silence_duration: How long to wait after silence before ending speech
|
||||||
|
# Default 0.55s is WAY too short - people pause between sentences/thoughts
|
||||||
|
# 1.8s allows natural conversation flow without being cut off
|
||||||
|
#
|
||||||
|
# - min_speech_duration: Minimum speech length to be considered valid
|
||||||
|
# Keeping it low (0.1s) allows short responses but filters noise
|
||||||
|
#
|
||||||
|
# - activation_threshold: Voice detection sensitivity (0-1)
|
||||||
|
# Lower = more sensitive to quiet speech, but may pick up background noise
|
||||||
|
# 0.35 is a good balance for typical indoor environments
|
||||||
|
#
|
||||||
|
# - padding_duration: Audio padding around detected speech (default: 0.3)
|
||||||
|
# Increased to 0.5s to capture soft word endings
|
||||||
proc.userdata["vad"] = silero.VAD.load(
|
proc.userdata["vad"] = silero.VAD.load(
|
||||||
min_silence_duration=0.9, # Wait 0.9s of silence before ending speech (default: 0.55)
|
min_silence_duration=1.8, # Wait 1.8s of silence before ending speech (was 0.9s)
|
||||||
min_speech_duration=0.05, # Keep low for quick interruption detection (default: 0.05)
|
min_speech_duration=0.1, # Minimum valid speech duration (was 0.05s)
|
||||||
activation_threshold=0.4, # Slightly lower for better sensitivity (default: 0.5)
|
activation_threshold=0.35, # Slightly more sensitive for quiet speakers (was 0.4)
|
||||||
|
padding_duration=0.5, # Extra audio padding around speech (default: 0.3)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -408,13 +425,22 @@ async def entrypoint(ctx: JobContext):
|
|||||||
|
|
||||||
# Deepgram for STT - better accuracy and faster than AssemblyAI
|
# Deepgram for STT - better accuracy and faster than AssemblyAI
|
||||||
# AssemblyAI was giving garbage like "shambhala balashambal" instead of actual speech
|
# AssemblyAI was giving garbage like "shambhala balashambal" instead of actual speech
|
||||||
|
#
|
||||||
|
# CRITICAL FIX: Endpointing settings prevent premature speech cutoff
|
||||||
|
# - endpointing: Time in ms of silence before finalizing transcript
|
||||||
|
# Default is ~500ms which is too aggressive for natural speech
|
||||||
|
# 1500ms (1.5s) allows for thinking pauses without cutting off
|
||||||
|
# - utterance_end_ms: Additional buffer for utterance detection
|
||||||
|
# 2000ms gives extra time for slow speakers or complex sentences
|
||||||
session = AgentSession(
|
session = AgentSession(
|
||||||
# Deepgram Nova-2 model for best STT accuracy
|
# Deepgram Nova-2 model for best STT accuracy
|
||||||
stt=deepgram.STT(
|
stt=deepgram.STT(
|
||||||
model="nova-2-general",
|
model="nova-2-general",
|
||||||
language="en-US",
|
language="en-US",
|
||||||
smart_format=True, # Better punctuation and formatting
|
smart_format=True, # Better punctuation and formatting
|
||||||
no_delay=True, # Faster response for real-time
|
no_delay=True, # Faster response for real-time
|
||||||
|
endpointing=1500, # Wait 1.5s of silence before finalizing (default: ~500ms)
|
||||||
|
utterance_end_ms=2000, # Extra 2s buffer for utterance end detection
|
||||||
),
|
),
|
||||||
# WellNuo voice_ask API for LLM with dynamic beneficiary data
|
# WellNuo voice_ask API for LLM with dynamic beneficiary data
|
||||||
llm=WellNuoLLM(
|
llm=WellNuoLLM(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user