From ad0fe41ee9f8f3582d3e9b487871fac4d92aaf8b Mon Sep 17 00:00:00 2001 From: Sergei Date: Sat, 24 Jan 2026 21:51:20 -0800 Subject: [PATCH] Improve voice call UX and disable agent interruption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Chat improvements: - Add pulsing animation to voice call button during active call - Log call start/end with duration to chat history - End call automatically when deployment ID changes - Reduce bottom padding (removed SafeArea bottom edge) Julia Agent: - Disable user interruption (min_interruption_duration=999) - Agent now speaks without being interrupted 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- app/(tabs)/chat.tsx | 472 +++++++++++----------------- app/(tabs)/profile.tsx | 4 +- julia-agent/julia-ai/livekit.toml | 6 + julia-agent/julia-ai/pyproject.toml | 2 + julia-agent/julia-ai/src/agent.py | 44 ++- 5 files changed, 228 insertions(+), 300 deletions(-) diff --git a/app/(tabs)/chat.tsx b/app/(tabs)/chat.tsx index d7506f3..8a58554 100644 --- a/app/(tabs)/chat.tsx +++ b/app/(tabs)/chat.tsx @@ -17,6 +17,7 @@ import { Keyboard, Platform, Alert, + Animated, } from 'react-native'; import { KeyboardAvoidingView } from 'react-native-keyboard-controller'; import { Ionicons } from '@expo/vector-icons'; @@ -36,12 +37,10 @@ import { LiveKitRoom, useVoiceAssistant, useConnectionState, - useRoomContext, - BarVisualizer, useTrackTranscription, useTracks, } from '@livekit/react-native'; -import { ConnectionState, RoomEvent, Track, TranscriptionSegment } from 'livekit-client'; +import { ConnectionState, Track } from 'livekit-client'; import { getToken, type BeneficiaryData } from '@/services/livekitService'; import { useAuth } from '@/contexts/AuthContext'; @@ -128,21 +127,17 @@ function normalizeQuestion(userMessage: string): string { } // ============================================================================ -// Voice Call Overlay Component +// Voice Call Transcript Handler (invisible - just captures transcripts) // ============================================================================ -interface VoiceCallOverlayProps { - onHangUp: () => void; - onMinimize: () => void; +interface VoiceCallTranscriptHandlerProps { onTranscript: (role: 'user' | 'assistant', text: string) => void; onDurationUpdate: (seconds: number) => void; - beneficiaryName?: string; } -function VoiceCallContent({ onHangUp, onMinimize, onTranscript, onDurationUpdate, beneficiaryName }: VoiceCallOverlayProps) { - const room = useRoomContext(); +function VoiceCallTranscriptHandler({ onTranscript, onDurationUpdate }: VoiceCallTranscriptHandlerProps) { const connectionState = useConnectionState(); - const { state: agentState, audioTrack } = useVoiceAssistant(); + const { audioTrack } = useVoiceAssistant(); const [callDuration, setCallDuration] = useState(0); const [lastProcessedId, setLastProcessedId] = useState(null); @@ -181,15 +176,13 @@ function VoiceCallContent({ onHangUp, onMinimize, onTranscript, onDurationUpdate } }, [userSegments, lastUserSegmentId, onTranscript]); - // Call duration timer + // Call duration timer - use ref to avoid state updates during render + const durationRef = useRef(0); useEffect(() => { if (connectionState === ConnectionState.Connected) { const interval = setInterval(() => { - setCallDuration(prev => { - const newDuration = prev + 1; - onDurationUpdate(newDuration); - return newDuration; - }); + durationRef.current += 1; + onDurationUpdate(durationRef.current); }, 1000); return () => clearInterval(interval); } @@ -203,187 +196,10 @@ function VoiceCallContent({ onHangUp, onMinimize, onTranscript, onDurationUpdate }; }, []); - // Format duration as mm:ss - const formatDuration = (seconds: number) => { - const mins = Math.floor(seconds / 60); - const secs = seconds % 60; - return `${mins.toString().padStart(2, '0')}:${secs.toString().padStart(2, '0')}`; - }; - - // Get status text based on agent state - const getStatusText = () => { - if (connectionState === ConnectionState.Connecting) return 'Connecting...'; - if (connectionState === ConnectionState.Reconnecting) return 'Reconnecting...'; - if (connectionState !== ConnectionState.Connected) return 'Disconnected'; - - switch (agentState) { - case 'listening': return 'Listening...'; - case 'thinking': return 'Thinking...'; - case 'speaking': return 'Speaking...'; - case 'connecting': return 'Connecting to Julia...'; - case 'initializing': return 'Starting...'; - default: return 'Connected'; - } - }; - - return ( - - - {/* Avatar */} - - - J - - {agentState === 'speaking' && ( - - )} - - - {/* Name and status */} - Julia AI - {beneficiaryName && ( - About {beneficiaryName} - )} - {getStatusText()} - - {/* Duration */} - {connectionState === ConnectionState.Connected && ( - {formatDuration(callDuration)} - )} - - {/* Audio Visualizer */} - {audioTrack && agentState === 'speaking' && ( - - - - )} - - - {/* Call controls */} - - {/* Minimize button */} - - - - - {/* Hang up button */} - - - - - {/* Placeholder for symmetry */} - - - - ); + // This component renders nothing - it just handles transcripts + return null; } -const voiceStyles = StyleSheet.create({ - container: { - flex: 1, - backgroundColor: 'rgba(0, 0, 0, 0.95)', - justifyContent: 'space-between', - alignItems: 'center', - paddingVertical: 60, - }, - content: { - flex: 1, - justifyContent: 'center', - alignItems: 'center', - }, - avatarContainer: { - position: 'relative', - marginBottom: Spacing.lg, - }, - avatar: { - width: 120, - height: 120, - borderRadius: 60, - backgroundColor: AppColors.success, - justifyContent: 'center', - alignItems: 'center', - }, - avatarSpeaking: { - backgroundColor: AppColors.primary, - }, - avatarText: { - fontSize: 48, - fontWeight: '600', - color: AppColors.white, - }, - speakingRing: { - position: 'absolute', - top: -10, - left: -10, - right: -10, - bottom: -10, - borderRadius: 70, - borderWidth: 3, - borderColor: AppColors.primary, - opacity: 0.5, - }, - name: { - fontSize: FontSizes['2xl'], - fontWeight: '600', - color: AppColors.white, - marginBottom: Spacing.xs, - }, - beneficiary: { - fontSize: FontSizes.base, - color: 'rgba(255, 255, 255, 0.7)', - marginBottom: Spacing.sm, - }, - status: { - fontSize: FontSizes.base, - color: AppColors.success, - marginBottom: Spacing.md, - }, - duration: { - fontSize: FontSizes.lg, - color: 'rgba(255, 255, 255, 0.8)', - fontVariant: ['tabular-nums'], - }, - visualizerContainer: { - marginTop: Spacing.xl, - height: 60, - width: 200, - }, - callControls: { - flexDirection: 'row', - alignItems: 'center', - justifyContent: 'center', - gap: Spacing.xl, - marginBottom: Spacing.xl, - }, - minimizeButton: { - width: 56, - height: 56, - borderRadius: 28, - backgroundColor: 'rgba(255, 255, 255, 0.2)', - justifyContent: 'center', - alignItems: 'center', - }, - hangUpButton: { - width: 72, - height: 72, - borderRadius: 36, - backgroundColor: AppColors.error, - justifyContent: 'center', - alignItems: 'center', - }, - controlPlaceholder: { - width: 56, - height: 56, - }, -}); - export default function ChatScreen() { const router = useRouter(); const { currentBeneficiary, setCurrentBeneficiary } = useBeneficiary(); @@ -399,20 +215,51 @@ export default function ChatScreen() { isCallActive, } = useVoiceCall(); - // Chat state - const [messages, setMessages] = useState([ - { - id: '1', - role: 'assistant', - content: 'Hello! I\'m Julia, your AI wellness assistant. You can type a message or tap the phone button to start a voice call.', - timestamp: new Date(), - }, - ]); + // Helper to create initial message with deployment ID + const createInitialMessage = useCallback((deploymentId?: string | null): Message => ({ + id: '1', + role: 'assistant', + content: `Hello! I'm Julia, your AI wellness companion.${deploymentId ? `\n\nDeployment ID: ${deploymentId}` : ''}\n\nTap the phone button to start a voice call, or type a message below.`, + timestamp: new Date(), + }), []); + + // Custom deployment ID from settings + const [customDeploymentId, setCustomDeploymentId] = useState(null); + + // Chat state - initialized after deployment ID is loaded + const [messages, setMessages] = useState([createInitialMessage(null)]); const [sortNewestFirst, setSortNewestFirst] = useState(false); // Voice call state (local connecting state only) const [isConnectingVoice, setIsConnectingVoice] = useState(false); + // Pulsing animation for active call + const pulseAnim = useRef(new Animated.Value(1)).current; + + // Start pulsing animation when call is active + useEffect(() => { + if (isCallActive) { + const pulse = Animated.loop( + Animated.sequence([ + Animated.timing(pulseAnim, { + toValue: 1.15, + duration: 600, + useNativeDriver: true, + }), + Animated.timing(pulseAnim, { + toValue: 1, + duration: 600, + useNativeDriver: true, + }), + ]) + ); + pulse.start(); + return () => pulse.stop(); + } else { + pulseAnim.setValue(1); + } + }, [isCallActive, pulseAnim]); + // Track if we've shown the voice call separator for current call const [hasShownVoiceSeparator, setHasShownVoiceSeparator] = useState(false); @@ -433,17 +280,40 @@ export default function ChatScreen() { const [beneficiaries, setBeneficiaries] = useState([]); const [loadingBeneficiaries, setLoadingBeneficiaries] = useState(false); - // Custom deployment ID from settings - const [customDeploymentId, setCustomDeploymentId] = useState(null); - - // Load custom deployment ID from settings + // Load custom deployment ID from settings and update initial message useEffect(() => { const loadCustomDeploymentId = async () => { const saved = await api.getDeploymentId(); setCustomDeploymentId(saved); + // Update initial message with deployment ID + if (saved) { + setMessages([createInitialMessage(saved)]); + } }; loadCustomDeploymentId(); - }, []); + }, [createInitialMessage]); + + // When deployment ID changes, end call and clear chat + const previousDeploymentId = useRef(null); + useEffect(() => { + // Skip initial load + if (previousDeploymentId.current === null) { + previousDeploymentId.current = customDeploymentId; + return; + } + // If deployment ID actually changed + if (previousDeploymentId.current !== customDeploymentId) { + console.log('[Chat] Deployment ID changed, ending call and clearing chat'); + // End any active call + if (isCallActive) { + endVoiceCallContext(); + } + // Clear chat with new initial message + setMessages([createInitialMessage(customDeploymentId)]); + setHasShownVoiceSeparator(false); + previousDeploymentId.current = customDeploymentId; + } + }, [customDeploymentId, createInitialMessage, isCallActive, endVoiceCallContext]); // Load beneficiaries const loadBeneficiaries = useCallback(async () => { @@ -546,6 +416,16 @@ export default function ChatScreen() { console.log('[Chat] Got voice token, connecting to room:', tokenResponse.data.roomName); + // Add call start message to chat + const callStartMessage: Message = { + id: `call-start-${Date.now()}`, + role: 'assistant', + content: 'Voice call started', + timestamp: new Date(), + isSystem: true, + }; + setMessages(prev => [...prev, callStartMessage]); + // Clear previous transcript and start call via context clearTranscript(); startCall({ @@ -565,29 +445,33 @@ export default function ChatScreen() { } }, [isConnectingVoice, isCallActive, currentBeneficiary, beneficiaries, user, clearTranscript, startCall, customDeploymentId]); - // End voice call + // End voice call and log to chat const endVoiceCall = useCallback(() => { console.log('[Chat] Ending voice call...'); + + // Add call end message to chat with duration + const duration = callState.callDuration; + const minutes = Math.floor(duration / 60); + const seconds = duration % 60; + const durationStr = `${minutes}:${seconds.toString().padStart(2, '0')}`; + + const callEndMessage: Message = { + id: `call-end-${Date.now()}`, + role: 'assistant', + content: `Call ended (${durationStr})`, + timestamp: new Date(), + isSystem: true, + }; + setMessages(prev => [...prev, callEndMessage]); + setHasShownVoiceSeparator(false); + endVoiceCallContext(); - }, [endVoiceCallContext]); + }, [endVoiceCallContext, callState.callDuration]); // Handle voice transcript entries - add to chat in real-time const handleVoiceTranscript = useCallback((role: 'user' | 'assistant', text: string) => { if (!text.trim()) return; - // Add separator before first voice message of this call - if (!hasShownVoiceSeparator) { - const separatorMessage: Message = { - id: `voice-separator-${Date.now()}`, - role: 'assistant', - content: 'Voice Call', - timestamp: new Date(), - isSystem: true, - }; - setMessages(prev => [...prev, separatorMessage]); - setHasShownVoiceSeparator(true); - } - // Create voice message and add to chat immediately const voiceMessage: Message = { id: `voice-${Date.now()}-${Math.random().toString(36).slice(2)}`, @@ -772,7 +656,7 @@ export default function ChatScreen() { }; return ( - + {/* Header */} router.push('/(tabs)')}> @@ -904,23 +788,37 @@ export default function ChatScreen() { {/* Input */} - {/* Voice Call Button */} - - {isConnectingVoice ? ( - - ) : isCallActive ? ( - - ) : ( - - )} - + {/* Voice Call Button - becomes pulsing bubble during call */} + + + {isConnectingVoice ? ( + + ) : isCallActive ? ( + + + + ) : ( + + )} + + + {/* Call duration badge */} + {isCallActive && ( + + + {Math.floor(callState.callDuration / 60).toString().padStart(2, '0')}: + {(callState.callDuration % 60).toString().padStart(2, '0')} + + + )} - {/* Voice Call Modal */} - - - {callState.token && callState.wsUrl ? ( - console.log('[Chat] LiveKit connected')} - onDisconnected={endVoiceCall} - onError={(error) => { - console.error('[Chat] LiveKit error:', error); - Alert.alert('Voice Call Error', error.message); - endVoiceCall(); - }} - > - - - ) : ( - - - Connecting... - - )} - - + {/* Invisible LiveKit Room - runs in background during call */} + {isCallActive && callState.token && callState.wsUrl && ( + console.log('[Chat] LiveKit connected')} + onDisconnected={endVoiceCall} + onError={(error) => { + console.error('[Chat] LiveKit error:', error); + Alert.alert('Voice Call Error', error.message); + endVoiceCall(); + }} + > + + + )} ); } @@ -1138,6 +1019,33 @@ const styles = StyleSheet.create({ borderColor: AppColors.success, backgroundColor: 'rgba(90, 200, 168, 0.1)', }, + voiceButtonActive: { + backgroundColor: AppColors.success, + borderColor: AppColors.success, + }, + callActiveIndicator: { + width: '100%', + height: '100%', + justifyContent: 'center', + alignItems: 'center', + }, + callDurationBadge: { + position: 'absolute', + left: 32, + top: -8, + backgroundColor: AppColors.success, + paddingHorizontal: 6, + paddingVertical: 2, + borderRadius: 8, + minWidth: 42, + alignItems: 'center', + }, + callDurationText: { + fontSize: 10, + fontWeight: '600', + color: AppColors.white, + fontVariant: ['tabular-nums'], + }, sendButton: { width: 44, height: 44, diff --git a/app/(tabs)/profile.tsx b/app/(tabs)/profile.tsx index 022faf1..f27f0f4 100644 --- a/app/(tabs)/profile.tsx +++ b/app/(tabs)/profile.tsx @@ -166,8 +166,8 @@ export default function ProfileScreen() { diff --git a/julia-agent/julia-ai/livekit.toml b/julia-agent/julia-ai/livekit.toml index fe1c113..51f4b64 100644 --- a/julia-agent/julia-ai/livekit.toml +++ b/julia-agent/julia-ai/livekit.toml @@ -6,3 +6,9 @@ id = "CA_Yd3qcuYEVKKE" [build] dockerfile = "Dockerfile" + +[env] +# Deepgram for TTS +DEEPGRAM_API_KEY = "cec33b489b0ba12c4e4f1ea888e887e88fba5848" +# AssemblyAI for STT (best accuracy - correctly recognizes "dad" vs "dead") +ASSEMBLYAI_API_KEY = "42e753b65b6a4360ae4a77ac76961857" diff --git a/julia-agent/julia-ai/pyproject.toml b/julia-agent/julia-ai/pyproject.toml index 09ebec4..49a3411 100644 --- a/julia-agent/julia-ai/pyproject.toml +++ b/julia-agent/julia-ai/pyproject.toml @@ -12,6 +12,8 @@ dependencies = [ "livekit-agents[silero]~=1.3", "livekit-plugins-noise-cancellation~=0.2", "livekit-plugins-deepgram~=1.0", + # Removed assemblyai - was giving garbage transcriptions + # Deepgram Nova-2 is faster and more accurate "python-dotenv", "aiohttp", ] diff --git a/julia-agent/julia-ai/src/agent.py b/julia-agent/julia-ai/src/agent.py index 3dc4ad1..65ba516 100644 --- a/julia-agent/julia-ai/src/agent.py +++ b/julia-agent/julia-ai/src/agent.py @@ -312,7 +312,14 @@ class WellNuoLLMStream(llm.LLMStream): def prewarm(proc: JobProcess): """Preload VAD model for faster startup.""" - proc.userdata["vad"] = silero.VAD.load() + # Increase min_silence_duration to prevent cutting off user speech during barge-in + # Default is 0.55s which is too short - user pauses between words get interpreted as end of speech + # 0.9s gives user more time to continue speaking without being cut off + proc.userdata["vad"] = silero.VAD.load( + min_silence_duration=0.9, # Wait 0.9s of silence before ending speech (default: 0.55) + min_speech_duration=0.05, # Keep low for quick interruption detection (default: 0.05) + activation_threshold=0.4, # Slightly lower for better sensitivity (default: 0.5) + ) async def wait_for_participant_with_metadata( @@ -389,34 +396,39 @@ async def entrypoint(ctx: JobContext): logger.info(f"Starting Julia AI session in room {ctx.room.name}") - # Wait for participant with metadata (fixes race condition) - # The mobile app sends deploymentId and beneficiaryNamesDict in token metadata + # Wait for participant with metadata - short timeout since metadata arrives immediately if present + # The mobile app sends deploymentId via token metadata deployment_id, beneficiary_names_dict = await wait_for_participant_with_metadata( - ctx, timeout=10.0 + ctx, timeout=2.0 # 2 seconds is enough - if metadata exists, it arrives within 0.5s ) - # Log what we're using + # Use deployment_id from metadata, or fall back to default effective_deployment_id = deployment_id or DEPLOYMENT_ID - logger.info( - f"Using WellNuo ask_wellnuo_ai API with deployment_id: {effective_deployment_id}" - ) - if beneficiary_names_dict: - logger.info(f"Beneficiary names dict: {beneficiary_names_dict}") - else: - logger.info("No beneficiary_names_dict provided, using default behavior") + logger.info(f"Using deployment_id={effective_deployment_id} (from_metadata={deployment_id is not None})") + # Deepgram for STT - better accuracy and faster than AssemblyAI + # AssemblyAI was giving garbage like "shambhala balashambal" instead of actual speech session = AgentSession( - # Deepgram Nova-2 for accurate speech-to-text - stt=deepgram.STT(model="nova-2"), + # Deepgram Nova-2 model for best STT accuracy + stt=deepgram.STT( + model="nova-2-general", + language="en-US", + smart_format=True, # Better punctuation and formatting + no_delay=True, # Faster response for real-time + ), # WellNuo voice_ask API for LLM with dynamic beneficiary data llm=WellNuoLLM( - deployment_id=deployment_id, + deployment_id=effective_deployment_id, beneficiary_names_dict=beneficiary_names_dict, ), # Deepgram Aura Asteria for natural female voice tts=deepgram.TTS(model="aura-asteria-en"), - # Silero VAD for voice activity detection + # Silero VAD for voice activity detection (prewarmed with tuned settings) vad=ctx.proc.userdata["vad"], + # INTERRUPTION SETTINGS: + # min_interruption_duration: How long user must speak to trigger interruption (default 0.5s) + # Set to 999.0 to effectively DISABLE interruption - user cannot interrupt the agent + min_interruption_duration=999.0, ) # Start the session with Julia assistant