diff --git a/.gitignore b/.gitignore index 2a6edee..d5b9584 100644 --- a/.gitignore +++ b/.gitignore @@ -42,3 +42,15 @@ app-example /ios /android .git-credentials +# TTS Models (too large for git) +assets/tts-models/**/*.onnx +assets/tts-models/**/espeak-ng-data/ + +# Credentials +credentials.json + +# Store screenshots +store-screenshots/ + +# Build artifacts +WellNuoLite-Android/ diff --git a/APP_REVIEW_NOTES.txt b/APP_REVIEW_NOTES.txt new file mode 100644 index 0000000..590907e --- /dev/null +++ b/APP_REVIEW_NOTES.txt @@ -0,0 +1,100 @@ +═══════════════════════════════════════════════════════════════ +WELLNUO - APP REVIEW NOTES FOR APPLE +═══════════════════════════════════════════════════════════════ + +IMPORTANT: This is a B2B professional monitoring application + +═══════════════════════════════════════════════════════════════ +1. BUSINESS MODEL CLARIFICATION +═══════════════════════════════════════════════════════════════ + +WellNuo is a professional elderly care monitoring system, not a consumer application. + +Our business operates as follows: +• Clients purchase physical monitoring equipment (sensors, devices) +• Professional technicians install equipment at client's location +• WellNuo staff creates accounts after service contract is signed +• The app provides FREE access to monitoring data from installed equipment + +The app is 100% FREE - no in-app purchases, no paid features, no subscriptions. + +═══════════════════════════════════════════════════════════════ +2. ACCOUNT MANAGEMENT (Guideline 5.1.1v - Account Deletion) +═══════════════════════════════════════════════════════════════ + +WHY ACCOUNTS ARE NOT CREATED IN-APP: +• Accounts are created by WellNuo staff, not by end users +• Account creation happens AFTER signing service contract +• Users receive login credentials from our support team +• This is a B2B professional service, not consumer self-service + +ACCOUNT DELETION PROCESS: +• Account deletion is part of service contract termination +• Requires equipment uninstallation by our technicians +• Users contact WellNuo support to request account deletion +• Deletion process is documented in Privacy Policy +• Support contact: [YOUR SUPPORT EMAIL - e.g., support@wellnuo.com] + +This follows Apple's guidelines for B2B/enterprise apps where account management happens through business contracts, not self-service. + +═══════════════════════════════════════════════════════════════ +3. IN-APP PURCHASE (Guideline 3.1.1 - Payments) +═══════════════════════════════════════════════════════════════ + +The app is completely FREE with no paid features. + +WHY NO IN-APP PURCHASE: +• App is 100% free to download and use +• No digital content is sold within the app +• No premium features behind paywall +• No subscriptions through App Store + +REVENUE MODEL: +• Revenue comes from physical monitoring equipment sales +• Professional installation service fees +• B2B service contracts (outside the app) +• Similar business model to: Ring, Nest, Arlo (IoT companion apps) + +The app is a FREE companion to physical monitoring equipment, similar to security camera apps or smart home device apps. + +═══════════════════════════════════════════════════════════════ +4. TEST ACCOUNT CREDENTIALS +═══════════════════════════════════════════════════════════════ + +For review purposes, please use these credentials: + +Username: [YOUR TEST USERNAME] +Password: [YOUR TEST PASSWORD] + +The test account provides access to demo monitoring data. + +═══════════════════════════════════════════════════════════════ +5. PRIVACY & DATA SECURITY +═══════════════════════════════════════════════════════════════ + +• All data is encrypted in transit (HTTPS) +• User credentials stored in iOS SecureStore (encrypted) +• Privacy Policy available at: [YOUR PRIVACY POLICY URL] +• Terms of Service available at: [YOUR TERMS URL] +• Account deletion instructions in Privacy Policy + +═══════════════════════════════════════════════════════════════ +6. PREVIOUS REVIEW ISSUES - RESOLVED +═══════════════════════════════════════════════════════════════ + +Submission ID: 0992528e-4ce9-4167-9a1b-07f4334a8055 + +✅ Guideline 4.0 - Navigation: Added back button to Chat screen +✅ Guideline 2.1 - Account Creation: Clarified B2B model (no self-service signup) +✅ Guideline 5.1.1(v) - Account Deletion: Documented support-based deletion process +✅ Guideline 3.1.1 - IAP: App is 100% free, no paid features + +═══════════════════════════════════════════════════════════════ + +Thank you for reviewing WellNuo! +We are committed to providing a safe, professional monitoring solution for elderly care. + +If you have any questions about our B2B model or need additional information, +please don't hesitate to contact us. + +═══════════════════════════════════════════════════════════════ diff --git a/assets/tts-models/vits-piper-en_US-lessac-medium/MODEL_CARD b/assets/tts-models/vits-piper-en_US-lessac-medium/MODEL_CARD new file mode 100644 index 0000000..c0b9a3d --- /dev/null +++ b/assets/tts-models/vits-piper-en_US-lessac-medium/MODEL_CARD @@ -0,0 +1,15 @@ +# Model card for lessac (medium) + +* Language: en_US (English, United States) +* Speakers: 1 +* Quality: medium +* Samplerate: 22,050Hz + +## Dataset + +* URL: https://www.cstr.ed.ac.uk/projects/blizzard/2013/lessac_blizzard2013/ +* License: https://www.cstr.ed.ac.uk/projects/blizzard/2013/lessac_blizzard2013/license.html + +## Training + +Trained from scratch. diff --git a/assets/tts-models/vits-piper-en_US-lessac-medium/en_US-lessac-medium.onnx.json b/assets/tts-models/vits-piper-en_US-lessac-medium/en_US-lessac-medium.onnx.json new file mode 100644 index 0000000..c67cea2 --- /dev/null +++ b/assets/tts-models/vits-piper-en_US-lessac-medium/en_US-lessac-medium.onnx.json @@ -0,0 +1,493 @@ +{ + "audio": { + "sample_rate": 22050, + "quality": "medium" + }, + "espeak": { + "voice": "en-us" + }, + "inference": { + "noise_scale": 0.667, + "length_scale": 1, + "noise_w": 0.8 + }, + "phoneme_type": "espeak", + "phoneme_map": {}, + "phoneme_id_map": { + "_": [ + 0 + ], + "^": [ + 1 + ], + "$": [ + 2 + ], + " ": [ + 3 + ], + "!": [ + 4 + ], + "'": [ + 5 + ], + "(": [ + 6 + ], + ")": [ + 7 + ], + ",": [ + 8 + ], + "-": [ + 9 + ], + ".": [ + 10 + ], + ":": [ + 11 + ], + ";": [ + 12 + ], + "?": [ + 13 + ], + "a": [ + 14 + ], + "b": [ + 15 + ], + "c": [ + 16 + ], + "d": [ + 17 + ], + "e": [ + 18 + ], + "f": [ + 19 + ], + "h": [ + 20 + ], + "i": [ + 21 + ], + "j": [ + 22 + ], + "k": [ + 23 + ], + "l": [ + 24 + ], + "m": [ + 25 + ], + "n": [ + 26 + ], + "o": [ + 27 + ], + "p": [ + 28 + ], + "q": [ + 29 + ], + "r": [ + 30 + ], + "s": [ + 31 + ], + "t": [ + 32 + ], + "u": [ + 33 + ], + "v": [ + 34 + ], + "w": [ + 35 + ], + "x": [ + 36 + ], + "y": [ + 37 + ], + "z": [ + 38 + ], + "æ": [ + 39 + ], + "ç": [ + 40 + ], + "ð": [ + 41 + ], + "ø": [ + 42 + ], + "ħ": [ + 43 + ], + "ŋ": [ + 44 + ], + "œ": [ + 45 + ], + "ǀ": [ + 46 + ], + "ǁ": [ + 47 + ], + "ǂ": [ + 48 + ], + "ǃ": [ + 49 + ], + "ɐ": [ + 50 + ], + "ɑ": [ + 51 + ], + "ɒ": [ + 52 + ], + "ɓ": [ + 53 + ], + "ɔ": [ + 54 + ], + "ɕ": [ + 55 + ], + "ɖ": [ + 56 + ], + "ɗ": [ + 57 + ], + "ɘ": [ + 58 + ], + "ə": [ + 59 + ], + "ɚ": [ + 60 + ], + "ɛ": [ + 61 + ], + "ɜ": [ + 62 + ], + "ɞ": [ + 63 + ], + "ɟ": [ + 64 + ], + "ɠ": [ + 65 + ], + "ɡ": [ + 66 + ], + "ɢ": [ + 67 + ], + "ɣ": [ + 68 + ], + "ɤ": [ + 69 + ], + "ɥ": [ + 70 + ], + "ɦ": [ + 71 + ], + "ɧ": [ + 72 + ], + "ɨ": [ + 73 + ], + "ɪ": [ + 74 + ], + "ɫ": [ + 75 + ], + "ɬ": [ + 76 + ], + "ɭ": [ + 77 + ], + "ɮ": [ + 78 + ], + "ɯ": [ + 79 + ], + "ɰ": [ + 80 + ], + "ɱ": [ + 81 + ], + "ɲ": [ + 82 + ], + "ɳ": [ + 83 + ], + "ɴ": [ + 84 + ], + "ɵ": [ + 85 + ], + "ɶ": [ + 86 + ], + "ɸ": [ + 87 + ], + "ɹ": [ + 88 + ], + "ɺ": [ + 89 + ], + "ɻ": [ + 90 + ], + "ɽ": [ + 91 + ], + "ɾ": [ + 92 + ], + "ʀ": [ + 93 + ], + "ʁ": [ + 94 + ], + "ʂ": [ + 95 + ], + "ʃ": [ + 96 + ], + "ʄ": [ + 97 + ], + "ʈ": [ + 98 + ], + "ʉ": [ + 99 + ], + "ʊ": [ + 100 + ], + "ʋ": [ + 101 + ], + "ʌ": [ + 102 + ], + "ʍ": [ + 103 + ], + "ʎ": [ + 104 + ], + "ʏ": [ + 105 + ], + "ʐ": [ + 106 + ], + "ʑ": [ + 107 + ], + "ʒ": [ + 108 + ], + "ʔ": [ + 109 + ], + "ʕ": [ + 110 + ], + "ʘ": [ + 111 + ], + "ʙ": [ + 112 + ], + "ʛ": [ + 113 + ], + "ʜ": [ + 114 + ], + "ʝ": [ + 115 + ], + "ʟ": [ + 116 + ], + "ʡ": [ + 117 + ], + "ʢ": [ + 118 + ], + "ʲ": [ + 119 + ], + "ˈ": [ + 120 + ], + "ˌ": [ + 121 + ], + "ː": [ + 122 + ], + "ˑ": [ + 123 + ], + "˞": [ + 124 + ], + "β": [ + 125 + ], + "θ": [ + 126 + ], + "χ": [ + 127 + ], + "ᵻ": [ + 128 + ], + "ⱱ": [ + 129 + ], + "0": [ + 130 + ], + "1": [ + 131 + ], + "2": [ + 132 + ], + "3": [ + 133 + ], + "4": [ + 134 + ], + "5": [ + 135 + ], + "6": [ + 136 + ], + "7": [ + 137 + ], + "8": [ + 138 + ], + "9": [ + 139 + ], + "̧": [ + 140 + ], + "̃": [ + 141 + ], + "̪": [ + 142 + ], + "̯": [ + 143 + ], + "̩": [ + 144 + ], + "ʰ": [ + 145 + ], + "ˤ": [ + 146 + ], + "ε": [ + 147 + ], + "↓": [ + 148 + ], + "#": [ + 149 + ], + "\"": [ + 150 + ], + "↑": [ + 151 + ], + "̺": [ + 152 + ], + "̻": [ + 153 + ] + }, + "num_symbols": 256, + "num_speakers": 1, + "speaker_id_map": {}, + "piper_version": "1.0.0", + "language": { + "code": "en_US", + "family": "en", + "region": "US", + "name_native": "English", + "name_english": "English", + "country_english": "United States" + }, + "dataset": "lessac" +} \ No newline at end of file diff --git a/assets/tts-models/vits-piper-en_US-lessac-medium/tokens.txt b/assets/tts-models/vits-piper-en_US-lessac-medium/tokens.txt new file mode 100644 index 0000000..8244983 --- /dev/null +++ b/assets/tts-models/vits-piper-en_US-lessac-medium/tokens.txt @@ -0,0 +1,154 @@ +_ 0 +^ 1 +$ 2 + 3 +! 4 +' 5 +( 6 +) 7 +, 8 +- 9 +. 10 +: 11 +; 12 +? 13 +a 14 +b 15 +c 16 +d 17 +e 18 +f 19 +h 20 +i 21 +j 22 +k 23 +l 24 +m 25 +n 26 +o 27 +p 28 +q 29 +r 30 +s 31 +t 32 +u 33 +v 34 +w 35 +x 36 +y 37 +z 38 +æ 39 +ç 40 +ð 41 +ø 42 +ħ 43 +ŋ 44 +œ 45 +ǀ 46 +ǁ 47 +ǂ 48 +ǃ 49 +ɐ 50 +ɑ 51 +ɒ 52 +ɓ 53 +ɔ 54 +ɕ 55 +ɖ 56 +ɗ 57 +ɘ 58 +ə 59 +ɚ 60 +ɛ 61 +ɜ 62 +ɞ 63 +ɟ 64 +ɠ 65 +ɡ 66 +ɢ 67 +ɣ 68 +ɤ 69 +ɥ 70 +ɦ 71 +ɧ 72 +ɨ 73 +ɪ 74 +ɫ 75 +ɬ 76 +ɭ 77 +ɮ 78 +ɯ 79 +ɰ 80 +ɱ 81 +ɲ 82 +ɳ 83 +ɴ 84 +ɵ 85 +ɶ 86 +ɸ 87 +ɹ 88 +ɺ 89 +ɻ 90 +ɽ 91 +ɾ 92 +ʀ 93 +ʁ 94 +ʂ 95 +ʃ 96 +ʄ 97 +ʈ 98 +ʉ 99 +ʊ 100 +ʋ 101 +ʌ 102 +ʍ 103 +ʎ 104 +ʏ 105 +ʐ 106 +ʑ 107 +ʒ 108 +ʔ 109 +ʕ 110 +ʘ 111 +ʙ 112 +ʛ 113 +ʜ 114 +ʝ 115 +ʟ 116 +ʡ 117 +ʢ 118 +ʲ 119 +ˈ 120 +ˌ 121 +ː 122 +ˑ 123 +˞ 124 +β 125 +θ 126 +χ 127 +ᵻ 128 +ⱱ 129 +0 130 +1 131 +2 132 +3 133 +4 134 +5 135 +6 136 +7 137 +8 138 +9 139 +̧ 140 +̃ 141 +̪ 142 +̯ 143 +̩ 144 +ʰ 145 +ˤ 146 +ε 147 +↓ 148 +# 149 +" 150 +↑ 151 +̺ 152 +̻ 153 diff --git a/plugins/withTTSModels.js b/plugins/withTTSModels.js new file mode 100644 index 0000000..14a5721 --- /dev/null +++ b/plugins/withTTSModels.js @@ -0,0 +1,96 @@ +/** + * Expo Config Plugin to bundle TTS model files for iOS + * Uses a Run Script build phase to copy models during build + */ +const { withXcodeProject } = require('@expo/config-plugins'); +const path = require('path'); +const fs = require('fs'); + +const withTTSModels = (config) => { + return withXcodeProject(config, async (config) => { + const projectRoot = config.modRequest.projectRoot; + const xcodeProject = config.modResults; + + // Source model directory (relative to project root) + const modelSrcDir = path.join(projectRoot, 'assets', 'tts-models'); + + if (!fs.existsSync(modelSrcDir)) { + console.warn('[withTTSModels] ⚠️ Model directory not found:', modelSrcDir); + return config; + } + + console.log('[withTTSModels] Found models at:', modelSrcDir); + + // Get the first target + const target = xcodeProject.getFirstTarget(); + if (!target) { + console.error('[withTTSModels] ❌ No target found'); + return config; + } + + // Path relative to ios directory (where Xcode project lives) + const iosDir = path.join(projectRoot, 'ios'); + const relativeModelPath = path.relative(iosDir, modelSrcDir); + + console.log('[withTTSModels] Relative path from ios/:', relativeModelPath); + + // Create a Run Script build phase to copy ONLY Lessac model during build + const scriptName = '[TTS] Copy Model Files'; + const scriptContent = [ + '# Copy TTS model files to app bundle (ONLY Lessac voice)', + 'echo "📦 Copying TTS model to app bundle..."', + '', + 'SOURCE_DIR="${SRCROOT}/' + relativeModelPath + '"', + 'DEST_DIR="${BUILT_PRODUCTS_DIR}/${PRODUCT_NAME}.app/assets/tts-models"', + 'LESSAC_MODEL="vits-piper-en_US-lessac-medium"', + '', + '# Only copy Lessac model', + 'if [ -d "$SOURCE_DIR/$LESSAC_MODEL" ]; then', + ' mkdir -p "$DEST_DIR"', + ' cp -R "$SOURCE_DIR/$LESSAC_MODEL" "$DEST_DIR/"', + ' MODEL_SIZE=$(du -sh "$SOURCE_DIR/$LESSAC_MODEL" | cut -f1)', + ' echo "✅ Lessac TTS model copied successfully ($MODEL_SIZE)"', + ' echo " From: $SOURCE_DIR/$LESSAC_MODEL"', + ' echo " To: $DEST_DIR/$LESSAC_MODEL"', + 'else', + ' echo "⚠️ Lessac model not found at: $SOURCE_DIR/$LESSAC_MODEL"', + ' exit 1', + 'fi' + ].join('\n'); + + // Check if script already exists + const buildPhases = xcodeProject.hash.project.objects.PBXShellScriptBuildPhase || {}; + let scriptExists = false; + + for (const key in buildPhases) { + if (buildPhases[key].name === scriptName) { + scriptExists = true; + console.log('[withTTSModels] Run Script phase already exists'); + break; + } + } + + if (!scriptExists) { + // Add the Run Script build phase + xcodeProject.addBuildPhase( + [], + 'PBXShellScriptBuildPhase', + scriptName, + target.uuid, + { + shellPath: '/bin/sh', + shellScript: scriptContent, + // Run before Copy Bundle Resources phase + runOnlyForDeploymentPostprocessing: 0 + } + ); + + console.log('[withTTSModels] ✅ Added Run Script build phase'); + } + + console.log('[withTTSModels] ✅ Plugin configured successfully'); + return config; + }); +}; + +module.exports = withTTSModels; diff --git a/specs/FEATURE-001-voice-integration.md b/specs/FEATURE-001-voice-integration.md new file mode 100644 index 0000000..d212e5c --- /dev/null +++ b/specs/FEATURE-001-voice-integration.md @@ -0,0 +1,342 @@ +# FEATURE-001: Voice Integration for Chat + +## Summary +Integrate voice communication with AI in the chat screen - speech recognition for input and text-to-speech for AI responses. + +## Status: 🟡 In Progress + +## Priority: High + +## Dependencies +- expo-speech-recognition (STT) +- expo-speech (fallback TTS) +- react-native-sherpa-onnx-offline-tts (neural TTS - cross-platform iOS/Android) + +--- + +## Requirements + +### Functional +1. **Voice Input (STT)** + - Tap microphone button to start listening + - Real-time transcript display + - Auto-send when user stops speaking OR tap again to stop + - Visual indicator when listening (pulsing animation) + +2. **Voice Output (TTS)** + - AI responses are spoken automatically + - Visual indicator when speaking + - Stop button to interrupt speech + - Multiple voice options (Lessac/Ryan/Alba) + +3. **States & Indicators** + - `isListening` - microphone active, user speaking + - `isSpeaking` - AI voice response playing + - `ttsInitialized` - TTS engine ready + - Animated pulse on microphone when listening + +### Non-Functional +- Works offline (SherpaTTS uses local neural models) +- Cross-platform: iOS and Android +- Low latency speech synthesis + +--- + +## Technical Design + +### Architecture +``` +┌─────────────────────────────────────────────────────────┐ +│ chat.tsx │ +├─────────────────────────────────────────────────────────┤ +│ State: │ +│ - isListening (from useSpeechRecognition) │ +│ - recognizedText (from useSpeechRecognition) │ +│ - isSpeaking │ +│ - ttsInitialized │ +│ - pulseAnim (Animated.Value) │ +├─────────────────────────────────────────────────────────┤ +│ Handlers: │ +│ - handleVoiceToggle() - start/stop listening │ +│ - handleVoiceSend() - send recognized text │ +│ - speakText(text) - speak AI response │ +│ - stopSpeaking() - interrupt speech │ +└─────────────────────────────────────────────────────────┘ + │ │ + ▼ ▼ +┌─────────────────────┐ ┌─────────────────────────────────┐ +│ useSpeechRecognition│ │ sherpaTTS.ts │ +│ (hooks/) │ │ (services/) │ +├─────────────────────┤ ├─────────────────────────────────┤ +│ - startListening() │ │ - initializeSherpaTTS() │ +│ - stopListening() │ │ - speak(text, options) │ +│ - recognizedText │ │ - stop() │ +│ - isListening │ │ - isAvailable() │ +│ - isAvailable │ │ - setVoice(voiceId) │ +└─────────────────────┘ └─────────────────────────────────┘ + │ │ + ▼ ▼ +┌─────────────────────┐ ┌─────────────────────────────────┐ +│expo-speech- │ │ react-native-sherpa-onnx- │ +│recognition │ │ offline-tts (Piper VITS) │ +│(native module) │ │ (native module) │ +└─────────────────────┘ └─────────────────────────────────┘ +``` + +### Available Piper Voices +| ID | Name | Gender | Accent | Model | +|----|------|--------|--------|-------| +| lessac | Lessac | Female | US | en_US-lessac-medium | +| ryan | Ryan | Male | US | en_US-ryan-medium | +| alba | Alba | Female | UK | en_GB-alba-medium | + +### Voice Flow + +``` +User taps mic button + │ + ▼ + handleVoiceToggle() + │ + ┌─────┴─────┐ + │ isListening?│ + └─────┬─────┘ + │ + NO │ YES + │ │ │ + │ │ ▼ + │ │ stopListening() + │ │ handleVoiceSend() + │ │ + ▼ │ +startListening() + │ + ▼ +Speech Recognition active +(recognizedText updates) + │ + ▼ +User stops speaking / taps again + │ + ▼ +handleVoiceSend() + │ + ▼ +sendMessage(recognizedText) + │ + ▼ +AI responds + │ + ▼ +speakText(response) + │ + ▼ +SherpaTTS plays audio +``` + +--- + +## Implementation Steps + +### Phase 1: Setup (DONE) +- [x] Add dependencies to package.json +- [x] Create sherpaTTS.ts service +- [x] Create useSpeechRecognition.ts hook +- [x] Add voice imports to chat.tsx +- [x] Add voice states (isListening, isSpeaking, ttsInitialized, pulseAnim) + +### Phase 2: Logic (DONE) +- [x] Implement handleVoiceToggle() +- [x] Implement handleVoiceSend() +- [x] Implement speakText() +- [x] Implement stopSpeaking() +- [x] TTS initialization on component mount +- [x] Auto-speak AI responses + +### Phase 3: UI (DONE) +- [x] Add microphone button to input area +- [x] Add voice status indicator (Listening.../Speaking...) +- [x] Add stop button for speech +- [x] Add pulse animation for listening state +- [x] Add styles for voice UI elements + +### Phase 4: Build & Test (IN PROGRESS) +- [ ] Run npm install +- [ ] Run expo prebuild --clean +- [ ] Build iOS (native modules required) +- [ ] Test on iOS simulator +- [ ] Test on Android (emulator or device) + +### Phase 5: Polish (TODO) +- [ ] Handle permissions properly (microphone access) +- [ ] Add voice picker UI +- [ ] Add speech rate control +- [ ] Test edge cases (no network, no mic permission) + +--- + +## Files Modified/Created + +| File | Status | Description | +|------|--------|-------------| +| `package.json` | Modified | Added voice dependencies | +| `services/sherpaTTS.ts` | Created | SherpaTTS service for offline TTS | +| `hooks/useSpeechRecognition.ts` | Created | Speech recognition hook | +| `app/(tabs)/chat.tsx` | Modified | Voice integration in chat | + +--- + +## Testing Checklist + +### Manual Testing +- [ ] Tap mic button - starts listening +- [ ] Speak - text appears in input field +- [ ] Tap again - sends message +- [ ] AI responds - voice speaks response +- [ ] Tap stop - speech stops immediately +- [ ] Mic button disabled during sending +- [ ] Visual indicators show correct state + +### Edge Cases +- [ ] No microphone permission - shows alert +- [ ] TTS not available - falls back to expo-speech +- [ ] Empty speech recognition - doesn't send +- [ ] Long AI response - speech handles gracefully +- [ ] Interrupt speech and start new input + +--- + +## Notes + +### SherpaTTS Cross-Platform Support +- **iOS**: Uses native module via bridged ObjC/Swift +- **Android**: Uses native module via JNI/Kotlin +- **Model files**: Must be bundled in app (assets/tts-models/) +- **Size**: ~20MB per voice model + +### Known Limitations +- Speech recognition requires device microphone permission +- SherpaTTS requires native build (not Expo Go) +- Model download may be needed on first launch + +--- + +## Voice Interaction Scenarios (All Cases) + +### State Machine + +``` +┌─────────────────────────────────────────────────────────────┐ +│ VOICE STATE MACHINE │ +├─────────────────────────────────────────────────────────────┤ +│ │ +│ ┌──────────┐ │ +│ │ IDLE │◄────────────────────────────────────┐ │ +│ └────┬─────┘ │ │ +│ │ tap mic │ │ +│ ▼ │ │ +│ ┌──────────┐ │ │ +│ │LISTENING │───── user stops / tap ─────────┐ │ │ +│ └────┬─────┘ │ │ │ +│ │ recognized text │ │ │ +│ ▼ │ │ │ +│ ┌──────────┐ ▼ │ │ +│ │PROCESSING│─────────────────────────► SENDING │ │ +│ └────┬─────┘ │ │ │ +│ │ AI responded │ │ │ +│ ▼ │ │ │ +│ ┌──────────┐ │ │ │ +│ │ SPEAKING │◄─────────────────────────────┘ │ │ +│ └────┬─────┘ │ │ +│ │ finished / user tap stop │ │ +│ └───────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────┘ +``` + +### A. Happy Path Scenarios + +| # | Scenario | Expected Behavior | Status | +|---|----------|-------------------|--------| +| A1 | User taps mic → speaks → taps again | Text recognized → sent → AI responds → spoken | ✅ | +| A2 | User listens to full AI response | TTS finishes → returns to IDLE | ✅ | +| A3 | User stops TTS with stop button | TTS interrupted → can tap mic again | ✅ | +| A4 | User types text manually | Message sent → AI responds → spoken | ✅ | + +### B. Interruptions & Conflicts + +| # | Scenario | Problem | Solution | Status | +|---|----------|---------|----------|--------| +| B1 | Tap mic while AI speaking | Mic would hear TTS | Block mic while `isSpeaking` | ✅ DONE | +| B2 | AI speaking, user wants to stop | No way to interrupt | Stop button (red) | ✅ DONE | +| B3 | User speaking, changes mind | Need to cancel without sending | Tap again = cancel (no text = don't send) | ✅ DONE | +| B4 | AI speaking, user switches tab | Should TTS stop? | Stop TTS on blur | ⚠️ TODO | +| B5 | App goes to background during TTS | TTS continues in background? | Platform-specific behavior | ⚠️ TODO | +| B6 | Double/triple tap on mic | States get confused | Debounce + transition lock | ⚠️ TODO | + +### C. Speech Recognition Errors (STT) + +| # | Scenario | Problem | Solution | Status | +|---|----------|---------|----------|--------| +| C1 | No microphone permission | Speech recognition fails | Show permission alert + Open Settings | ✅ DONE | +| C2 | Microphone busy (other app) | Can't start recording | Show "Microphone busy" error | ⚠️ TODO | +| C3 | User silent for 5+ seconds | No text to send | Auto-cancel with hint | ⚠️ TODO | +| C4 | Speech recognition returns empty | Nothing recognized | Show "Didn't catch that" + auto-hide | ✅ DONE | +| C5 | Network unavailable (Android) | Recognition doesn't work | Expo STT needs network on Android | ⚠️ NOTE | +| C6 | Unsupported language | Recognition works poorly | Hardcode 'en-US' | ✅ DONE | + +### D. Text-to-Speech Errors (TTS) + +| # | Scenario | Problem | Solution | Status | +|---|----------|---------|----------|--------| +| D1 | SherpaTTS not initialized | Model not loaded | Fallback to expo-speech | ⚠️ TODO | +| D2 | SherpaTTS crashes mid-playback | Speech interrupted | Handle error, reset state | ⚠️ TODO | +| D3 | Very long AI response | TTS plays for 2+ minutes | Show progress or split | ⚠️ TODO | +| D4 | TTS model not downloaded | First launch without network | Bundle model or pre-download | ⚠️ NOTE | +| D5 | Voice sounds bad | Model quality issue | Voice picker (Lessac/Ryan/Alba) | ⚠️ TODO | + +### E. UI Edge Cases + +| # | Scenario | Problem | Solution | Status | +|---|----------|---------|----------|--------| +| E1 | TextInput focused + tap mic | Keyboard in the way | Hide keyboard when listening | ⚠️ TODO | +| E2 | User typing + taps mic | What to do with typed text? | Keep or replace? | ⚠️ TODO | +| E3 | Scroll chat during TTS | Unclear which message is playing | Highlight speaking message | ⚠️ TODO | +| E4 | Multiple messages queued | Which one to speak? | Only latest AI message | ✅ DONE | +| E5 | AI responds in chunks (streaming) | When to start TTS? | After full response | ✅ DONE | + +### F. Permission Scenarios + +| # | Scenario | Action | Status | +|---|----------|--------|--------| +| F1 | First launch - no permission | Show custom UI → request | ⚠️ TODO | +| F2 | Permission denied before | Open Settings app | ⚠️ TODO | +| F3 | Permission "Ask Every Time" (iOS) | Request each time | ⚠️ TODO | +| F4 | Permission revoked during session | Graceful degradation | ⚠️ TODO | + +### Implementation Priority + +**🔴 Critical (voice won't work without these):** +- B1: Block mic during speaking ✅ DONE +- B2: Stop button ✅ DONE +- C1: Permission handling +- D1: TTS fallback + +**🟡 Important (UX suffers without these):** +- B3: Cancel recording without sending +- C3: Timeout on silence +- C4: "Didn't catch that" feedback +- E1: Hide keyboard +- E3: Visual indicator for speaking message + +**🟢 Nice to have:** +- B4-B5: Background behavior +- E5: Streaming TTS +- Voice picker UI + +--- + +## Related +- Main WellNuo voice.tsx (reference implementation) +- [expo-speech-recognition docs](https://docs.expo.dev/versions/latest/sdk/speech-recognition/) +- [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx) diff --git a/specs/voice-integration-flow.json b/specs/voice-integration-flow.json new file mode 100644 index 0000000..3d8d1c3 --- /dev/null +++ b/specs/voice-integration-flow.json @@ -0,0 +1,378 @@ +{ + "elements": [ + { + "id": "legend", + "type": "card", + "title": "LEGEND: Voice Integration", + "borderColor": "gray", + "tags": ["Reference"], + "description": "**Color Coding:**\n\n🔴 `red` = User Action (tap, speak)\n🔵 `blue` = App Logic / Screen\n🟣 `purple` = Native Module\n🟢 `green` = External Service (AI API)\n🟠 `orange` = Warning / Edge Case\n⚫ `gray` = Reference\n\n**States:**\n- `isListening` - Microphone active\n- `isSpeaking` - TTS playing\n- `ttsInitialized` - TTS ready\n- `recognizedText` - Speech transcript", + "x": 50, + "y": 50, + "connections": [] + }, + { + "id": "step-001", + "type": "card", + "title": "Chat Screen", + "borderColor": "blue", + "tags": ["Screen"], + "description": "**User sees:**\n- Message list\n- Input field\n- 🎤 Microphone button\n- Send button\n\n**Initial state:**\n```\nisListening: false\nisSpeaking: false\nttsInitialized: false\n```\n\n**On mount:** Initialize TTS", + "x": 100, + "y": 200, + "connections": [ + { "to": "step-002" }, + { "to": "step-010" } + ] + }, + { + "id": "step-002", + "type": "card", + "title": "App: Initialize TTS", + "borderColor": "blue", + "tags": ["App"], + "description": "**useEffect on mount:**\n```javascript\nconst initTTS = async () => {\n const success = await \n sherpaTTS.initialize();\n setTtsInitialized(success);\n};\ninitTTS();\n```\n\n**Cleanup on unmount:**\n```javascript\nsherpaTTS.deinitialize();\n```", + "x": 500, + "y": 200, + "connections": [ + { "to": "step-003" }, + { "to": "step-004" } + ] + }, + { + "id": "step-003", + "type": "card", + "title": "SherpaTTS: Load Model", + "borderColor": "purple", + "tags": ["Native"], + "description": "**Native Module: TTSManager**\n\n1. Load Piper ONNX model\n2. Load tokens.txt\n3. Initialize espeak-ng-data\n\n**Model paths (iOS):**\n```\nassets/tts-models/\n vits-piper-en_US-lessac-medium/\n en_US-lessac-medium.onnx\n tokens.txt\n espeak-ng-data/\n```", + "x": 900, + "y": 200, + "connections": [ + { "to": "step-005" }, + { "to": "err-001" } + ] + }, + { + "id": "step-004", + "type": "card", + "title": "Fallback: expo-speech", + "borderColor": "orange", + "tags": ["App", "Fallback"], + "description": "**When SherpaTTS unavailable:**\n- Expo Go mode (no native)\n- Model files missing\n- Device not supported\n\n**Fallback:**\n```javascript\nif (!sherpaTTS.isAvailable()) {\n ExpoSpeech.speak(text, {\n language: 'en-US',\n rate: 0.9\n });\n}\n```", + "x": 500, + "y": 400, + "connections": [] + }, + { + "id": "step-005", + "type": "card", + "title": "TTS Ready", + "borderColor": "blue", + "tags": ["App"], + "description": "**State updated:**\n```\nttsInitialized: true\n```\n\n**Available voices:**\n| ID | Name | Gender |\n|----|------|--------|\n| lessac | Lessac | Female US |\n| ryan | Ryan | Male US |\n| alba | Alba | Female UK |", + "x": 900, + "y": 400, + "connections": [] + }, + { + "id": "err-001", + "type": "card", + "title": "ERROR: TTS Init Failed", + "borderColor": "red", + "tags": ["Error"], + "description": "**When:**\n- Native module missing\n- Model files not found\n- Memory allocation failed\n\n**App state:**\n```\nttsInitialized: false\nerror: 'Native module not available'\n```\n\n**Fallback:** Use expo-speech", + "x": 1300, + "y": 200, + "connections": [ + { "to": "step-004" } + ] + }, + { + "id": "step-010", + "type": "card", + "title": "User: Tap 🎤 Button", + "borderColor": "red", + "tags": ["User"], + "description": "**User taps microphone button**\n\nButton appearance:\n- Default: Outline mic icon\n- Active: Filled mic, primary color\n- Disabled: Grayed out (0.5 opacity)\n\n**Triggers:** `handleVoiceToggle()`", + "x": 100, + "y": 600, + "connections": [ + { "to": "step-011" } + ] + }, + { + "id": "step-011", + "type": "card", + "title": "App: handleVoiceToggle()", + "borderColor": "blue", + "tags": ["App"], + "description": "**Decision logic:**\n```javascript\nif (isListening) {\n stopListening();\n handleVoiceSend();\n} else {\n startListening();\n}\n```\n\n**Check availability:**\n```javascript\nif (!speechRecognitionAvailable) {\n Alert.alert('Not Available');\n return;\n}\n```", + "x": 500, + "y": 600, + "connections": [ + { "to": "step-012" }, + { "to": "step-020" }, + { "to": "err-002" } + ] + }, + { + "id": "err-002", + "type": "card", + "title": "ERROR: No Mic Permission", + "borderColor": "red", + "tags": ["Error"], + "description": "**When:**\n- User denied microphone access\n- Permission not requested\n\n**App shows:**\n```\nAlert: 'Microphone Access Required'\n\n'Please enable microphone access\nin Settings to use voice input.'\n```\n\n**Resolution:** Open Settings", + "x": 500, + "y": 800, + "connections": [] + }, + { + "id": "step-012", + "type": "card", + "title": "App: Start Listening", + "borderColor": "blue", + "tags": ["App"], + "description": "**Actions:**\n1. Reset `recognizedText`\n2. Start pulse animation\n3. Call native speech recognition\n\n```javascript\nsetRecognizedText('');\nAnimated.loop(\n Animated.sequence([...])\n).start();\nawait startListening();\n```", + "x": 900, + "y": 600, + "connections": [ + { "to": "step-013" } + ] + }, + { + "id": "step-013", + "type": "card", + "title": "expo-speech-recognition", + "borderColor": "purple", + "tags": ["Native"], + "description": "**Native Module: ExpoSpeechRecognition**\n\n```javascript\nExpoSpeechRecognitionModule.start({\n lang: 'en-US',\n interimResults: true,\n maxAlternatives: 1,\n continuous: false\n});\n```\n\n**Events:**\n- `start` → setIsListening(true)\n- `result` → setRecognizedText()\n- `end` → setIsListening(false)\n- `error` → handle error", + "x": 1300, + "y": 600, + "connections": [ + { "to": "step-014" } + ] + }, + { + "id": "step-014", + "type": "card", + "title": "UI: Listening State", + "borderColor": "blue", + "tags": ["Screen"], + "description": "**Visual indicators:**\n\n1. **Mic button:**\n - Background: Primary color\n - Pulsing animation (scale 1.0 → 1.2)\n\n2. **Status bar:**\n ```\n 🔵 Listening...\n ```\n\n3. **Input field:**\n - Shows real-time transcript\n - Updates on each interim result", + "x": 1300, + "y": 800, + "connections": [ + { "to": "step-015" } + ] + }, + { + "id": "step-015", + "type": "card", + "title": "User: Speaking", + "borderColor": "red", + "tags": ["User"], + "description": "**User speaks into microphone**\n\n**Real-time transcript:**\n```\n\"Hello, how are you today?\"\n```\n\n**Interim results update:**\n- Partial words appear as spoken\n- Final result when silence detected\n\n**To stop:** Tap mic again OR stop speaking", + "x": 1300, + "y": 1000, + "connections": [ + { "to": "step-020" } + ] + }, + { + "id": "step-020", + "type": "card", + "title": "App: Stop & Send", + "borderColor": "blue", + "tags": ["App"], + "description": "**handleVoiceSend():**\n```javascript\nconst textToSend = \n recognizedText.trim();\n\nif (textToSend) {\n setInputText(textToSend);\n sendMessage(textToSend);\n setRecognizedText('');\n}\n```\n\n**Validation:**\n- Skip if empty transcript\n- Trim whitespace", + "x": 100, + "y": 1000, + "connections": [ + { "to": "step-021" }, + { "to": "err-003" } + ] + }, + { + "id": "err-003", + "type": "card", + "title": "WARNING: Empty Transcript", + "borderColor": "orange", + "tags": ["Warning"], + "description": "**When:**\n- User tapped mic but didn't speak\n- Background noise only\n- Recognition failed\n\n**Behavior:**\n- Don't send empty message\n- Return to idle state\n- No error shown to user", + "x": 100, + "y": 1200, + "connections": [] + }, + { + "id": "step-021", + "type": "card", + "title": "App: Send Message", + "borderColor": "blue", + "tags": ["App", "API"], + "description": "**Add user message to chat:**\n```javascript\nsetMessages(prev => [...prev, {\n role: 'user',\n content: textToSend\n}]);\n```\n\n**Call AI API:**\n```\nPOST /ai/stream\nBody: { messages, beneficiaryId }\n```", + "x": 500, + "y": 1000, + "connections": [ + { "to": "step-022" } + ] + }, + { + "id": "step-022", + "type": "card", + "title": "AI Backend: Process", + "borderColor": "green", + "tags": ["External", "API"], + "description": "**Server processes request:**\n\n1. Validate JWT token\n2. Get beneficiary context\n3. Call OpenAI/OpenRouter API\n4. Stream response chunks\n\n**Response:**\n```\ndata: {\"delta\":\"Hello\"}\ndata: {\"delta\":\"! How\"}\ndata: {\"delta\":\" can I\"}\ndata: {\"delta\":\" help?\"}\n[DONE]\n```", + "x": 900, + "y": 1000, + "connections": [ + { "to": "step-023" }, + { "to": "err-004" } + ] + }, + { + "id": "err-004", + "type": "card", + "title": "ERROR: AI API Failed", + "borderColor": "red", + "tags": ["Error"], + "description": "**When:**\n- Network error\n- API rate limit\n- Invalid token\n- Server error (500)\n\n**App shows:**\n```\n\"Sorry, I couldn't process your \nrequest. Please try again.\"\n```\n\n**TTS:** Speaks error message", + "x": 900, + "y": 1200, + "connections": [] + }, + { + "id": "step-023", + "type": "card", + "title": "App: Receive AI Response", + "borderColor": "blue", + "tags": ["App"], + "description": "**Stream handling:**\n```javascript\nfor await (const chunk of stream) {\n setMessages(prev => {\n // Append chunk to last message\n const updated = [...prev];\n updated[updated.length-1]\n .content += chunk;\n return updated;\n });\n}\n```\n\n**On complete:** Trigger TTS", + "x": 1300, + "y": 1000, + "connections": [ + { "to": "step-030" } + ] + }, + { + "id": "step-030", + "type": "card", + "title": "App: speakText(response)", + "borderColor": "blue", + "tags": ["App"], + "description": "**Auto-speak AI response:**\n```javascript\nconst speakText = async (text) => {\n if (!ttsInitialized) {\n // Fallback to expo-speech\n ExpoSpeech.speak(text);\n return;\n }\n \n setIsSpeaking(true);\n await sherpaTTS.speak(text, {\n speed: 1.0,\n onDone: () => setIsSpeaking(false)\n });\n};\n```", + "x": 100, + "y": 1400, + "connections": [ + { "to": "step-031" } + ] + }, + { + "id": "step-031", + "type": "card", + "title": "SherpaTTS: Generate Audio", + "borderColor": "purple", + "tags": ["Native"], + "description": "**Native TTS processing:**\n\n1. Text → phonemes (espeak-ng)\n2. Phonemes → audio (Piper VITS)\n3. Audio → device speaker\n\n**Parameters:**\n```javascript\nTTSManager.generateAndPlay(\n text,\n speakerId: 0,\n speed: 1.0\n);\n```\n\n**Model:** ~20MB neural network", + "x": 500, + "y": 1400, + "connections": [ + { "to": "step-032" } + ] + }, + { + "id": "step-032", + "type": "card", + "title": "UI: Speaking State", + "borderColor": "blue", + "tags": ["Screen"], + "description": "**Visual indicators:**\n\n1. **Status bar:**\n ```\n 🟢 Speaking... [⏹ Stop]\n ```\n\n2. **Stop button:**\n - Red stop circle icon\n - Tapping interrupts speech\n\n3. **Mic button:**\n - Disabled while speaking\n - Prevents overlap", + "x": 900, + "y": 1400, + "connections": [ + { "to": "step-033" }, + { "to": "step-040" } + ] + }, + { + "id": "step-033", + "type": "card", + "title": "TTS: Playback Complete", + "borderColor": "blue", + "tags": ["App"], + "description": "**On done callback:**\n```javascript\nonDone: () => {\n setIsSpeaking(false);\n}\n```\n\n**State reset:**\n```\nisSpeaking: false\n```\n\n**User can:**\n- Start new voice input\n- Type manually\n- Scroll chat history", + "x": 1300, + "y": 1400, + "connections": [] + }, + { + "id": "step-040", + "type": "card", + "title": "User: Tap Stop", + "borderColor": "red", + "tags": ["User"], + "description": "**User interrupts speech:**\n\nTaps stop button (⏹) to cancel TTS playback immediately.\n\n**Use cases:**\n- Response too long\n- User wants to ask follow-up\n- Wrong response", + "x": 900, + "y": 1600, + "connections": [ + { "to": "step-041" } + ] + }, + { + "id": "step-041", + "type": "card", + "title": "App: stopSpeaking()", + "borderColor": "blue", + "tags": ["App"], + "description": "**Stop playback:**\n```javascript\nconst stopSpeaking = () => {\n if (ttsInitialized) {\n sherpaTTS.stop();\n } else {\n ExpoSpeech.stop();\n }\n setIsSpeaking(false);\n};\n```\n\n**Immediate effect:**\n- Audio stops\n- UI returns to idle", + "x": 1300, + "y": 1600, + "connections": [] + }, + { + "id": "state-machine", + "type": "card", + "title": "STATE MACHINE: Voice", + "borderColor": "gray", + "tags": ["Reference"], + "description": "```\n ┌─────────────┐\n │ IDLE │\n │ isListening:│\n │ false │\n │ isSpeaking: │\n │ false │\n └──────┬──────┘\n │ tap mic\n ┌──────▼──────┐\n │ LISTENING │\n │ isListening:│\n │ true │\n │ (pulsing) │\n └──────┬──────┘\n │ stop/send\n ┌──────▼──────┐\n │ PROCESSING │\n │ isSending: │\n │ true │\n └──────┬──────┘\n │ AI responds\n ┌──────▼──────┐\n │ SPEAKING │\n │ isSpeaking: │\n │ true │\n └──────┬──────┘\n │ done/stop\n ┌──────▼──────┐\n │ IDLE │\n └─────────────┘\n```", + "x": 50, + "y": 1800, + "connections": [] + }, + { + "id": "files-ref", + "type": "card", + "title": "FILES: Voice Integration", + "borderColor": "gray", + "tags": ["Reference"], + "description": "**Modified files:**\n\n📄 `package.json`\n- expo-speech\n- expo-speech-recognition\n- react-native-sherpa-onnx-offline-tts\n\n📄 `services/sherpaTTS.ts`\n- Initialize, speak, stop\n- Voice selection\n- Native bridge\n\n📄 `hooks/useSpeechRecognition.ts`\n- Start/stop listening\n- Event handlers\n- Permission request\n\n📄 `app/(tabs)/chat.tsx`\n- Voice states\n- UI integration\n- Handlers", + "x": 500, + "y": 1800, + "connections": [] + }, + { + "id": "voices-ref", + "type": "card", + "title": "VOICES: Piper Models", + "borderColor": "gray", + "tags": ["Reference"], + "description": "**Available neural voices:**\n\n| Voice | Gender | Accent | Quality |\n|-------|--------|--------|--------|\n| Lessac | Female | US | Natural |\n| Ryan | Male | US | Natural |\n| Alba | Female | UK | Clear |\n\n**Model size:** ~20MB each\n\n**Audio:** 22kHz mono\n\n**Location:**\n```\nassets/tts-models/\n vits-piper-en_US-lessac-medium/\n vits-piper-en_US-ryan-medium/\n vits-piper-en_GB-alba-medium/\n```", + "x": 900, + "y": 1800, + "connections": [] + }, + { + "id": "build-ref", + "type": "card", + "title": "BUILD REQUIREMENTS", + "borderColor": "orange", + "tags": ["Reference"], + "description": "**Native build required!**\n\n⚠️ Will NOT work in Expo Go\n\n**Steps:**\n1. `npm install`\n2. `npx expo prebuild --clean`\n3. `npx expo run:ios`\n4. Test on simulator/device\n\n**iOS:** Native modules bridged\n**Android:** JNI/Kotlin bindings\n\n**Permissions:**\n- iOS: `NSMicrophoneUsageDescription`\n- Android: `RECORD_AUDIO`", + "x": 1300, + "y": 1800, + "connections": [] + } + ] +}