基于 ESP32-S3 的四博AI双目智能音箱方案:0.71/1.28双目光屏、四路触控、三轴姿态、震动马达、语音克隆与专属知识库接入
基于 ESP32-S3 的四博AI双目智能音箱方案0.71/1.28双目光屏、四路触控、三轴姿态、震动马达、语音克隆与专属知识库接入1. 方案概述四博AI双目智能音箱方案是一套基于ESP32-S3 / ESPS3-32 / ESPS3-32E的多模态AI硬件平台。它不是普通智能音箱而是把AI语音交互、双目表情显示、四路触摸感应、三轴姿态检测、震动反馈、语音克隆、专属知识库、小程序配置、Wi-Fi/4G/BLE联网集成在一起的AI陪伴终端。该方案适合落地到以下产品AI双目智能音箱 AI桌面宠物 儿童AI陪伴机器人 AI语音机器人 AI智能玩具 AI情绪陪伴终端 IP互动摆件 AI学习机外设 AI小夜灯升级版产品核心定位可以总结为会说话支持AI大模型、TTS、语音克隆 会表达支持0.71/1.28双目光屏同显/异显 会互动支持四路触控、三轴姿态、震动马达 会扩展支持客户小程序、客户后端、专属知识库 会联网支持Wi-Fi、4G、BLE配网2. 硬件系统架构推荐硬件架构如下------------------------------------------------------ | 四博AI双目智能音箱系统 | ------------------------------------------------------ | 主控层 | | └── ESP32-S3 / ESPS3-32 / ESPS3-32E | | | | 显示层 | | ├── 左眼 0.71 / 1.28 光屏 | | ├── 右眼 0.71 / 1.28 光屏 | | ├── 双目同显 | | ├── 双目异显 | | └── 图片 / 动图 / MJPEG / QOI素材播放 | | | | 交互层 | | ├── TOUCH0头部触摸 | | ├── TOUCH1左侧触摸 | | ├── TOUCH2右侧触摸 | | ├── TOUCH3下巴触摸 / AI对话 | | ├── 三轴G-sensor拿起 / 放下 / 摇晃 / 倾斜 | | └── 震动马达触控反馈 / AI状态反馈 / 闹钟反馈 | | | | 音频层 | | ├── MIC语音采集 | | ├── Speaker本地播放 | | ├── TTS语音播放 | | ├── 声音克隆音色播放 | | └── 可扩展TWS耳机 / 蓝牙音箱 | | | | 网络层 | | ├── Wi-Fi连接AI服务、客户后端、知识库服务 | | ├── 4G移动联网 | | └── BLE小程序配网、设备绑定、近场控制 | ------------------------------------------------------3. 推荐工程目录sibo_ai_dual_eye_speaker/ ├── CMakeLists.txt ├── sdkconfig.defaults ├── partitions.csv └── main/ ├── app_main.c ├── app_config.h ├── board_pins.h ├── system_state.c ├── system_state.h ├── eye_display.c ├── eye_display.h ├── asset_manager.c ├── asset_manager.h ├── touch_manager.c ├── touch_manager.h ├── gsensor_manager.c ├── gsensor_manager.h ├── motor_manager.c ├── motor_manager.h ├── audio_manager.c ├── audio_manager.h ├── network_manager.c ├── network_manager.h ├── ai_client.c ├── ai_client.h ├── voice_clone.c ├── voice_clone.h ├── kb_client.c ├── kb_client.h ├── app_protocol.c └── app_protocol.h4. 分区表设计双目智能音箱需要存储表情素材、动画素材、语音提示音、客户上传资源、配置文件和OTA固件因此建议使用双OTA assets资源分区 config分区。# Name, Type, SubType, Offset, Size, Flags nvs, data, nvs, 0x9000, 0x4000, otadata, data, ota, 0xd000, 0x2000, phy_init, data, phy, 0xf000, 0x1000, ota_0, app, ota_0, 0x20000, 0x400000, ota_1, app, ota_1, , 0x400000, assets, data, fat, , 0x600000, config, data, nvs, , 0x10000,资源目录建议/assets/ ├── eyes/ │ ├── idle_left.qoi │ ├── idle_right.qoi │ ├── happy_left.qoi │ ├── happy_right.qoi │ ├── blink_left.qoi │ ├── blink_right.qoi │ ├── thinking_left.qoi │ ├── thinking_right.qoi │ ├── speaking_left.qoi │ └── speaking_right.qoi ├── video/ │ ├── boot.mjpeg │ ├── thinking.mjpeg │ └── speaking.mjpeg ├── audio/ │ ├── wake.wav │ ├── hello.wav │ ├── error.wav │ └── cloned_voice_demo.opus └── manifest.json5. app_config.h#pragma once #define DEVICE_MODEL SIBO_AI_DUAL_EYE_SPEAKER #define FW_VERSION 1.0.0 #define DEFAULT_VOLUME 55 #define DEFAULT_EYE_FPS 15 #define ENABLE_WIFI 1 #define ENABLE_4G 1 #define ENABLE_BLE_CONFIG 1 #define ENABLE_DUAL_EYE 1 #define ENABLE_TOUCH 1 #define ENABLE_GSENSOR 1 #define ENABLE_MOTOR 1 #define ENABLE_VOICE_CLONE 1 #define ENABLE_KNOWLEDGE_BASE 1 #define AI_WS_URL wss://your-ai-server.com/device/ws #define KB_HTTP_URL https://your-ai-server.com/kb/query #define ASSET_SERVER_URL https://your-cdn.com/assets #define DEVICE_ID_PREFIX sibo_dual_eye_6. board_pins.h实际量产时要按PCB重新分配下面是工程模板#pragma once /* 双目屏 SPI */ #define PIN_LCD_SPI_MOSI 11 #define PIN_LCD_SPI_CLK 12 #define PIN_LCD_LEFT_CS 13 #define PIN_LCD_RIGHT_CS 14 #define PIN_LCD_DC 15 #define PIN_LCD_RST 16 #define PIN_LCD_BL 17 /* I2S 音频 */ #define PIN_I2S_BCLK 4 #define PIN_I2S_WS 5 #define PIN_I2S_DOUT 6 #define PIN_I2S_DIN 7 #define PIN_AMP_EN 8 /* 四路触摸 */ #define PIN_TOUCH_HEAD 1 #define PIN_TOUCH_LEFT 2 #define PIN_TOUCH_RIGHT 3 #define PIN_TOUCH_CHIN 9 /* G-sensor I2C */ #define PIN_I2C_SDA 41 #define PIN_I2C_SCL 42 /* 震动马达 */ #define PIN_MOTOR 10 /* 4G模组 */ #define PIN_4G_TX 43 #define PIN_4G_RX 44 #define PIN_4G_PWRKEY 457. 双目显示状态机双目屏幕建议抽象为四种模式1. 双目同显左右眼显示同一种表情 2. 双目异显左右眼显示不同表情 3. 视频模式播放MJPEG/QOI帧动画 4. 客户素材模式播放小程序上传的图片/动图/视频eye_display.h#pragma once #include stdbool.h typedef enum { EYE_MODE_SYNC 0, EYE_MODE_ASYNC, EYE_MODE_VIDEO, EYE_MODE_CUSTOM } eye_mode_t; typedef enum { EYE_EXPR_IDLE 0, EYE_EXPR_WAKEUP, EYE_EXPR_HAPPY, EYE_EXPR_BLINK, EYE_EXPR_THINKING, EYE_EXPR_SPEAKING, EYE_EXPR_SLEEP, EYE_EXPR_ERROR, EYE_EXPR_CUSTOM } eye_expr_t; typedef struct { eye_mode_t mode; eye_expr_t left_expr; eye_expr_t right_expr; char left_asset[128]; char right_asset[128]; int fps; bool loop; } eye_display_ctx_t; void eye_display_init(void); void eye_display_show_sync(eye_expr_t expr); void eye_display_show_async(eye_expr_t left, eye_expr_t right); void eye_display_play_assets(const char *left_path, const char *right_path, int fps, bool loop); void eye_display_stop(void);eye_display.c#include stdio.h #include string.h #include eye_display.h static eye_display_ctx_t s_eye { .mode EYE_MODE_SYNC, .left_expr EYE_EXPR_IDLE, .right_expr EYE_EXPR_IDLE, .fps 15, .loop true, }; static const char *expr_to_asset(eye_expr_t expr, bool left) { switch (expr) { case EYE_EXPR_WAKEUP: return left ? /assets/eyes/wakeup_left.qoi : /assets/eyes/wakeup_right.qoi; case EYE_EXPR_HAPPY: return left ? /assets/eyes/happy_left.qoi : /assets/eyes/happy_right.qoi; case EYE_EXPR_BLINK: return left ? /assets/eyes/blink_left.qoi : /assets/eyes/blink_right.qoi; case EYE_EXPR_THINKING: return left ? /assets/eyes/thinking_left.qoi : /assets/eyes/thinking_right.qoi; case EYE_EXPR_SPEAKING: return left ? /assets/eyes/speaking_left.qoi : /assets/eyes/speaking_right.qoi; case EYE_EXPR_SLEEP: return left ? /assets/eyes/sleep_left.qoi : /assets/eyes/sleep_right.qoi; case EYE_EXPR_ERROR: return left ? /assets/eyes/error_left.qoi : /assets/eyes/error_right.qoi; case EYE_EXPR_IDLE: default: return left ? /assets/eyes/idle_left.qoi : /assets/eyes/idle_right.qoi; } } void eye_display_init(void) { printf([eye] init dual lcd\n); /* * lcd_bus_init(); * lcd_left_panel_init(); * lcd_right_panel_init(); * lvgl_port_init(); */ } void eye_display_show_sync(eye_expr_t expr) { s_eye.mode EYE_MODE_SYNC; s_eye.left_expr expr; s_eye.right_expr expr; const char *left expr_to_asset(expr, true); const char *right expr_to_asset(expr, false); printf([eye] sync expr%d L%s R%s\n, expr, left, right); /* * image_decode_and_draw_left(left); * image_decode_and_draw_right(right); */ } void eye_display_show_async(eye_expr_t left, eye_expr_t right) { s_eye.mode EYE_MODE_ASYNC; s_eye.left_expr left; s_eye.right_expr right; const char *left_asset expr_to_asset(left, true); const char *right_asset expr_to_asset(right, false); printf([eye] async L%s R%s\n, left_asset, right_asset); /* * image_decode_and_draw_left(left_asset); * image_decode_and_draw_right(right_asset); */ } void eye_display_play_assets(const char *left_path, const char *right_path, int fps, bool loop) { if (!left_path || !right_path) { return; } s_eye.mode EYE_MODE_CUSTOM; s_eye.fps fps; s_eye.loop loop; strncpy(s_eye.left_asset, left_path, sizeof(s_eye.left_asset) - 1); strncpy(s_eye.right_asset, right_path, sizeof(s_eye.right_asset) - 1); printf([eye] custom L%s R%s fps%d loop%d\n, s_eye.left_asset, s_eye.right_asset, s_eye.fps, s_eye.loop); /* * anim_player_start_left(s_eye.left_asset, fps, loop); * anim_player_start_right(s_eye.right_asset, fps, loop); */ } void eye_display_stop(void) { printf([eye] stop display animation\n); }8. 四路触摸交互四路触摸建议映射为TOUCH_HEAD摸头开心反馈 TOUCH_LEFT左侧触摸左眼眨眼 TOUCH_RIGHT右侧触摸右眼眨眼 TOUCH_CHIN下巴触摸进入AI对话typedef enum { TOUCH_EVT_HEAD 0, TOUCH_EVT_LEFT, TOUCH_EVT_RIGHT, TOUCH_EVT_CHIN } touch_event_t; void motor_vibrate_once(int ms); void ai_client_start_listen(void); void touch_manager_handle_event(touch_event_t evt) { switch (evt) { case TOUCH_EVT_HEAD: printf([touch] head touched\n); eye_display_show_sync(EYE_EXPR_HAPPY); motor_vibrate_once(80); break; case TOUCH_EVT_LEFT: printf([touch] left touched\n); eye_display_show_async(EYE_EXPR_BLINK, EYE_EXPR_IDLE); break; case TOUCH_EVT_RIGHT: printf([touch] right touched\n); eye_display_show_async(EYE_EXPR_IDLE, EYE_EXPR_BLINK); break; case TOUCH_EVT_CHIN: printf([touch] chin touched, start AI chat\n); eye_display_show_sync(EYE_EXPR_THINKING); motor_vibrate_once(50); ai_client_start_listen(); break; default: break; } }9. 三轴姿态传感器交互三轴传感器可以识别拿起、放下、摇晃、左倾、右倾等动作让设备交互更自然。typedef enum { GSENSOR_EVT_NONE 0, GSENSOR_EVT_SHAKE, GSENSOR_EVT_PICK_UP, GSENSOR_EVT_PUT_DOWN, GSENSOR_EVT_TILT_LEFT, GSENSOR_EVT_TILT_RIGHT } gsensor_event_t; void gsensor_handle_event(gsensor_event_t evt) { switch (evt) { case GSENSOR_EVT_SHAKE: printf([gsensor] shake\n); eye_display_show_sync(EYE_EXPR_HAPPY); motor_vibrate_once(100); break; case GSENSOR_EVT_PICK_UP: printf([gsensor] pick up\n); eye_display_show_sync(EYE_EXPR_WAKEUP); motor_vibrate_once(60); break; case GSENSOR_EVT_PUT_DOWN: printf([gsensor] put down\n); eye_display_show_sync(EYE_EXPR_SLEEP); break; case GSENSOR_EVT_TILT_LEFT: printf([gsensor] tilt left\n); eye_display_show_async(EYE_EXPR_BLINK, EYE_EXPR_IDLE); break; case GSENSOR_EVT_TILT_RIGHT: printf([gsensor] tilt right\n); eye_display_show_async(EYE_EXPR_IDLE, EYE_EXPR_BLINK); break; default: break; } }10. 震动马达驱动#include driver/gpio.h #include freertos/FreeRTOS.h #include freertos/task.h #include board_pins.h void motor_init(void) { gpio_config_t io { .pin_bit_mask 1ULL PIN_MOTOR, .mode GPIO_MODE_OUTPUT, .pull_up_en GPIO_PULLUP_DISABLE, .pull_down_en GPIO_PULLDOWN_DISABLE, .intr_type GPIO_INTR_DISABLE, }; gpio_config(io); gpio_set_level(PIN_MOTOR, 0); } void motor_vibrate_once(int ms) { gpio_set_level(PIN_MOTOR, 1); vTaskDelay(pdMS_TO_TICKS(ms)); gpio_set_level(PIN_MOTOR, 0); } void motor_vibrate_pattern(void) { motor_vibrate_once(60); vTaskDelay(pdMS_TO_TICKS(80)); motor_vibrate_once(60); }11. 语音克隆与专属知识库协议设计小程序端可以完成两个配置1. 语音克隆选择用户自己的克隆音色 voice_id 2. 专属知识库绑定知识库 kb_id小程序下发配置{ cmd: ai_profile_set, data: { agent_id: sibo_pet_001, voice_id: clone_voice_zhangsan, kb_id: kb_customer_product_manual, role_prompt: 你是一个可爱的双目AI陪伴音箱回答要简洁、温柔、有陪伴感。, tts_format: opus, asr_language: zh-CN } }设备端结构体typedef struct { char agent_id[64]; char voice_id[64]; char kb_id[64]; char role_prompt[256]; char tts_format[16]; char asr_language[16]; } ai_profile_t; static ai_profile_t s_ai_profile { .agent_id sibo_pet_default, .voice_id default_soft_voice, .kb_id default_kb, .role_prompt 你是四博AI双目智能音箱。, .tts_format opus, .asr_language zh-CN };配置解析代码#include stdio.h #include string.h #include cJSON.h void ai_profile_handle_json(const char *json) { cJSON *root cJSON_Parse(json); if (!root) { printf([ai-profile] parse failed\n); return; } cJSON *data cJSON_GetObjectItem(root, data); if (!data) { cJSON_Delete(root); return; } cJSON *agent_id cJSON_GetObjectItem(data, agent_id); cJSON *voice_id cJSON_GetObjectItem(data, voice_id); cJSON *kb_id cJSON_GetObjectItem(data, kb_id); cJSON *role_prompt cJSON_GetObjectItem(data, role_prompt); if (agent_id agent_id-valuestring) { strncpy(s_ai_profile.agent_id, agent_id-valuestring, sizeof(s_ai_profile.agent_id) - 1); } if (voice_id voice_id-valuestring) { strncpy(s_ai_profile.voice_id, voice_id-valuestring, sizeof(s_ai_profile.voice_id) - 1); } if (kb_id kb_id-valuestring) { strncpy(s_ai_profile.kb_id, kb_id-valuestring, sizeof(s_ai_profile.kb_id) - 1); } if (role_prompt role_prompt-valuestring) { strncpy(s_ai_profile.role_prompt, role_prompt-valuestring, sizeof(s_ai_profile.role_prompt) - 1); } printf([ai-profile] agent%s voice%s kb%s\n, s_ai_profile.agent_id, s_ai_profile.voice_id, s_ai_profile.kb_id); cJSON_Delete(root); }12. 专属知识库查询流程AI对话前可以先查询客户知识库再把结果作为上下文发送给大模型。用户语音 ↓ ASR转文字 ↓ 查询专属知识库 kb_id ↓ 拼接知识库结果 用户问题 角色设定 ↓ 发送给AI大模型 ↓ 使用 voice_id 生成克隆语音 ↓ 播放TTS同时驱动双目说话表情知识库查询请求{ device_id: sibo_dual_eye_aabbcc, kb_id: kb_customer_product_manual, query: 这个设备怎么切换蓝牙音箱, top_k: 3 }C端封装void kb_client_query(const char *question) { char json[512]; snprintf(json, sizeof(json), { \device_id\:\%s\, \kb_id\:\%s\, \query\:\%s\, \top_k\:3 }, sibo_dual_eye_aabbcc, s_ai_profile.kb_id, question ); printf([kb] query: %s\n, json); /* * http_post(KB_HTTP_URL, json); */ }13. AI对话请求协议{ type: ai_chat, device_id: sibo_dual_eye_aabbcc, agent_id: sibo_pet_001, voice_id: clone_voice_zhangsan, kb_id: kb_customer_product_manual, scene: dual_eye_speaker, text: 你能介绍一下自己吗, context: { touch: head, motion: pick_up, eye_state: happy } }AI请求代码void ai_client_send_text(const char *text) { char json[1024]; snprintf(json, sizeof(json), { \type\:\ai_chat\, \device_id\:\%s\, \agent_id\:\%s\, \voice_id\:\%s\, \kb_id\:\%s\, \scene\:\dual_eye_speaker\, \text\:\%s\, \context\:{ \eye_state\:\happy\ } }, sibo_dual_eye_aabbcc, s_ai_profile.agent_id, s_ai_profile.voice_id, s_ai_profile.kb_id, text ); printf([ai] send: %s\n, json); eye_display_show_sync(EYE_EXPR_THINKING); /* * websocket_send_text(json); */ }14. AI状态与表情联动typedef enum { AI_STATE_IDLE 0, AI_STATE_WAKEUP, AI_STATE_LISTENING, AI_STATE_THINKING, AI_STATE_SPEAKING, AI_STATE_ERROR } ai_state_t; void ai_state_update(ai_state_t state) { switch (state) { case AI_STATE_IDLE: eye_display_show_sync(EYE_EXPR_IDLE); break; case AI_STATE_WAKEUP: eye_display_show_sync(EYE_EXPR_WAKEUP); motor_vibrate_once(60); break; case AI_STATE_LISTENING: eye_display_show_sync(EYE_EXPR_THINKING); break; case AI_STATE_THINKING: eye_display_play_assets( /assets/eyes/thinking_left.qoi, /assets/eyes/thinking_right.qoi, 15, true ); break; case AI_STATE_SPEAKING: eye_display_show_sync(EYE_EXPR_SPEAKING); break; case AI_STATE_ERROR: eye_display_show_sync(EYE_EXPR_ERROR); motor_vibrate_pattern(); break; default: break; } }15. 小程序统一控制协议适配“四博小助手小程序”或客户自己的小程序时可以采用统一 JSON 控制协议。{ cmd: device_control, data: { eye: { mode: async, left: happy, right: blink }, audio: { volume: 65, output: speaker }, ai: { voice_id: clone_voice_zhangsan, kb_id: kb_customer_product_manual }, vibration: { enable: true, duration_ms: 80 } } }统一解析代码static eye_expr_t parse_eye_expr(const char *name) { if (!name) return EYE_EXPR_IDLE; if (strcmp(name, happy) 0) return EYE_EXPR_HAPPY; if (strcmp(name, blink) 0) return EYE_EXPR_BLINK; if (strcmp(name, thinking) 0) return EYE_EXPR_THINKING; if (strcmp(name, speaking) 0) return EYE_EXPR_SPEAKING; if (strcmp(name, sleep) 0) return EYE_EXPR_SLEEP; if (strcmp(name, error) 0) return EYE_EXPR_ERROR; return EYE_EXPR_IDLE; } void app_protocol_handle_json(const char *json) { cJSON *root cJSON_Parse(json); if (!root) { printf([proto] parse failed\n); return; } cJSON *cmd cJSON_GetObjectItem(root, cmd); cJSON *data cJSON_GetObjectItem(root, data); if (!cmd || !data) { cJSON_Delete(root); return; } if (strcmp(cmd-valuestring, device_control) 0) { cJSON *eye cJSON_GetObjectItem(data, eye); cJSON *audio cJSON_GetObjectItem(data, audio); cJSON *ai cJSON_GetObjectItem(data, ai); cJSON *vibration cJSON_GetObjectItem(data, vibration); if (eye) { const char *mode cJSON_GetObjectItem(eye, mode)-valuestring; const char *left cJSON_GetObjectItem(eye, left)-valuestring; const char *right cJSON_GetObjectItem(eye, right)-valuestring; if (strcmp(mode, sync) 0) { eye_display_show_sync(parse_eye_expr(left)); } else { eye_display_show_async(parse_eye_expr(left), parse_eye_expr(right)); } } if (ai) { cJSON *voice_id cJSON_GetObjectItem(ai, voice_id); cJSON *kb_id cJSON_GetObjectItem(ai, kb_id); if (voice_id voice_id-valuestring) { strncpy(s_ai_profile.voice_id, voice_id-valuestring, sizeof(s_ai_profile.voice_id) - 1); } if (kb_id kb_id-valuestring) { strncpy(s_ai_profile.kb_id, kb_id-valuestring, sizeof(s_ai_profile.kb_id) - 1); } } if (vibration) { cJSON *duration cJSON_GetObjectItem(vibration, duration_ms); if (duration) { motor_vibrate_once(duration-valueint); } } if (audio) { cJSON *volume cJSON_GetObjectItem(audio, volume); if (volume) { printf([audio] set volume%d\n, volume-valueint); // audio_manager_set_volume(volume-valueint); } } } cJSON_Delete(root); }16. app_main.c#include stdio.h #include freertos/FreeRTOS.h #include freertos/task.h void app_main(void) { printf(SIBO AI Dual Eye Speaker Start\n); network_auto_connect(); eye_display_init(); motor_init(); /* * 实际工程继续初始化 * touch_manager_init(); * gsensor_manager_init(); * audio_manager_init(); * ai_client_init(); * blufi_manager_init(); * asset_manager_init(); */ eye_display_show_sync(EYE_EXPR_IDLE); while (1) { /* * 1. 处理触摸事件 * 2. 处理三轴姿态事件 * 3. 处理AI对话状态 * 4. 处理小程序/后端命令 * 5. 刷新双目动画 * 6. 处理网络状态 */ vTaskDelay(pdMS_TO_TICKS(20)); } }17. 总结四博AI双目智能音箱方案的关键不是“加两块屏”而是把ESP32-S3主控、双目光屏、四路触摸、三轴姿态、震动马达、AI大模型、语音克隆、专属知识库、小程序配置、Wi-Fi/4G/BLE联网统一成一个完整可量产平台。它的优势可以总结为1. 基于ESP32-S3适合AI音视频产品开发 2. 通配0.71/1.28双目光屏 3. 支持双目同显、双目异显 4. 标配四路触控感应和震动马达 5. 内置三轴姿态传感器交互更灵敏 6. 支持小程序语音克隆和专属知识库配置 7. 支持客户后端、客户小程序、客户AI服务接入 8. 支持Wi-Fi、4G、BLE多种联网方式 9. 可扩展蓝牙音箱、TWS耳机、OTA和素材更新对于B端客户来说这套方案可以快速把一个普通智能音箱升级为有表情、有动作、有知识库、有专属声音的AI陪伴终端。