|
|
Arduino Nano ESP32-S3 |
x 1 | |
|
|
SSD1331_DISPLAY |
x 1 | |
|
|
BZT52C33-TPMicro Crystal
|
x 1 | |
|
|
AS01808PR-N50-RPUI Audio, Inc.
|
x 1 |
|
arduino IDEArduino
|
ESP32-S3 Based AI-Powered Conversational Device
This project is an ESP32-S3 based AI-enabled conversational device designed to serve as a friendly interface between users and a cloud-based AI system. The device does not run AI locally; instead, it focuses on capturing user input, handling connectivity, and delivering responses generated by the server in real time. The motivation behind this project is to demonstrate that natural and accessible human–AI interaction does not require powerful or expensive hardware. By using the ESP32-S3 as a lightweight communication bridge, the system achieves low cost, low power consumption, and scalability while still providing an AI friend experience. The project showcases how embedded devices can effectively enable real-time conversational interaction through a client–server architecture, supported by images and videos demonstrating the hardware, system design, and live operation.
This is a short YouTube video summarizing this project:
#include <Arduino.h>
#include <Wire.h>
#include <Adafruit_GFX.h>
#include <Adafruit_SSD1306.h>
#include <driver/i2s.h>
#include <WiFi.h>
#include <ArduinoWebsockets.h>
#include <Adafruit_NeoPixel.h>
#include <herBinary.h>
using namespace websockets;
#define AUDIO_INPUT_SAMPLE_RATE 16000
#define AUDIO_I2S_MIC_GPIO_WS 4
#define AUDIO_I2S_MIC_GPIO_SCK 5
#define AUDIO_I2S_MIC_GPIO_DIN 6
#define AUDIO_I2S_SPK_GPIO_DOUT 7
#define AUDIO_I2S_SPK_GPIO_BCLK 15
#define AUDIO_I2S_SPK_GPIO_LRCK 16
#define BUILTIN_LED_GPIO 48
#define VOLUME_DOWN_BUTTON_GPIO 39
#define I2S_MIC I2S_NUM_0
#define I2S_SPK I2S_NUM_1
#define BUFFER_SIZE 1024
#define SNOW_COUNT 16
struct Snow {
float x, y;
float vx, vy;
};
Snow snow[SNOW_COUNT];
void initSnow()
{
for (int i = 0; i < SNOW_COUNT; i++)
{
snow[i].x = random(-128, 128);
snow[i].y = random(-64, 0);
snow[i].vx = 0.3f + random(0, 10) * 0.02f;
snow[i].vy = 0.5f + random(0, 10) * 0.03f;
}
}
void respawnSnow(int i)
{
if (random(0, 2) == 0)
{
snow[i].x = random(0, 128);
snow[i].y = random(-10, 0);
}
else
{
snow[i].x = random(-10, 0);
snow[i].y = random(0, 64);
}
snow[i].vx = 0.3f + random(0, 10) * 0.02f;
snow[i].vy = 0.5f + random(0, 10) * 0.03f;
}
static int16_t audio_buffer[BUFFER_SIZE];
volatile float g_audio_level = 0.0f;
Adafruit_SSD1306 display(128, 64, &Wire, -1);
Adafruit_NeoPixel pixel(1, BUILTIN_LED_GPIO, NEO_GRB + NEO_KHZ800);
const char* ssid = "Hot Toc Hoang Anh";
const char* password = "07011979";
bool lastButtonState = HIGH;
enum {
STATE_IDLE = 0,
STATE_RECORDING = 1,
STATE_PROCESSING = 2,
STATE_SPEAKING = 3,
};
volatile int8_t state = STATE_IDLE;
WebsocketsClient client;
void disp(const String str) {
display.clearDisplay();
display.setTextSize(1);
display.setTextColor(SSD1306_WHITE);
display.setCursor(0, 0);
display.println(str);
display.display();
}
void drawIdle()
{
static uint32_t last = 0;
if (millis() - last < 40) return;
last = millis();
display.clearDisplay();
display.drawBitmap(
0, 0,
reze_bitmap_rezesmileSmol,
128, 64,
SSD1306_WHITE
);
for (int i = 0; i < SNOW_COUNT; i++)
{
snow[i].x += snow[i].vx;
snow[i].y += snow[i].vy;
int x = (int)snow[i].x;
int y = (int)snow[i].y;
if (x >= 0 && x < 128 && y >= 0 && y < 64)
{
display.drawPixel(x, y, SSD1306_WHITE);
if (i % 5 == 0)
display.drawPixel(x + 1, y, SSD1306_WHITE);
}
if (snow[i].x > 130 || snow[i].y > 66)
{
respawnSnow(i);
}
}
display.display();
}
void drawSiriWave(float level)
{
static float phase = 0;
static uint32_t last = 0;
if (millis() - last < 40) return;
last = millis();
display.clearDisplay();
const int midY = 32;
const float center = 64.0;
const float baseAmp = 6 + level * 18;
const int waves = 4;
for (int w = 0; w < waves; w++)
{
float waveAmp = baseAmp * (1.0 - w * 0.2f);
float freq = 0.04f + w * 0.005f;
float softness = 1.0f - w * 0.15f;
for (int x = 0; x < 128; x++)
{
float dist = abs(x - center) / center;
float envelope = expf(-dist * dist * 3.5f);
float y =
sinf(freq * x + phase) +
softness * 0.4f * sinf(freq * 1.7f * x + phase);
int yy = midY + (int)(y * waveAmp * envelope);
if (yy >= 0 && yy < 64)
display.drawPixel(x, yy, SSD1306_WHITE);
}
}
display.display();
phase += 0.12f + level * 0.35f;
}
void drawSpeakingWave()
{
static float phase = 0;
display.clearDisplay();
const int midY = 32;
const float center = 64;
for (int x = 0; x < 128; x++)
{
float dist = abs(x - center) / center;
float env = expf(-dist * dist * 3);
float y = sinf(0.045f * x + phase);
int yy = midY + y * 8 * env;
display.drawPixel(x, yy, SSD1306_WHITE);
}
display.display();
phase += 0.08f;
}
void drawProcessing()
{
static float phase = 0;
display.clearDisplay();
int cx = 64;
int cy = 32;
for (int i = 0; i < 3; i++)
{
float a = phase + i * 2.1f;
int x = cx + cosf(a) * 10;
int y = cy + sinf(a) * 6;
display.fillCircle(x, y, 2, SSD1306_WHITE);
}
display.display();
phase += 0.08f;
}
void led(uint8_t r, uint8_t g, uint8_t b, bool on) {
if (on) pixel.setPixelColor(0, pixel.Color(r, g, b));
else pixel.clear();
pixel.show();
}
bool readAudioBuffer() {
size_t bytes_read;
int32_t raw_buffer[BUFFER_SIZE];
esp_err_t res = i2s_read(
I2S_MIC,
raw_buffer,
BUFFER_SIZE * sizeof(int32_t),
&bytes_read,
portMAX_DELAY
);
if (res != ESP_OK || bytes_read == 0) return false;
int samples = bytes_read / sizeof(int32_t);
for (int i = 0; i < samples; i++) {
int32_t s = raw_buffer[i] >> 14;
audio_buffer[i] = (int16_t) constrain(s, -32768, 32767);
}
return true;
}
void i2sInitMic() {
i2s_config_t cfg = {
.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX),
.sample_rate = AUDIO_INPUT_SAMPLE_RATE,
.bits_per_sample = I2S_BITS_PER_SAMPLE_32BIT,
.channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
.communication_format = I2S_COMM_FORMAT_I2S,
.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
.dma_buf_count = 6,
.dma_buf_len = 1024,
.use_apll = true,
.tx_desc_auto_clear = false,
.fixed_mclk = 0
};
i2s_pin_config_t pin = {
.bck_io_num = AUDIO_I2S_MIC_GPIO_SCK,
.ws_io_num = AUDIO_I2S_MIC_GPIO_WS,
.data_out_num = I2S_PIN_NO_CHANGE,
.data_in_num = AUDIO_I2S_MIC_GPIO_DIN
};
i2s_driver_install(I2S_MIC, &cfg, 0, NULL);
i2s_set_pin(I2S_MIC, &pin);
i2s_set_clk(I2S_MIC, AUDIO_INPUT_SAMPLE_RATE, I2S_BITS_PER_SAMPLE_32BIT, I2S_CHANNEL_MONO);
}
void i2sInitSpeaker() {
i2s_config_t cfg = {
.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX),
.sample_rate = 24000,
.bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
.channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
.communication_format = I2S_COMM_FORMAT_I2S,
.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
.dma_buf_count = 6,
.dma_buf_len = 512,
.use_apll = true,
.tx_desc_auto_clear = true,
.fixed_mclk = 0
};
i2s_pin_config_t pin = {
.bck_io_num = AUDIO_I2S_SPK_GPIO_BCLK,
.ws_io_num = AUDIO_I2S_SPK_GPIO_LRCK,
.data_out_num = AUDIO_I2S_SPK_GPIO_DOUT,
.data_in_num = I2S_PIN_NO_CHANGE
};
i2s_driver_install(I2S_SPK, &cfg, 0, NULL);
i2s_set_pin(I2S_SPK, &pin);
i2s_set_clk(I2S_SPK, 24000, I2S_BITS_PER_SAMPLE_16BIT, I2S_CHANNEL_MONO);
}
void initWiFi() {
WiFi.begin(ssid, password);
unsigned long start = millis();
while (WiFi.status() != WL_CONNECTED) {
delay(200);
if (millis() - start > 20000) {
break;
}
}
}
void onEventCallback(WebsocketsEvent event, String data) {
if (event == WebsocketsEvent::ConnectionOpened) {
("WS connected");
} else if (event == WebsocketsEvent::ConnectionClosed) {
("WS closed");
}
}
String led_mode = "None";
void onMessageCallback(WebsocketsMessage message) {
if (message.isText()) {
String txt = message.data();
if (txt == "tts_end") {
g_audio_level = 0.0f;
state = STATE_IDLE;
return;
}
state = STATE_SPEAKING;
if (txt == "rainbow") {
led_mode = "rainbow";
}
else if (txt == "off") {
led_mode = "None";
}
}
else if (message.isBinary()) {
state = STATE_SPEAKING;
const std::string& audio_str = message.rawData();
size_t bytes_written = 0;
float fake = (audio_str.size() % 4000) / 4000.0f;
g_audio_level = 0.3f + fake * 0.5f;
i2s_write(I2S_SPK, audio_str.data(), audio_str.size(), &bytes_written, portMAX_DELAY);
}
}
void connectWS(String ws_server) {
client.onMessage(onMessageCallback);
client.onEvent(onEventCallback);
while (!client.connect(ws_server)) {
delay(1000);
}
}
float envelopeFollow(float input)
{
static float env = 0.0f;
const float attack = 0.25f;
const float release = 0.04f;
if (input > env)
env += attack * (input - env);
else
env += release * (input - env);
return env;
}
void streamAudioForMs(uint32_t duration_ms) {
client.send("{\"type\":\"stt_start\"}");
const size_t SAMPLES_PER_CHUNK = BUFFER_SIZE / sizeof(int32_t);
int32_t buffer32[SAMPLES_PER_CHUNK];
int16_t buffer16[SAMPLES_PER_CHUNK];
size_t bytes_read = 0;
unsigned long start = millis();
static int32_t dc_offset = 0;
const float alpha = 0.999f;
while (millis() - start < duration_ms) {
i2s_read(I2S_MIC, (void*)buffer32, sizeof(buffer32), &bytes_read, portMAX_DELAY);
if (bytes_read == 0) {
delay(2);
continue;
}
int samples = bytes_read / sizeof(int32_t);
for (int i = 0; i < samples; i++) {
int32_t s = buffer32[i] >> 14;
dc_offset = alpha * dc_offset + (1.0f - alpha) * s;
s -= dc_offset;
s *= 2;
if (s > 32767) s = 32767;
if (s < -32768) s = -32768;
buffer16[i] = (int16_t)s;
}
uint32_t sum = 0;
for (int i = 0; i < samples; i++) {
sum += abs(buffer16[i]);
}
float energy = (float)sum / samples;
float env = envelopeFollow(energy);
const float MIN_E = 150.0f;
const float MAX_E = 3500.0f;
float lvl = (env - MIN_E) / (MAX_E - MIN_E);
if (lvl < 0) lvl = 0;
if (lvl > 1) lvl = 1;
g_audio_level = lvl;
client.sendBinary((const char*)buffer16, samples * sizeof(int16_t));
client.poll();
}
client.send("{\"type\":\"stt_end\"}");
}
TaskHandle_t displayHandle = NULL;
TaskHandle_t ledHandle = NULL;
QueueHandle_t dispQueue;
void ledTask(void *param) {
while(1)
{
if(led_mode == "rainbow")
{
led(255, 0, 180,1);
vTaskDelay(300 / portTICK_PERIOD_MS);
led(255, 180, 0,1);
vTaskDelay(300 / portTICK_PERIOD_MS);
led( 0, 120, 255,1);
vTaskDelay(300 / portTICK_PERIOD_MS);
led( 0, 255, 120,1);
vTaskDelay(300 / portTICK_PERIOD_MS);
}
else if (led_mode == "warning")
{
led( 255, 0, 0,1);
vTaskDelay(300 / portTICK_PERIOD_MS);
led( 255, 165, 0,1);
vTaskDelay(300 / portTICK_PERIOD_MS);
}
else
{
led(0,0,0,0);
vTaskDelay(300 / portTICK_PERIOD_MS);
}
}
}
void displayTask(void *param)
{
while (1)
{
switch (state)
{
case STATE_IDLE:
drawIdle();
break;
case STATE_RECORDING:
drawSiriWave(g_audio_level);
break;
case STATE_PROCESSING:
drawProcessing();
break;
case STATE_SPEAKING:
drawSiriWave(g_audio_level);
break;
default:
disp("Unknown");
break;
}
}
}
void setup() {
Serial.begin(115200);
dispQueue = xQueueCreate(10, sizeof(char) * 128);
Serial.println("Dang cho du lieu IP tu UART...");
unsigned long start = millis();
String ip_from_serial = "";
while (millis() - start < 10000) {
if (Serial.available()) {
ip_from_serial = Serial.readStringUntil('\n');
ip_from_serial.trim();
break;
}
}
if (ip_from_serial.length() == 0) {
Serial.println("Không nhận được IP, dùng IP mặc định.");
ip_from_serial = "x.x.x.x";
}
String ws_server = "ws://" + ip_from_serial + ":portnumber";
Serial.println(ws_server);
delay(500);
gpio_reset_pin(GPIO_NUM_41);
gpio_reset_pin(GPIO_NUM_42);
Wire.begin(41, 42);
delay(100);
pixel.begin(); pixel.clear(); pixel.show();
display.begin(SSD1306_SWITCHCAPVCC, 0x3C);
delay(100);
initSnow();
disp("init Wifi");
Serial.println("init wifi");
delay(500);
initWiFi();
disp("init Mic");
Serial.println("init mic");
delay(500);
i2sInitMic();
disp("init Speaker");
Serial.println("init speaker");
delay(500);
i2sInitSpeaker();
disp(ws_server.c_str());
Serial.println("init connectWS");
delay(500);
connectWS(ws_server);
lastButtonState = digitalRead(VOLUME_DOWN_BUTTON_GPIO);
xTaskCreate(displayTask, "DisplayTask", 4096, NULL, 1, NULL);
delay(200);
xTaskCreate(ledTask, "LedTask", 2048, NULL, 1, NULL);
delay(200);
}
void loop() {
bool buttonState = digitalRead(VOLUME_DOWN_BUTTON_GPIO);
if (buttonState == LOW && lastButtonState == HIGH) {
state = STATE_RECORDING;
streamAudioForMs(5000);
state = STATE_PROCESSING;
}
lastButtonState = buttonState;
client.poll();
vTaskDelay(10);
}
// 'rezesmileSmolBinary', 128x64px
const unsigned char reze_bitmap_rezesmileSmol [] PROGMEM = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0xff, 0xff, 0xff, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0xff, 0xff, 0xff, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0xff, 0xff, 0xff, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0xff, 0xff, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0x80, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xc0, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xc0, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0xff, 0xdf, 0xff, 0xff, 0xff, 0xe0, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0xff, 0xdf, 0xff, 0xff, 0xff, 0xf0, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0xff, 0xff, 0xff, 0xff, 0xff, 0xf0, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xf8, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xf8, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0xff, 0xff, 0xff, 0xfb, 0xff, 0xfc, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0xff, 0xff, 0xff, 0xf3, 0xff, 0xfc, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0xff, 0xff, 0xff, 0xe3, 0xff, 0xfc, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0xff, 0xff, 0x87, 0xff, 0xfe, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0xff, 0xff, 0x03, 0xff, 0xfe, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0xff, 0xfe, 0x01, 0xff, 0xfe, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xfe, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0xff, 0xff, 0xe7, 0xff, 0xfe, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xfb, 0xff, 0xff, 0xf5, 0xff, 0xfe, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xf3, 0xff, 0xef, 0x7d, 0xff, 0xfe, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0xf3, 0xff, 0xc1, 0xd8, 0xff, 0xfe, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0xe7, 0xff, 0xc0, 0x28, 0xff, 0xfe, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0xe5, 0xff, 0xc4, 0x00, 0xff, 0xfe, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xf1, 0xff, 0xcd, 0x80, 0x7f, 0xfe, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0xf9, 0xff, 0xc9, 0x80, 0x3f, 0xfe, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0xf8, 0xff, 0xc1, 0x60, 0x3f, 0xfe, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xfc, 0x7f, 0x80, 0x40, 0x1f, 0xfc, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0x80, 0x00, 0x0f, 0xfc, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0xff, 0xff, 0x7f, 0xc0, 0x00, 0x07, 0xfc, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0xff, 0xfe, 0x7f, 0xc0, 0x00, 0x7f, 0xfc, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0xff, 0xfe, 0x7f, 0xc0, 0x00, 0x37, 0xfc, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xfe, 0x3f, 0xe0, 0x40, 0x07, 0xf8, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xfe, 0x3f, 0xe0, 0x38, 0x07, 0xf8, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0xfe, 0x3f, 0xd0, 0x0e, 0x0f, 0xf8, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0xfe, 0x3f, 0xc8, 0x00, 0x1f, 0xf8, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xfe, 0x37, 0xc4, 0x01, 0x1f, 0xf0, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0xfe, 0x1f, 0xc0, 0x00, 0x3f, 0xf0, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x07, 0xfc, 0x1b, 0xe0, 0x00, 0xff, 0xf0, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x0f, 0xfc, 0x1f, 0xf8, 0x01, 0xff, 0xd0, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x03, 0x0f, 0xfe, 0x0f, 0xae, 0x07, 0xff, 0xe0, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x8b, 0xff, 0x0f, 0xff, 0x0d, 0xf7, 0xe0, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xc7, 0xdf, 0xf1, 0x7f, 0xe0, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0xff, 0xf9, 0xdf, 0xc6, 0x63, 0x90, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0xff, 0xff, 0xff, 0xf4, 0x63, 0x0c, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0xff, 0xff, 0xf8, 0xc3, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0xdf, 0xff, 0xff, 0xf7, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0xd7, 0xce, 0xff, 0xff, 0xfa, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x8f, 0xcf, 0xff, 0xf9, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x0f, 0x07, 0x80, 0x1f, 0xf9, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x06, 0x00, 0x60, 0x0f, 0xf9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x03, 0x00, 0x18, 0x0e, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x07, 0x80, 0x07, 0x06, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0xc0, 0x00, 0xe6, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x70, 0x00, 0x1f, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x29, 0xd8, 0x00, 0x07, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x7c, 0x00, 0x01, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0xff, 0x00, 0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
// Array of all bitmaps for convenience. (Total bytes used to store images in PROGMEM = 1040)
const int reze_bitmap_allArray_LEN = 1;
const unsigned char* reze_bitmap_allArray[1] = {
reze_bitmap_rezesmileSmol
};
import asyncio, websockets, json, tempfile, os, wave
import unicodedata
import time
import numpy as np
from vosk import Model, KaldiRecognizer
from piper import PiperVoice
import soundfile as sf
import re
from google import generativeai as genai
from asyncio import Queue
SR_IN = 16000
SR_OUT = 24000
PORT = 8765
model = Model("vosk-model-en-us-0.22-lgraph/vosk-model-en-us-0.22-lgraph")
voice = PiperVoice.load("models/en-us-libritts-high.onnx")
print("Models loaded OK.")
genai.configure(api_key="gemini_api")
model_gemini = genai.GenerativeModel("gemini-2.5-flash-lite")
print('Gemini ok')
music_ack_queue = Queue()
def do_stt(audio_bytes: bytes) -> str:
"""Nhận raw PCM16 16kHz mono -> text"""
rec = KaldiRecognizer(model, SR_IN)
rec.AcceptWaveform(audio_bytes)
res = json.loads(rec.FinalResult())
return res.get("text", "")
def do_tts(text: str) -> bytes:
"""Nhận text -> PCM16 bytes"""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tf:
out_path = tf.name
with wave.open(out_path, "wb") as wavf:
voice.synthesize_wav(text, wavf)
data, fs = sf.read(out_path, dtype='int16')
os.remove(out_path)
if fs != SR_OUT:
data = np.interp(
np.linspace(0, len(data), int(len(data)*SR_OUT/fs)),
np.arange(len(data)),
data
).astype(np.int16)
return data.tobytes()
def Gemini_resp(msg):
prompt = (
f"You are Yuki, a gothic-style girlfriend with a calm, thoughtful personality. "
f"You speak in a warm, slightly mysterious tone — never robotic, never too formal. "
f"Respond very shortly, naturally and clearly, with enough detail to sound human, but keep sentences concise. "
f"Plain text answer. "
f"Question: {msg}"
)
response = model_gemini.generate_content(prompt)
text = response.text.strip()
text = unicodedata.normalize("NFD", text)
text = re.sub(r'[\u0300-\u036f]', '', text)
text = re.sub(r'[^\w\s.,!?:;\'"À-ỹ-]', '', text, flags=re.UNICODE)
text = re.sub(r'\s+', ' ', text).strip()
print("Gemini response:", text)
return text
async def handle_client(ws):
print("Client connected")
collecting = False
collected = bytearray()
collect_start_time = None
max_collect_duration = 6.0
async def process_collected_and_respond():
nonlocal collected, collecting, collect_start_time
if len(collected) == 0:
return
print(f"Running STT on {len(collected)} bytes...")
try:
text = do_stt(bytes(collected))
except Exception as e:
print("STT error:", e)
text = ""
print("STT result:", text)
if text.strip():
lower = text.lower()
user_wants_song = any(k in lower for k in [
"can you sing me a song",
"can you play a song",
"can you sing",
"sing for me",
"sing a song",
"sing",
"song"
])
if user_wants_song:
resp = "Of course"
else:
try:
resp = Gemini_resp(text)
except Exception as e:
print("Gemini error:", e)
resp = "I'm sorry, what"
else:
resp = "I cant hear you. please speak slow and clearly"
try:
sentences = re.split(r'(?<=[.!?]) +', resp)
for idx, sent in enumerate(sentences):
sent = sent.strip()
if not sent:
continue
print(f"TTS part {idx+1}/{len(sentences)}: {sent}")
await ws.send(sent)
await asyncio.sleep(0.1)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tf:
out_path = tf.name
with wave.open(out_path, "wb") as wavf:
voice.synthesize_wav(sent, wavf)
data, fs = sf.read(out_path, dtype='int16')
os.remove(out_path)
if fs != SR_OUT:
data = np.interp(
np.linspace(0, len(data), int(len(data) * SR_OUT / fs)),
np.arange(len(data)),
data
).astype(np.int16)
CHUNK = 4096
for i in range(0, len(data), CHUNK):
await ws.send(data[i:i+CHUNK].tobytes())
await asyncio.sleep(0.001)
await asyncio.sleep(0.05)
print("TTS streaming done")
await ws.send("tts_end")
if 'user_wants_song' in locals() and user_wants_song:
print("Trigger music playback...")
asyncio.create_task(play_music(ws, name='caramelldansen'))
except Exception as e:
print("TTS streaming error:", e)
collected = bytearray()
collecting = False
collect_start_time = None
try:
async for msg in ws:
if isinstance(msg, str):
try:
j = json.loads(msg)
except:
j = None
if j and j.get("type") == "stt_start":
collecting = True
collected = bytearray()
collect_start_time = time.time()
print("Start collecting audio")
elif j and j.get("type") == "stt_end":
print("Received stt_end -> processing")
await process_collected_and_respond()
else:
print("Text msg:", msg)
else:
if collecting:
collected.extend(msg)
if time.time() - collect_start_time > max_collect_duration:
print("Max collect duration exceeded -> process")
await process_collected_and_respond()
else:
print(f"Binary received but not collecting ({len(msg)} bytes)")
except websockets.exceptions.ConnectionClosedOK:
print("Client closed")
except Exception as e:
print("Error:", e)
print("Client disconnected")
async def play_music(ws, name='caramelldansen', chunk_bytes=1024):
file_name = f"{name}.wav"
if not os.path.exists(file_name):
print("Music file missing:", file_name)
return
data, fs = sf.read(file_name, dtype='int16')
if len(data.shape) == 2:
data = data[:, 0]
raw = data.tobytes()
total = len(raw)
idx = 0
await ws.send("rainbow")
try:
while idx < total:
end = min(idx + chunk_bytes, total)
chunk = raw[idx:end]
await ws.send(chunk)
idx = end
await asyncio.sleep(0.001)
print("Music stream done")
await ws.send("off")
await ws.send("tts_end")
except Exception as e:
print("Music error:", e)
async def main():
print(f"Starting WebSocket server on port {PORT}...")
async with websockets.serve(handle_client, "0.0.0.0", PORT, max_size=None, max_queue=None, ping_interval=None):
await asyncio.Future()
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
print('Exit.')
exit()
ESP32-S3 Based AI-Powered Conversational Device
Raspberry Pi 5 7 Inch Touch Screen IPS 1024x600 HD LCD HDMI-compatible Display for RPI 4B 3B+ OPI 5 AIDA64 PC Secondary Screen(Without Speaker)
BUY NOW- Comments(1)
- Likes(1)
-
knisa
Jan 08,2026
- 0 USER VOTES
- YOUR VOTE 0.00 0.00
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
More by Yuki Akira
-
AI-driven LoRa & LLM-enabled Kiosk & Food Delivery System
111 2 0 -
-
-
-
ESP32-C3 BLE Keyboard - Battery Powered with USB-C Charging
361 0 0 -
-
mammoth-3D SLM Voron Toolhead – Manual Drill & Tap Edition
463 0 1 -
-
AEL-2011 Power Supply Module
1067 0 2 -
AEL-2011 50W Power Amplifier
920 0 2 -
-







