ESP32 Audio Input Optimization

Hi everyone I have been able to get the ESP32 to work with the edge impulse library and get my microphone working however it seems to be laggy, and I am trying to find a way to optimize the “hearing” aspect. When saying a word it may take about a second for the led to turn on. If anyone has any ideas I would greatly appreciate it!

#include <driver/i2s.h>
#include <Sylo_inferencing.h>

int LED = 13; 
long prev = 0;

void TaskMic( void *pvParameters );
 
 
bool semaphore = false;
 
float features[16000];
//float features_mic[16000];
 
int raw_feature_get_data(size_t offset, size_t length, float *out_ptr) {
  memcpy(out_ptr, features + offset, length * sizeof(float));
  return 0;
}
 
 
void ei_printf(const char *format, ...) {
  static char print_buf[1024] = { 0 };
 
  va_list args;
  va_start(args, format);
  int r = vsnprintf(print_buf, sizeof(print_buf), format, args);
  va_end(args);
 
  if (r > 0) {
    Serial.write(print_buf);
  }
}
 
 
const i2s_port_t I2S_PORT = I2S_NUM_0;
const int BLOCK_SIZE = 1000;
 
void setup() {
  Serial.begin(115200);
    pinMode(LED, OUTPUT); 

  Serial.println("Configuring I2S...");
  esp_err_t err;
 
  // The I2S config as per the example
  const i2s_config_t i2s_config = {
    .mode = i2s_mode_t(I2S_MODE_MASTER | I2S_MODE_RX), // Receive, not transfer
    .sample_rate = 16000,                         // 16KHz
    .bits_per_sample = I2S_BITS_PER_SAMPLE_32BIT, // could only get it to work with 32bits
    .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT, // although the SEL config should be left, it seems to transmit on right
    .communication_format = i2s_comm_format_t(I2S_COMM_FORMAT_I2S | I2S_COMM_FORMAT_I2S_MSB),
    .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,     // Interrupt level 1
    .dma_buf_count = 10,                           // number of buffers
    .dma_buf_len = BLOCK_SIZE                     // samples per buffer
  };
 
  // The pin config as per the setup
  const i2s_pin_config_t pin_config = {
    .bck_io_num = 14,   // BCKL
    .ws_io_num = 15,    // LRCL
    .data_out_num = -1, // not used (only for speakers)
    .data_in_num = 32   // DOUT
  };
 
  // Configuring the I2S driver and pins.
  // This function must be called before any I2S driver read/write operations.
  err = i2s_driver_install(I2S_PORT, &i2s_config, 0, NULL);
  if (err != ESP_OK) {
    Serial.printf("Failed installing driver: %d\n", err);
    while (true);
  }
  err = i2s_set_pin(I2S_PORT, &pin_config);
  if (err != ESP_OK) {
    Serial.printf("Failed setting pin: %d\n", err);
    while (true);
  }
  Serial.println("I2S driver installed.");
 
//  xTaskCreatePinnedToCore(
//    TaskMic
//    ,  "TaskMic"   // A name just for humans
//    ,  10024  // This stack size can be checked & adjusted by reading the Stack Highwater
//    ,  NULL
//    ,  2  // Priority, with 3 (configMAX_PRIORITIES - 1) being the highest, and 0 being the lowest.
//    ,  NULL
//    ,  0);
}
 
 
void loop() {
  
    int32_t samples[BLOCK_SIZE];
    int num_bytes_read = i2s_read_bytes(I2S_PORT,
                                        (char *)samples,
                                        BLOCK_SIZE * 4,     // the doc says bytes, but its elements.
                                        portMAX_DELAY); // no timeout
 
    int samples_read = num_bytes_read / 8;
 
    long tr = millis();
    long raz = tr - prev;
    prev = tr;
    semaphore = true;
    
while(semaphore){
    for (int i = 0; i < 16000; i++) {
      if (i < 15000) {
        features[i] = features[i + 1000];
 
      } else {
        features[i] = (float)samples[i - 15000];
      }
 
    }
    semaphore = false;
}
 
  if (sizeof(features) / sizeof(float) != EI_CLASSIFIER_DSP_INPUT_FRAME_SIZE) {
    ei_printf("The size of your 'features' array is not correct. Expected %lu items, but had %lu\n",
              EI_CLASSIFIER_DSP_INPUT_FRAME_SIZE, sizeof(features) / sizeof(float));
    delay(1000);
    return;
  }
 
  ei_impulse_result_t result = { 0 };
 
  // the features are stored into flash, and we don't want to load everything into RAM
  signal_t features_signal;
  features_signal.total_length = sizeof(features) / sizeof(features[0]);
  features_signal.get_data = &raw_feature_get_data;
 
  // invoke the impulse
  EI_IMPULSE_ERROR res = run_classifier(&features_signal, &result, false /* debug */);
 
 
  int ind = -1;
  float maxx = 0.0;
 
  for (int in = 0; in < EI_CLASSIFIER_LABEL_COUNT; in++) {
 
    if (result.classification[in].value > maxx && result.classification[in].value > 0.80) {
 
      maxx = result.classification[in].value;
      ind = in;
    }
  }
 
  if (ind >= 0) {
    String to_prnt = "";
 
    to_prnt += result.classification[ind].label;
    to_prnt += "  ";
    to_prnt += result.classification[ind].value;
 
    Serial.println(to_prnt);
  }
 
  ei_printf("run_classifier returned: %d\n", res);
 
  if (res != 0) return;
 
  // print the predictions
  ei_printf("Predictions ");
  ei_printf("(DSP: %d ms., Classification: %d ms., Anomaly: %d ms.)",
            result.timing.dsp, result.timing.classification, result.timing.anomaly);
  ei_printf(": \n");
  ei_printf("[");
  for (size_t ix = 0; ix < EI_CLASSIFIER_LABEL_COUNT; ix++) {
    ei_printf("%.5f", result.classification[ix].value);
#if EI_CLASSIFIER_HAS_ANOMALY == 1
    ei_printf(", ");
#else
    if (ix != EI_CLASSIFIER_LABEL_COUNT - 1) {
      ei_printf(", ");
    }
#endif
  }
#if EI_CLASSIFIER_HAS_ANOMALY == 1
  ei_printf("%.3f", result.anomaly);
#endif
  ei_printf("]\n");
 
  // human-readable predictions
  for (size_t ix = 0; ix < EI_CLASSIFIER_LABEL_COUNT; ix++) {
    ei_printf("    %s: %.5f\n", result.classification[ix].label, result.classification[ix].value);
  }
#if EI_CLASSIFIER_HAS_ANOMALY == 1
  ei_printf("    anomaly score: %.3f\n", result.anomaly);
#endif

   if(result.classification[2].value > 0.8){
     digitalWrite(LED, HIGH);
     delay(1000);
    digitalWrite(LED, LOW);
  }
 
  delay(10);
}

Hi @JVanlla,

You currently are running sequentially; first sampling then classifying and then you continue the process.This is blocking, while the classification process is running you are not capturing any audio data.

One way you can improve the responsiveness is to capture audio in parallel to running the classification. You may do this with a DMA and a double buffering mechanism. See run continuous audio for more explanation on how you can achieve this.

For an example of how we do this with Arduino Nano 33 BLE Sense you can export a Arduino Library and take a look at the nano_ble33_sense_microphone_continuous.ino example.

I have tried to readjust my code to the EI microphone continuous example but have found issues when trying to compile. Could you please take a look and let m know what you think?

//Libraries--------------------------------------------------------------------------------------------------------------------------

#include <driver/i2s.h>
#include <Sylo_inferencing.h>

//Global Variables-------------------------------------------------------------------------------------------------------------------

const i2s_port_t I2S_PORT = I2S_NUM_0;
const int BLOCK_SIZE = 1000;

typedef struct {
signed short *buffers[2];
unsigned char buf_select;
unsigned char buf_ready;
unsigned int buf_count;
unsigned int n_samples;
} inference_t;

static inference_t inference;
static bool record_ready = false;
static signed short *sampleBuffer;
static int print_results = -(EI_CLASSIFIER_SLICES_PER_MODEL_WINDOW);

//Edge Impulse Print Function--------------------------------------------------------------------------------------------------------

void ei_printf(const char *format, ...) {
  static char print_buf[1024] = { 0 };
 
  va_list args;
  va_start(args, format);
  int r = vsnprintf(print_buf, sizeof(print_buf), format, args);
  va_end(args);
 
  if (r > 0) {
Serial.write(print_buf);
  }
}

//Void Setup-------------------------------------------------------------------------------------------------------------------------

void setup() {
  
  Serial.begin(115200); //Serial Begin

//ESP32 I2S Config-------------------------------------------------------------------------------------------------------------------
  
  Serial.println("Configuring I2S..."); //I2S Config
  esp_err_t err;
 
  // The I2S config as per the example
  const i2s_config_t i2s_config = {
.mode = i2s_mode_t(I2S_MODE_MASTER | I2S_MODE_RX), // Receive, not transfer
.sample_rate = 16000,                         // 16KHz
.bits_per_sample = I2S_BITS_PER_SAMPLE_32BIT, // could only get it to work with 32bits
.channel_format = I2S_CHANNEL_FMT_ONLY_LEFT, // although the SEL config should be left, it seems to transmit on right
.communication_format = i2s_comm_format_t(I2S_COMM_FORMAT_I2S | I2S_COMM_FORMAT_I2S_MSB),
.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,     // Interrupt level 1
.dma_buf_count = 10,                           // number of buffers
.dma_buf_len = BLOCK_SIZE                     // samples per buffer
  };
 
  // The pin config as per the setup
  const i2s_pin_config_t pin_config = {
.bck_io_num = 14,   // BCKL
.ws_io_num = 15,    // LRCL
.data_out_num = -1, // not used (only for speakers)
.data_in_num = 32   // DOUT
  };
 
  // Configuring the I2S driver and pins.
  // This function must be called before any I2S driver read/write operations.
  err = i2s_driver_install(I2S_PORT, &i2s_config, 0, NULL);
  if (err != ESP_OK) {
Serial.printf("Failed installing driver: %d\n", err);
while (true);
  }
  err = i2s_set_pin(I2S_PORT, &pin_config);
  if (err != ESP_OK) {
Serial.printf("Failed setting pin: %d\n", err);
while (true);
  }
  Serial.println("I2S driver installed.");

//Summary of Infrencing Settings-----------------------------------------------------------------------------------------------------

  ei_printf("Inferencing settings:\n");
  ei_printf("\tInterval: %.2f ms.\n", (float)EI_CLASSIFIER_INTERVAL_MS);
  ei_printf("\tFrame size: %d\n", EI_CLASSIFIER_DSP_INPUT_FRAME_SIZE);
  ei_printf("\tSample length: %d ms.\n", EI_CLASSIFIER_RAW_SAMPLE_COUNT / 16);
  ei_printf("\tNo. of classes: %d\n", sizeof(ei_classifier_inferencing_categories) / sizeof(ei_classifier_inferencing_categories[0]));

  run_classifier_init();
  if (microphone_inference_start(EI_CLASSIFIER_SLICE_SIZE) == false) {
    ei_printf("ERR: Failed to setup audio sampling\r\n");
    return;
  }

//Void Setup Continued---------------------------------------------------------------------------------------------------------------

}

//Void Loop-------------------------------------------------------------------------------------------------------------------------

void loop() {

//Edge Impulse Infrencing Loop-------------------------------------------------------------------------------------------------------

   bool m = microphone_inference_record();
   if (!m) {
    ei_printf("ERR: Failed to record audio...\n");
    return;
   }
   
   signal_t signal;
   signal.total_length = EI_CLASSIFIER_SLICE_SIZE;
   signal.get_data = &microphone_audio_signal_get_data;
   ei_impulse_result_t result = {0};

   EI_IMPULSE_ERROR r = run_classifier_continuous(&signal, &result, false);
   if (r != EI_IMPULSE_OK) {
    ei_printf("ERR: Failed to run classifier (%d)\n", r);
    return;
}

   if (++print_results >= (EI_CLASSIFIER_SLICES_PER_MODEL_WINDOW)) {
    // print the predictions
    ei_printf("Predictions ");
    ei_printf("(DSP: %d ms., Classification: %d ms., Anomaly: %d ms.)",
        result.timing.dsp, result.timing.classification, result.timing.anomaly);
    ei_printf(": \n");
    for (size_t ix = 0; ix < EI_CLASSIFIER_LABEL_COUNT; ix++) {
        ei_printf("    %s: %.5f\n", result.classification[ix].label,
                  result.classification[ix].value);
    }
#if EI_CLASSIFIER_HAS_ANOMALY == 1
    ei_printf("    anomaly score: %.3f\n", result.anomaly);
#endif

    print_results = 0;
}

//Void Loop Continued----------------------------------------------------------------------------------------------------------------

}

//Edge Impulse Get Data Function-----------------------------------------------------------------------------------------------------

int microphone_audio_signal_get_data(size_t offset, size_t length, float *out_ptr){
  numpy::int16_to_float(&inference.buffers[inference.buf_select ^ 1][offset], out_ptr, length);
  return 0;
}

//Edge Impulse Record Audio Function-------------------------------------------------------------------------------------------------

static bool microphone_inference_record(void){
bool ret = true;

if (inference.buf_ready == 1) {
    ei_printf(
        "Error sample buffer overrun. Decrease the number of slices per model window "
        "(EI_CLASSIFIER_SLICES_PER_MODEL_WINDOW)\n");
    ret = false;
}

while (inference.buf_ready == 0) {
    delay(1);
}

inference.buf_ready = 0;

return ret;
}

//Edge Impulse Read Buffer Function-------------------------------------------------------------------------------------------------

static void pdm_data_ready_inference_callback(void){
int32_t samples[BLOCK_SIZE];

// read into the sample buffer
int bytesRead = i2s_read_bytes(I2S_PORT, (char *)samples, BLOCK_SIZE * 4, portMAX_DELAY); 

if (record_ready == true) {
    for (int i = 0; i<bytesRead>> 1; i++) {
        inference.buffers[inference.buf_select][inference.buf_count++] = sampleBuffer[i];

        if (inference.buf_count >= inference.n_samples) {
            inference.buf_select ^= 1;
            inference.buf_count = 0;
            inference.buf_ready = 1;
        }
    }
}
}

//Edge Impulse Initialize Inferencing Structure Function-----------------------------------------------------------------------------

static bool microphone_inference_start(uint32_t n_samples)
{
inference.buffers[0] = (signed short *)malloc(n_samples * sizeof(signed short));

if (inference.buffers[0] == NULL) {
    return false;
}

inference.buffers[1] = (signed short *)malloc(n_samples * sizeof(signed short));

if (inference.buffers[1] == NULL) {
    free(inference.buffers[0]);
    return false;
}

sampleBuffer = (signed short *)malloc((n_samples >> 1) * sizeof(signed short));

if (sampleBuffer == NULL) {
    free(inference.buffers[0]);
    free(inference.buffers[1]);
    return false;
}

inference.buf_select = 0;
inference.buf_count = 0;
inference.n_samples = n_samples;
inference.buf_ready = 0;

// configure the data receive callback
pdm_data_ready_inference_callback();

record_ready = true;

return true;
}

Hello @JVanlla,

Could you share your compilation issues?

Regards,

Louis

@JVanlla Hi!
We’re working on official integration of ESP32 with Edge Impulse. Continuous inference of audio signal is going to be a part of firmware. However, while it is still WIP, do you mind sharing the code compilation output, so we could see what the problem is?

Hi! Since the time of my last post I have been able to get some help. The code below uses both cores to transfer and process the data, however I am having a hard time combining it into one core as I need the second core to operate other functions. Will the support for ESP32 come as an arduino library and .ino file? If you have any suggestions on how I can convert this to single core without increasing latency I would appreciate it. The code is shown below:

#include <driver/i2s.h>
#include <Sylo_inferencing.h>
 
void TaskMic( void *pvParameters );
 
 
bool semaphore = false;
 
float features[16000];
//float features_mic[16000];
 
int raw_feature_get_data(size_t offset, size_t length, float *out_ptr) {
  memcpy(out_ptr, features + offset, length * sizeof(float));
  return 0;
}
 
 
void ei_printf(const char *format, ...) {
  static char print_buf[1024] = { 0 };
 
  va_list args;
  va_start(args, format);
  int r = vsnprintf(print_buf, sizeof(print_buf), format, args);
  va_end(args);
 
  if (r > 0) {
    Serial.write(print_buf);
  }
}
 
 
const i2s_port_t I2S_PORT = I2S_NUM_0;
const int BLOCK_SIZE = 1000;
 
void setup() {
  Serial.begin(115200);
 
  xTaskCreatePinnedToCore(
    TaskMic
    ,  "TaskMic"   // A name just for humans
    ,  10024  // This stack size can be checked & adjusted by reading the Stack Highwater
    ,  NULL
    ,  2  // Priority, with 3 (configMAX_PRIORITIES - 1) being the highest, and 0 being the lowest.
    ,  NULL
    ,  0);
}
 
 
void loop() {
 
 
  while (semaphore) {
    vTaskDelay(1);
  }
  semaphore = true;
 
  if (sizeof(features) / sizeof(float) != EI_CLASSIFIER_DSP_INPUT_FRAME_SIZE) {
    ei_printf("The size of your 'features' array is not correct. Expected %lu items, but had %lu\n",
              EI_CLASSIFIER_DSP_INPUT_FRAME_SIZE, sizeof(features) / sizeof(float));
    delay(1000);
    return;
  }
 
  ei_impulse_result_t result = { 0 };
 
  // the features are stored into flash, and we don't want to load everything into RAM
  signal_t features_signal;
  features_signal.total_length = sizeof(features) / sizeof(features[0]);
  features_signal.get_data = &raw_feature_get_data;
 
  // invoke the impulse
  EI_IMPULSE_ERROR res = run_classifier(&features_signal, &result, false /* debug */);
 
 
  int ind = -1;
  float maxx = 0.0;
 
  for (int in = 0; in < EI_CLASSIFIER_LABEL_COUNT; in++) {
 
    if (result.classification[in].value > maxx && result.classification[in].value > 0.80) {
 
      maxx = result.classification[in].value;
      ind = in;
    }
  }

 
  if (ind >= 0) {
    String to_prnt = "";
 
    to_prnt += result.classification[ind].label;
    to_prnt += "  ";
    to_prnt += result.classification[ind].value;
 
    Serial.println(to_prnt);
  }
 
  ei_printf("run_classifier returned: %d\n", res);
 
  if (res != 0) return;
 
  // print the predictions
  ei_printf("Predictions ");
  ei_printf("(DSP: %d ms., Classification: %d ms., Anomaly: %d ms.)",
            result.timing.dsp, result.timing.classification, result.timing.anomaly);
  ei_printf(": \n");
  ei_printf("[");
  for (size_t ix = 0; ix < EI_CLASSIFIER_LABEL_COUNT; ix++) {
    ei_printf("%.5f", result.classification[ix].value);
#if EI_CLASSIFIER_HAS_ANOMALY == 1
    ei_printf(", ");
#else
    if (ix != EI_CLASSIFIER_LABEL_COUNT - 1) {
      ei_printf(", ");
    }
#endif
  }
#if EI_CLASSIFIER_HAS_ANOMALY == 1
  ei_printf("%.3f", result.anomaly);
#endif
  ei_printf("]\n");
 
  // human-readable predictions
  for (size_t ix = 0; ix < EI_CLASSIFIER_LABEL_COUNT; ix++) {
    ei_printf("    %s: %.5f\n", result.classification[ix].label, result.classification[ix].value);
  }
#if EI_CLASSIFIER_HAS_ANOMALY == 1
  ei_printf("    anomaly score: %.3f\n", result.anomaly);
#endif

if(result.classification[2].value > 0.8){
  Serial.println("Sylo Was Heard");
}
 
 
  semaphore = false;
 
  delay(10);
}
 
 
void TaskMic(void *pvParameters) { // This is a task.
 
  Serial.println("Configuring I2S...");
  esp_err_t err;
 
  // The I2S config as per the example
  const i2s_config_t i2s_config = {
    .mode = i2s_mode_t(I2S_MODE_MASTER | I2S_MODE_RX), // Receive, not transfer
    .sample_rate = 16000,                         // 16KHz
    .bits_per_sample = I2S_BITS_PER_SAMPLE_32BIT, // could only get it to work with 32bits
    .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT, // although the SEL config should be left, it seems to transmit on right
    .communication_format = i2s_comm_format_t(I2S_COMM_FORMAT_I2S | I2S_COMM_FORMAT_I2S_MSB),
    .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,     // Interrupt level 1
    .dma_buf_count = 10,                           // number of buffers
    .dma_buf_len = BLOCK_SIZE                     // samples per buffer
  };
 
  // The pin config as per the setup
  const i2s_pin_config_t pin_config = {
    .bck_io_num = 14,   // BCKL
    .ws_io_num = 15,    // LRCL
    .data_out_num = -1, // not used (only for speakers)
    .data_in_num = 32   // DOUT
  };
 
  // Configuring the I2S driver and pins.
  // This function must be called before any I2S driver read/write operations.
  err = i2s_driver_install(I2S_PORT, &i2s_config, 0, NULL);
  if (err != ESP_OK) {
    Serial.printf("Failed installing driver: %d\n", err);
    while (true);
  }
  err = i2s_set_pin(I2S_PORT, &pin_config);
  if (err != ESP_OK) {
    Serial.printf("Failed setting pin: %d\n", err);
    while (true);
  }
  Serial.println("I2S driver installed.");

 
  for (;;) {
 
    // Read multiple samples at once and calculate the sound pressure
    int32_t samples[BLOCK_SIZE];
    int num_bytes_read = i2s_read_bytes(I2S_PORT,
                                        (char *)samples,
                                        BLOCK_SIZE * 4,     // the doc says bytes, but its elements.
                                        portMAX_DELAY); // no timeout
 
    int samples_read = num_bytes_read / 8;
 
    while (semaphore) {
      vTaskDelay(1);
    }
    semaphore = true;
 
    for (int i = 0; i < 16000; i++) {
      if (i < 15000) {
        features[i] = features[i + 1000];
 
      } else {
        features[i] = (float)samples[i - 15000];
      }
 
    }
 
    semaphore = false;
 
    vTaskDelay(1);
 
  }
 
 
}

Hello!
The EI firmware was officially released, for the pointers to continuous audio inference implementation, you can have a look here

and here

You don’t need to explicitly bind the task to a second core, FreeRTOS (SMP) handles that for you on a dual-core ESP32.
Hope that was helpful!

1 Like