Running Inference on M5Core2 for Audio Classification

I am using the sample project with code from record.ino. I can speak and listen to my voice when I use the microphonedata0 variable in startRec() function in loop. But I don’t figure out how to use sampleBuffer. Following is my code. If someone can please help me figure out how can I get the code working on M2Core2. Thanks

#define EIDSP_QUANTIZE_FILTERBANK   0

#include <Tutorial_Responding_to_your_voice_inferencing.h>
#include <M5Core2.h>
#include <driver/i2s.h>

#define CONFIG_I2S_BCK_PIN 12
#define CONFIG_I2S_LRCK_PIN 0
#define CONFIG_I2S_DATA_PIN 2
#define CONFIG_I2S_DATA_IN_PIN 34

#define Speak_I2S_NUMBER I2S_NUM_0

#define MODE_MIC 0
#define MODE_SPK 1
#define DATA_SIZE 1024

//uint8_t microphonedata0[1024 * 50];
int data_offset = 0;

/** Audio buffers, pointers and selectors */
typedef struct {
    int16_t *buffer;
    uint8_t buf_ready;
    uint32_t buf_count;
    uint32_t n_samples;
} inference_t;

static inference_t inference;
static signed short sampleBuffer[2048];
static bool debug_nn = false; // Set this to true to see e.g. features generated from the raw signal

bool InitI2SSpeakOrMic(int mode)
{
    esp_err_t err = ESP_OK;
    i2s_driver_uninstall(Speak_I2S_NUMBER);
    i2s_config_t i2s_config = {
        .mode = (i2s_mode_t)(I2S_MODE_MASTER),
        .sample_rate = 44100,
        .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT, // is fixed at 12bit, stereo, MSB
        .channel_format = I2S_CHANNEL_FMT_ONLY_RIGHT,
        .communication_format = I2S_COMM_FORMAT_I2S,
        .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
        .dma_buf_count = 2,
        .dma_buf_len = 128,
    };
    if (mode == MODE_MIC)
    {
        i2s_config.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX | I2S_MODE_PDM);
    }
    else
    {
        i2s_config.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX);
        i2s_config.use_apll = false;
        i2s_config.tx_desc_auto_clear = true;
    }
    err += i2s_driver_install(Speak_I2S_NUMBER, &i2s_config, 0, NULL);
    i2s_pin_config_t tx_pin_config;
    tx_pin_config.bck_io_num = CONFIG_I2S_BCK_PIN;
    tx_pin_config.ws_io_num = CONFIG_I2S_LRCK_PIN;
    tx_pin_config.data_out_num = CONFIG_I2S_DATA_PIN;
    tx_pin_config.data_in_num = CONFIG_I2S_DATA_IN_PIN;
    err += i2s_set_pin(Speak_I2S_NUMBER, &tx_pin_config);
    err += i2s_set_clk(Speak_I2S_NUMBER, 44100, I2S_BITS_PER_SAMPLE_16BIT, I2S_CHANNEL_MONO);
    return true;
}

void DisplayInit(void)
{
  M5.Lcd.fillScreen(WHITE);
  M5.Lcd.setTextColor(BLACK);
  M5.Lcd.setTextSize(2);
}

void setup() {
 
  // put your setup code here, to run once:
    //Serial.begin(115200);

    M5.begin(true, true, true, true);
    M5.Axp.SetSpkEnable(true);
    DisplayInit();
    M5.Lcd.setTextColor(RED);
    M5.Lcd.setCursor(10, 10);
    M5.Lcd.printf("Recorder!");
    M5.Lcd.setTextColor(BLACK);
    M5.Lcd.setCursor(10, 26);
    M5.Lcd.printf("Edge Impulse Demo");
    delay(100);

    Serial.println("Edge Impulse Inferencing Demo");

    // summary of inferencing settings (from model_metadata.h)
    ei_printf("Inferencing settings:\n");
    ei_printf("\tInterval: %.2f ms.\n", (float)EI_CLASSIFIER_INTERVAL_MS);
    ei_printf("\tFrame size: %d\n", EI_CLASSIFIER_DSP_INPUT_FRAME_SIZE);
    ei_printf("\tSample length: %d ms.\n", EI_CLASSIFIER_RAW_SAMPLE_COUNT / 16);
    ei_printf("\tNo. of classes: %d\n", sizeof(ei_classifier_inferencing_categories) / sizeof(ei_classifier_inferencing_categories[0]));


    M5.Lcd.setCursor(10, 56);
    M5.Lcd.printf("Inferencing setup");
    
    if (microphone_inference_start(EI_CLASSIFIER_RAW_SAMPLE_COUNT) == false) {
        ei_printf("ERR: Failed to setup audio sampling\r\n");
        return;
    }
}

void loop() {


   ei_printf("Starting inferencing in 2 seconds...\n");

    delay(2000);

    ei_printf("Recording...\n");

    bool m = microphone_inference_record();
    if (!m) {
        ei_printf("ERR: Failed to record audio...\n");
        return;
    }

    ei_printf("Recording done\n");

    signal_t signal;
    signal.total_length = EI_CLASSIFIER_RAW_SAMPLE_COUNT;
    signal.get_data = &microphone_audio_signal_get_data;
    ei_impulse_result_t result = { 0 };

    EI_IMPULSE_ERROR r = run_classifier(&signal, &result, debug_nn);
    if (r != EI_IMPULSE_OK) {
        ei_printf("ERR: Failed to run classifier (%d)\n", r);
        return;
    }

    // print the predictions
    ei_printf("Predictions ");
    ei_printf("(DSP: %d ms., Classification: %d ms., Anomaly: %d ms.)",
        result.timing.dsp, result.timing.classification, result.timing.anomaly);
    M5.Lcd.setCursor(10, 116);
    M5.Lcd.printf("(DSP: %d ms., Classification: %d ms., Anomaly: %d ms.)",
        result.timing.dsp, result.timing.classification, result.timing.anomaly);
    ei_printf(": \n");
    for (size_t ix = 0; ix < EI_CLASSIFIER_LABEL_COUNT; ix++) {
        ei_printf("    %s: %.5f\n", result.classification[ix].label, result.classification[ix].value);
    }
#if EI_CLASSIFIER_HAS_ANOMALY == 1
    ei_printf("    anomaly score: %.3f\n", result.anomaly);
    M5.Lcd.setCursor(10, 116);
    M5.Lcd.printf("anomaly score: %.3f\n", result.anomaly);
#endif
  
}


static bool microphone_inference_start(uint32_t n_samples)
{
    inference.buffer = (int16_t *)malloc(n_samples * sizeof(int16_t));

    if(inference.buffer == NULL) {
        return false;
    }

    inference.buf_count  = 0;
    inference.n_samples  = n_samples;
    inference.buf_ready  = 0;
 
    pdm_data_ready_inference_callback();
    microphone_inference_end();
    return true;
}



static void pdm_data_ready_inference_callback(void)
{
    
    // read into the sample buffer
    int bytesRead = startRec();

    if (inference.buf_ready == 0) {
        for(int i = 0; i < bytesRead>>1; i++) {
            inference.buffer[inference.buf_count++] = sampleBuffer[i];

            if(inference.buf_count >= inference.n_samples) {
                inference.buf_count = 0;
                inference.buf_ready = 1;
                break;
            }
        }
    }
}


static void microphone_inference_end(void)
{
   
    free(inference.buffer);
}


int startRec(){
TouchPoint_t pos= M5.Touch.getPressPoint();

        size_t byte_read;
  
        M5.Lcd.setCursor(10, 76);
        M5.Lcd.printf("Starting recording...!");
        M5.Axp.SetLDOEnable(3,true);
        delay(1000);
        M5.Axp.SetLDOEnable(3,false); 
        data_offset = 0;
        InitI2SSpeakOrMic(MODE_MIC);

        if(M5.Touch.ispressed() == true){
          i2s_read(Speak_I2S_NUMBER, (char *)(sampleBuffer), DATA_SIZE, &byte_read, (100 / portTICK_RATE_MS));
        }
        delay(1000);
        
        /*while(1){
            i2s_read(Speak_I2S_NUMBER, (char *)(microphonedata0 + data_offset), DATA_SIZE, &byte_read, (100 / portTICK_RATE_MS));
            data_offset += 1024;
                if(data_offset == 1024 * 50 || M5.Touch.ispressed() != true)
                  break;  
        }
        */
        size_t bytes_written;
        InitI2SSpeakOrMic(MODE_SPK);
        i2s_write(Speak_I2S_NUMBER, sampleBuffer, data_offset, &bytes_written, portMAX_DELAY);
    
    M5.Lcd.setCursor(10, 96);
    M5.Lcd.printf("Ending recording...!");

    return byte_read;
}

static bool microphone_inference_record(void)
{
    inference.buf_ready = 0;
    inference.buf_count = 0;

    while(inference.buf_ready == 0) {
        delay(10);
    }

    return true;
}

static int microphone_audio_signal_get_data(size_t offset, size_t length, float *out_ptr)
{
    numpy::int16_to_float(&inference.buffer[offset], out_ptr, length);

    return 0;
}

#if !defined(EI_CLASSIFIER_SENSOR) || EI_CLASSIFIER_SENSOR != EI_CLASSIFIER_SENSOR_MICROPHONE
#error "Invalid model for current sensor."
#endif
1 Like

Hi @timothy.malche,
The EI audio examples uses a callback pdm_data_ready_inference_callback() that is called periodically when an audio buffer is full. The M5Core2 example calls the audio driver directly and waits for the buffer to be filled. I’ve modified your code to handle audio data using the M5Core2 driver and to give you an idea how it should. Although I couldn’t actually test it.

Hopefully this works for you.
Arjan

#define EIDSP_QUANTIZE_FILTERBANK   0

#include <Tutorial_Responding_to_your_voice_inferencing.h>
#include <M5Core2.h>
#include <driver/i2s.h>

#define CONFIG_I2S_BCK_PIN 12
#define CONFIG_I2S_LRCK_PIN 0
#define CONFIG_I2S_DATA_PIN 2
#define CONFIG_I2S_DATA_IN_PIN 34

#define Speak_I2S_NUMBER I2S_NUM_0

#define MODE_MIC 0
#define MODE_SPK 1
#define DATA_SIZE 1024

//uint8_t microphonedata0[1024 * 50];
int data_offset = 0;

/** Audio buffers, pointers and selectors */
typedef struct {
    int16_t *buffer;
    uint8_t buf_ready;
    uint32_t buf_count;
    uint32_t n_samples;
} inference_t;

static inference_t inference;
static signed short sampleBuffer[2048];
static bool debug_nn = false; // Set this to true to see e.g. features generated from the raw signal

bool InitI2SSpeakOrMic(int mode)
{
    esp_err_t err = ESP_OK;
    i2s_driver_uninstall(Speak_I2S_NUMBER);
    i2s_config_t i2s_config = {
        .mode = (i2s_mode_t)(I2S_MODE_MASTER),
        .sample_rate = 44100,
        .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT, // is fixed at 12bit, stereo, MSB
        .channel_format = I2S_CHANNEL_FMT_ONLY_RIGHT,
        .communication_format = I2S_COMM_FORMAT_I2S,
        .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
        .dma_buf_count = 2,
        .dma_buf_len = 128,
    };
    if (mode == MODE_MIC)
    {
        i2s_config.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX | I2S_MODE_PDM);
    }
    else
    {
        i2s_config.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX);
        i2s_config.use_apll = false;
        i2s_config.tx_desc_auto_clear = true;
    }
    err += i2s_driver_install(Speak_I2S_NUMBER, &i2s_config, 0, NULL);
    i2s_pin_config_t tx_pin_config;
    tx_pin_config.bck_io_num = CONFIG_I2S_BCK_PIN;
    tx_pin_config.ws_io_num = CONFIG_I2S_LRCK_PIN;
    tx_pin_config.data_out_num = CONFIG_I2S_DATA_PIN;
    tx_pin_config.data_in_num = CONFIG_I2S_DATA_IN_PIN;
    err += i2s_set_pin(Speak_I2S_NUMBER, &tx_pin_config);
    err += i2s_set_clk(Speak_I2S_NUMBER, 44100, I2S_BITS_PER_SAMPLE_16BIT, I2S_CHANNEL_MONO);
    return true;
}

void DisplayInit(void)
{
  M5.Lcd.fillScreen(WHITE);
  M5.Lcd.setTextColor(BLACK);
  M5.Lcd.setTextSize(2);
}

void setup() {
 
  // put your setup code here, to run once:
    //Serial.begin(115200);

    M5.begin(true, true, true, true);
    M5.Axp.SetSpkEnable(true);
    DisplayInit();
    M5.Lcd.setTextColor(RED);
    M5.Lcd.setCursor(10, 10);
    M5.Lcd.printf("Recorder!");
    M5.Lcd.setTextColor(BLACK);
    M5.Lcd.setCursor(10, 26);
    M5.Lcd.printf("Edge Impulse Demo");
    delay(100);

    Serial.println("Edge Impulse Inferencing Demo");

    // summary of inferencing settings (from model_metadata.h)
    ei_printf("Inferencing settings:\n");
    ei_printf("\tInterval: %.2f ms.\n", (float)EI_CLASSIFIER_INTERVAL_MS);
    ei_printf("\tFrame size: %d\n", EI_CLASSIFIER_DSP_INPUT_FRAME_SIZE);
    ei_printf("\tSample length: %d ms.\n", EI_CLASSIFIER_RAW_SAMPLE_COUNT / 16);
    ei_printf("\tNo. of classes: %d\n", sizeof(ei_classifier_inferencing_categories) / sizeof(ei_classifier_inferencing_categories[0]));


    M5.Lcd.setCursor(10, 56);
    M5.Lcd.printf("Inferencing setup");
    
    if (microphone_inference_start(EI_CLASSIFIER_RAW_SAMPLE_COUNT) == false) {
        ei_printf("ERR: Failed to setup audio sampling\r\n");
        return;
    }
}

void loop() {


   ei_printf("Starting inferencing in 2 seconds...\n");

    delay(2000);

    ei_printf("Recording...\n");

    bool m = microphone_inference_record();
    if (!m) {
        ei_printf("ERR: Failed to record audio...\n");
        return;
    }

    ei_printf("Recording done\n");

    signal_t signal;
    signal.total_length = EI_CLASSIFIER_RAW_SAMPLE_COUNT;
    signal.get_data = &microphone_audio_signal_get_data;
    ei_impulse_result_t result = { 0 };

    EI_IMPULSE_ERROR r = run_classifier(&signal, &result, debug_nn);
    if (r != EI_IMPULSE_OK) {
        ei_printf("ERR: Failed to run classifier (%d)\n", r);
        return;
    }

    // print the predictions
    ei_printf("Predictions ");
    ei_printf("(DSP: %d ms., Classification: %d ms., Anomaly: %d ms.)",
        result.timing.dsp, result.timing.classification, result.timing.anomaly);
    M5.Lcd.setCursor(10, 116);
    M5.Lcd.printf("(DSP: %d ms., Classification: %d ms., Anomaly: %d ms.)",
        result.timing.dsp, result.timing.classification, result.timing.anomaly);
    ei_printf(": \n");
    for (size_t ix = 0; ix < EI_CLASSIFIER_LABEL_COUNT; ix++) {
        ei_printf("    %s: %.5f\n", result.classification[ix].label, result.classification[ix].value);
    }
#if EI_CLASSIFIER_HAS_ANOMALY == 1
    ei_printf("    anomaly score: %.3f\n", result.anomaly);
    M5.Lcd.setCursor(10, 116);
    M5.Lcd.printf("anomaly score: %.3f\n", result.anomaly);
#endif
  
}


static bool microphone_inference_start(uint32_t n_samples)
{
    inference.buffer = (int16_t *)malloc(n_samples * sizeof(int16_t));

    if(inference.buffer == NULL) {
        return false;
    }

    inference.buf_count  = 0;
    inference.n_samples  = n_samples;
    inference.buf_ready  = 0;
 
    return true;
}



static void pdm_data_ready_inference_callback(void)
{
    
    // read into the sample buffer
    int bytesRead;// = startRec();
    i2s_read(Speak_I2S_NUMBER, (char *)(sampleBuffer), DATA_SIZE, &bytesRead, (100 / portTICK_RATE_MS));

    if (inference.buf_ready == 0) {
        for(int i = 0; i < bytesRead>>1; i++) {
            inference.buffer[inference.buf_count++] = sampleBuffer[i];

            if(inference.buf_count >= inference.n_samples) {
                inference.buf_count = 0;
                inference.buf_ready = 1;
                break;
            }
        }
    }
}


static void microphone_inference_end(void)
{
   
    free(inference.buffer);
}


int startRec(){
TouchPoint_t pos= M5.Touch.getPressPoint();

        size_t byte_read;
  
        M5.Lcd.setCursor(10, 76);
        M5.Lcd.printf("Starting recording...!");
        M5.Axp.SetLDOEnable(3,true);
        delay(1000);
        M5.Axp.SetLDOEnable(3,false); 
        data_offset = 0;
        InitI2SSpeakOrMic(MODE_MIC);

        if(M5.Touch.ispressed() == true){
          i2s_read(Speak_I2S_NUMBER, (char *)(sampleBuffer), DATA_SIZE, &byte_read, (100 / portTICK_RATE_MS));
        }
        delay(1000);
        
        /*while(1){
            i2s_read(Speak_I2S_NUMBER, (char *)(microphonedata0 + data_offset), DATA_SIZE, &byte_read, (100 / portTICK_RATE_MS));
            data_offset += 1024;
                if(data_offset == 1024 * 50 || M5.Touch.ispressed() != true)
                  break;  
        }
        */
        size_t bytes_written;
        InitI2SSpeakOrMic(MODE_SPK);
        i2s_write(Speak_I2S_NUMBER, sampleBuffer, datinference.buf_readya_offset, &bytes_written, portMAX_DELAY);
    
    M5.Lcd.setCursor(10, 96);
    M5.Lcd.printf("Ending recording...!");

    return byte_read;
}

static bool microphone_inference_record(void)
{
    inference.buf_ready = 0;
    inference.buf_count = 0;

    while(inference.buf_ready == 0) {        
        
        pdm_data_ready_inference_callback(void);
    }

    inference.buf_ready = 0;
    return true;
}

static int microphone_audio_signal_get_data(size_t offset, size_t length, float *out_ptr)
{
    numpy::int16_to_float(&inference.buffer[offset], out_ptr, length);

    return 0;
}

#if !defined(EI_CLASSIFIER_SENSOR) || EI_CLASSIFIER_SENSOR != EI_CLASSIFIER_SENSOR_MICROPHONE
#error "Invalid model for current sensor."
#endif
1 Like

Hi @Arjan thanks. I’ll try and check it. However as per the suggestion given by engineer from M5Stack I was asked to fix following

  1. The SampleBuffer array size is too small. Should be 1024 * 100 or bigger;
  2. For ‘I2S_read’, position for pointer is used, which should be added gradually. Pls refer to our reference.

so as per suggention point 1 when I declare SampleBuffer[1024 * 100], the code does not compile. how ever when I declade SampleBuffer[1024 * 90] it compiles.

as per suggestion point 2 my code should be as following:

int startRec(){
.
.
.
while(1){
            i2s_read(Speak_I2S_NUMBER, (char *)(sampleBuffer + data_offset), DATA_SIZE, &byte_read, (100 / portTICK_RATE_MS));
            data_offset += 1024;
                if(data_offset == 1024 * 90 || M5.Touch.ispressed() != true)
                  break;  
        }
.
.
.
}

so should I need to replace the code that you suggested with the above code. Please suggest.

Hard to tell without knowing exactly what the i2s_read() function does. But in the example a buffer is filled each iteration with a block of sample data until the buffer is full. The same happens in the pdm_data_ready_inference_callback() function where samples are placed in the inference.buffer. So a SampleBuffer of 1024 bytes should be sufficient.

2 Likes

Thanks @Arjan for explanation.

2 Likes

Hi @timothy.malche did you manage to get this working? Did your classification come back accurately?

Trying to do an edge inferencing for Audio Classification on M5Core2 too and came across the same struggle with the example either using pdm or static data. Trying to convert the i2s audio data to the format that the inference code can take and Arjan’s example looks very promising.

Got it working after some minor tweaking but the inference output is always showing one category (with 90+% confidence) despite testing same noises simultaneously on EdgeImpulse web capture giving quite accurate results.

All good now. Inference is detecting correctly after a few minor tweaks. The sample codes you guys had above really helped. Thanks Guys!

1 Like

Hi @Wsantoso good to know that you have tweaked :wink:
Thanks for informing. All the best.

Hi @Arjan for whatever reasons above suggestions did not work for me. I am using M5stack AtomU microphone and I am trying to use Edge Impulse example to detect water flow. But I am always getting an inference of 1 or 0.99 when using M5stack boards (stickc or stickcplus or atomu). If I try the same examples using an ESP32 Dev (with INMP441 mic) or Esp-Eye (with built in mic), I get the correct results and good detection to detect water flow. I tried to follow all the suggestions above. Unsure what I am doing wrong. Do you have any pointers or things to check for? Thanks and appreciate it.

We have official firmware for ESP-EYE board, here is the code for microphone

you can use it as a reference. Of course the microphone config needs to match the one of the board you are actually using.

1 Like

hi @AIWintermuteAI
can we run that code on ESP32 devkit with INMP441 mic directly? or any line should I care about to change?

The firmware only supports ESP-EYE microphone configuration - for all other boards you need to change microphone config. The best way to do it is to see microphone config example for your board and then modify config in ei_microphone.cpp.