Running Inference on M5Core2 for Audio Classification

I am using the sample project with code from record.ino. I can speak and listen to my voice when I use the microphonedata0 variable in startRec() function in loop. But I don’t figure out how to use sampleBuffer. Following is my code. If someone can please help me figure out how can I get the code working on M2Core2. Thanks

#define EIDSP_QUANTIZE_FILTERBANK   0

#include <Tutorial_Responding_to_your_voice_inferencing.h>
#include <M5Core2.h>
#include <driver/i2s.h>

#define CONFIG_I2S_BCK_PIN 12
#define CONFIG_I2S_LRCK_PIN 0
#define CONFIG_I2S_DATA_PIN 2
#define CONFIG_I2S_DATA_IN_PIN 34

#define Speak_I2S_NUMBER I2S_NUM_0

#define MODE_MIC 0
#define MODE_SPK 1
#define DATA_SIZE 1024

//uint8_t microphonedata0[1024 * 50];
int data_offset = 0;

/** Audio buffers, pointers and selectors */
typedef struct {
    int16_t *buffer;
    uint8_t buf_ready;
    uint32_t buf_count;
    uint32_t n_samples;
} inference_t;

static inference_t inference;
static signed short sampleBuffer[2048];
static bool debug_nn = false; // Set this to true to see e.g. features generated from the raw signal

bool InitI2SSpeakOrMic(int mode)
{
    esp_err_t err = ESP_OK;
    i2s_driver_uninstall(Speak_I2S_NUMBER);
    i2s_config_t i2s_config = {
        .mode = (i2s_mode_t)(I2S_MODE_MASTER),
        .sample_rate = 44100,
        .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT, // is fixed at 12bit, stereo, MSB
        .channel_format = I2S_CHANNEL_FMT_ONLY_RIGHT,
        .communication_format = I2S_COMM_FORMAT_I2S,
        .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
        .dma_buf_count = 2,
        .dma_buf_len = 128,
    };
    if (mode == MODE_MIC)
    {
        i2s_config.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX | I2S_MODE_PDM);
    }
    else
    {
        i2s_config.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX);
        i2s_config.use_apll = false;
        i2s_config.tx_desc_auto_clear = true;
    }
    err += i2s_driver_install(Speak_I2S_NUMBER, &i2s_config, 0, NULL);
    i2s_pin_config_t tx_pin_config;
    tx_pin_config.bck_io_num = CONFIG_I2S_BCK_PIN;
    tx_pin_config.ws_io_num = CONFIG_I2S_LRCK_PIN;
    tx_pin_config.data_out_num = CONFIG_I2S_DATA_PIN;
    tx_pin_config.data_in_num = CONFIG_I2S_DATA_IN_PIN;
    err += i2s_set_pin(Speak_I2S_NUMBER, &tx_pin_config);
    err += i2s_set_clk(Speak_I2S_NUMBER, 44100, I2S_BITS_PER_SAMPLE_16BIT, I2S_CHANNEL_MONO);
    return true;
}

void DisplayInit(void)
{
  M5.Lcd.fillScreen(WHITE);
  M5.Lcd.setTextColor(BLACK);
  M5.Lcd.setTextSize(2);
}

void setup() {
 
  // put your setup code here, to run once:
    //Serial.begin(115200);

    M5.begin(true, true, true, true);
    M5.Axp.SetSpkEnable(true);
    DisplayInit();
    M5.Lcd.setTextColor(RED);
    M5.Lcd.setCursor(10, 10);
    M5.Lcd.printf("Recorder!");
    M5.Lcd.setTextColor(BLACK);
    M5.Lcd.setCursor(10, 26);
    M5.Lcd.printf("Edge Impulse Demo");
    delay(100);

    Serial.println("Edge Impulse Inferencing Demo");

    // summary of inferencing settings (from model_metadata.h)
    ei_printf("Inferencing settings:\n");
    ei_printf("\tInterval: %.2f ms.\n", (float)EI_CLASSIFIER_INTERVAL_MS);
    ei_printf("\tFrame size: %d\n", EI_CLASSIFIER_DSP_INPUT_FRAME_SIZE);
    ei_printf("\tSample length: %d ms.\n", EI_CLASSIFIER_RAW_SAMPLE_COUNT / 16);
    ei_printf("\tNo. of classes: %d\n", sizeof(ei_classifier_inferencing_categories) / sizeof(ei_classifier_inferencing_categories[0]));


    M5.Lcd.setCursor(10, 56);
    M5.Lcd.printf("Inferencing setup");
    
    if (microphone_inference_start(EI_CLASSIFIER_RAW_SAMPLE_COUNT) == false) {
        ei_printf("ERR: Failed to setup audio sampling\r\n");
        return;
    }
}

void loop() {


   ei_printf("Starting inferencing in 2 seconds...\n");

    delay(2000);

    ei_printf("Recording...\n");

    bool m = microphone_inference_record();
    if (!m) {
        ei_printf("ERR: Failed to record audio...\n");
        return;
    }

    ei_printf("Recording done\n");

    signal_t signal;
    signal.total_length = EI_CLASSIFIER_RAW_SAMPLE_COUNT;
    signal.get_data = &microphone_audio_signal_get_data;
    ei_impulse_result_t result = { 0 };

    EI_IMPULSE_ERROR r = run_classifier(&signal, &result, debug_nn);
    if (r != EI_IMPULSE_OK) {
        ei_printf("ERR: Failed to run classifier (%d)\n", r);
        return;
    }

    // print the predictions
    ei_printf("Predictions ");
    ei_printf("(DSP: %d ms., Classification: %d ms., Anomaly: %d ms.)",
        result.timing.dsp, result.timing.classification, result.timing.anomaly);
    M5.Lcd.setCursor(10, 116);
    M5.Lcd.printf("(DSP: %d ms., Classification: %d ms., Anomaly: %d ms.)",
        result.timing.dsp, result.timing.classification, result.timing.anomaly);
    ei_printf(": \n");
    for (size_t ix = 0; ix < EI_CLASSIFIER_LABEL_COUNT; ix++) {
        ei_printf("    %s: %.5f\n", result.classification[ix].label, result.classification[ix].value);
    }
#if EI_CLASSIFIER_HAS_ANOMALY == 1
    ei_printf("    anomaly score: %.3f\n", result.anomaly);
    M5.Lcd.setCursor(10, 116);
    M5.Lcd.printf("anomaly score: %.3f\n", result.anomaly);
#endif
  
}


static bool microphone_inference_start(uint32_t n_samples)
{
    inference.buffer = (int16_t *)malloc(n_samples * sizeof(int16_t));

    if(inference.buffer == NULL) {
        return false;
    }

    inference.buf_count  = 0;
    inference.n_samples  = n_samples;
    inference.buf_ready  = 0;
 
    pdm_data_ready_inference_callback();
    microphone_inference_end();
    return true;
}



static void pdm_data_ready_inference_callback(void)
{
    
    // read into the sample buffer
    int bytesRead = startRec();

    if (inference.buf_ready == 0) {
        for(int i = 0; i < bytesRead>>1; i++) {
            inference.buffer[inference.buf_count++] = sampleBuffer[i];

            if(inference.buf_count >= inference.n_samples) {
                inference.buf_count = 0;
                inference.buf_ready = 1;
                break;
            }
        }
    }
}


static void microphone_inference_end(void)
{
   
    free(inference.buffer);
}


int startRec(){
TouchPoint_t pos= M5.Touch.getPressPoint();

        size_t byte_read;
  
        M5.Lcd.setCursor(10, 76);
        M5.Lcd.printf("Starting recording...!");
        M5.Axp.SetLDOEnable(3,true);
        delay(1000);
        M5.Axp.SetLDOEnable(3,false); 
        data_offset = 0;
        InitI2SSpeakOrMic(MODE_MIC);

        if(M5.Touch.ispressed() == true){
          i2s_read(Speak_I2S_NUMBER, (char *)(sampleBuffer), DATA_SIZE, &byte_read, (100 / portTICK_RATE_MS));
        }
        delay(1000);
        
        /*while(1){
            i2s_read(Speak_I2S_NUMBER, (char *)(microphonedata0 + data_offset), DATA_SIZE, &byte_read, (100 / portTICK_RATE_MS));
            data_offset += 1024;
                if(data_offset == 1024 * 50 || M5.Touch.ispressed() != true)
                  break;  
        }
        */
        size_t bytes_written;
        InitI2SSpeakOrMic(MODE_SPK);
        i2s_write(Speak_I2S_NUMBER, sampleBuffer, data_offset, &bytes_written, portMAX_DELAY);
    
    M5.Lcd.setCursor(10, 96);
    M5.Lcd.printf("Ending recording...!");

    return byte_read;
}

static bool microphone_inference_record(void)
{
    inference.buf_ready = 0;
    inference.buf_count = 0;

    while(inference.buf_ready == 0) {
        delay(10);
    }

    return true;
}

static int microphone_audio_signal_get_data(size_t offset, size_t length, float *out_ptr)
{
    numpy::int16_to_float(&inference.buffer[offset], out_ptr, length);

    return 0;
}

#if !defined(EI_CLASSIFIER_SENSOR) || EI_CLASSIFIER_SENSOR != EI_CLASSIFIER_SENSOR_MICROPHONE
#error "Invalid model for current sensor."
#endif
1 Like

Hi @timothy.malche,
The EI audio examples uses a callback pdm_data_ready_inference_callback() that is called periodically when an audio buffer is full. The M5Core2 example calls the audio driver directly and waits for the buffer to be filled. I’ve modified your code to handle audio data using the M5Core2 driver and to give you an idea how it should. Although I couldn’t actually test it.

Hopefully this works for you.
Arjan

#define EIDSP_QUANTIZE_FILTERBANK   0

#include <Tutorial_Responding_to_your_voice_inferencing.h>
#include <M5Core2.h>
#include <driver/i2s.h>

#define CONFIG_I2S_BCK_PIN 12
#define CONFIG_I2S_LRCK_PIN 0
#define CONFIG_I2S_DATA_PIN 2
#define CONFIG_I2S_DATA_IN_PIN 34

#define Speak_I2S_NUMBER I2S_NUM_0

#define MODE_MIC 0
#define MODE_SPK 1
#define DATA_SIZE 1024

//uint8_t microphonedata0[1024 * 50];
int data_offset = 0;

/** Audio buffers, pointers and selectors */
typedef struct {
    int16_t *buffer;
    uint8_t buf_ready;
    uint32_t buf_count;
    uint32_t n_samples;
} inference_t;

static inference_t inference;
static signed short sampleBuffer[2048];
static bool debug_nn = false; // Set this to true to see e.g. features generated from the raw signal

bool InitI2SSpeakOrMic(int mode)
{
    esp_err_t err = ESP_OK;
    i2s_driver_uninstall(Speak_I2S_NUMBER);
    i2s_config_t i2s_config = {
        .mode = (i2s_mode_t)(I2S_MODE_MASTER),
        .sample_rate = 44100,
        .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT, // is fixed at 12bit, stereo, MSB
        .channel_format = I2S_CHANNEL_FMT_ONLY_RIGHT,
        .communication_format = I2S_COMM_FORMAT_I2S,
        .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
        .dma_buf_count = 2,
        .dma_buf_len = 128,
    };
    if (mode == MODE_MIC)
    {
        i2s_config.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX | I2S_MODE_PDM);
    }
    else
    {
        i2s_config.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX);
        i2s_config.use_apll = false;
        i2s_config.tx_desc_auto_clear = true;
    }
    err += i2s_driver_install(Speak_I2S_NUMBER, &i2s_config, 0, NULL);
    i2s_pin_config_t tx_pin_config;
    tx_pin_config.bck_io_num = CONFIG_I2S_BCK_PIN;
    tx_pin_config.ws_io_num = CONFIG_I2S_LRCK_PIN;
    tx_pin_config.data_out_num = CONFIG_I2S_DATA_PIN;
    tx_pin_config.data_in_num = CONFIG_I2S_DATA_IN_PIN;
    err += i2s_set_pin(Speak_I2S_NUMBER, &tx_pin_config);
    err += i2s_set_clk(Speak_I2S_NUMBER, 44100, I2S_BITS_PER_SAMPLE_16BIT, I2S_CHANNEL_MONO);
    return true;
}

void DisplayInit(void)
{
  M5.Lcd.fillScreen(WHITE);
  M5.Lcd.setTextColor(BLACK);
  M5.Lcd.setTextSize(2);
}

void setup() {
 
  // put your setup code here, to run once:
    //Serial.begin(115200);

    M5.begin(true, true, true, true);
    M5.Axp.SetSpkEnable(true);
    DisplayInit();
    M5.Lcd.setTextColor(RED);
    M5.Lcd.setCursor(10, 10);
    M5.Lcd.printf("Recorder!");
    M5.Lcd.setTextColor(BLACK);
    M5.Lcd.setCursor(10, 26);
    M5.Lcd.printf("Edge Impulse Demo");
    delay(100);

    Serial.println("Edge Impulse Inferencing Demo");

    // summary of inferencing settings (from model_metadata.h)
    ei_printf("Inferencing settings:\n");
    ei_printf("\tInterval: %.2f ms.\n", (float)EI_CLASSIFIER_INTERVAL_MS);
    ei_printf("\tFrame size: %d\n", EI_CLASSIFIER_DSP_INPUT_FRAME_SIZE);
    ei_printf("\tSample length: %d ms.\n", EI_CLASSIFIER_RAW_SAMPLE_COUNT / 16);
    ei_printf("\tNo. of classes: %d\n", sizeof(ei_classifier_inferencing_categories) / sizeof(ei_classifier_inferencing_categories[0]));


    M5.Lcd.setCursor(10, 56);
    M5.Lcd.printf("Inferencing setup");
    
    if (microphone_inference_start(EI_CLASSIFIER_RAW_SAMPLE_COUNT) == false) {
        ei_printf("ERR: Failed to setup audio sampling\r\n");
        return;
    }
}

void loop() {


   ei_printf("Starting inferencing in 2 seconds...\n");

    delay(2000);

    ei_printf("Recording...\n");

    bool m = microphone_inference_record();
    if (!m) {
        ei_printf("ERR: Failed to record audio...\n");
        return;
    }

    ei_printf("Recording done\n");

    signal_t signal;
    signal.total_length = EI_CLASSIFIER_RAW_SAMPLE_COUNT;
    signal.get_data = &microphone_audio_signal_get_data;
    ei_impulse_result_t result = { 0 };

    EI_IMPULSE_ERROR r = run_classifier(&signal, &result, debug_nn);
    if (r != EI_IMPULSE_OK) {
        ei_printf("ERR: Failed to run classifier (%d)\n", r);
        return;
    }

    // print the predictions
    ei_printf("Predictions ");
    ei_printf("(DSP: %d ms., Classification: %d ms., Anomaly: %d ms.)",
        result.timing.dsp, result.timing.classification, result.timing.anomaly);
    M5.Lcd.setCursor(10, 116);
    M5.Lcd.printf("(DSP: %d ms., Classification: %d ms., Anomaly: %d ms.)",
        result.timing.dsp, result.timing.classification, result.timing.anomaly);
    ei_printf(": \n");
    for (size_t ix = 0; ix < EI_CLASSIFIER_LABEL_COUNT; ix++) {
        ei_printf("    %s: %.5f\n", result.classification[ix].label, result.classification[ix].value);
    }
#if EI_CLASSIFIER_HAS_ANOMALY == 1
    ei_printf("    anomaly score: %.3f\n", result.anomaly);
    M5.Lcd.setCursor(10, 116);
    M5.Lcd.printf("anomaly score: %.3f\n", result.anomaly);
#endif
  
}


static bool microphone_inference_start(uint32_t n_samples)
{
    inference.buffer = (int16_t *)malloc(n_samples * sizeof(int16_t));

    if(inference.buffer == NULL) {
        return false;
    }

    inference.buf_count  = 0;
    inference.n_samples  = n_samples;
    inference.buf_ready  = 0;
 
    return true;
}



static void pdm_data_ready_inference_callback(void)
{
    
    // read into the sample buffer
    int bytesRead;// = startRec();
    i2s_read(Speak_I2S_NUMBER, (char *)(sampleBuffer), DATA_SIZE, &bytesRead, (100 / portTICK_RATE_MS));

    if (inference.buf_ready == 0) {
        for(int i = 0; i < bytesRead>>1; i++) {
            inference.buffer[inference.buf_count++] = sampleBuffer[i];

            if(inference.buf_count >= inference.n_samples) {
                inference.buf_count = 0;
                inference.buf_ready = 1;
                break;
            }
        }
    }
}


static void microphone_inference_end(void)
{
   
    free(inference.buffer);
}


int startRec(){
TouchPoint_t pos= M5.Touch.getPressPoint();

        size_t byte_read;
  
        M5.Lcd.setCursor(10, 76);
        M5.Lcd.printf("Starting recording...!");
        M5.Axp.SetLDOEnable(3,true);
        delay(1000);
        M5.Axp.SetLDOEnable(3,false); 
        data_offset = 0;
        InitI2SSpeakOrMic(MODE_MIC);

        if(M5.Touch.ispressed() == true){
          i2s_read(Speak_I2S_NUMBER, (char *)(sampleBuffer), DATA_SIZE, &byte_read, (100 / portTICK_RATE_MS));
        }
        delay(1000);
        
        /*while(1){
            i2s_read(Speak_I2S_NUMBER, (char *)(microphonedata0 + data_offset), DATA_SIZE, &byte_read, (100 / portTICK_RATE_MS));
            data_offset += 1024;
                if(data_offset == 1024 * 50 || M5.Touch.ispressed() != true)
                  break;  
        }
        */
        size_t bytes_written;
        InitI2SSpeakOrMic(MODE_SPK);
        i2s_write(Speak_I2S_NUMBER, sampleBuffer, datinference.buf_readya_offset, &bytes_written, portMAX_DELAY);
    
    M5.Lcd.setCursor(10, 96);
    M5.Lcd.printf("Ending recording...!");

    return byte_read;
}

static bool microphone_inference_record(void)
{
    inference.buf_ready = 0;
    inference.buf_count = 0;

    while(inference.buf_ready == 0) {        
        
        pdm_data_ready_inference_callback(void);
    }

    inference.buf_ready = 0;
    return true;
}

static int microphone_audio_signal_get_data(size_t offset, size_t length, float *out_ptr)
{
    numpy::int16_to_float(&inference.buffer[offset], out_ptr, length);

    return 0;
}

#if !defined(EI_CLASSIFIER_SENSOR) || EI_CLASSIFIER_SENSOR != EI_CLASSIFIER_SENSOR_MICROPHONE
#error "Invalid model for current sensor."
#endif
1 Like

Hi @Arjan thanks. I’ll try and check it. However as per the suggestion given by engineer from M5Stack I was asked to fix following

  1. The SampleBuffer array size is too small. Should be 1024 * 100 or bigger;
  2. For ‘I2S_read’, position for pointer is used, which should be added gradually. Pls refer to our reference.

so as per suggention point 1 when I declare SampleBuffer[1024 * 100], the code does not compile. how ever when I declade SampleBuffer[1024 * 90] it compiles.

as per suggestion point 2 my code should be as following:

int startRec(){
.
.
.
while(1){
            i2s_read(Speak_I2S_NUMBER, (char *)(sampleBuffer + data_offset), DATA_SIZE, &byte_read, (100 / portTICK_RATE_MS));
            data_offset += 1024;
                if(data_offset == 1024 * 90 || M5.Touch.ispressed() != true)
                  break;  
        }
.
.
.
}

so should I need to replace the code that you suggested with the above code. Please suggest.

Hard to tell without knowing exactly what the i2s_read() function does. But in the example a buffer is filled each iteration with a block of sample data until the buffer is full. The same happens in the pdm_data_ready_inference_callback() function where samples are placed in the inference.buffer. So a SampleBuffer of 1024 bytes should be sufficient.

2 Likes

Thanks @Arjan for explanation.

2 Likes

Hi @timothy.malche did you manage to get this working? Did your classification come back accurately?

Trying to do an edge inferencing for Audio Classification on M5Core2 too and came across the same struggle with the example either using pdm or static data. Trying to convert the i2s audio data to the format that the inference code can take and Arjan’s example looks very promising.

Got it working after some minor tweaking but the inference output is always showing one category (with 90+% confidence) despite testing same noises simultaneously on EdgeImpulse web capture giving quite accurate results.

All good now. Inference is detecting correctly after a few minor tweaks. The sample codes you guys had above really helped. Thanks Guys!

1 Like

Hi @Wsantoso good to know that you have tweaked :wink:
Thanks for informing. All the best.