Continuous audio recognition in Arduino

I am trying to build a siren detector that can recognize police and fire engine sirens. Since this is a long, continuous, I have found that using 2 second samples seems to do a good job at removing false positives. I think it is because most similar sounds do not last as long.

I have it work very reliably and it is able to pick up sirens even when they are very faint. It does have occasional false positives, like when trucks are back up, people singing and jazz (saxophone seems similar)

Like @janvda I tried switch from MFCC to MFE and I am seeing similar things. The accuracy of the MFE based model is only a few percent lower than MFCC. When I run it against the training data, the accuracy is much less. The MFE based model appears to be better at blocking false positives, but it is not as good at picking out sirens. This seems to be especially true when then siren is quieter so the background silence gets amplified more during normalization.

I am using Arduino because I am lazy and they make it easy to record to SD and add in LoRa. I want to do continuous classification, perhaps with smaller sample windows. The documentation is really helpful but I wanted to see if there was some good code to start from or some pointers. I am targeting the Adafruit Feather Sense board, which is Nrf52840 based.

Could I use the run_nn_continuous() function as a starting point? I am sure you guys have already looked into this, so I want to check if there are some gotchas I will run into.

@Robotastic yeah,

  1. Create two buffers with size EI_CLASSIFIER_SLICE_SIZE.
  2. Fill buffer 1 with audio data on a separate thread or through DMA.
  3. When buffer 1 is full, switch writing the audio data to buffer 2.
  4. Construct a signal_t struct which reads back buffer 1.
  5. Call run_nn_continuous.
  6. When buffer 2 is full go back to 3, and switch to buffer 1 again.

I’ll have one of our embedded engineers pull together an example in a sketch at some point in the near future.

Awesome! I was able to pull something that seems to be working good. I have 2 of the same HW setups so I am going to try running the continuous version against the standard version. Here is the basic jist of it:

// If your target is limited in memory remove this macro to save 10K RAM
#define EIDSP_QUANTIZE_FILTERBANK   0


/* Includes ---------------------------------------------------------------- */
#include <PDM.h>
#include <urban-sound_inference.h>
#include <edge-impulse-sdk/dsp/numpy.hpp>
#include <SPI.h>

#include <Wire.h>


#define BUFFER_LENGTH   EI_CLASSIFIER_SLICE_SIZE //EI_CLASSIFIER_RAW_SAMPLE_COUNT



// Blinky on receipt
#define LED 13




/** Audio buffers, pointers and selectors */
typedef struct {
    int16_t *buffer;
    int16_t *other_buffer;
    bool use_buffer;
    uint8_t buf_ready;
    uint32_t buf_count;
    uint32_t n_samples;
} inference_t;

static inference_t inference;
static bool record_ready = false;
static signed short sampleBuffer[2048];
static bool debug_nn = false; // Set this to true to see e.g. features generated from the raw signal
static int samples_saved = 0;

/**
 * @brief      Arduino setup function
 */
void setup()
{

    Serial.begin(115200);

    // summary of inferencing settings (from model_metadata.h)
    ei_printf("Inferencing settings:\n");
    ei_printf("\tInterval: %.2f ms.\n", (float)EI_CLASSIFIER_INTERVAL_MS);
    ei_printf("\tFrame size: %d\n", EI_CLASSIFIER_DSP_INPUT_FRAME_SIZE);
    ei_printf("\tSample length: %d ms.\n", BUFFER_LENGTH / 16);
    ei_printf("\tNo. of classes: %d\n", sizeof(ei_classifier_inferencing_categories) / sizeof(ei_classifier_inferencing_categories[0]));
    
    run_classifier_init();
    
    if (microphone_inference_start(BUFFER_LENGTH) == false) {
        ei_printf("ERR: Failed to setup audio sampling\r\n");
        return;
    }
}

int print_results = -(EI_CLASSIFIER_SLICES_PER_MODEL_WINDOW);

/**
 * @brief      Arduino main function. Runs the inferencing loop.
 */
void loop()
{


    ei_printf("Recording...\n");

    bool m = microphone_inference_record();
    if (!m) {
        ei_printf("ERR: Failed to record audio...\n");
        return;
    }
    inference.use_buffer = !inference.use_buffer;
    inference.buf_ready = 0;
    inference.buf_count = 0;
    ei_printf("Recording done\n");

    signal_t signal;
    signal.total_length = BUFFER_LENGTH;
    signal.get_data = &microphone_audio_signal_get_data;
    ei_impulse_result_t result = { 0 };


    EI_IMPULSE_ERROR r = run_classifier_continuous(&signal, &result, false);
        if (r != EI_IMPULSE_OK) {
            ei_printf("ERR: Failed to run classifier (%d)\n", r);
        }
    ei_printf("Amount already recorded: %d of %d, is it done: %d\n",inference.buf_count, inference.n_samples, inference.buf_ready);
    
        
        if (++print_results >= (EI_CLASSIFIER_SLICES_PER_MODEL_WINDOW >> 1)) {
            // print the predictions
            ei_printf("Predictions (DSP: %d ms., Classification: %d ms., Anomaly: %d ms.): \n",
                      result.timing.dsp, result.timing.classification, result.timing.anomaly);
            for (size_t ix = 0; ix < EI_CLASSIFIER_LABEL_COUNT; ix++) {
                ei_printf("    %s: %.5f\n", result.classification[ix].label,
                          result.classification[ix].value);
            }
      #if EI_CLASSIFIER_HAS_ANOMALY == 1
                  ei_printf("    anomaly score: %.3f\n", result.anomaly);
      #endif

            print_results = 0;
        }

    int noise = (int) (result.classification[0].value * 100);
    int siren = (int) (result.classification[1].value * 100);

}

/**
 * @brief      Printf function uses vsnprintf and output using Arduino Serial
 *
 * @param[in]  format     Variable argument list
 */
void ei_printf(const char *format, ...) {
    static char print_buf[1024] = { 0 };

    va_list args;
    va_start(args, format);
    int r = vsnprintf(print_buf, sizeof(print_buf), format, args);
    va_end(args);
    #ifndef SERIAL_SOUND
    if (r > 0) {
        Serial.write(print_buf);
    }
    #endif
}


/**
 * @brief      PDM buffer full callback
 *             Get data and call audio thread callback
 */
static void pdm_data_ready_inference_callback(void)
{
    int bytesAvailable = PDM.available();

    // read into the sample buffer
    int bytesRead = PDM.read((char *)&sampleBuffer[0], bytesAvailable);

    //if (record_ready == true || inference.buf_ready == 1) {
    if (inference.buf_ready != 1) {
        for(int i = 0; i < bytesRead>>1; i++) {
            if (inference.use_buffer) {
              inference.buffer[inference.buf_count++] = sampleBuffer[i];
            } else {
              inference.other_buffer[inference.buf_count++] = sampleBuffer[i];
            }
            if(inference.buf_count >= inference.n_samples) {
                inference.buf_count = 0;
                inference.buf_ready = 1;
                break;
            }
        }
    }
}


/**
 * @brief      Init inferencing struct and setup/start PDM
 *
 * @param[in]  n_samples  The n samples
 *
 * @return     { description_of_the_return_value }
 */
static bool microphone_inference_start(uint32_t n_samples)
{
    inference.buffer = (int16_t *)malloc(n_samples * sizeof(int16_t));

    if(inference.buffer == NULL) {
      ei_printf("Fnot engough mem 1");
        return false;
    }

    inference.buf_count  = 0;
    inference.n_samples  = n_samples;
    inference.buf_ready  = 0;

    inference.other_buffer = (int16_t *)malloc(n_samples * sizeof(int16_t));

    if(inference.other_buffer == NULL) {
      ei_printf("not enough mem 2!");
        return false;
    }


    // configure the data receive callback
    PDM.onReceive(&pdm_data_ready_inference_callback);

    // optionally set the gain, defaults to 20
    PDM.setGain(60);

    //ei_printf("Sector size: %d nblocks: %d\r\n", ei_nano_fs_get_block_size(), n_sample_blocks);
    PDM.setBufferSize(4096);

    // initialize PDM with:
    // - one channel (mono mode)
    // - a 16 kHz sample rate
    if (!PDM.begin(1, EI_CLASSIFIER_FREQUENCY)) {
        ei_printf("Failed to start PDM!");
    }

    record_ready = true;

    return true;
}

/**
 * @brief      Wait on new data
 *
 * @return     True when finished
 */
static bool microphone_inference_record(void)
{


    while(inference.buf_ready == 0) {
        delay(10);
    }

    return true;
}

/**
 * Get raw audio signal data
 */
static int microphone_audio_signal_get_data(size_t offset, size_t length, float *out_ptr)
{
    // The buffers get swapped before inference starts. Doing NOT should give you the buffer that was just recorded
    if (!inference.use_buffer) {
      numpy::int16_to_float(&inference.buffer[offset], out_ptr, length);
      
    } else {
      numpy::int16_to_float(&inference.other_buffer[offset], out_ptr, length);
      
    }
    
  
    return 0;
}

/**
 * @brief      Stop PDM and release buffers
 */
static void microphone_inference_end(void)
{
    PDM.end();
    free(inference.buffer);
    free(inference.other_buffer);
}

#if !defined(EI_CLASSIFIER_SENSOR) || EI_CLASSIFIER_SENSOR != EI_CLASSIFIER_SENSOR_MICROPHONE
#error "Invalid model for current sensor."
#endif
1 Like

@Robotastic Awesome!!

I have 2 of the same HW setups so I am going to try running the continuous version against the standard version.

No real benefit for non-continuous. Underlying algorithms are the same, we just do some smart incremental feature generation for continuous mode, so would just stick to that!