Seeed Xiao ESP32-S3 Sense Camera Object Detection with TFT Display

In the examples of the Edge Impulse library for Arduino with ESP32 boards with a camera, there is no image output with the bounding boxes drawn. Only the output in the Serial Monitor, with information about the inference.

I acquired a Seeed Xiao ESP32-S3 Sense and a Seeed Studio Round Display for Xiao, to run an object recognition project and decided to take on the challenge of adding this functionality to the original library example.

I followed the steps of the excellent tutorials written by Marcelo Rovai TinyML Made Easy: Object Detection with XIAO ESP32S3 Sense - and XIAO-ESP32S3-Sense/camera_round_display_save_jpeg at main · Mjrovai/XIAO-ESP32S3-Sense · GitHub and therefore I will not reproduce the steps here.

I’m just going to share the code with the comments and adaptations I made to be able to have the bounding boxes drawn on the TFT display at the same time the camera captures them.

/* Adding TFT Support Libraries */
#include <Arduino.h>
#include <TFT_eSPI.h>
#include <SPI.h>

/* Adding Edge Impulse Inferencing Library */
#include <Add_Your_inferencing_here.h> // Remember to change this line!
#include "edge-impulse-sdk/dsp/image/image.hpp"

/* Adding ESP32S3 Sense Camera Support */
#include "esp_camera.h"

/*  The bounding boxes were coming out small and the coordinates were out of place. 
    I understood that, although the camera configuration points to 240x240, 
    the images of the bounding boxes in the TFT were not in the same proportion. 
    I decided to divide the maximum resolutions (1600x1200) by 240 and arrived at values 6.67 and 5.

#define MAX_WIDTH 1600
#define MAX_HEIGHT 1200


#define PWDN_GPIO_NUM     -1
#define RESET_GPIO_NUM    -1
#define XCLK_GPIO_NUM     10
#define SIOD_GPIO_NUM     40
#define SIOC_GPIO_NUM     39
#define Y9_GPIO_NUM       48
#define Y8_GPIO_NUM       11
#define Y7_GPIO_NUM       12
#define Y6_GPIO_NUM       14
#define Y5_GPIO_NUM       16
#define Y4_GPIO_NUM       18
#define Y3_GPIO_NUM       17
#define Y2_GPIO_NUM       15
#define VSYNC_GPIO_NUM    38
#define HREF_GPIO_NUM     47
#define PCLK_GPIO_NUM     13

#define TOUCH_INT D7

/* Original EDGE IMPULSE example parameters - Change Camera Capture to display same size as TFT */
#define EI_CAMERA_RAW_FRAME_BUFFER_COLS           240
#define EI_CAMERA_RAW_FRAME_BUFFER_ROWS           240
#define EI_CAMERA_FRAME_BYTE_SIZE                 3

/* Private variables ------------------------------------------------------- */
static bool debug_nn = false; // Set this to true to see e.g. features generated from the raw signal
static bool is_initialised = false;
uint8_t *snapshot_buf; //points to the output of the capture

/* Camera Config Parameters */
static camera_config_t camera_config = {
  .pin_pwdn = PWDN_GPIO_NUM,
  .pin_reset = RESET_GPIO_NUM,
  .pin_xclk = XCLK_GPIO_NUM,
  .pin_sscb_sda = SIOD_GPIO_NUM,
  .pin_sscb_scl = SIOC_GPIO_NUM,

  .pin_d7 = Y9_GPIO_NUM,
  .pin_d6 = Y8_GPIO_NUM,
  .pin_d5 = Y7_GPIO_NUM,
  .pin_d4 = Y6_GPIO_NUM,
  .pin_d3 = Y5_GPIO_NUM,
  .pin_d2 = Y4_GPIO_NUM,
  .pin_d1 = Y3_GPIO_NUM,
  .pin_d0 = Y2_GPIO_NUM,
  .pin_vsync = VSYNC_GPIO_NUM,
  .pin_href = HREF_GPIO_NUM,
  .pin_pclk = PCLK_GPIO_NUM,

  //XCLK 20MHz or 10MHz for OV2640 double FPS (Experimental)
  .xclk_freq_hz = 10000000,
  .ledc_timer = LEDC_TIMER_0,
  .ledc_channel = LEDC_CHANNEL_0,

  /* This config pixel format needed to be the same in function ei_camera_capture ()
     bool converted = fmt2rgb888(fb->buf, fb->len, PIXFORMAT_RGB565, snapshot_buf);

  .pixel_format = PIXFORMAT_RGB565, //YUV422,GRAYSCALE,RGB565,JPEG
  .frame_size = FRAMESIZE_240X240,    //QQVGA-UXGA Do not use sizes above QVGA when not JPEG

  .jpeg_quality = 12, //0-63 lower number means higher quality
  .fb_count = 2,       //if more than one, i2s runs in continuous mode. Use only with JPEG
  .fb_location = CAMERA_FB_IN_PSRAM, // Frame Buffer in PSRAM - YES!
  .grab_mode = CAMERA_GRAB_WHEN_EMPTY,

/* Function definitions ------------------------------------------------------- */
bool ei_camera_init(void);
void ei_camera_deinit(void);
bool ei_camera_capture(uint32_t img_width, uint32_t img_height, uint8_t *out_buf) ;

/* Create TFT_SPI object */
TFT_eSPI tft = TFT_eSPI();

/* Bounding Boxes Coordinates need to be uint32_t in TFTeSPI Library */
uint32_t x;
uint32_t y;
uint32_t w;
uint32_t h;

  @brief      Arduino setup function
void setup()
  // put your setup code here, to run once:
  //comment out the below line to start inference immediately after upload
  // while (!Serial);

  /* Init TFT */

  /*   Need to set Rotation to 3 and change Camera Sensor 
      Config to Sync coordinates of bounding boxes*/

  if (ei_camera_init() == false) {
    ei_printf("Failed to initialize Camera!\r\n");
  else {
    ei_printf("Camera initialized\r\n");

  ei_printf("\nStarting continious inference in 2 seconds...\n");

  @brief      Get data and run inferencing

  @param[in]  debug  Get debug info if true
void loop()

  // instead of wait_ms, we'll wait on the signal, this allows threads to cancel us...
  if (ei_sleep(5) != EI_IMPULSE_OK) {


  // check if allocation was successful
  if (snapshot_buf == nullptr) {
    ei_printf("ERR: Failed to allocate snapshot buffer!\n");

  /* The signal that will be inferred is this one */
  ei::signal_t signal;
  signal.get_data = &ei_camera_get_data;

  /*  Makes the call to the ei_camera_capture function,
      which captures the image to the framebuffer,
      converts the image to the correct format and resizes it.
  if (ei_camera_capture((size_t)EI_CLASSIFIER_INPUT_WIDTH, (size_t)EI_CLASSIFIER_INPUT_HEIGHT, snapshot_buf) == false) {
    ei_printf("Failed to capture image\r\n");

  // Run the classifier
  ei_impulse_result_t result = { 0 };

  EI_IMPULSE_ERROR err = run_classifier(&signal, &result, debug_nn);
  if (err != EI_IMPULSE_OK) {
    ei_printf("ERR: Failed to run classifier (%d)\n", err);

  // Print the predictions to Serial Debug
  ei_printf("TFT Example -  Predictions (DSP: %d ms., Classification: %d ms., Anomaly: %d ms.): \n",
            result.timing.dsp, result.timing.classification, result.timing.anomaly);

  bool bb_found = result.bounding_boxes[0].value > 0;

  for (size_t ix = 0; ix < result.bounding_boxes_count; ix++) {
    auto bb = result.bounding_boxes[ix];
    if (bb.value == 0) {
      commenting this line, just to compare dimensions above
      ei_printf("   %s (%f) [ x: %u, y: %u, width: %u, height: %u ]\n", bb.label, bb.value, bb.x, bb.y, bb.width, bb.height);


    /* Create Coordinates and Size for Bounding Boxes */
    x = bb.x;
    y = bb.y;
    w = bb.width * 6; // WIDTH MAX / CURRENT
    h = bb.height * 5; // HEIGHT MAX / CURRENT

    /* Checking Sizes */
    ei_printf("BB Coord [ x: %u, y: %u, width: %u, height: %u ]\n",  bb.x, bb.y, bb.width, bb.height);
    ei_printf("Percentage %u",  result.classification[0].value);

    /* Draw Bounding Boxes in Display */
    tft.drawRect (x, y, w, h, TFT_GREEN);
    tft.setCursor(x, y);
  if (!bb_found) {
    ei_printf("TFT Example -     No objects found\n");
  for (size_t ix = 0; ix < EI_CLASSIFIER_LABEL_COUNT; ix++) {
    ei_printf("    %s: %.5f\n", result.classification[ix].label,

  ei_printf("TFT Example -     anomaly score: %.3f\n", result.anomaly);



   @brief   Setup image sensor & start streaming

   @retval  false if initialisation failed
bool ei_camera_init(void) {

  if (is_initialised) return true;

  //initialize the camera
  esp_err_t err = esp_camera_init(&camera_config);
  if (err != ESP_OK) {
    Serial.printf("Camera init failed with error 0x%x\n", err);
    return false;

  sensor_t * s = esp_camera_sensor_get();
  // initial sensors are flipped vertically and colors are a bit saturated

  /*  You need to change these settings, otherwise
      the x and y coordinates of the TFT and the camera will be
      different from each other. 
      It appears that the TFT is rotated 90 counterclockwise in relation to the camera...
  s->set_vflip(s, 1); // flip vertical
  s->set_hmirror(s, 1); // mirror horizontal
  is_initialised = true;
  return true;

   @brief      Stop streaming of sensor data
void ei_camera_deinit(void) {

  //deinitialize the camera
  esp_err_t err = esp_camera_deinit();

  if (err != ESP_OK)
    ei_printf("Camera deinit failed\n");

  is_initialised = false;

   @brief      Capture, rescale and crop image

   @param[in]  img_width     width of output image
   @param[in]  img_height    height of output image
   @param[in]  out_buf       pointer to store output image, NULL may be used
                             if ei_camera_frame_buffer is to be used for capture and resize/cropping.

   @retval     false if not initialised, image captured, rescaled or cropped failed

bool ei_camera_capture(uint32_t img_width, uint32_t img_height, uint8_t *out_buf) {
  bool do_resize = false;

  if (!is_initialised) {
    ei_printf("ERR: Camera is not initialized\r\n");
    return false;

  camera_fb_t *fb = esp_camera_fb_get();

  if (!fb) {
    ei_printf("Camera capture failed\n");
    return false;
  /*  Write camera captured framebuffer into TFT Display */
  tft.pushColors(fb->buf, fb->len);

  /*  Pay attention to this PIXFORMAT_RGB565 parameter in camera config. Need to be equal or convertion will fail  */
  bool converted = fmt2rgb888(fb->buf, fb->len, PIXFORMAT_RGB565, snapshot_buf);


  if (!converted) {
    ei_printf("Conversion failed\n");
    return false;

  if ((img_width != EI_CAMERA_RAW_FRAME_BUFFER_COLS)
      || (img_height != EI_CAMERA_RAW_FRAME_BUFFER_ROWS)) {
    do_resize = true;

  if (do_resize) {

  return true;

static int ei_camera_get_data(size_t offset, size_t length, float *out_ptr)
  // we already have a RGB888 buffer, so recalculate offset into pixel index
  size_t pixel_ix = offset * 3;
  size_t pixels_left = length;
  size_t out_ptr_ix = 0;

  while (pixels_left != 0) {
    out_ptr[out_ptr_ix] = (snapshot_buf[pixel_ix] << 16) + (snapshot_buf[pixel_ix + 1] << 8) + snapshot_buf[pixel_ix + 2];

    // go to the next pixel
    pixel_ix += 3;
  // and done!
  return 0;

#error "Invalid model for current sensor"
1 Like

Hi @djairguilherme

Nice work!

@mjrovai will be happy to see this!

Can you print out the bounding box coordinates (x , y , w , h ) to the serial monitor along with any other relevant information during runtime.

Maybe there is something happening with the 90 degree screen rotation etc, paste the output here for the community to review and offer sugggestion :rocket: