models/object_tracking_vittrack/demo.cpp

#include <iostream>
#include <opencv2/opencv.hpp>

using namespace std;
using namespace cv;
using namespace dnn;

struct TrackingResult
{
    bool isLocated;
    Rect bbox;
    float score;
};

class VitTrack
{
public:

    VitTrack(const string& model_path, int backend_id = 0, int target_id = 0) 
    {
        params.net = model_path;
        params.backend = backend_id;
        params.target = target_id;
        model = TrackerVit::create(params);
    }

    void init(const Mat& image, const Rect& roi)
    {
        model->init(image, roi);
    }

    TrackingResult infer(const Mat& image)
    {
        TrackingResult result;
        result.isLocated = model->update(image, result.bbox);
        result.score = model->getTrackingScore();
        return result;
    }

private:
    TrackerVit::Params params;
    Ptr<TrackerVit> model;
};

Mat visualize(const Mat& image, const Rect& bbox, float score, bool isLocated, double fps = -1.0,
                  const Scalar& box_color = Scalar(0, 255, 0), const Scalar& text_color = Scalar(0, 255, 0),
                  double fontScale = 1.0, int fontSize = 1)
{
    Mat output = image.clone();
    int h = output.rows;
    int w = output.cols;

    if (fps >= 0)
    {
        putText(output, "FPS: " + to_string(fps), Point(0, 30), FONT_HERSHEY_DUPLEX, fontScale, text_color, fontSize);
    }

    if (isLocated && score >= 0.3)
    {
        rectangle(output, bbox, box_color, 2);
        putText(output, format("%.2f", score), Point(bbox.x, bbox.y + 25),
                    FONT_HERSHEY_DUPLEX, fontScale, text_color, fontSize);
    }
    else
    {
        Size text_size = getTextSize("Target lost!", FONT_HERSHEY_DUPLEX, fontScale, fontSize, nullptr);
        int text_x = (w - text_size.width) / 2;
        int text_y = (h - text_size.height) / 2;
        putText(output, "Target lost!", Point(text_x, text_y), FONT_HERSHEY_DUPLEX, fontScale, Scalar(0, 0, 255), fontSize);
    }

    return output;
}

int main(int argc, char** argv)
{
    CommandLineParser parser(argc, argv,
        "{help  h           |                                       | Print help message. }"
        "{input i           |                                       |Set path to the input video. Omit for using default camera.}"
        "{model_path        |object_tracking_vittrack_2023sep.onnx  |Set model path}"
        "{backend_target bt |0                                      |Choose backend-target pair: 0 - OpenCV implementation + CPU, 1 - CUDA + GPU (CUDA), 2 - CUDA + GPU (CUDA FP16), 3 - TIM-VX + NPU, 4 - CANN + NPU}"
        "{save s            |false                                  |Specify to save a file with results.}"
        "{vis v             |true                                   |Specify to open a new window to show results.}");
    if (parser.has("help"))
    {
        parser.printMessage();
        return 0;
    }

    string input = parser.get<string>("input");
    string model_path = parser.get<string>("model_path");
    int backend_target = parser.get<int>("backend_target");
    bool save = parser.get<bool>("save");
    bool vis = parser.get<bool>("vis");

    vector<vector<int>> backend_target_pairs =
    {
        {DNN_BACKEND_OPENCV, DNN_TARGET_CPU},
        {DNN_BACKEND_CUDA, DNN_TARGET_CUDA},
        {DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16},
        {DNN_BACKEND_TIMVX, DNN_TARGET_NPU},
        {DNN_BACKEND_CANN, DNN_TARGET_NPU}
    };

    int backend_id = backend_target_pairs[backend_target][0];
    int target_id = backend_target_pairs[backend_target][1];

    // Create VitTrack tracker
    VitTrack tracker(model_path, backend_id, target_id);

    // Open video capture
    VideoCapture video;
    if (input.empty())
    {
        video.open(0);  // Default camera
    }
    else
    {
        video.open(input);
    }

    if (!video.isOpened())
    {
        cerr << "Error: Could not open video source" << endl;
        return -1;
    }

    // Select an object
    Mat first_frame;
    video >> first_frame;

    if (first_frame.empty())
    {
        cerr << "No frames grabbed!" << endl;
        return -1;
    }

    Mat first_frame_copy = first_frame.clone();
    putText(first_frame_copy, "1. Drag a bounding box to track.", Point(0, 25), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 255, 0));
    putText(first_frame_copy, "2. Press ENTER to confirm", Point(0, 50), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 255, 0));
    Rect roi = selectROI("VitTrack Demo", first_frame_copy);

    if (roi.area() == 0)
    {
        cerr << "No ROI is selected! Exiting..." << endl;
        return -1;
    }
    else
    {
        cout << "Selected ROI: " << roi << endl;
    }

    // Create VideoWriter if save option is specified
    VideoWriter output_video;
    if (save)
    {
        Size frame_size = first_frame.size();
        output_video.open("output.mp4", VideoWriter::fourcc('m', 'p', '4', 'v'), video.get(CAP_PROP_FPS), frame_size);
        if (!output_video.isOpened())
        {
            cerr << "Error: Could not create output video stream" << endl;
            return -1;
        }
    }

    // Initialize tracker with ROI
    tracker.init(first_frame, roi);

    // Track frame by frame
    TickMeter tm;
    while (waitKey(1) < 0)
    {
        video >> first_frame;
        if (first_frame.empty())
        {
            cout << "End of video" << endl;
            break;
        }

        // Inference
        tm.start();
        TrackingResult result = tracker.infer(first_frame);
        tm.stop();

        // Visualize
        Mat frame = first_frame.clone();
        frame = visualize(frame, result.bbox, result.score, result.isLocated, tm.getFPS());

        if (save)
        {
            output_video.write(frame);
        }

        if (vis)
        {
            imshow("VitTrack Demo", frame);
        }
        tm.reset();
    }

    if (save)
    {
        output_video.release();
    }

    video.release();
    destroyAllWindows();

    return 0;
}