diff --git a/models/facial_expression_recognition/CMakeLists.txt b/models/facial_expression_recognition/CMakeLists.txt new file mode 100644 index 00000000..7a138782 --- /dev/null +++ b/models/facial_expression_recognition/CMakeLists.txt @@ -0,0 +1,30 @@ +cmake_minimum_required(VERSION 3.24) +set(CMAKE_CXX_STANDARD 11) +set(project_name "opencv_zoo_face_expression_recognition") + +PROJECT (${project_name}) + +set(OPENCV_VERSION "4.9.0") +set(OPENCV_INSTALLATION_PATH "" CACHE PATH "Where to look for OpenCV installation") +find_package(OpenCV ${OPENCV_VERSION} REQUIRED HINTS ${OPENCV_INSTALLATION_PATH}) +# Find OpenCV, you may need to set OpenCV_DIR variable +# to the absolute path to the directory containing OpenCVConfig.cmake file +# via the command line or GUI + +file(GLOB SourceFile + "demo.cpp") +# If the package has been found, several variables will +# be set, you can find the full list with descriptions +# in the OpenCVConfig.cmake file. +# Print some message showing some of them +message(STATUS "OpenCV library status:") +message(STATUS " config: ${OpenCV_DIR}") +message(STATUS " version: ${OpenCV_VERSION}") +message(STATUS " libraries: ${OpenCV_LIBS}") +message(STATUS " include path: ${OpenCV_INCLUDE_DIRS}") + +# Declare the executable target built from your sources +add_executable(${project_name} ${SourceFile}) + +# Link your application with OpenCV libraries +target_link_libraries(${project_name} PRIVATE ${OpenCV_LIBS}) diff --git a/models/facial_expression_recognition/README.md b/models/facial_expression_recognition/README.md index f5d1415c..0b0004a0 100644 --- a/models/facial_expression_recognition/README.md +++ b/models/facial_expression_recognition/README.md @@ -19,12 +19,30 @@ Results of accuracy evaluation on [RAF-DB](http://whdeng.cn/RAF/model1.html). ***NOTE***: This demo uses [../face_detection_yunet](../face_detection_yunet) as face detector, which supports 5-landmark detection for now (2021sep). +### Python Run the following command to try the demo: ```shell # recognize the facial expression on images python demo.py --input /path/to/image -v ``` +### C++ + +Install latest OpenCV and CMake >= 3.24.0 to get started with: + +```shell +# A typical and default installation path of OpenCV is /usr/local +cmake -B build -D OPENCV_INSTALLATION_PATH=/path/to/opencv/installation . +cmake --build build + +# detect on camera input +./build/opencv_zoo_face_expression_recognition +# detect on an image +./build/opencv_zoo_face_expression_recognition -i=/path/to/image +# get help messages +./build/opencv_zoo_face_expression_recognition -h +``` + ### Example outputs Note: Zoom in to to see the recognized facial expression in the top-left corner of each face boxes. diff --git a/models/facial_expression_recognition/demo.cpp b/models/facial_expression_recognition/demo.cpp new file mode 100644 index 00000000..bba5cb3f --- /dev/null +++ b/models/facial_expression_recognition/demo.cpp @@ -0,0 +1,304 @@ +#include "opencv2/opencv.hpp" + +#include +#include +#include +#include + +using namespace std; +using namespace cv; +using namespace dnn; + +std::vector> backend_target_pairs = { + {DNN_BACKEND_OPENCV, DNN_TARGET_CPU}, + {DNN_BACKEND_CUDA, DNN_TARGET_CUDA}, + {DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16}, + {DNN_BACKEND_TIMVX, DNN_TARGET_NPU}, + {DNN_BACKEND_CANN, DNN_TARGET_NPU} +}; + +class FER +{ +private: + Net model; + string modelPath; + float std[5][2] = { + {38.2946, 51.6963}, + {73.5318, 51.5014}, + {56.0252, 71.7366}, + {41.5493, 92.3655}, + {70.7299, 92.2041} + }; + vector expressionEnum = { + "angry", "disgust", "fearful", + "happy", "neutral", "sad", "surprised" + }; + Mat stdPoints = Mat(5, 2, CV_32F, this->std); + Size patchSize = Size(112,112); + Scalar imageMean = Scalar(0.5,0.5,0.5); + Scalar imageStd = Scalar(0.5,0.5,0.5); + + const String inputNames = "data"; + const String outputNames = "label"; + + int backend_id; + int target_id; + +public: + FER(const string& modelPath, + int backend_id = 0, + int target_id = 0) + : modelPath(modelPath), backend_id(backend_id), target_id(target_id) + { + this->model = readNet(modelPath); + this->model.setPreferableBackend(backend_id); + this->model.setPreferableTarget(target_id); + } + + Mat preprocess(const Mat image, const Mat points) + { + // image alignment + Mat transformation = estimateAffine2D(points, this->stdPoints); + Mat aligned = Mat::zeros(this->patchSize.height, this->patchSize.width, image.type()); + warpAffine(image, aligned, transformation, this->patchSize); + + // image normalization + aligned.convertTo(aligned, CV_32F, 1.0 / 255.0); + aligned -= imageMean; + aligned /= imageStd; + + return blobFromImage(aligned);; + } + + String infer(const Mat image, const Mat facePoints) + { + Mat points = facePoints(Rect(4, 0, facePoints.cols-5, facePoints.rows)).reshape(2, 5); + Mat inputBlob = preprocess(image, points); + + this->model.setInput(inputBlob, this->inputNames); + Mat outputBlob = this->model.forward(this->outputNames); + + Point maxLoc; + minMaxLoc(outputBlob, nullptr, nullptr, nullptr, &maxLoc); + + return getDesc(maxLoc.x); + } + + String getDesc(int ind) + { + + if (ind >= 0 && ind < this->expressionEnum.size()) + { + return this->expressionEnum[ind]; + } + else + { + cerr << "Error: Index out of bounds." << endl; + return ""; + } + } + +}; + +class YuNet +{ +public: + YuNet(const string& model_path, + const Size& input_size = Size(320, 320), + float conf_threshold = 0.6f, + float nms_threshold = 0.3f, + int top_k = 5000, + int backend_id = 0, + int target_id = 0) + : model_path_(model_path), input_size_(input_size), + conf_threshold_(conf_threshold), nms_threshold_(nms_threshold), + top_k_(top_k), backend_id_(backend_id), target_id_(target_id) + { + model = FaceDetectorYN::create(model_path_, "", input_size_, conf_threshold_, nms_threshold_, top_k_, backend_id_, target_id_); + } + + void setBackendAndTarget(int backend_id, int target_id) + { + backend_id_ = backend_id; + target_id_ = target_id; + model = FaceDetectorYN::create(model_path_, "", input_size_, conf_threshold_, nms_threshold_, top_k_, backend_id_, target_id_); + } + + /* Overwrite the input size when creating the model. Size format: [Width, Height]. + */ + void setInputSize(const Size& input_size) + { + input_size_ = input_size; + model->setInputSize(input_size_); + } + + Mat infer(const Mat image) + { + Mat res; + model->detect(image, res); + return res; + } + +private: + Ptr model; + + string model_path_; + Size input_size_; + float conf_threshold_; + float nms_threshold_; + int top_k_; + int backend_id_; + int target_id_; +}; + +cv::Mat visualize(const cv::Mat& image, const cv::Mat& faces, const vector expressions, float fps = -1.f) +{ + static cv::Scalar box_color{0, 255, 0}; + static std::vector landmark_color{ + cv::Scalar(255, 0, 0), // right eye + cv::Scalar( 0, 0, 255), // left eye + cv::Scalar( 0, 255, 0), // nose tip + cv::Scalar(255, 0, 255), // right mouth corner + cv::Scalar( 0, 255, 255) // left mouth corner + }; + static cv::Scalar text_color{0, 255, 0}; + + auto output_image = image.clone(); + + if (fps >= 0) + { + cv::putText(output_image, cv::format("FPS: %.2f", fps), cv::Point(0, 15), cv::FONT_HERSHEY_SIMPLEX, 0.5, text_color, 2); + } + + for (int i = 0; i < faces.rows; ++i) + { + // Draw bounding boxes + int x1 = static_cast(faces.at(i, 0)); + int y1 = static_cast(faces.at(i, 1)); + int w = static_cast(faces.at(i, 2)); + int h = static_cast(faces.at(i, 3)); + cv::rectangle(output_image, cv::Rect(x1, y1, w, h), box_color, 2); + + // Expression as text + String exp = expressions[i]; + cv::putText(output_image, exp, cv::Point(x1, y1+12), cv::FONT_HERSHEY_DUPLEX, 0.5, text_color); + + // Draw landmarks + for (int j = 0; j < landmark_color.size(); ++j) + { + int x = static_cast(faces.at(i, 2*j+4)), y = static_cast(faces.at(i, 2*j+5)); + cv::circle(output_image, cv::Point(x, y), 2, landmark_color[j], 2); + } + } + return output_image; +} + +string keys = +"{ help h | | Print help message. }" +"{ model m | facial_expression_recognition_mobilefacenet_2022july.onnx | Usage: Path to the model, defaults to facial_expression_recognition_mobilefacenet_2022july.onnx }" +"{ yunet_model ym | ../face_detection_yunet/face_detection_yunet_2023mar.onnx | Usage: Path to the face detection yunet model, defaults to face_detection_yunet_2023mar.onnx }" +"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}" +"{ backend_target t | 0 | Choose one of the backend-target pair to run this demo:\n" + "0: (default) OpenCV implementation + CPU,\n" + "1: CUDA + GPU (CUDA),\n" + "2: CUDA + GPU (CUDA FP16),\n" + "3: TIM-VX + NPU,\n" + "4: CANN + NPU}" +"{ save s | false | Specify to save results.}" +"{ vis v | true | Specify to open a window for result visualization.}" +; + + +int main(int argc, char** argv) +{ + CommandLineParser parser(argc, argv, keys); + + parser.about("Facial Expression Recognition"); + if (parser.has("help")) + { + parser.printMessage(); + return 0; + } + + string modelPath = parser.get("model"); + string yunetModelPath = parser.get("yunet_model"); + string inputPath = parser.get("input"); + uint8_t backendTarget = parser.get("backend_target"); + bool saveFlag = parser.get("save"); + bool visFlag = parser.get("vis"); + + if (modelPath.empty()) + CV_Error(Error::StsError, "Model file " + modelPath + " not found"); + + if (yunetModelPath.empty()) + CV_Error(Error::StsError, "Face Detection Model file " + yunetModelPath + " not found"); + + YuNet faceDetectionModel(yunetModelPath); + FER expressionRecognitionModel(modelPath, backend_target_pairs[backendTarget].first, backend_target_pairs[backendTarget].second); + + VideoCapture cap; + if (!inputPath.empty()) + cap.open(samples::findFile(inputPath)); + else + cap.open(0); + + if (!cap.isOpened()) + CV_Error(Error::StsError, "Cannot opend video or file"); + + Mat frame; + static const std::string kWinName = "Facial Expression Demo"; + + + while (waitKey(1) < 0) + { + cap >> frame; + + if (frame.empty()) + { + if(inputPath.empty()) + cout << "Frame is empty" << endl; + break; + } + + faceDetectionModel.setInputSize(frame.size()); + + Mat faces = faceDetectionModel.infer(frame); + vector expressions; + + for (int i = 0; i < faces.rows; ++i) + { + Mat face = faces.row(i); + String exp = expressionRecognitionModel.infer(frame, face); + expressions.push_back(exp); + + int x1 = static_cast(faces.at(i, 0)); + int y1 = static_cast(faces.at(i, 1)); + int w = static_cast(faces.at(i, 2)); + int h = static_cast(faces.at(i, 3)); + float conf = faces.at(i, 14); + + std::cout << cv::format("%d: x1=%d, y1=%d, w=%d, h=%d, conf=%.4f expression=%s\n", i, x1, y1, w, h, conf, exp.c_str()); + + } + + Mat res_frame = visualize(frame, faces, expressions); + + if(visFlag || inputPath.empty()) + { + imshow(kWinName, res_frame); + if(!inputPath.empty()) + waitKey(0); + } + if(saveFlag) + { + cout << "Results are saved to result.jpg" << endl; + + cv::imwrite("result.jpg", res_frame); + } + } + + + return 0; + +} +