diff --git a/modules/text/samples/webcam_demo.cpp b/modules/text/samples/webcam_demo.cpp index 6071b3c69..ba9ba9b37 100644 --- a/modules/text/samples/webcam_demo.cpp +++ b/modules/text/samples/webcam_demo.cpp @@ -1,21 +1,19 @@ /* * webcam-demo.cpp * - * A demo program of End-to-end Scene Text Detection and Recognition. + * A demo program of End-to-end Scene Text Detection and Recognition using webcam or video. * * Created on: Jul 31, 2014 * Author: Lluis Gomez i Bigorda */ #include "opencv2/text.hpp" -#include "opencv2/core/utility.hpp" #include "opencv2/highgui.hpp" #include "opencv2/imgproc.hpp" #include "opencv2/features2d.hpp" #include - using namespace std; using namespace cv; using namespace cv::text; @@ -32,7 +30,7 @@ private: public: Parallel_extractCSER(vector &_channels, vector< vector > &_regions, vector >_er_filter1, vector >_er_filter2) - : channels(_channels),regions(_regions),er_filter1(_er_filter1),er_filter2(_er_filter2){} + : channels(_channels),regions(_regions),er_filter1(_er_filter1),er_filter2(_er_filter2) {} virtual void operator()( const cv::Range &r ) const { @@ -75,34 +73,81 @@ public: Parallel_OCR & operator=(const Parallel_OCR &a); }; - //Discard wrongly recognised strings bool isRepetitive(const string& s); //Draw ER's in an image via floodFill void er_draw(vector &channels, vector > ®ions, vector group, Mat& segmentation); -//Perform text detection and recognition from webcam +const char* keys = +{ + "{@input | 0 | camera index or video file name}" + "{ image i | | specify input image}" +}; + +//Perform text detection and recognition from webcam or video int main(int argc, char* argv[]) { - cout << endl << argv[0] << endl << endl; - cout << "A demo program of End-to-end Scene Text Detection and Recognition using webcam." << endl << endl; - cout << " Usage: " << argv[0] << " [camera_index]" << endl << endl; + CommandLineParser parser(argc, argv, keys); + + cout << "A demo program of End-to-end Scene Text Detection and Recognition using webcam or video." << endl << endl; + cout << " Keys: " << endl; cout << " Press 'r' to switch between MSER/CSER regions." << endl; cout << " Press 'g' to switch between Horizontal and Arbitrary oriented grouping." << endl; cout << " Press 'o' to switch between OCRTesseract/OCRHMMDecoder recognition." << endl; cout << " Press 's' to scale down frame size to 320x240." << endl; cout << " Press 'ESC' to exit." << endl << endl; + parser.printMessage(); + + VideoCapture cap; + Mat frame, image, gray, out_img; + String input = parser.get("@input"); + String image_file_name = parser.get("image"); + if (image_file_name != "") + { + image = imread(image_file_name); + if (image.empty()) + { + cout << "\nunable to open " << image_file_name << "\nprogram terminated!\n"; + return 1; + } + else + { + cout << "\nimage " << image_file_name << " loaded!\n"; + frame = image.clone(); + } + } + else + { + cout << "\nInitializing capturing... "; + if (input.size() == 1 && isdigit(input[0])) + cap.open(input[0] - '0'); + else + cap.open(input); + + if (!cap.isOpened()) + { + cout << "\nCould not initialize capturing!\n"; + return 1; + } + + cout << " Done!" << endl; + + cap.read(frame); + } namedWindow("recognition",WINDOW_NORMAL); + imshow("recognition", frame); + waitKey(1); + bool downsize = false; int REGION_TYPE = 1; int GROUPING_ALGORITHM = 0; int RECOGNITION = 0; - char *region_types_str[2] = {const_cast("ERStats"), const_cast("MSER")}; - char *grouping_algorithms_str[2] = {const_cast("exhaustive_search"), const_cast("multioriented")}; - char *recognitions_str[2] = {const_cast("Tesseract"), const_cast("NM_chain_features + KNN")}; - Mat frame,grey,orig_grey,out_img; + String region_types_str[2] = {"ERStats", "MSER"}; + String grouping_algorithms_str[2] = {"exhaustive_search", "multioriented"}; + String recognitions_str[2] = {"Tesseract", "NM_chain_features + KNN"}; + vector channels; vector > regions(2); //two channels @@ -118,15 +163,13 @@ int main(int argc, char* argv[]) er_filters2.push_back(er_filter2); } - //double t_r = getTickCount(); - //Initialize OCR engine (we initialize 10 instances in order to work several recognitions in parallel) - cout << "Initializing OCR engines ..." << endl; + cout << "Initializing OCR engines ... "; int num_ocrs = 10; vector< Ptr > ocrs; for (int o=0; o > decoders; for (int o=0; o > contours; vector bboxes; - Ptr mser = MSER::create(21,(int)(0.00002*grey.cols*grey.rows),(int)(0.05*grey.cols*grey.rows),1,0.7); - mser->detectRegions(grey, contours, bboxes); + Ptr mser = MSER::create(21, (int)(0.00002*gray.cols*gray.rows), (int)(0.05*gray.cols*gray.rows), 1, 0.7); + mser->detectRegions(gray, contours, bboxes); //Convert the output of MSER to suitable input for the grouping/recognition algorithms if (contours.size() > 0) - MSERsToERStats(grey, contours, regions); - + MSERsToERStats(gray, contours, regions); break; } - case 2: - { - break; - } - } - //cout << "TIME_REGION_DETECTION_ALT = " << ((double)getTickCount() - t_d)*1000/getTickFrequency() << endl; // Detect character groups - //double t_g = getTickCount(); vector< vector > nm_region_groups; vector nm_boxes; switch (GROUPING_ALGORITHM) { - case 0: - { + case 0: // exhaustive_search erGrouping(frame, channels, regions, nm_region_groups, nm_boxes, ERGROUPING_ORIENTATION_HORIZ); break; - } - case 1: - { + case 1: //multioriented erGrouping(frame, channels, regions, nm_region_groups, nm_boxes, ERGROUPING_ORIENTATION_ANY, "./trained_classifier_erGrouping.xml", 0.5); break; } - } - //cout << "TIME_GROUPING_ALT = " << ((double)getTickCount() - t_g)*1000/getTickFrequency() << endl; - - - /*Text Recognition (OCR)*/ - - frame.copyTo(out_img); - int scale = downsize ? 2 : 1; - float scale_img = (float)((600.f/frame.rows)/scale); - float scale_font = (float)(2-scale_img)/1.4f; + int bottom_bar_height= out_img.rows/7 ; + copyMakeBorder(frame, out_img, 0, bottom_bar_height, 0, 0, BORDER_CONSTANT, Scalar(150, 150, 150)); + float scale_font = (float)(bottom_bar_height /85.0); vector words_detection; float min_confidence1 = 0.f, min_confidence2 = 0.f; if (RECOGNITION == 0) { - min_confidence1 = 51.f; min_confidence2 = 60.f; + min_confidence1 = 51.f; + min_confidence2 = 60.f; } vector detections; - //t_r = getTickCount(); - for (int i=0; i<(int)nm_boxes.size(); i++) { rectangle(out_img, nm_boxes[i].tl(), nm_boxes[i].br(), Scalar(255,255,0),3); - Mat group_img = Mat::zeros(frame.rows+2, frame.cols+2, CV_8UC1); er_draw(channels, regions, nm_region_groups[i], group_img); group_img(nm_boxes[i]).copyTo(group_img); @@ -268,27 +269,25 @@ int main(int argc, char* argv[]) // parallel process detections in batches of ocrs.size() (== num_ocrs) for (int i=0; i<(int)detections.size(); i=i+(int)num_ocrs) { - Range r; - if (i+(int)num_ocrs <= (int)detections.size()) - r = Range(i,i+(int)num_ocrs); - else - r = Range(i,(int)detections.size()); + Range r; + if (i+(int)num_ocrs <= (int)detections.size()) + r = Range(i,i+(int)num_ocrs); + else + r = Range(i,(int)detections.size()); - switch(RECOGNITION) - { - case 0: - parallel_for_(r, Parallel_OCR(detections, outputs, boxes, words, confidences, ocrs)); - break; - case 1: - parallel_for_(r, Parallel_OCR(detections, outputs, boxes, words, confidences, decoders)); - break; - } + switch(RECOGNITION) + { + case 0: // Tesseract + parallel_for_(r, Parallel_OCR(detections, outputs, boxes, words, confidences, ocrs)); + break; + case 1: // NM_chain_features + KNN + parallel_for_(r, Parallel_OCR(detections, outputs, boxes, words, confidences, decoders)); + break; + } } - for (int i=0; i<(int)detections.size(); i++) { - outputs[i].erase(remove(outputs[i].begin(), outputs[i].end(), '\n'), outputs[i].end()); //cout << "OCR output = \"" << outputs[i] << "\" length = " << outputs[i].size() << endl; if (outputs[i].size() < 3) @@ -311,56 +310,57 @@ int main(int argc, char* argv[]) rectangle(out_img, boxes[i][j].tl()-Point(3,word_size.height+3), boxes[i][j].tl()+Point(word_size.width,0), Scalar(255,0,255),-1); putText(out_img, words[i][j], boxes[i][j].tl()-Point(1,1), FONT_HERSHEY_SIMPLEX, scale_font, Scalar(255,255,255),(int)(3*scale_font)); } - } - //cout << "TIME_OCR_ALT = " << ((double)getTickCount() - t_r)*1000/getTickFrequency() << endl; - - t_all = ((double)getTickCount() - t_all)*1000/getTickFrequency(); - char buff[100]; - sprintf(buff, "%2.1f Fps. @ %dx%d", (float)(1000/t_all), out_img.cols, out_img.rows); - string fps_info = buff; - rectangle(out_img, Point( out_img.rows-(160/scale),out_img.rows-(70/scale) ), Point(out_img.cols,out_img.rows), Scalar(255,255,255),-1); - putText(out_img, fps_info, Point( 10,out_img.rows-(10/scale) ), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0)); - putText(out_img, region_types_str[REGION_TYPE], Point( out_img.rows-(150/scale),out_img.rows-(50/scale) ), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0)); - putText(out_img, grouping_algorithms_str[GROUPING_ALGORITHM], Point( out_img.rows-(150/scale),out_img.rows-(30/scale) ), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0)); - putText(out_img, recognitions_str[RECOGNITION], Point( out_img.rows-(150/scale),out_img.rows-(10/scale) ), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0)); - + int text_thickness = 1+(out_img.rows/500); + string fps_info = format("%2.1f Fps. %dx%d", (float)(1000 / t_all), frame.cols, frame.rows); + putText(out_img, fps_info, Point( 10,out_img.rows-5 ), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0), text_thickness); + putText(out_img, region_types_str[REGION_TYPE], Point((int)(out_img.cols*0.5), out_img.rows - (int)(bottom_bar_height / 1.5)), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0), text_thickness); + putText(out_img, grouping_algorithms_str[GROUPING_ALGORITHM], Point((int)(out_img.cols*0.5),out_img.rows-((int)(bottom_bar_height /3)+4) ), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0), text_thickness); + putText(out_img, recognitions_str[RECOGNITION], Point((int)(out_img.cols*0.5),out_img.rows-5 ), FONT_HERSHEY_DUPLEX, scale_font, Scalar(255,0,0), text_thickness); imshow("recognition", out_img); - //imwrite("recognition_alt.jpg", out_img); - int key = waitKey(30); - if (key == 27) //wait for key + + if ((image_file_name == "") && !cap.read(frame)) { - cout << "esc key pressed" << endl; + cout << "Capturing ended! press any key to exit." << endl; + waitKey(); + return 0; + } + + int key = waitKey(30); //wait for a key press + + switch (key) + { + case 27: //ESC + cout << "ESC key pressed and exited." << endl; + return 0; + case 32: //SPACE + imwrite("recognition_alt.jpg", out_img); + break; + case 103: //'g' + GROUPING_ALGORITHM = (GROUPING_ALGORITHM+1)%2; + cout << "Grouping switched to " << grouping_algorithms_str[GROUPING_ALGORITHM] << endl; + break; + case 111: //'o' + RECOGNITION = (RECOGNITION+1)%2; + cout << "OCR switched to " << recognitions_str[RECOGNITION] << endl; + break; + case 114: //'r' + REGION_TYPE = (REGION_TYPE+1)%2; + cout << "Regions switched to " << region_types_str[REGION_TYPE] << endl; + break; + case 115: //'s' + downsize = !downsize; + if (!image.empty()) + { + frame = image.clone(); + } + break; + default: break; } - else - { - switch (key) - { - case 103: //g - GROUPING_ALGORITHM = (GROUPING_ALGORITHM+1)%2; - cout << "Grouping switched to " << grouping_algorithms_str[GROUPING_ALGORITHM] << endl; - break; - case 111: //o - RECOGNITION = (RECOGNITION+1)%2; - cout << "OCR switched to " << recognitions_str[RECOGNITION] << endl; - break; - case 114: //r - REGION_TYPE = (REGION_TYPE+1)%2; - cout << "Regions switched to " << region_types_str[REGION_TYPE] << endl; - break; - case 115: //s - downsize = !downsize; - break; - default: - break; - - } - } - } return 0; @@ -389,11 +389,9 @@ bool isRepetitive(const string& s) return true; } - return false; } - void er_draw(vector &channels, vector > ®ions, vector group, Mat& segmentation) { for (int r=0; r<(int)group.size(); r++)