diff --git a/modules/rgbd/README.md b/modules/rgbd/README.md index 2ccfae29e..2a662c682 100644 --- a/modules/rgbd/README.md +++ b/modules/rgbd/README.md @@ -15,3 +15,5 @@ Note that the KinectFusion algorithm was patented and its use may be restricted * _US8401225B2_ Moving object segmentation using depth images Since OpenCV's license imposes different restrictions on usage please consult a legal before using this algorithm any way. + +That's why you need to set the OPENCV_ENABLE_NONFREE option in CMake to use KinectFusion. diff --git a/modules/rgbd/include/opencv2/rgbd/kinfu.hpp b/modules/rgbd/include/opencv2/rgbd/kinfu.hpp index 31f6de5e6..5bc66731c 100644 --- a/modules/rgbd/include/opencv2/rgbd/kinfu.hpp +++ b/modules/rgbd/include/opencv2/rgbd/kinfu.hpp @@ -28,19 +28,6 @@ struct CV_EXPORTS_W Params */ CV_WRAP static Ptr coarseParams(); - enum PlatformType - { - - PLATFORM_CPU, PLATFORM_GPU - }; - - /** @brief A platform on which to run the algorithm. - * - Currently KinFu supports only one platform which is a CPU. - GPU platform is to be implemented in the future. - */ - PlatformType platform; - /** @brief frame size in pixels */ CV_PROP_RW Size frameSize; @@ -50,8 +37,9 @@ struct CV_EXPORTS_W Params /** @brief pre-scale per 1 meter for input values Typical values are: - * 5000 per 1 meter for the 16-bit PNG files of TUM database - * 1 per 1 meter for the 32-bit float images in the ROS bag files + * 5000 per 1 meter for the 16-bit PNG files of TUM database + * 1000 per 1 meter for Kinect 2 device + * 1 per 1 meter for the 32-bit float images in the ROS bag files */ CV_PROP_RW float depthFactor; @@ -65,13 +53,13 @@ struct CV_EXPORTS_W Params /** @brief Number of pyramid levels for ICP */ CV_PROP_RW int pyramidLevels; - /** @brief Resolution of voxel cube + /** @brief Resolution of voxel space - Number of voxels in each cube edge. + Number of voxels in each dimension. */ - CV_PROP_RW int volumeDims; - /** @brief Size of voxel cube side in meters */ - CV_PROP_RW float volumeSize; + CV_PROP_RW Vec3i volumeDims; + /** @brief Size of voxel in meters */ + CV_PROP_RW float voxelSize; /** @brief Minimal camera movement in meters @@ -84,7 +72,7 @@ struct CV_EXPORTS_W Params /** @brief distance to truncate in meters - Distances that exceed this value will be truncated in voxel cube values. + Distances to surface that exceed this value will be truncated to 1.0. */ CV_PROP_RW float tsdf_trunc_dist; @@ -128,11 +116,19 @@ struct CV_EXPORTS_W Params The output can be obtained as a vector of points and their normals or can be Phong-rendered from given camera pose. - An internal representation of a model is a voxel cube that keeps TSDF values + An internal representation of a model is a voxel cuboid that keeps TSDF values which are a sort of distances to the surface (for details read the @cite kinectfusion article about TSDF). There is no interface to that representation yet. - This implementation is based on (kinfu-remake)[https://github.com/Nerei/kinfu_remake]. + KinFu uses OpenCL acceleration automatically if available. + To enable or disable it explicitly use cv::setUseOptimized() or cv::ocl::setUseOpenCL(). + + This implementation is based on [kinfu-remake](https://github.com/Nerei/kinfu_remake). + + Note that the KinectFusion algorithm was patented and its use may be restricted by + the list of patents mentioned in README.md file in this module directory. + + That's why you need to set the OPENCV_ENABLE_NONFREE option in CMake to use KinectFusion. */ class CV_EXPORTS_W KinFu { @@ -142,11 +138,10 @@ public: /** @brief Get current parameters */ virtual const Params& getParams() const = 0; - virtual void setParams(const Params&) = 0; /** @brief Renders a volume into an image - Renders a 0-surface of TSDF using Phong shading into a CV_8UC3 Mat. + Renders a 0-surface of TSDF using Phong shading into a CV_8UC4 Mat. Light pose is fixed in KinFu params. @param image resulting image @@ -186,21 +181,18 @@ public: */ CV_WRAP virtual void reset() = 0; - /** @brief Get current pose in voxel cube space */ + /** @brief Get current pose in voxel space */ virtual const Affine3f getPose() const = 0; /** @brief Process next depth frame - Integrates depth into voxel cube with respect to its ICP-calculated pose. + Integrates depth into voxel space with respect to its ICP-calculated pose. Input image is converted to CV_32F internally if has another type. @param depth one-channel image which size and depth scale is described in algorithm's parameters @return true if succeded to align new frame with current scene, false if opposite */ CV_WRAP virtual bool update(InputArray depth) = 0; - -private: - class KinFuImpl; }; //! @} diff --git a/modules/rgbd/samples/kinfu_demo.cpp b/modules/rgbd/samples/kinfu_demo.cpp index 55cd69891..3a81f68fa 100644 --- a/modules/rgbd/samples/kinfu_demo.cpp +++ b/modules/rgbd/samples/kinfu_demo.cpp @@ -26,7 +26,7 @@ static vector readDepth(std::string fileList) fstream file(fileList); if(!file.is_open()) - throw std::runtime_error("Failed to open file"); + throw std::runtime_error("Failed to read depth list"); std::string dir; size_t slashIdx = fileList.rfind('/'); @@ -48,47 +48,86 @@ static vector readDepth(std::string fileList) return v; } +struct DepthWriter +{ + DepthWriter(string fileList) : + file(fileList, ios::out), count(0), dir() + { + size_t slashIdx = fileList.rfind('/'); + slashIdx = slashIdx != std::string::npos ? slashIdx : fileList.rfind('\\'); + dir = fileList.substr(0, slashIdx); -const Size kinect2FrameSize(512, 424); -// approximate values, no guarantee to be correct -const float kinect2Focal = 366.1f; -const float kinect2Cx = 258.2f; -const float kinect2Cy = 204.f; + if(!file.is_open()) + throw std::runtime_error("Failed to write depth list"); + + file << "# depth maps saved from device" << endl; + file << "# useless_number filename" << endl; + } + + void append(InputArray _depth) + { + Mat depth = _depth.getMat(); + string depthFname = cv::format("%04d.png", count); + string fullDepthFname = dir + '/' + depthFname; + if(!imwrite(fullDepthFname, depth)) + throw std::runtime_error("Failed to write depth to file " + fullDepthFname); + file << count++ << " " << depthFname << endl; + } + + fstream file; + int count; + string dir; +}; + +namespace Kinect2Params +{ + static const Size frameSize = Size(512, 424); + // approximate values, no guarantee to be correct + static const float focal = 366.1f; + static const float cx = 258.2f; + static const float cy = 204.f; + static const float k1 = 0.12f; + static const float k2 = -0.34f; + static const float k3 = 0.12f; +}; struct DepthSource { public: DepthSource() : - depthFileList(), - frameIdx(0), - vc(), - useKinect2Workarounds(true) + DepthSource("", -1) { } DepthSource(int cam) : - depthFileList(), - frameIdx(), - vc(VideoCaptureAPIs::CAP_OPENNI2 + cam), - useKinect2Workarounds(true) + DepthSource("", cam) { } DepthSource(String fileListName) : - depthFileList(readDepth(fileListName)), + DepthSource(fileListName, -1) + { } + + DepthSource(String fileListName, int cam) : + depthFileList(fileListName.empty() ? vector() : readDepth(fileListName)), frameIdx(0), - vc(), + vc( cam >= 0 ? VideoCapture(VideoCaptureAPIs::CAP_OPENNI2 + cam) : VideoCapture()), + undistortMap1(), + undistortMap2(), useKinect2Workarounds(true) { } - Mat getDepth() + UMat getDepth() { - Mat out; + UMat out; if (!vc.isOpened()) { if (frameIdx < depthFileList.size()) - out = cv::imread(depthFileList[frameIdx++], IMREAD_ANYDEPTH); + { + Mat f = cv::imread(depthFileList[frameIdx++], IMREAD_ANYDEPTH); + f.copyTo(out); + } else { - return Mat(); + return UMat(); } } else @@ -99,8 +138,14 @@ public: // workaround for Kinect 2 if(useKinect2Workarounds) { - out = out(Rect(Point(), kinect2FrameSize)); - cv::flip(out, out, 1); + out = out(Rect(Point(), Kinect2Params::frameSize)); + + UMat outCopy; + // linear remap adds gradient between valid and invalid pixels + // which causes garbage, use nearest instead + remap(out, outCopy, undistortMap1, undistortMap2, cv::INTER_NEAREST); + + cv::flip(outCopy, out, 1); } } if (out.empty()) @@ -125,21 +170,47 @@ public: // it's recommended to calibrate sensor to obtain its intrinsics float fx, fy, cx, cy; - fx = fy = useKinect2Workarounds ? kinect2Focal : focal; - cx = useKinect2Workarounds ? kinect2Cx : w/2 - 0.5f; - cy = useKinect2Workarounds ? kinect2Cy : h/2 - 0.5f; + Size frameSize; + if(useKinect2Workarounds) + { + fx = fy = Kinect2Params::focal; + cx = Kinect2Params::cx; + cy = Kinect2Params::cy; - params.frameSize = useKinect2Workarounds ? kinect2FrameSize : Size(w, h); - params.intr = Matx33f(fx, 0, cx, - 0, fy, cy, - 0, 0, 1); + frameSize = Kinect2Params::frameSize; + } + else + { + fx = fy = focal; + cx = w/2 - 0.5f; + cy = h/2 - 0.5f; + + frameSize = Size(w, h); + } + + Matx33f camMatrix = Matx33f(fx, 0, cx, + 0, fy, cy, + 0, 0, 1); + + params.frameSize = frameSize; + params.intr = camMatrix; params.depthFactor = 1000.f; + + Matx distCoeffs; + distCoeffs(0) = Kinect2Params::k1; + distCoeffs(1) = Kinect2Params::k2; + distCoeffs(4) = Kinect2Params::k3; + if(useKinect2Workarounds) + initUndistortRectifyMap(camMatrix, distCoeffs, cv::noArray(), + camMatrix, frameSize, CV_16SC2, + undistortMap1, undistortMap2); } } vector depthFileList; size_t frameIdx; VideoCapture vc; + UMat undistortMap1, undistortMap2; bool useKinect2Workarounds; }; @@ -163,7 +234,7 @@ void pauseCallback(const viz::MouseEvent& me, void* args) { PauseCallbackArgs pca = *((PauseCallbackArgs*)(args)); viz::Viz3d window(vizWindowName); - Mat rendered; + UMat rendered; pca.kf.render(rendered, window.getViewerPose().matrix); imshow("render", rendered); waitKey(1); @@ -174,10 +245,13 @@ void pauseCallback(const viz::MouseEvent& me, void* args) static const char* keys = { "{help h usage ? | | print this message }" - "{depth | | Path to depth.txt file listing a set of depth images }" + "{depth | | Path to depth.txt file listing a set of depth images }" "{camera | | Index of depth camera to be used as a depth source }" "{coarse | | Run on coarse settings (fast but ugly) or on default (slow but looks better)," " in coarse mode points and normals are displayed }" + "{idle | | Do not run KinFu, just display depth frames }" + "{record | | Write depth frames to specified file list" + " (the same format as for the 'depth' key) }" }; static const std::string message = @@ -189,9 +263,19 @@ static const std::string message = int main(int argc, char **argv) { bool coarse = false; + bool idle = false; + string recordPath; CommandLineParser parser(argc, argv, keys); parser.about(message); + + if(!parser.check()) + { + parser.printMessage(); + parser.printErrors(); + return -1; + } + if(parser.has("help")) { parser.printMessage(); @@ -201,12 +285,13 @@ int main(int argc, char **argv) { coarse = true; } - - if(!parser.check()) + if(parser.has("record")) { - parser.printMessage(); - parser.printErrors(); - return -1; + recordPath = parser.get("record"); + } + if(parser.has("idle")) + { + idle = true; } DepthSource ds; @@ -222,7 +307,13 @@ int main(int argc, char **argv) return -1; } + Ptr depthWriter; + if(!recordPath.empty()) + depthWriter = makePtr(recordPath); + Ptr params; + Ptr kf; + if(coarse) params = Params::coarseParams(); else @@ -231,11 +322,15 @@ int main(int argc, char **argv) // These params can be different for each depth sensor ds.updateParams(*params); - // Scene-specific params should be tuned for each scene individually - //params.volumePose = params.volumePose.translate(Vec3f(0.f, 0.f, 0.5f)); - //params.tsdf_max_weight = 16; + // Enables OpenCL explicitly (by default can be switched-off) + cv::setUseOptimized(true); - Ptr kf = KinFu::create(params); + // Scene-specific params should be tuned for each scene individually + //params->volumePose = params->volumePose.translate(Vec3f(0.f, 0.f, 0.5f)); + //params->tsdf_max_weight = 16; + + if(!idle) + kf = KinFu::create(params); #ifdef HAVE_OPENCV_VIZ cv::viz::Viz3d window(vizWindowName); @@ -243,18 +338,21 @@ int main(int argc, char **argv) bool pause = false; #endif - // TODO: can we use UMats for that? - Mat rendered; - Mat points; - Mat normals; + UMat rendered; + UMat points; + UMat normals; int64 prevTime = getTickCount(); - for(Mat frame = ds.getDepth(); !frame.empty(); frame = ds.getDepth()) + for(UMat frame = ds.getDepth(); !frame.empty(); frame = ds.getDepth()) { + if(depthWriter) + depthWriter->append(frame); + #ifdef HAVE_OPENCV_VIZ if(pause) { + // doesn't happen in idle mode kf->getCloud(points, normals); if(!points.empty() && !normals.empty()) { @@ -263,8 +361,9 @@ int main(int argc, char **argv) window.showWidget("cloud", cloudWidget); window.showWidget("normals", cloudNormals); + Vec3d volSize = kf->getParams().voxelSize*Vec3d(kf->getParams().volumeDims); window.showWidget("cube", viz::WCube(Vec3d::all(0), - Vec3d::all(kf->getParams().volumeSize)), + volSize), kf->getParams().volumePose); PauseCallbackArgs pca(*kf); window.registerMouseCallback(pauseCallback, (void*)&pca); @@ -272,6 +371,8 @@ int main(int argc, char **argv) "Close the window or press Q to resume"), Point())); window.spin(); window.removeWidget("text"); + window.removeWidget("cloud"); + window.removeWidget("normals"); window.registerMouseCallback(0); } @@ -280,41 +381,49 @@ int main(int argc, char **argv) else #endif { - Mat cvt8; - float depthFactor = kf->getParams().depthFactor; + UMat cvt8; + float depthFactor = params->depthFactor; convertScaleAbs(frame, cvt8, 0.25*256. / depthFactor); - imshow("depth", cvt8); + if(!idle) + { + imshow("depth", cvt8); - if(!kf->update(frame)) - { - kf->reset(); - std::cout << "reset" << std::endl; - } -#ifdef HAVE_OPENCV_VIZ - else - { - if(coarse) + if(!kf->update(frame)) { - kf->getCloud(points, normals); - if(!points.empty() && !normals.empty()) - { - viz::WCloud cloudWidget(points, viz::Color::white()); - viz::WCloudNormals cloudNormals(points, normals, /*level*/1, /*scale*/0.05, viz::Color::gray()); - window.showWidget("cloud", cloudWidget); - window.showWidget("normals", cloudNormals); - } + kf->reset(); + std::cout << "reset" << std::endl; } +#ifdef HAVE_OPENCV_VIZ + else + { + if(coarse) + { + kf->getCloud(points, normals); + if(!points.empty() && !normals.empty()) + { + viz::WCloud cloudWidget(points, viz::Color::white()); + viz::WCloudNormals cloudNormals(points, normals, /*level*/1, /*scale*/0.05, viz::Color::gray()); + window.showWidget("cloud", cloudWidget); + window.showWidget("normals", cloudNormals); + } + } - //window.showWidget("worldAxes", viz::WCoordinateSystem()); - window.showWidget("cube", viz::WCube(Vec3d::all(0), - Vec3d::all(kf->getParams().volumeSize)), - kf->getParams().volumePose); - window.setViewerPose(kf->getPose()); - window.spinOnce(1, true); - } + //window.showWidget("worldAxes", viz::WCoordinateSystem()); + Vec3d volSize = kf->getParams().voxelSize*kf->getParams().volumeDims; + window.showWidget("cube", viz::WCube(Vec3d::all(0), + volSize), + kf->getParams().volumePose); + window.setViewerPose(kf->getPose()); + window.spinOnce(1, true); + } #endif - kf->render(rendered); + kf->render(rendered); + } + else + { + rendered = cvt8; + } } int64 newTime = getTickCount(); @@ -325,17 +434,19 @@ int main(int argc, char **argv) imshow("render", rendered); - int c = waitKey(100); + int c = waitKey(1); switch (c) { case 'r': - kf->reset(); + if(!idle) + kf->reset(); break; case 'q': return 0; #ifdef HAVE_OPENCV_VIZ case 'p': - pause = true; + if(!idle) + pause = true; #endif default: break; diff --git a/modules/rgbd/src/fast_icp.cpp b/modules/rgbd/src/fast_icp.cpp index a58205564..8cea96d72 100644 --- a/modules/rgbd/src/fast_icp.cpp +++ b/modules/rgbd/src/fast_icp.cpp @@ -12,52 +12,98 @@ using namespace std; namespace cv { namespace kinfu { +enum +{ + UTSIZE = 27 +}; ICP::ICP(const Intr _intrinsics, const std::vector& _iterations, float _angleThreshold, float _distanceThreshold) : iterations(_iterations), angleThreshold(_angleThreshold), distanceThreshold(_distanceThreshold), intrinsics(_intrinsics) { } -///////// CPU implementation ///////// - -class ICPCPU : public ICP +class ICPImpl : public ICP { public: - ICPCPU(const cv::kinfu::Intr _intrinsics, const std::vector &_iterations, float _angleThreshold, float _distanceThreshold); + ICPImpl(const cv::kinfu::Intr _intrinsics, const std::vector &_iterations, float _angleThreshold, float _distanceThreshold); - virtual bool estimateTransform(cv::Affine3f& transform, cv::Ptr oldFrame, cv::Ptr newFrame) const override; + virtual bool estimateTransform(cv::Affine3f& transform, + InputArray oldPoints, InputArray oldNormals, + InputArray newPoints, InputArray newNormals + ) const override; - virtual ~ICPCPU() { } + template < typename T > + bool estimateTransformT(cv::Affine3f& transform, + const vector& oldPoints, const vector& oldNormals, + const vector& newPoints, const vector& newNormals + ) const; + + virtual ~ICPImpl() { } + + template < typename T > + void getAb(const T& oldPts, const T& oldNrm, const T& newPts, const T& newNrm, + cv::Affine3f pose, int level, cv::Matx66f& A, cv::Vec6f& b) const; private: - void getAb(const Points &oldPts, const Normals &oldNrm, const Points &newPts, const Normals &newNrm, - cv::Affine3f pose, int level, cv::Matx66f& A, cv::Vec6f& b) const; + + mutable vector groupedSumBuffers; + }; - -ICPCPU::ICPCPU(const Intr _intrinsics, const std::vector &_iterations, float _angleThreshold, float _distanceThreshold) : - ICP(_intrinsics, _iterations, _angleThreshold, _distanceThreshold) +ICPImpl::ICPImpl(const Intr _intrinsics, const std::vector &_iterations, float _angleThreshold, float _distanceThreshold) : + ICP(_intrinsics, _iterations, _angleThreshold, _distanceThreshold), + groupedSumBuffers(_iterations.size()) { } -bool ICPCPU::estimateTransform(cv::Affine3f& transform, cv::Ptr _oldFrame, cv::Ptr _newFrame) const + +bool ICPImpl::estimateTransform(cv::Affine3f& transform, + InputArray _oldPoints, InputArray _oldNormals, + InputArray _newPoints, InputArray _newNormals + ) const { - ScopeTime st("icp"); + CV_TRACE_FUNCTION(); - cv::Ptr oldFrame = _oldFrame.dynamicCast(); - cv::Ptr newFrame = _newFrame.dynamicCast(); + CV_Assert(_oldPoints.size() == _oldNormals.size()); + CV_Assert(_newPoints.size() == _newNormals.size()); + CV_Assert(_oldPoints.size() == _newPoints.size()); - const std::vector& oldPoints = oldFrame->points; - const std::vector& oldNormals = oldFrame->normals; - const std::vector& newPoints = newFrame->points; - const std::vector& newNormals = newFrame->normals; +#ifdef HAVE_OPENCL + if(cv::ocl::isOpenCLActivated() && + _oldPoints.isUMatVector() && _oldNormals.isUMatVector() && + _newPoints.isUMatVector() && _newNormals.isUMatVector()) + { + std::vector op, np, on, nn; + _oldPoints.getUMatVector(op); + _newPoints.getUMatVector(np); + _oldNormals.getUMatVector(on); + _newNormals.getUMatVector(nn); + return estimateTransformT(transform, op, on, np, nn); + } +#endif + + std::vector op, on, np, nn; + _oldPoints.getMatVector(op); + _newPoints.getMatVector(np); + _oldNormals.getMatVector(on); + _newNormals.getMatVector(nn); + return estimateTransformT(transform, op, on, np, nn); +} + +template < typename T > +bool ICPImpl::estimateTransformT(cv::Affine3f& transform, + const vector& oldPoints, const vector& oldNormals, + const vector& newPoints, const vector& newNormals + ) const +{ + CV_TRACE_FUNCTION(); transform = Affine3f::Identity(); for(size_t l = 0; l < iterations.size(); l++) { size_t level = iterations.size() - 1 - l; - Points oldPts = oldPoints [level], newPts = newPoints [level]; - Normals oldNrm = oldNormals[level], newNrm = newNormals[level]; + const T& oldPts = oldPoints [level], newPts = newPoints [level]; + const T& oldNrm = oldNormals[level], newNrm = newNormals[level]; for(int iter = 0; iter < iterations[level]; iter++) { @@ -83,10 +129,13 @@ bool ICPCPU::estimateTransform(cv::Affine3f& transform, cv::Ptr _oldFrame return true; } + +///////// CPU implementation ///////// + // 1 any coord to check is enough since we know the generation -#if CV_SIMD128 +#if USE_INTRINSICS static inline bool fastCheck(const v_float32x4& p0, const v_float32x4& p1) { float check = (p0.get0() + p1.get0()); @@ -125,11 +174,6 @@ typedef Matx ABtype; struct GetAbInvoker : ParallelLoopBody { - enum - { - UTSIZE = 27 - }; - GetAbInvoker(ABtype& _globalAb, Mutex& _mtx, const Points& _oldPts, const Normals& _oldNrm, const Points& _newPts, const Normals& _newNrm, Affine3f _pose, Intr::Projector _proj, float _sqDistanceThresh, float _minCos) : @@ -141,7 +185,7 @@ struct GetAbInvoker : ParallelLoopBody virtual void operator ()(const Range& range) const override { -#if CV_SIMD128 +#if USE_INTRINSICS CV_Assert(ptype::channels == 4); const size_t utBufferSize = 9; @@ -447,18 +491,20 @@ struct GetAbInvoker : ParallelLoopBody }; -void ICPCPU::getAb(const Points& oldPts, const Normals& oldNrm, const Points& newPts, const Normals& newNrm, - Affine3f pose, int level, Matx66f &A, Vec6f &b) const +template <> +void ICPImpl::getAb(const Mat& oldPts, const Mat& oldNrm, const Mat& newPts, const Mat& newNrm, + cv::Affine3f pose, int level, cv::Matx66f& A, cv::Vec6f& b) const { - ScopeTime st("icp: get ab", false); + CV_TRACE_FUNCTION(); CV_Assert(oldPts.size() == oldNrm.size()); CV_Assert(newPts.size() == newNrm.size()); ABtype sumAB = ABtype::zeros(); - Mutex mutex; - GetAbInvoker invoker(sumAB, mutex, oldPts, oldNrm, newPts, newNrm, pose, + const Points op(oldPts), on(oldNrm); + const Normals np(newPts), nn(newNrm); + GetAbInvoker invoker(sumAB, mutex, op, on, np, nn, pose, intrinsics.scale(level).makeProjector(), distanceThreshold*distanceThreshold, cos(angleThreshold)); Range range(0, newPts.rows); @@ -480,39 +526,133 @@ void ICPCPU::getAb(const Points& oldPts, const Normals& oldNrm, const Points& ne ///////// GPU implementation ///////// -class ICPGPU : public ICP +#ifdef HAVE_OPENCL + +template <> +void ICPImpl::getAb(const UMat& oldPts, const UMat& oldNrm, const UMat& newPts, const UMat& newNrm, + Affine3f pose, int level, Matx66f &A, Vec6f &b) const { -public: - ICPGPU(const cv::kinfu::Intr _intrinsics, const std::vector &_iterations, float _angleThreshold, float _distanceThreshold); + CV_TRACE_FUNCTION(); - virtual bool estimateTransform(cv::Affine3f& transform, cv::Ptr oldFrame, cv::Ptr newFrame) const override; + Size oldSize = oldPts.size(), newSize = newPts.size(); + CV_Assert(oldSize == oldNrm.size()); + CV_Assert(newSize == newNrm.size()); - virtual ~ICPGPU() { } -}; + // calculate 1x7 vector ab to produce b and upper triangle of A: + // [A|b] = ab*(ab^t) + // and then reduce it across work groups -ICPGPU::ICPGPU(const Intr _intrinsics, const std::vector &_iterations, float _angleThreshold, float _distanceThreshold) : - ICP(_intrinsics, _iterations, _angleThreshold, _distanceThreshold) -{ } + cv::String errorStr; + ocl::ProgramSource source = ocl::rgbd::icp_oclsrc; + cv::String options = "-cl-fast-relaxed-math -cl-mad-enable"; + ocl::Kernel k; + k.create("getAb", source, options, &errorStr); + if(k.empty()) + throw std::runtime_error("Failed to create kernel: " + errorStr); -bool ICPGPU::estimateTransform(cv::Affine3f& /*transform*/, cv::Ptr /*_oldFrame*/, cv::Ptr /*newFrame*/) const -{ - throw std::runtime_error("Not implemented"); + size_t globalSize[2]; + globalSize[0] = (size_t)newPts.cols; + globalSize[1] = (size_t)newPts.rows; + + const ocl::Device& device = ocl::Device::getDefault(); + // workaround for Intel's integrated GPU + size_t wgsLimit = device.isIntel() ? 64 : device.maxWorkGroupSize(); + size_t memSize = device.localMemSize(); + // local memory should keep upperTriangles for all threads in group for reduce + const size_t ltsz = UTSIZE*sizeof(float); + const size_t lcols = 32; + size_t lrows = min(memSize/ltsz, wgsLimit)/lcols; + // round lrows down to 2^n + lrows = roundDownPow2(lrows); + size_t localSize[2] = {lcols, lrows}; + Size ngroups((int)divUp(globalSize[0], (unsigned int)localSize[0]), + (int)divUp(globalSize[1], (unsigned int)localSize[1])); + + // size of local buffer for group-wide reduce + size_t lsz = localSize[0]*localSize[1]*ltsz; + + Intr::Projector proj = intrinsics.scale(level).makeProjector(); + Vec2f fxy(proj.fx, proj.fy), cxy(proj.cx, proj.cy); + + UMat& groupedSumGpu = groupedSumBuffers[level]; + groupedSumGpu.create(Size(ngroups.width*UTSIZE, ngroups.height), + CV_32F); + groupedSumGpu.setTo(0); + + // TODO: optimization possible: + // samplers instead of oldPts/oldNrm (mask needed) + k.args(ocl::KernelArg::ReadOnlyNoSize(oldPts), + ocl::KernelArg::ReadOnlyNoSize(oldNrm), + oldSize, + ocl::KernelArg::ReadOnlyNoSize(newPts), + ocl::KernelArg::ReadOnlyNoSize(newNrm), + newSize, + ocl::KernelArg::Constant(pose.matrix.val, + sizeof(pose.matrix.val)), + fxy.val, cxy.val, + distanceThreshold*distanceThreshold, + cos(angleThreshold), + //TODO: replace by KernelArg::Local(lsz) + ocl::KernelArg(ocl::KernelArg::LOCAL, 0, 1, 1, 0, lsz), + ocl::KernelArg::WriteOnlyNoSize(groupedSumGpu) + ); + + if(!k.run(2, globalSize, localSize, true)) + throw std::runtime_error("Failed to run kernel"); + + float upperTriangle[UTSIZE]; + for(int i = 0; i < UTSIZE; i++) + upperTriangle[i] = 0; + + Mat groupedSumCpu = groupedSumGpu.getMat(ACCESS_READ); + + for(int y = 0; y < ngroups.height; y++) + { + const float* rowr = groupedSumCpu.ptr(y); + for(int x = 0; x < ngroups.width; x++) + { + const float* p = rowr + x*UTSIZE; + for(int j = 0; j < UTSIZE; j++) + { + upperTriangle[j] += p[j]; + } + } + } + groupedSumCpu.release(); + + ABtype sumAB = ABtype::zeros(); + int pos = 0; + for(int i = 0; i < 6; i++) + { + for(int j = i; j < 7; j++) + { + sumAB(i, j) = upperTriangle[pos++]; + } + } + + // splitting AB matrix to A and b + for(int i = 0; i < 6; i++) + { + // augment lower triangle of A by symmetry + for(int j = i; j < 6; j++) + { + A(i, j) = A(j, i) = sumAB(i, j); + } + + b(i) = sumAB(i, 6); + } } -cv::Ptr makeICP(cv::kinfu::Params::PlatformType t, - const cv::kinfu::Intr _intrinsics, const std::vector &_iterations, +#endif + +/// + + +cv::Ptr makeICP(const cv::kinfu::Intr _intrinsics, const std::vector &_iterations, float _angleThreshold, float _distanceThreshold) { - switch (t) - { - case cv::kinfu::Params::PlatformType::PLATFORM_CPU: - return cv::makePtr(_intrinsics, _iterations, _angleThreshold, _distanceThreshold); - case cv::kinfu::Params::PlatformType::PLATFORM_GPU: - return cv::makePtr(_intrinsics, _iterations, _angleThreshold, _distanceThreshold); - default: - return cv::Ptr(); - } + return makePtr(_intrinsics, _iterations, _angleThreshold, _distanceThreshold); } } // namespace kinfu diff --git a/modules/rgbd/src/fast_icp.hpp b/modules/rgbd/src/fast_icp.hpp index d036e9074..7a9f06940 100644 --- a/modules/rgbd/src/fast_icp.hpp +++ b/modules/rgbd/src/fast_icp.hpp @@ -18,7 +18,10 @@ class ICP public: ICP(const cv::kinfu::Intr _intrinsics, const std::vector &_iterations, float _angleThreshold, float _distanceThreshold); - virtual bool estimateTransform(cv::Affine3f& transform, cv::Ptr oldFrame, cv::Ptr newFrame) const = 0; + virtual bool estimateTransform(cv::Affine3f& transform, + InputArray oldPoints, InputArray oldNormals, + InputArray newPoints, InputArray newNormals + ) const = 0; virtual ~ICP() { } @@ -30,8 +33,7 @@ protected: cv::kinfu::Intr intrinsics; }; -cv::Ptr makeICP(cv::kinfu::Params::PlatformType t, - const cv::kinfu::Intr _intrinsics, const std::vector &_iterations, +cv::Ptr makeICP(const cv::kinfu::Intr _intrinsics, const std::vector &_iterations, float _angleThreshold, float _distanceThreshold); } // namespace kinfu diff --git a/modules/rgbd/src/kinfu.cpp b/modules/rgbd/src/kinfu.cpp index 99fa8257c..1257187c2 100644 --- a/modules/rgbd/src/kinfu.cpp +++ b/modules/rgbd/src/kinfu.cpp @@ -12,45 +12,10 @@ namespace cv { namespace kinfu { -class KinFu::KinFuImpl : public KinFu -{ -public: - KinFuImpl(const Params& _params); - virtual ~KinFuImpl(); - - const Params& getParams() const CV_OVERRIDE; - void setParams(const Params&) CV_OVERRIDE; - - void render(OutputArray image, const Matx44f& cameraPose) const CV_OVERRIDE; - - void getCloud(OutputArray points, OutputArray normals) const CV_OVERRIDE; - void getPoints(OutputArray points) const CV_OVERRIDE; - void getNormals(InputArray points, OutputArray normals) const CV_OVERRIDE; - - void reset() CV_OVERRIDE; - - const Affine3f getPose() const CV_OVERRIDE; - - bool update(InputArray depth) CV_OVERRIDE; - -private: - Params params; - - cv::Ptr frameGenerator; - cv::Ptr icp; - cv::Ptr volume; - - int frameCounter; - Affine3f pose; - cv::Ptr frame; -}; - Ptr Params::defaultParams() { Params p; - p.platform = PLATFORM_CPU; - p.frameSize = Size(640, 480); float fx, fy, cx, cy; @@ -88,12 +53,13 @@ Ptr Params::defaultParams() p.tsdf_min_camera_movement = 0.f; //meters, disabled - p.volumeDims = 512; //number of voxels + p.volumeDims = Vec3i::all(512); //number of voxels - p.volumeSize = 3.f; //meters + float volSize = 3.f; + p.voxelSize = volSize/512.f; //meters // default pose of volume cube - p.volumePose = Affine3f().translate(Vec3f(-p.volumeSize/2, -p.volumeSize/2, 0.5f)); + p.volumePose = Affine3f().translate(Vec3f(-volSize/2.f, -volSize/2.f, 0.5f)); p.tsdf_trunc_dist = 0.04f; //meters; p.tsdf_max_weight = 64; //frames @@ -129,78 +95,156 @@ Ptr Params::coarseParams() } p->pyramidLevels = (int)p->icpIterations.size(); - p->volumeDims = 128; //number of voxels + float volSize = 3.f; + p->volumeDims = Vec3i::all(128); //number of voxels + p->voxelSize = volSize/128.f; p->raycast_step_factor = 0.75f; //in voxel sizes return p; } -KinFu::KinFuImpl::KinFuImpl(const Params &_params) : +// T should be Mat or UMat +template< typename T > +class KinFuImpl : public KinFu +{ +public: + KinFuImpl(const Params& _params); + virtual ~KinFuImpl(); + + const Params& getParams() const CV_OVERRIDE; + + void render(OutputArray image, const Matx44f& cameraPose) const CV_OVERRIDE; + + void getCloud(OutputArray points, OutputArray normals) const CV_OVERRIDE; + void getPoints(OutputArray points) const CV_OVERRIDE; + void getNormals(InputArray points, OutputArray normals) const CV_OVERRIDE; + + void reset() CV_OVERRIDE; + + const Affine3f getPose() const CV_OVERRIDE; + + bool update(InputArray depth) CV_OVERRIDE; + + bool updateT(const T& depth); + +private: + Params params; + + cv::Ptr icp; + cv::Ptr volume; + + int frameCounter; + Affine3f pose; + std::vector pyrPoints; + std::vector pyrNormals; +}; + + +template< typename T > +KinFuImpl::KinFuImpl(const Params &_params) : params(_params), - frameGenerator(makeFrameGenerator(params.platform)), - icp(makeICP(params.platform, params.intr, params.icpIterations, params.icpAngleThresh, params.icpDistThresh)), - volume(makeTSDFVolume(params.platform, params.volumeDims, params.volumeSize, params.volumePose, + icp(makeICP(params.intr, params.icpIterations, params.icpAngleThresh, params.icpDistThresh)), + volume(makeTSDFVolume(params.volumeDims, params.voxelSize, params.volumePose, params.tsdf_trunc_dist, params.tsdf_max_weight, params.raycast_step_factor)), - frame() + pyrPoints(), pyrNormals() { reset(); } -void KinFu::KinFuImpl::reset() +template< typename T > +void KinFuImpl::reset() { frameCounter = 0; pose = Affine3f::Identity(); volume->reset(); } -KinFu::KinFuImpl::~KinFuImpl() -{ +template< typename T > +KinFuImpl::~KinFuImpl() +{ } -} - -const Params& KinFu::KinFuImpl::getParams() const +template< typename T > +const Params& KinFuImpl::getParams() const { return params; } -void KinFu::KinFuImpl::setParams(const Params& p) -{ - params = p; -} - -const Affine3f KinFu::KinFuImpl::getPose() const +template< typename T > +const Affine3f KinFuImpl::getPose() const { return pose; } -bool KinFu::KinFuImpl::update(InputArray _depth) -{ - ScopeTime st("kinfu update"); +template<> +bool KinFuImpl::update(InputArray _depth) +{ CV_Assert(!_depth.empty() && _depth.size() == params.frameSize); - cv::Ptr newFrame = (*frameGenerator)(); - (*frameGenerator)(newFrame, _depth, - params.intr, - params.pyramidLevels, - params.depthFactor, - params.bilateral_sigma_depth, - params.bilateral_sigma_spatial, - params.bilateral_kernel_size); + Mat depth; + if(_depth.isUMat()) + { + _depth.copyTo(depth); + return updateT(depth); + } + else + { + return updateT(_depth.getMat()); + } +} + + +template<> +bool KinFuImpl::update(InputArray _depth) +{ + CV_Assert(!_depth.empty() && _depth.size() == params.frameSize); + + UMat depth; + if(!_depth.isUMat()) + { + _depth.copyTo(depth); + return updateT(depth); + } + else + { + return updateT(_depth.getUMat()); + } +} + + +template< typename T > +bool KinFuImpl::updateT(const T& _depth) +{ + CV_TRACE_FUNCTION(); + + T depth; + if(_depth.type() != DEPTH_TYPE) + _depth.convertTo(depth, DEPTH_TYPE); + else + depth = _depth; + + std::vector newPoints, newNormals; + makeFrameFromDepth(depth, newPoints, newNormals, params.intr, + params.pyramidLevels, + params.depthFactor, + params.bilateral_sigma_depth, + params.bilateral_sigma_spatial, + params.bilateral_kernel_size); if(frameCounter == 0) { // use depth instead of distance - volume->integrate(newFrame, params.depthFactor, pose, params.intr); + volume->integrate(depth, params.depthFactor, pose, params.intr); - frame = newFrame; + pyrPoints = newPoints; + pyrNormals = newNormals; } else { Affine3f affine; - bool success = icp->estimateTransform(affine, frame, newFrame); + bool success = icp->estimateTransform(affine, pyrPoints, pyrNormals, newPoints, newNormals); if(!success) return false; @@ -212,12 +256,15 @@ bool KinFu::KinFuImpl::update(InputArray _depth) if((rnorm + tnorm)/2 >= params.tsdf_min_camera_movement) { // use depth instead of distance - volume->integrate(newFrame, params.depthFactor, pose, params.intr); + volume->integrate(depth, params.depthFactor, pose, params.intr); } - // raycast and build a pyramid of points and normals - volume->raycast(pose, params.intr, params.frameSize, - params.pyramidLevels, frameGenerator, frame); + T& points = pyrPoints [0]; + T& normals = pyrNormals[0]; + volume->raycast(pose, params.intr, params.frameSize, points, normals); + // build a pyramid of points and normals + buildPyramidPointsNormals(points, normals, pyrPoints, pyrNormals, + params.pyramidLevels); } frameCounter++; @@ -225,9 +272,10 @@ bool KinFu::KinFuImpl::update(InputArray _depth) } -void KinFu::KinFuImpl::render(OutputArray image, const Matx44f& _cameraPose) const +template< typename T > +void KinFuImpl::render(OutputArray image, const Matx44f& _cameraPose) const { - ScopeTime st("kinfu render"); + CV_TRACE_FUNCTION(); Affine3f cameraPose(_cameraPose); @@ -235,41 +283,59 @@ void KinFu::KinFuImpl::render(OutputArray image, const Matx44f& _cameraPose) con if((cameraPose.rotation() == pose.rotation() && cameraPose.translation() == pose.translation()) || (cameraPose.rotation() == id.rotation() && cameraPose.translation() == id.translation())) { - frame->render(image, 0, params.lightPose); + renderPointsNormals(pyrPoints[0], pyrNormals[0], image, params.lightPose); } else { - // raycast and build a pyramid of points and normals - cv::Ptr f = (*frameGenerator)(); - volume->raycast(cameraPose, params.intr, params.frameSize, - params.pyramidLevels, frameGenerator, f); - f->render(image, 0, params.lightPose); + T points, normals; + volume->raycast(cameraPose, params.intr, params.frameSize, points, normals); + renderPointsNormals(points, normals, image, params.lightPose); } } -void KinFu::KinFuImpl::getCloud(OutputArray p, OutputArray n) const +template< typename T > +void KinFuImpl::getCloud(OutputArray p, OutputArray n) const { volume->fetchPointsNormals(p, n); } -void KinFu::KinFuImpl::getPoints(OutputArray points) const + +template< typename T > +void KinFuImpl::getPoints(OutputArray points) const { volume->fetchPointsNormals(points, noArray()); } -void KinFu::KinFuImpl::getNormals(InputArray points, OutputArray normals) const + +template< typename T > +void KinFuImpl::getNormals(InputArray points, OutputArray normals) const { volume->fetchNormals(points, normals); } // importing class -Ptr KinFu::create(const Ptr& _params) +#ifdef OPENCV_ENABLE_NONFREE + +Ptr KinFu::create(const Ptr& params) { - return makePtr(*_params); +#ifdef HAVE_OPENCL + if(cv::ocl::isOpenCLActivated()) + return makePtr< KinFuImpl >(*params); +#endif + return makePtr< KinFuImpl >(*params); } +#else +Ptr KinFu::create(const Ptr& /*params*/) +{ + CV_Error(Error::StsNotImplemented, + "This algorithm is patented and is excluded in this configuration; " + "Set OPENCV_ENABLE_NONFREE CMake option and rebuild the library"); +} +#endif + KinFu::~KinFu() {} } // namespace kinfu diff --git a/modules/rgbd/src/kinfu_frame.cpp b/modules/rgbd/src/kinfu_frame.cpp index eef81b3cc..2fd7225f4 100644 --- a/modules/rgbd/src/kinfu_frame.cpp +++ b/modules/rgbd/src/kinfu_frame.cpp @@ -10,98 +10,9 @@ namespace cv { namespace kinfu { -struct FrameGeneratorCPU : FrameGenerator -{ -public: - virtual cv::Ptr operator ()() const override; - virtual void operator() (Ptr _frame, InputArray depth, const kinfu::Intr, int levels, float depthFactor, - float sigmaDepth, float sigmaSpatial, int kernelSize) const override; - virtual void operator() (Ptr _frame, InputArray points, InputArray normals, int levels) const override; - - virtual ~FrameGeneratorCPU() {} -}; - -void computePointsNormals(const cv::kinfu::Intr, float depthFactor, const Depth, Points, Normals ); -Depth pyrDownBilateral(const Depth depth, float sigma); -void pyrDownPointsNormals(const Points p, const Normals n, Points& pdown, Normals& ndown); - -cv::Ptr FrameGeneratorCPU::operator ()() const -{ - return makePtr(); -} - -void FrameGeneratorCPU::operator ()(Ptr _frame, InputArray depth, const Intr intr, int levels, float depthFactor, - float sigmaDepth, float sigmaSpatial, int kernelSize) const -{ - ScopeTime st("frameGenerator: from depth"); - - Ptr frame = _frame.dynamicCast(); - - CV_Assert(frame); - - //CV_Assert(depth.type() == CV_16S); - // this should convert CV_16S to CV_32F - frame->depthData = Depth(depth.getMat()); - - // looks like OpenCV's bilateral filter works the same as KinFu's - Depth smooth; - bilateralFilter(frame->depthData, smooth, kernelSize, sigmaDepth*depthFactor, sigmaSpatial); - - // depth truncation is not used by default - //if (p.icp_truncate_depth_dist > 0) kfusion::cuda::depthTruncation(curr_.depth_pyr[0], p.icp_truncate_depth_dist); - - // we don't need depth pyramid outside this method - // if we do, the code is to be refactored - - Depth scaled = smooth; - Size sz = smooth.size(); - frame->points.resize(levels); - frame->normals.resize(levels); - for(int i = 0; i < levels; i++) - { - Points p = frame->points[i]; - Normals n = frame->normals[i]; - p.create(sz); n.create(sz); - - computePointsNormals(intr.scale(i), depthFactor, scaled, p, n); - - frame->points[i] = p; - frame->normals[i] = n; - - if(i < levels - 1) - { - sz.width /= 2; sz.height /= 2; - scaled = pyrDownBilateral(scaled, sigmaDepth*depthFactor); - } - } -} - -void FrameGeneratorCPU::operator ()(Ptr _frame, InputArray _points, InputArray _normals, int levels) const -{ - ScopeTime st("frameGenerator: pyrDown p, n"); - - CV_Assert( _points.type() == DataType::type); - CV_Assert(_normals.type() == DataType::type); - - Ptr frame = _frame.dynamicCast(); - - CV_Assert(frame); - - frame->depthData = Depth(); - frame->points.resize(levels); - frame->normals.resize(levels); - frame->points[0] = _points.getMat(); - frame->normals[0] = _normals.getMat(); - Size sz = _points.size(); - for(int i = 1; i < levels; i++) - { - sz.width /= 2; sz.height /= 2; - frame->points[i].create(sz); - frame->normals[i].create(sz); - pyrDownPointsNormals(frame->points[i-1], frame->normals[i-1], - frame->points[i ], frame->normals[i ]); - } -} +static void computePointsNormals(const cv::kinfu::Intr, float depthFactor, const Depth, Points, Normals ); +static Depth pyrDownBilateral(const Depth depth, float sigma); +static void pyrDownPointsNormals(const Points p, const Normals n, Points& pdown, Normals& ndown); template inline float specPow(float x) @@ -132,7 +43,7 @@ inline float specPow<1>(float x) struct RenderInvoker : ParallelLoopBody { - RenderInvoker(const Points& _points, const Normals& _normals, Mat_& _img, Affine3f _lightPose, Size _sz) : + RenderInvoker(const Points& _points, const Normals& _normals, Mat_& _img, Affine3f _lightPose, Size _sz) : ParallelLoopBody(), points(_points), normals(_normals), @@ -145,7 +56,7 @@ struct RenderInvoker : ParallelLoopBody { for(int y = range.start; y < range.end; y++) { - Vec3b* imgRow = img[y]; + Vec4b* imgRow = img[y]; const ptype* ptsRow = points[y]; const ptype* nrmRow = normals[y]; @@ -154,11 +65,11 @@ struct RenderInvoker : ParallelLoopBody Point3f p = fromPtype(ptsRow[x]); Point3f n = fromPtype(nrmRow[x]); - Vec3b color; + Vec4b color; if(isNaN(p)) { - color = Vec3b(0, 32, 0); + color = Vec4b(0, 32, 0, 0); } else { @@ -178,7 +89,7 @@ struct RenderInvoker : ParallelLoopBody uchar ix = (uchar)((Ax*Ka*Dx + Lx*Kd*Dx*max(0.f, n.dot(l)) + Lx*Ks*Sx*specPow(max(0.f, r.dot(v))))*255.f); - color = Vec3b(ix, ix, ix); + color = Vec4b(ix, ix, ix, 0); } imgRow[x] = color; @@ -188,38 +99,16 @@ struct RenderInvoker : ParallelLoopBody const Points& points; const Normals& normals; - Mat_& img; + Mat_& img; Affine3f lightPose; Size sz; }; -void FrameCPU::render(OutputArray image, int level, Affine3f lightPose) const -{ - ScopeTime st("frame render"); - - CV_Assert(level < (int)points.size()); - CV_Assert(level < (int)normals.size()); - - Size sz = points[level].size(); - image.create(sz, CV_8UC3); - Mat_ img = image.getMat(); - - RenderInvoker ri(points[level], normals[level], img, lightPose, sz); - Range range(0, sz.height); - const int nstripes = -1; - parallel_for_(range, ri, nstripes); -} - - -void FrameCPU::getDepth(OutputArray _depth) const -{ - CV_Assert(!depthData.empty()); - _depth.assign(depthData); -} - void pyrDownPointsNormals(const Points p, const Normals n, Points &pdown, Normals &ndown) { + CV_TRACE_FUNCTION(); + for(int y = 0; y < pdown.rows; y++) { ptype* ptsRow = pdown[y]; @@ -310,6 +199,8 @@ struct PyrDownBilateralInvoker : ParallelLoopBody Depth pyrDownBilateral(const Depth depth, float sigma) { + CV_TRACE_FUNCTION(); + Depth depthDown(depth.rows/2, depth.cols/2); PyrDownBilateralInvoker pdi(depth, depthDown, sigma); @@ -386,6 +277,8 @@ struct ComputePointsNormalsInvoker : ParallelLoopBody void computePointsNormals(const Intr intr, float depthFactor, const Depth depth, Points points, Normals normals) { + CV_TRACE_FUNCTION(); + CV_Assert(!points.empty() && !normals.empty()); CV_Assert(depth.size() == points.size()); CV_Assert(depth.size() == normals.size()); @@ -405,53 +298,399 @@ void computePointsNormals(const Intr intr, float depthFactor, const Depth depth, ///////// GPU implementation ///////// -struct FrameGeneratorGPU : FrameGenerator -{ -public: - virtual cv::Ptr operator ()() const override; - virtual void operator() (Ptr frame, InputArray depth, const kinfu::Intr, int levels, float depthFactor, - float sigmaDepth, float sigmaSpatial, int kernelSize) const override; - virtual void operator() (Ptr frame, InputArray points, InputArray normals, int levels) const override; +#ifdef HAVE_OPENCL - virtual ~FrameGeneratorGPU() {} -}; +static bool ocl_renderPointsNormals(const UMat points, const UMat normals, UMat image, Affine3f lightPose); +static bool ocl_makeFrameFromDepth(const UMat depth, OutputArrayOfArrays points, OutputArrayOfArrays normals, + const Intr intr, int levels, float depthFactor, + float sigmaDepth, float sigmaSpatial, int kernelSize); +static bool ocl_buildPyramidPointsNormals(const UMat points, const UMat normals, + OutputArrayOfArrays pyrPoints, OutputArrayOfArrays pyrNormals, + int levels); -cv::Ptr FrameGeneratorGPU::operator ()() const +static bool computePointsNormalsGpu(const Intr intr, float depthFactor, const UMat& depth, UMat& points, UMat& normals); +static bool pyrDownBilateralGpu(const UMat& depth, UMat& depthDown, float sigma); +static bool customBilateralFilterGpu(const UMat src, UMat& dst, int kernelSize, float sigmaDepth, float sigmaSpatial); +static bool pyrDownPointsNormalsGpu(const UMat p, const UMat n, UMat &pdown, UMat &ndown); + + +bool computePointsNormalsGpu(const Intr intr, float depthFactor, const UMat& depth, + UMat& points, UMat& normals) { - return makePtr(); + CV_TRACE_FUNCTION(); + + CV_Assert(!points.empty() && !normals.empty()); + CV_Assert(depth.size() == points.size()); + CV_Assert(depth.size() == normals.size()); + CV_Assert(depth.type() == DEPTH_TYPE); + CV_Assert(points.type() == POINT_TYPE); + CV_Assert(normals.type() == POINT_TYPE); + + // conversion to meters + float dfac = 1.f/depthFactor; + + Intr::Reprojector reproj = intr.makeReprojector(); + + cv::String errorStr; + cv::String name = "computePointsNormals"; + ocl::ProgramSource source = ocl::rgbd::kinfu_frame_oclsrc; + cv::String options = "-cl-fast-relaxed-math -cl-mad-enable"; + ocl::Kernel k; + k.create(name.c_str(), source, options, &errorStr); + + if(k.empty()) + return false; + + Vec2f fxyinv(reproj.fxinv, reproj.fyinv), cxy(reproj.cx, reproj.cy); + + k.args(ocl::KernelArg::WriteOnlyNoSize(points), + ocl::KernelArg::WriteOnlyNoSize(normals), + ocl::KernelArg::ReadOnly(depth), + fxyinv.val, + cxy.val, + dfac); + + size_t globalSize[2]; + globalSize[0] = (size_t)depth.cols; + globalSize[1] = (size_t)depth.rows; + + return k.run(2, globalSize, NULL, true); } -void FrameGeneratorGPU::operator ()(Ptr /*frame*/, InputArray /*depth*/, const Intr /*intr*/, int /*levels*/, float /*depthFactor*/, - float /*sigmaDepth*/, float /*sigmaSpatial*/, int /*kernelSize*/) const + +bool pyrDownBilateralGpu(const UMat& depth, UMat& depthDown, float sigma) { - throw std::runtime_error("Not implemented"); + CV_TRACE_FUNCTION(); + + depthDown.create(depth.rows/2, depth.cols/2, DEPTH_TYPE); + + cv::String errorStr; + cv::String name = "pyrDownBilateral"; + ocl::ProgramSource source = ocl::rgbd::kinfu_frame_oclsrc; + cv::String options = "-cl-fast-relaxed-math -cl-mad-enable"; + ocl::Kernel k; + k.create(name.c_str(), source, options, &errorStr); + + if(k.empty()) + return false; + + k.args(ocl::KernelArg::ReadOnly(depth), + ocl::KernelArg::WriteOnly(depthDown), + sigma); + + size_t globalSize[2]; + globalSize[0] = (size_t)depthDown.cols; + globalSize[1] = (size_t)depthDown.rows; + + return k.run(2, globalSize, NULL, true); } -void FrameGeneratorGPU::operator ()(Ptr /*frame*/, InputArray /*_points*/, InputArray /*_normals*/, int /*levels*/) const +//TODO: remove it when OpenCV's bilateral processes 32f on GPU +bool customBilateralFilterGpu(const UMat src /* udepth */, UMat& dst /* smooth */, + int kernelSize, float sigmaDepth, float sigmaSpatial) { - throw std::runtime_error("Not implemented"); + CV_TRACE_FUNCTION(); + + Size frameSize = src.size(); + + CV_Assert(frameSize.area() > 0); + CV_Assert(src.type() == DEPTH_TYPE); + + dst.create(frameSize, DEPTH_TYPE); + + cv::String errorStr; + cv::String name = "customBilateral"; + ocl::ProgramSource source = ocl::rgbd::kinfu_frame_oclsrc; + cv::String options = "-cl-fast-relaxed-math -cl-mad-enable"; + ocl::Kernel k; + k.create(name.c_str(), source, options, &errorStr); + + if(k.empty()) + return false; + + k.args(ocl::KernelArg::ReadOnlyNoSize(src), + ocl::KernelArg::WriteOnlyNoSize(dst), + frameSize, + kernelSize, + 0.5f / (sigmaSpatial * sigmaSpatial), + 0.5f / (sigmaDepth * sigmaDepth)); + + size_t globalSize[2]; + globalSize[0] = (size_t)src.cols; + globalSize[1] = (size_t)src.rows; + + return k.run(2, globalSize, NULL, true); } -void FrameGPU::render(OutputArray /* image */, int /*level*/, Affine3f /*lightPose*/) const + +bool pyrDownPointsNormalsGpu(const UMat p, const UMat n, UMat &pdown, UMat &ndown) { - throw std::runtime_error("Not implemented"); + CV_TRACE_FUNCTION(); + + cv::String errorStr; + cv::String name = "pyrDownPointsNormals"; + ocl::ProgramSource source = ocl::rgbd::kinfu_frame_oclsrc; + cv::String options = "-cl-fast-relaxed-math -cl-mad-enable"; + ocl::Kernel k; + k.create(name.c_str(), source, options, &errorStr); + + if(k.empty()) + return false; + + Size downSize = pdown.size(); + + k.args(ocl::KernelArg::ReadOnlyNoSize(p), + ocl::KernelArg::ReadOnlyNoSize(n), + ocl::KernelArg::WriteOnlyNoSize(pdown), + ocl::KernelArg::WriteOnlyNoSize(ndown), + downSize); + + size_t globalSize[2]; + globalSize[0] = (size_t)pdown.cols; + globalSize[1] = (size_t)pdown.rows; + + return k.run(2, globalSize, NULL, true); } -void FrameGPU::getDepth(OutputArray /* depth */) const + +static bool ocl_renderPointsNormals(const UMat points, const UMat normals, + UMat img, Affine3f lightPose) { - throw std::runtime_error("Not implemented"); + CV_TRACE_FUNCTION(); + + cv::String errorStr; + cv::String name = "render"; + ocl::ProgramSource source = ocl::rgbd::kinfu_frame_oclsrc; + cv::String options = "-cl-fast-relaxed-math -cl-mad-enable"; + ocl::Kernel k; + k.create(name.c_str(), source, options, &errorStr); + + if(k.empty()) + return false; + + Vec4f lightPt(lightPose.translation()[0], + lightPose.translation()[1], + lightPose.translation()[2]); + Size frameSize = points.size(); + + k.args(ocl::KernelArg::ReadOnlyNoSize(points), + ocl::KernelArg::ReadOnlyNoSize(normals), + ocl::KernelArg::WriteOnlyNoSize(img), + frameSize, + lightPt.val); + + size_t globalSize[2]; + globalSize[0] = (size_t)points.cols; + globalSize[1] = (size_t)points.rows; + + return k.run(2, globalSize, NULL, true); } -cv::Ptr makeFrameGenerator(cv::kinfu::Params::PlatformType t) + +static bool ocl_makeFrameFromDepth(const UMat depth, OutputArrayOfArrays points, OutputArrayOfArrays normals, + const Intr intr, int levels, float depthFactor, + float sigmaDepth, float sigmaSpatial, int kernelSize) { - switch (t) + CV_TRACE_FUNCTION(); + + // looks like OpenCV's bilateral filter works the same as KinFu's + UMat smooth; + //TODO: fix that + // until 32f isn't implemented in OpenCV, we should use our workarounds + //bilateralFilter(udepth, smooth, kernelSize, sigmaDepth*depthFactor, sigmaSpatial); + if(!customBilateralFilterGpu(depth, smooth, kernelSize, sigmaDepth*depthFactor, sigmaSpatial)) + return false; + + // depth truncation is not used by default + //if (p.icp_truncate_depth_dist > 0) kfusion::cuda::depthTruncation(curr_.depth_pyr[0], p.icp_truncate_depth_dist); + + UMat scaled = smooth; + Size sz = smooth.size(); + points.create(levels, 1, POINT_TYPE); + normals.create(levels, 1, POINT_TYPE); + for(int i = 0; i < levels; i++) { - case cv::kinfu::Params::PlatformType::PLATFORM_CPU: - return cv::makePtr(); - case cv::kinfu::Params::PlatformType::PLATFORM_GPU: - return cv::makePtr(); - default: - return cv::Ptr(); + UMat& p = points.getUMatRef(i); + UMat& n = normals.getUMatRef(i); + p.create(sz, POINT_TYPE); + n.create(sz, POINT_TYPE); + + if(!computePointsNormalsGpu(intr.scale(i), depthFactor, scaled, p, n)) + return false; + + if(i < levels - 1) + { + sz.width /= 2, sz.height /= 2; + UMat halfDepth(sz, DEPTH_TYPE); + pyrDownBilateralGpu(scaled, halfDepth, sigmaDepth*depthFactor); + scaled = halfDepth; + } + } + + return true; +} + + +static bool ocl_buildPyramidPointsNormals(const UMat points, const UMat normals, + OutputArrayOfArrays pyrPoints, OutputArrayOfArrays pyrNormals, + int levels) +{ + CV_TRACE_FUNCTION(); + + pyrPoints .create(levels, 1, POINT_TYPE); + pyrNormals.create(levels, 1, POINT_TYPE); + + pyrPoints .getUMatRef(0) = points; + pyrNormals.getUMatRef(0) = normals; + + Size sz = points.size(); + for(int i = 1; i < levels; i++) + { + UMat p1 = pyrPoints .getUMat(i-1); + UMat n1 = pyrNormals.getUMat(i-1); + + sz.width /= 2; sz.height /= 2; + UMat& p0 = pyrPoints .getUMatRef(i); + UMat& n0 = pyrNormals.getUMatRef(i); + p0.create(sz, POINT_TYPE); + n0.create(sz, POINT_TYPE); + + if(!pyrDownPointsNormalsGpu(p1, n1, p0, n0)) + return false; + } + + return true; +} + +#endif + + +void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray image, Affine3f lightPose) +{ + CV_TRACE_FUNCTION(); + + CV_Assert(_points.size().area() > 0); + CV_Assert(_points.size() == _normals.size()); + + Size sz = _points.size(); + image.create(sz, CV_8UC4); + + CV_OCL_RUN(_points.isUMat() && _normals.isUMat() && image.isUMat(), + ocl_renderPointsNormals(_points.getUMat(), + _normals.getUMat(), + image.getUMat(), lightPose)) + + Points points = _points.getMat(); + Normals normals = _normals.getMat(); + + Mat_ img = image.getMat(); + + RenderInvoker ri(points, normals, img, lightPose, sz); + Range range(0, sz.height); + const int nstripes = -1; + parallel_for_(range, ri, nstripes); +} + + +void makeFrameFromDepth(InputArray _depth, + OutputArray pyrPoints, OutputArray pyrNormals, + const Intr intr, int levels, float depthFactor, + float sigmaDepth, float sigmaSpatial, int kernelSize) +{ + CV_TRACE_FUNCTION(); + + CV_Assert(_depth.type() == DEPTH_TYPE); + + CV_OCL_RUN(_depth.isUMat() && pyrPoints.isUMatVector() && pyrNormals.isUMatVector(), + ocl_makeFrameFromDepth(_depth.getUMat(), pyrPoints, pyrNormals, + intr, levels, depthFactor, + sigmaDepth, sigmaSpatial, kernelSize)); + + int kp = pyrPoints.kind(), kn = pyrNormals.kind(); + CV_Assert(kp == _InputArray::STD_ARRAY_MAT || kp == _InputArray::STD_VECTOR_MAT); + CV_Assert(kn == _InputArray::STD_ARRAY_MAT || kn == _InputArray::STD_VECTOR_MAT); + + Depth depth = _depth.getMat(); + + // looks like OpenCV's bilateral filter works the same as KinFu's + Depth smooth; + + //TODO: remove it when OpenCV's bilateral works properly + patchNaNs(depth); + + bilateralFilter(depth, smooth, kernelSize, sigmaDepth*depthFactor, sigmaSpatial); + + // depth truncation is not used by default + //if (p.icp_truncate_depth_dist > 0) kfusion::cuda::depthTruncation(curr_.depth_pyr[0], p.icp_truncate_depth_dist); + + // we don't need depth pyramid outside this method + // if we do, the code is to be refactored + + Depth scaled = smooth; + Size sz = smooth.size(); + pyrPoints.create(levels, 1, POINT_TYPE); + pyrNormals.create(levels, 1, POINT_TYPE); + for(int i = 0; i < levels; i++) + { + pyrPoints .create(sz, POINT_TYPE, i); + pyrNormals.create(sz, POINT_TYPE, i); + + Points p = pyrPoints. getMatRef(i); + Normals n = pyrNormals.getMatRef(i); + + computePointsNormals(intr.scale(i), depthFactor, scaled, p, n); + + if(i < levels - 1) + { + sz.width /= 2; sz.height /= 2; + scaled = pyrDownBilateral(scaled, sigmaDepth*depthFactor); + } + } +} + + +void buildPyramidPointsNormals(InputArray _points, InputArray _normals, + OutputArrayOfArrays pyrPoints, OutputArrayOfArrays pyrNormals, + int levels) +{ + CV_TRACE_FUNCTION(); + + CV_Assert(_points.type() == POINT_TYPE); + CV_Assert(_points.type() == _normals.type()); + CV_Assert(_points.size() == _normals.size()); + + CV_OCL_RUN(_points.isUMat() && _normals.isUMat() && + pyrPoints.isUMatVector() && pyrNormals.isUMatVector(), + ocl_buildPyramidPointsNormals(_points.getUMat(), _normals.getUMat(), + pyrPoints, pyrNormals, + levels)); + + int kp = pyrPoints.kind(), kn = pyrNormals.kind(); + CV_Assert(kp == _InputArray::STD_ARRAY_MAT || kp == _InputArray::STD_VECTOR_MAT); + CV_Assert(kn == _InputArray::STD_ARRAY_MAT || kn == _InputArray::STD_VECTOR_MAT); + + Mat p0 = _points.getMat(), n0 = _normals.getMat(); + + pyrPoints .create(levels, 1, POINT_TYPE); + pyrNormals.create(levels, 1, POINT_TYPE); + + pyrPoints .getMatRef(0) = p0; + pyrNormals.getMatRef(0) = n0; + + Size sz = _points.size(); + for(int i = 1; i < levels; i++) + { + Points p1 = pyrPoints .getMat(i-1); + Normals n1 = pyrNormals.getMat(i-1); + + sz.width /= 2; sz.height /= 2; + + pyrPoints .create(sz, POINT_TYPE, i); + pyrNormals.create(sz, POINT_TYPE, i); + Points pd = pyrPoints. getMatRef(i); + Normals nd = pyrNormals.getMatRef(i); + + pyrDownPointsNormals(p1, n1, pd, nd); } } diff --git a/modules/rgbd/src/kinfu_frame.hpp b/modules/rgbd/src/kinfu_frame.hpp index 02c281d2a..f9237ffe0 100644 --- a/modules/rgbd/src/kinfu_frame.hpp +++ b/modules/rgbd/src/kinfu_frame.hpp @@ -70,52 +70,24 @@ inline ptype toPtype(const cv::Vec3f& x) return ptype(x[0], x[1], x[2], 0); } +enum +{ + DEPTH_TYPE = DataType::type, + POINT_TYPE = DataType::type +}; + typedef cv::Mat_< ptype > Points; typedef Points Normals; typedef cv::Mat_< depthType > Depth; -struct Frame -{ -public: - virtual void render(cv::OutputArray image, int level, cv::Affine3f lightPose) const = 0; - virtual void getDepth(cv::OutputArray depth) const = 0; - virtual ~Frame() { } -}; - -struct FrameCPU : Frame -{ -public: - FrameCPU() : points(), normals() { } - virtual ~FrameCPU() { } - - virtual void render(cv::OutputArray image, int level, cv::Affine3f lightPose) const override; - virtual void getDepth(cv::OutputArray depth) const override; - - std::vector points; - std::vector normals; - Depth depthData; -}; - -struct FrameGPU : Frame -{ -public: - virtual void render(cv::OutputArray image, int level, cv::Affine3f lightPose) const override; - virtual void getDepth(cv::OutputArray depth) const override; - virtual ~FrameGPU() { } -}; - -struct FrameGenerator -{ -public: - virtual cv::Ptr operator ()() const = 0; - virtual void operator() (cv::Ptr frame, cv::InputArray depth, const cv::kinfu::Intr, int levels, float depthFactor, - float sigmaDepth, float sigmaSpatial, int kernelSize) const = 0; - virtual void operator() (cv::Ptr frame, cv::InputArray points, cv::InputArray normals, int levels) const = 0; - virtual ~FrameGenerator() {} -}; - -cv::Ptr makeFrameGenerator(cv::kinfu::Params::PlatformType t); +void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray image, cv::Affine3f lightPose); +void makeFrameFromDepth(InputArray depth, OutputArray pyrPoints, OutputArray pyrNormals, + const Intr intr, int levels, float depthFactor, + float sigmaDepth, float sigmaSpatial, int kernelSize); +void buildPyramidPointsNormals(InputArray _points, InputArray _normals, + OutputArrayOfArrays pyrPoints, OutputArrayOfArrays pyrNormals, + int levels); } // namespace kinfu } // namespace cv diff --git a/modules/rgbd/src/odometry.cpp b/modules/rgbd/src/odometry.cpp index e6d4d07cd..a81e72ffe 100755 --- a/modules/rgbd/src/odometry.cpp +++ b/modules/rgbd/src/odometry.cpp @@ -1490,21 +1490,10 @@ Size FastICPOdometry::prepareFrameCache(Ptr& frame, int cacheType checkDepth(frame->depth, frame->depth.size()); // mask isn't used by FastICP - - Ptr fg = makeFrameGenerator(Params::PlatformType::PLATFORM_CPU); - Ptr f = (*fg)().dynamicCast(); Intr intr(cameraMatrix); float depthFactor = 1.f; // user should rescale depth manually - (*fg)(f, frame->depth, intr, (int)iterCounts.total(), depthFactor, - sigmaDepth, sigmaSpatial, kernelSize); - - frame->pyramidCloud.clear(); - frame->pyramidNormals.clear(); - for(size_t i = 0; i < f->points.size(); i++) - { - frame->pyramidCloud.push_back(f->points[i]); - frame->pyramidNormals.push_back(f->normals[i]); - } + makeFrameFromDepth(frame->depth, frame->pyramidCloud, frame->pyramidNormals, intr, (int)iterCounts.total(), + depthFactor, sigmaDepth, sigmaSpatial, kernelSize); return frame->depth.size(); } @@ -1526,22 +1515,16 @@ bool FastICPOdometry::computeImpl(const Ptr& srcFrame, { kinfu::Intr intr(cameraMatrix); std::vector iterations = iterCounts; - Ptr icp = kinfu::makeICP(kinfu::Params::PlatformType::PLATFORM_CPU, - intr, + Ptr icp = kinfu::makeICP(intr, iterations, angleThreshold, maxDistDiff); + // KinFu's ICP calculates transformation from new frame to old one (src to dst) Affine3f transform; - Ptr srcF = makePtr(), dstF = makePtr(); - for(size_t i = 0; i < srcFrame->pyramidCloud.size(); i++) - { - srcF-> points.push_back(srcFrame->pyramidCloud [i]); - srcF->normals.push_back(srcFrame->pyramidNormals[i]); - dstF-> points.push_back(dstFrame->pyramidCloud [i]); - dstF->normals.push_back(dstFrame->pyramidNormals[i]); - } - bool result = icp->estimateTransform(transform, dstF, srcF); + bool result = icp->estimateTransform(transform, + dstFrame->pyramidCloud, dstFrame->pyramidNormals, + srcFrame->pyramidCloud, srcFrame->pyramidNormals); Rt.create(Size(4, 4), CV_64FC1); Mat(Matx44d(transform.matrix)).copyTo(Rt.getMat()); diff --git a/modules/rgbd/src/opencl/icp.cl b/modules/rgbd/src/opencl/icp.cl new file mode 100644 index 000000000..2a8448e2c --- /dev/null +++ b/modules/rgbd/src/opencl/icp.cl @@ -0,0 +1,239 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +// This code is also subject to the license terms in the LICENSE_KinectFusion.md file found in this module's directory + +#define UTSIZE 27 + +typedef float4 ptype; + +/* + Calculate an upper triangle of Ab matrix then reduce it across workgroup +*/ + +inline void calcAb7(__global const char * oldPointsptr, + int oldPoints_step, int oldPoints_offset, + __global const char * oldNormalsptr, + int oldNormals_step, int oldNormals_offset, + const int2 oldSize, + __global const char * newPointsptr, + int newPoints_step, int newPoints_offset, + __global const char * newNormalsptr, + int newNormals_step, int newNormals_offset, + const int2 newSize, + const float16 poseMatrix, + const float2 fxy, + const float2 cxy, + const float sqDistanceThresh, + const float minCos, + float* ab7 + ) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if(x >= newSize.x || y >= newSize.y) + return; + + // coord-independent constants + + const float3 poseRot0 = poseMatrix.s012; + const float3 poseRot1 = poseMatrix.s456; + const float3 poseRot2 = poseMatrix.s89a; + const float3 poseTrans = poseMatrix.s37b; + + const float2 oldEdge = (float2)(oldSize.x - 1, oldSize.y - 1); + + __global const ptype* newPtsRow = (__global const ptype*)(newPointsptr + + newPoints_offset + + y*newPoints_step); + + __global const ptype* newNrmRow = (__global const ptype*)(newNormalsptr + + newNormals_offset + + y*newNormals_step); + + float3 newP = newPtsRow[x].xyz; + float3 newN = newNrmRow[x].xyz; + + if(any(isnan(newP)) || any(isnan(newN))) + return; + + //transform to old coord system + newP = (float3)(dot(newP, poseRot0), + dot(newP, poseRot1), + dot(newP, poseRot2)) + poseTrans; + newN = (float3)(dot(newN, poseRot0), + dot(newN, poseRot1), + dot(newN, poseRot2)); + + //find correspondence by projecting the point + float2 oldCoords = (newP.xy/newP.z)*fxy+cxy; + + if(!(all(oldCoords >= 0.f) && all(oldCoords < oldEdge))) + return; + + // bilinearly interpolate oldPts and oldNrm under oldCoords point + float3 oldP, oldN; + float2 ip = floor(oldCoords); + float2 t = oldCoords - ip; + int xi = ip.x, yi = ip.y; + + __global const ptype* prow0 = (__global const ptype*)(oldPointsptr + + oldPoints_offset + + (yi+0)*oldPoints_step); + __global const ptype* prow1 = (__global const ptype*)(oldPointsptr + + oldPoints_offset + + (yi+1)*oldPoints_step); + float3 p00 = prow0[xi+0].xyz; + float3 p01 = prow0[xi+1].xyz; + float3 p10 = prow1[xi+0].xyz; + float3 p11 = prow1[xi+1].xyz; + + // NaN check is done later + + __global const ptype* nrow0 = (__global const ptype*)(oldNormalsptr + + oldNormals_offset + + (yi+0)*oldNormals_step); + __global const ptype* nrow1 = (__global const ptype*)(oldNormalsptr + + oldNormals_offset + + (yi+1)*oldNormals_step); + + float3 n00 = nrow0[xi+0].xyz; + float3 n01 = nrow0[xi+1].xyz; + float3 n10 = nrow1[xi+0].xyz; + float3 n11 = nrow1[xi+1].xyz; + + // NaN check is done later + + float3 p0 = mix(p00, p01, t.x); + float3 p1 = mix(p10, p11, t.x); + oldP = mix(p0, p1, t.y); + + float3 n0 = mix(n00, n01, t.x); + float3 n1 = mix(n10, n11, t.x); + oldN = mix(n0, n1, t.y); + + if(any(isnan(oldP)) || any(isnan(oldN))) + return; + + //filter by distance + float3 diff = newP - oldP; + if(dot(diff, diff) > sqDistanceThresh) + return; + + //filter by angle + if(fabs(dot(newN, oldN)) < minCos) + return; + + // build point-wise vector ab = [ A | b ] + + float3 VxN = cross(newP, oldN); + float ab[7] = {VxN.x, VxN.y, VxN.z, oldN.x, oldN.y, oldN.z, -dot(oldN, diff)}; + + for(int i = 0; i < 7; i++) + ab7[i] = ab[i]; +} + + +__kernel void getAb(__global const char * oldPointsptr, + int oldPoints_step, int oldPoints_offset, + __global const char * oldNormalsptr, + int oldNormals_step, int oldNormals_offset, + const int2 oldSize, + __global const char * newPointsptr, + int newPoints_step, int newPoints_offset, + __global const char * newNormalsptr, + int newNormals_step, int newNormals_offset, + const int2 newSize, + const float16 poseMatrix, + const float2 fxy, + const float2 cxy, + const float sqDistanceThresh, + const float minCos, + __local float * reducebuf, + __global char* groupedSumptr, + int groupedSum_step, int groupedSum_offset +) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if(x >= newSize.x || y >= newSize.y) + return; + + const int gx = get_group_id(0); + const int gy = get_group_id(1); + const int gw = get_num_groups(0); + const int gh = get_num_groups(1); + + const int lx = get_local_id(0); + const int ly = get_local_id(1); + const int lw = get_local_size(0); + const int lh = get_local_size(1); + const int lsz = lw*lh; + const int lid = lx + ly*lw; + + float ab[7]; + for(int i = 0; i < 7; i++) + ab[i] = 0; + + calcAb7(oldPointsptr, + oldPoints_step, oldPoints_offset, + oldNormalsptr, + oldNormals_step, oldNormals_offset, + oldSize, + newPointsptr, + newPoints_step, newPoints_offset, + newNormalsptr, + newNormals_step, newNormals_offset, + newSize, + poseMatrix, + fxy, cxy, + sqDistanceThresh, + minCos, + ab); + + // build point-wise upper-triangle matrix [ab^T * ab] w/o last row + // which is [A^T*A | A^T*b] + // and gather sum + + __local float* upperTriangle = reducebuf + lid*UTSIZE; + + int pos = 0; + for(int i = 0; i < 6; i++) + { + for(int j = i; j < 7; j++) + { + upperTriangle[pos++] = ab[i]*ab[j]; + } + } + + // reduce upperTriangle to local mem + + // maxStep = ctz(lsz), ctz isn't supported on CUDA devices + const int c = clz(lsz & -lsz); + const int maxStep = c ? 31 - c : c; + for(int nstep = 1; nstep <= maxStep; nstep++) + { + if(lid % (1 << nstep) == 0) + { + __local float* rto = reducebuf + UTSIZE*lid; + __local float* rfrom = reducebuf + UTSIZE*(lid+(1 << (nstep-1))); + for(int i = 0; i < UTSIZE; i++) + rto[i] += rfrom[i]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // here group sum should be in reducebuf[0...UTSIZE] + if(lid == 0) + { + __global float* groupedRow = (__global float*)(groupedSumptr + + groupedSum_offset + + gy*groupedSum_step); + + for(int i = 0; i < UTSIZE; i++) + groupedRow[gx*UTSIZE + i] = reducebuf[i]; + } +} diff --git a/modules/rgbd/src/opencl/kinfu_frame.cl b/modules/rgbd/src/opencl/kinfu_frame.cl new file mode 100644 index 000000000..07a05dfed --- /dev/null +++ b/modules/rgbd/src/opencl/kinfu_frame.cl @@ -0,0 +1,287 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +// This code is also subject to the license terms in the LICENSE_KinectFusion.md file found in this module's directory + +inline float3 reproject(float3 p, float2 fxyinv, float2 cxy) +{ + float2 pp = p.z*(p.xy - cxy)*fxyinv; + return (float3)(pp, p.z); +} + +typedef float4 ptype; + +__kernel void computePointsNormals(__global char * pointsptr, + int points_step, int points_offset, + __global char * normalsptr, + int normals_step, int normals_offset, + __global const char * depthptr, + int depth_step, int depth_offset, + int depth_rows, int depth_cols, + const float2 fxyinv, + const float2 cxy, + const float dfac + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if(x >= depth_cols || y >= depth_rows) + return; + + __global const float* row0 = (__global const float*)(depthptr + depth_offset + + (y+0)*depth_step); + __global const float* row1 = (__global const float*)(depthptr + depth_offset + + (y+1)*depth_step); + + float d00 = row0[x]; + float z00 = d00*dfac; + float3 p00 = (float3)(convert_float2((int2)(x, y)), z00); + float3 v00 = reproject(p00, fxyinv, cxy); + + float3 p = nan((uint)0), n = nan((uint)0); + + if(x < depth_cols - 1 && y < depth_rows - 1) + { + float d01 = row0[x+1]; + float d10 = row1[x]; + + float z01 = d01*dfac; + float z10 = d10*dfac; + + if(z00 != 0 && z01 != 0 && z10 != 0) + { + float3 p01 = (float3)(convert_float2((int2)(x+1, y+0)), z01); + float3 p10 = (float3)(convert_float2((int2)(x+0, y+1)), z10); + float3 v01 = reproject(p01, fxyinv, cxy); + float3 v10 = reproject(p10, fxyinv, cxy); + + float3 vec = cross(v01 - v00, v10 - v00); + n = - normalize(vec); + p = v00; + } + } + + __global float* pts = (__global float*)(pointsptr + points_offset + y*points_step + x*sizeof(ptype)); + __global float* nrm = (__global float*)(normalsptr + normals_offset + y*normals_step + x*sizeof(ptype)); + vstore4((ptype)(p, 0), 0, pts); + vstore4((ptype)(n, 0), 0, nrm); +} + +__kernel void pyrDownBilateral(__global const char * depthptr, + int depth_step, int depth_offset, + int depth_rows, int depth_cols, + __global char * depthDownptr, + int depthDown_step, int depthDown_offset, + int depthDown_rows, int depthDown_cols, + const float sigma + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if(x >= depthDown_cols || y >= depthDown_rows) + return; + + const float sigma3 = sigma*3; + const int D = 5; + + __global const float* srcCenterRow = (__global const float*)(depthptr + depth_offset + + (2*y)*depth_step); + + float center = srcCenterRow[2*x]; + + int sx = max(0, 2*x - D/2), ex = min(2*x - D/2 + D, depth_cols-1); + int sy = max(0, 2*y - D/2), ey = min(2*y - D/2 + D, depth_rows-1); + + float sum = 0; + int count = 0; + + for(int iy = sy; iy < ey; iy++) + { + __global const float* srcRow = (__global const float*)(depthptr + depth_offset + + (iy)*depth_step); + for(int ix = sx; ix < ex; ix++) + { + float val = srcRow[ix]; + if(fabs(val - center) < sigma3) + { + sum += val; count++; + } + } + } + + __global float* downRow = (__global float*)(depthDownptr + depthDown_offset + + y*depthDown_step + x*sizeof(float)); + + *downRow = (count == 0) ? 0 : sum/convert_float(count); +} + +//TODO: remove bilateral when OpenCV performs 32f bilat with OpenCL + +__kernel void customBilateral(__global const char * srcptr, + int src_step, int src_offset, + __global char * dstptr, + int dst_step, int dst_offset, + const int2 frameSize, + const int kernelSize, + const float sigma_spatial2_inv_half, + const float sigma_depth2_inv_half +) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if(x >= frameSize.x || y >= frameSize.y) + return; + + __global const float* srcCenterRow = (__global const float*)(srcptr + src_offset + + y*src_step); + float value = srcCenterRow[x]; + + int tx = min (x - kernelSize / 2 + kernelSize, frameSize.x - 1); + int ty = min (y - kernelSize / 2 + kernelSize, frameSize.y - 1); + + float sum1 = 0; + float sum2 = 0; + + for (int cy = max (y - kernelSize / 2, 0); cy < ty; ++cy) + { + __global const float* srcRow = (__global const float*)(srcptr + src_offset + + cy*src_step); + for (int cx = max (x - kernelSize / 2, 0); cx < tx; ++cx) + { + float depth = srcRow[cx]; + + float space2 = convert_float((x - cx) * (x - cx) + (y - cy) * (y - cy)); + float color2 = (value - depth) * (value - depth); + + float weight = native_exp (-(space2 * sigma_spatial2_inv_half + + color2 * sigma_depth2_inv_half)); + + sum1 += depth * weight; + sum2 += weight; + } + } + + __global float* dst = (__global float*)(dstptr + dst_offset + + y*dst_step + x*sizeof(float)); + *dst = sum1/sum2; +} + +__kernel void pyrDownPointsNormals(__global const char * pptr, + int p_step, int p_offset, + __global const char * nptr, + int n_step, int n_offset, + __global char * pdownptr, + int pdown_step, int pdown_offset, + __global char * ndownptr, + int ndown_step, int ndown_offset, + const int2 downSize + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if(x >= downSize.x || y >= downSize.y) + return; + + float3 point = nan((uint)0), normal = nan((uint)0); + + __global const ptype* pUpRow0 = (__global const ptype*)(pptr + p_offset + (2*y )*p_step); + __global const ptype* pUpRow1 = (__global const ptype*)(pptr + p_offset + (2*y+1)*p_step); + + float3 d00 = pUpRow0[2*x ].xyz; + float3 d01 = pUpRow0[2*x+1].xyz; + float3 d10 = pUpRow1[2*x ].xyz; + float3 d11 = pUpRow1[2*x+1].xyz; + + if(!(any(isnan(d00)) || any(isnan(d01)) || + any(isnan(d10)) || any(isnan(d11)))) + { + point = (d00 + d01 + d10 + d11)*0.25f; + + __global const ptype* nUpRow0 = (__global const ptype*)(nptr + n_offset + (2*y )*n_step); + __global const ptype* nUpRow1 = (__global const ptype*)(nptr + n_offset + (2*y+1)*n_step); + + float3 n00 = nUpRow0[2*x ].xyz; + float3 n01 = nUpRow0[2*x+1].xyz; + float3 n10 = nUpRow1[2*x ].xyz; + float3 n11 = nUpRow1[2*x+1].xyz; + + normal = (n00 + n01 + n10 + n11)*0.25f; + } + + __global ptype* pts = (__global ptype*)(pdownptr + pdown_offset + y*pdown_step); + __global ptype* nrm = (__global ptype*)(ndownptr + ndown_offset + y*ndown_step); + pts[x] = (ptype)(point, 0); + nrm[x] = (ptype)(normal, 0); +} + +typedef char4 pixelType; + +// 20 is fixed power +float specPow20(float x) +{ + float x2 = x*x; + float x5 = x2*x2*x; + float x10 = x5*x5; + float x20 = x10*x10; + return x20; +} + +__kernel void render(__global const char * pointsptr, + int points_step, int points_offset, + __global const char * normalsptr, + int normals_step, int normals_offset, + __global char * imgptr, + int img_step, int img_offset, + const int2 frameSize, + const float4 lightPt + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if(x >= frameSize.x || y >= frameSize.y) + return; + + __global const ptype* ptsRow = (__global const ptype*)(pointsptr + points_offset + y*points_step + x*sizeof(ptype)); + __global const ptype* nrmRow = (__global const ptype*)(normalsptr + normals_offset + y*normals_step + x*sizeof(ptype)); + + float3 p = (*ptsRow).xyz; + float3 n = (*nrmRow).xyz; + + pixelType color; + + if(any(isnan(p))) + { + color = (pixelType)(0, 32, 0, 0); + } + else + { + const float Ka = 0.3f; //ambient coeff + const float Kd = 0.5f; //diffuse coeff + const float Ks = 0.2f; //specular coeff + //const int sp = 20; //specular power, fixed in specPow20() + + const float Ax = 1.f; //ambient color, can be RGB + const float Dx = 1.f; //diffuse color, can be RGB + const float Sx = 1.f; //specular color, can be RGB + const float Lx = 1.f; //light color + + float3 l = normalize(lightPt.xyz - p); + float3 v = normalize(-p); + float3 r = normalize(2.f*n*dot(n, l) - l); + + float val = (Ax*Ka*Dx + Lx*Kd*Dx*max(0.f, dot(n, l)) + + Lx*Ks*Sx*specPow20(max(0.f, dot(r, v)))); + + uchar ix = convert_uchar(val*255.f); + color = (pixelType)(ix, ix, ix, 0); + } + + __global char* imgRow = (__global char*)(imgptr + img_offset + y*img_step + x*sizeof(pixelType)); + vstore4(color, 0, imgRow); +} diff --git a/modules/rgbd/src/opencl/tsdf.cl b/modules/rgbd/src/opencl/tsdf.cl new file mode 100644 index 000000000..d9bfff4ac --- /dev/null +++ b/modules/rgbd/src/opencl/tsdf.cl @@ -0,0 +1,825 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +// This code is also subject to the license terms in the LICENSE_KinectFusion.md file found in this module's directory + +__kernel void integrate(__global const char * depthptr, + int depth_step, int depth_offset, + int depth_rows, int depth_cols, + __global float2 * volumeptr, + const float16 vol2camMatrix, + const float voxelSize, + const int4 volResolution4, + const int4 volDims4, + const float2 fxy, + const float2 cxy, + const float dfac, + const float truncDist, + const int maxWeight) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + const int3 volResolution = volResolution4.xyz; + + if(x >= volResolution.x || y >= volResolution.y) + return; + + // coord-independent constants + const int3 volDims = volDims4.xyz; + const float2 limits = (float2)(depth_cols-1, depth_rows-1); + + const float4 vol2cam0 = vol2camMatrix.s0123; + const float4 vol2cam1 = vol2camMatrix.s4567; + const float4 vol2cam2 = vol2camMatrix.s89ab; + + const float truncDistInv = 1.f/truncDist; + + // optimization of camSpace transformation (vector addition instead of matmul at each z) + float4 inPt = (float4)(x*voxelSize, y*voxelSize, 0, 1); + float3 basePt = (float3)(dot(vol2cam0, inPt), + dot(vol2cam1, inPt), + dot(vol2cam2, inPt)); + + float3 camSpacePt = basePt; + + // zStep == vol2cam*(float3(x, y, 1)*voxelSize) - basePt; + float3 zStep = ((float3)(vol2cam0.z, vol2cam1.z, vol2cam2.z))*voxelSize; + + int volYidx = x*volDims.x + y*volDims.y; + + int startZ, endZ; + if(fabs(zStep.z) > 1e-5) + { + int baseZ = convert_int(-basePt.z / zStep.z); + if(zStep.z > 0) + { + startZ = baseZ; + endZ = volResolution.z; + } + else + { + startZ = 0; + endZ = baseZ; + } + } + else + { + if(basePt.z > 0) + { + startZ = 0; endZ = volResolution.z; + } + else + { + // z loop shouldn't be performed + //startZ = endZ = 0; + return; + } + } + + startZ = max(0, startZ); + endZ = min(volResolution.z, endZ); + + for(int z = startZ; z < endZ; z++) + { + // optimization of the following: + //float3 camSpacePt = vol2cam * ((float3)(x, y, z)*voxelSize); + camSpacePt += zStep; + + if(camSpacePt.z <= 0) + continue; + + float3 camPixVec = camSpacePt / camSpacePt.z; + float2 projected = mad(camPixVec.xy, fxy, cxy); + + float v; + // bilinearly interpolate depth at projected + if(all(projected >= 0) && all(projected < limits)) + { + float2 ip = floor(projected); + int xi = ip.x, yi = ip.y; + + __global const float* row0 = (__global const float*)(depthptr + depth_offset + + (yi+0)*depth_step); + __global const float* row1 = (__global const float*)(depthptr + depth_offset + + (yi+1)*depth_step); + + float v00 = row0[xi+0]; + float v01 = row0[xi+1]; + float v10 = row1[xi+0]; + float v11 = row1[xi+1]; + float4 vv = (float4)(v00, v01, v10, v11); + + // assume correct depth is positive + if(all(vv > 0)) + { + float2 t = projected - ip; + float2 vf = mix(vv.xz, vv.yw, t.x); + v = mix(vf.s0, vf.s1, t.y); + } + else + continue; + } + else + continue; + + if(v == 0) + continue; + + float pixNorm = length(camPixVec); + + // difference between distances of point and of surface to camera + float sdf = pixNorm*(v*dfac - camSpacePt.z); + // possible alternative is: + // float sdf = length(camSpacePt)*(v*dfac/camSpacePt.z - 1.0); + + if(sdf >= -truncDist) + { + float tsdf = fmin(1.0f, sdf * truncDistInv); + int volIdx = volYidx + z*volDims.z; + + float2 voxel = volumeptr[volIdx]; + float value = voxel.s0; + int weight = as_int(voxel.s1); + + // update TSDF + value = (value*weight + tsdf) / (weight + 1); + weight = min(weight + 1, maxWeight); + + voxel.s0 = value; + voxel.s1 = as_float(weight); + volumeptr[volIdx] = voxel; + } + } +} + + +inline float interpolateVoxel(float3 p, __global const float2* volumePtr, + int3 volDims, int8 neighbourCoords) +{ + float3 fip = floor(p); + int3 ip = convert_int3(fip); + float3 t = p - fip; + + int3 cmul = volDims*ip; + int coordBase = cmul.x + cmul.y + cmul.z; + int nco[8]; + vstore8(neighbourCoords + coordBase, 0, nco); + + float vaz[8]; + for(int i = 0; i < 8; i++) + vaz[i] = volumePtr[nco[i]].s0; + + float8 vz = vload8(0, vaz); + + float4 vy = mix(vz.s0246, vz.s1357, t.z); + float2 vx = mix(vy.s02, vy.s13, t.y); + return mix(vx.s0, vx.s1, t.x); +} + +inline float3 getNormalVoxel(float3 p, __global const float2* volumePtr, + int3 volResolution, int3 volDims, int8 neighbourCoords) +{ + if(any(p < 1) || any(p >= convert_float3(volResolution - 2))) + return nan((uint)0); + + float3 fip = floor(p); + int3 ip = convert_int3(fip); + float3 t = p - fip; + + int3 cmul = volDims*ip; + int coordBase = cmul.x + cmul.y + cmul.z; + int nco[8]; + vstore8(neighbourCoords + coordBase, 0, nco); + + int arDims[3]; + vstore3(volDims, 0, arDims); + float an[3]; + for(int c = 0; c < 3; c++) + { + int dim = arDims[c]; + + float vaz[8]; + for(int i = 0; i < 8; i++) + vaz[i] = volumePtr[nco[i] + dim].s0 - + volumePtr[nco[i] - dim].s0; + + float8 vz = vload8(0, vaz); + + float4 vy = mix(vz.s0246, vz.s1357, t.z); + float2 vx = mix(vy.s02, vy.s13, t.y); + + an[c] = mix(vx.s0, vx.s1, t.x); + } + + //gradientDeltaFactor is fixed at 1.0 of voxel size + return fast_normalize(vload3(0, an)); +} + +typedef float4 ptype; + +__kernel void raycast(__global char * pointsptr, + int points_step, int points_offset, + __global char * normalsptr, + int normals_step, int normals_offset, + const int2 frameSize, + __global const float2 * volumeptr, + __global const float * vol2camptr, + __global const float * cam2volptr, + const float2 fixy, + const float2 cxy, + const float4 boxDown4, + const float4 boxUp4, + const float tstep, + const float voxelSize, + const int4 volResolution4, + const int4 volDims4, + const int8 neighbourCoords + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if(x >= frameSize.x || y >= frameSize.y) + return; + + // coordinate-independent constants + + __global const float* cm = cam2volptr; + const float3 camRot0 = vload4(0, cm).xyz; + const float3 camRot1 = vload4(1, cm).xyz; + const float3 camRot2 = vload4(2, cm).xyz; + const float3 camTrans = (float3)(cm[3], cm[7], cm[11]); + + __global const float* vm = vol2camptr; + const float3 volRot0 = vload4(0, vm).xyz; + const float3 volRot1 = vload4(1, vm).xyz; + const float3 volRot2 = vload4(2, vm).xyz; + const float3 volTrans = (float3)(vm[3], vm[7], vm[11]); + + const float3 boxDown = boxDown4.xyz; + const float3 boxUp = boxUp4.xyz; + const int3 volDims = volDims4.xyz; + + const int3 volResolution = volResolution4.xyz; + + const float invVoxelSize = native_recip(voxelSize); + + // kernel itself + + float3 point = nan((uint)0); + float3 normal = nan((uint)0); + + float3 orig = camTrans; + + // get direction through pixel in volume space: + // 1. reproject (x, y) on projecting plane where z = 1.f + float3 planed = (float3)(((float2)(x, y) - cxy)*fixy, 1.f); + + // 2. rotate to volume space + planed = (float3)(dot(planed, camRot0), + dot(planed, camRot1), + dot(planed, camRot2)); + + // 3. normalize + float3 dir = fast_normalize(planed); + + // compute intersection of ray with all six bbox planes + float3 rayinv = native_recip(dir); + float3 tbottom = rayinv*(boxDown - orig); + float3 ttop = rayinv*(boxUp - orig); + + // re-order intersections to find smallest and largest on each axis + float3 minAx = min(ttop, tbottom); + float3 maxAx = max(ttop, tbottom); + + // near clipping plane + const float clip = 0.f; + float tmin = max(max(max(minAx.x, minAx.y), max(minAx.x, minAx.z)), clip); + float tmax = min(min(maxAx.x, maxAx.y), min(maxAx.x, maxAx.z)); + + // precautions against getting coordinates out of bounds + tmin = tmin + tstep; + tmax = tmax - tstep; + + if(tmin < tmax) + { + // interpolation optimized a little + orig *= invVoxelSize; + dir *= invVoxelSize; + + float3 rayStep = dir*tstep; + float3 next = (orig + dir*tmin); + float f = interpolateVoxel(next, volumeptr, volDims, neighbourCoords); + float fnext = f; + + // raymarch + int steps = 0; + int nSteps = floor(native_divide(tmax - tmin, tstep)); + bool stop = false; + for(int i = 0; i < nSteps; i++) + { + // fix for wrong steps counting + if(!stop) + { + next += rayStep; + + // fetch voxel + int3 ip = convert_int3(round(next)); + int3 cmul = ip*volDims; + int idx = cmul.x + cmul.y + cmul.z; + fnext = volumeptr[idx].s0; + + if(fnext != f) + { + fnext = interpolateVoxel(next, volumeptr, volDims, neighbourCoords); + + // when ray crosses a surface + if(signbit(f) != signbit(fnext)) + { + stop = true; continue; + } + + f = fnext; + } + steps++; + } + } + + // if ray penetrates a surface from outside + // linearly interpolate t between two f values + if(f > 0 && fnext < 0) + { + float3 tp = next - rayStep; + float ft = interpolateVoxel(tp, volumeptr, volDims, neighbourCoords); + float ftdt = interpolateVoxel(next, volumeptr, volDims, neighbourCoords); + // float t = tmin + steps*tstep; + // float ts = t - tstep*ft/(ftdt - ft); + float ts = tmin + tstep*(steps - native_divide(ft, ftdt - ft)); + + // avoid division by zero + if(!isnan(ts) && !isinf(ts)) + { + float3 pv = orig + dir*ts; + float3 nv = getNormalVoxel(pv, volumeptr, volResolution, volDims, neighbourCoords); + + if(!any(isnan(nv))) + { + //convert pv and nv to camera space + normal = (float3)(dot(nv, volRot0), + dot(nv, volRot1), + dot(nv, volRot2)); + // interpolation optimized a little + pv *= voxelSize; + point = (float3)(dot(pv, volRot0), + dot(pv, volRot1), + dot(pv, volRot2)) + volTrans; + } + } + } + } + + __global float* pts = (__global float*)(pointsptr + points_offset + y*points_step + x*sizeof(ptype)); + __global float* nrm = (__global float*)(normalsptr + normals_offset + y*normals_step + x*sizeof(ptype)); + vstore4((float4)(point, 0), 0, pts); + vstore4((float4)(normal, 0), 0, nrm); +} + + +__kernel void getNormals(__global const char * pointsptr, + int points_step, int points_offset, + __global char * normalsptr, + int normals_step, int normals_offset, + const int2 frameSize, + __global const float2* volumeptr, + __global const float * volPoseptr, + __global const float * invPoseptr, + const float voxelSizeInv, + const int4 volResolution4, + const int4 volDims4, + const int8 neighbourCoords + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if(x >= frameSize.x || y >= frameSize.y) + return; + + // coordinate-independent constants + + __global const float* vp = volPoseptr; + const float3 volRot0 = vload4(0, vp).xyz; + const float3 volRot1 = vload4(1, vp).xyz; + const float3 volRot2 = vload4(2, vp).xyz; + const float3 volTrans = (float3)(vp[3], vp[7], vp[11]); + + __global const float* iv = invPoseptr; + const float3 invRot0 = vload4(0, iv).xyz; + const float3 invRot1 = vload4(1, iv).xyz; + const float3 invRot2 = vload4(2, iv).xyz; + const float3 invTrans = (float3)(iv[3], iv[7], iv[11]); + + const int3 volResolution = volResolution4.xyz; + const int3 volDims = volDims4.xyz; + + // kernel itself + + __global const ptype* ptsRow = (__global const ptype*)(pointsptr + + points_offset + + y*points_step); + float3 p = ptsRow[x].xyz; + float3 n = nan((uint)0); + if(!any(isnan(p))) + { + float3 voxPt = (float3)(dot(p, invRot0), + dot(p, invRot1), + dot(p, invRot2)) + invTrans; + voxPt = voxPt * voxelSizeInv; + n = getNormalVoxel(voxPt, volumeptr, volResolution, volDims, neighbourCoords); + n = (float3)(dot(n, volRot0), + dot(n, volRot1), + dot(n, volRot2)); + } + + __global float* nrm = (__global float*)(normalsptr + + normals_offset + + y*normals_step + + x*sizeof(ptype)); + + vstore4((float4)(n, 0), 0, nrm); +} + +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable + +struct CoordReturn +{ + bool result; + float3 point; + float3 normal; +}; + +inline struct CoordReturn coord(int x, int y, int z, float3 V, float v0, int axis, + __global const float2* volumeptr, + int3 volResolution, int3 volDims, + int8 neighbourCoords, + float voxelSize, float voxelSizeInv, + const float3 volRot0, + const float3 volRot1, + const float3 volRot2, + const float3 volTrans, + bool needNormals, + bool scan + ) +{ + struct CoordReturn cr; + + // 0 for x, 1 for y, 2 for z + bool limits = false; + int3 shift; + float Vc = 0.f; + if(axis == 0) + { + shift = (int3)(1, 0, 0); + limits = (x + 1 < volResolution.x); + Vc = V.x; + } + if(axis == 1) + { + shift = (int3)(0, 1, 0); + limits = (y + 1 < volResolution.y); + Vc = V.y; + } + if(axis == 2) + { + shift = (int3)(0, 0, 1); + limits = (z + 1 < volResolution.z); + Vc = V.z; + } + + if(limits) + { + int3 ip = ((int3)(x, y, z)) + shift; + int3 cmul = ip*volDims; + int idx = cmul.x + cmul.y + cmul.z; + float2 voxel = volumeptr[idx].s0; + float vd = voxel.s0; + int weight = as_int(voxel.s1); + + if(weight != 0 && vd != 1.f) + { + if((v0 > 0 && vd < 0) || (v0 < 0 && vd > 0)) + { + // calc actual values or estimate amount of space + if(!scan) + { + // linearly interpolate coordinate + float Vn = Vc + voxelSize; + float dinv = 1.f/(fabs(v0)+fabs(vd)); + float inter = (Vc*fabs(vd) + Vn*fabs(v0))*dinv; + + float3 p = (float3)(shift.x ? inter : V.x, + shift.y ? inter : V.y, + shift.z ? inter : V.z); + + cr.point = (float3)(dot(p, volRot0), + dot(p, volRot1), + dot(p, volRot2)) + volTrans; + + if(needNormals) + { + float3 nv = getNormalVoxel(p * voxelSizeInv, + volumeptr, volResolution, volDims, neighbourCoords); + + cr.normal = (float3)(dot(nv, volRot0), + dot(nv, volRot1), + dot(nv, volRot2)); + } + } + + cr.result = true; + return cr; + } + } + } + + cr.result = false; + return cr; +} + + +__kernel void scanSize(__global const float2* volumeptr, + const int4 volResolution4, + const int4 volDims4, + const int8 neighbourCoords, + __global const float * volPoseptr, + const float voxelSize, + const float voxelSizeInv, + __local int* reducebuf, + __global char* groupedSumptr, + int groupedSum_slicestep, + int groupedSum_step, int groupedSum_offset + ) +{ + const int3 volDims = volDims4.xyz; + const int3 volResolution = volResolution4.xyz; + + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + + bool validVoxel = true; + if(x >= volResolution.x || y >= volResolution.y || z >= volResolution.z) + validVoxel = false; + + const int gx = get_group_id(0); + const int gy = get_group_id(1); + const int gz = get_group_id(2); + + const int lx = get_local_id(0); + const int ly = get_local_id(1); + const int lz = get_local_id(2); + const int lw = get_local_size(0); + const int lh = get_local_size(1); + const int ld = get_local_size(2); + const int lsz = lw*lh*ld; + const int lid = lx + ly*lw + lz*lw*lh; + + // coordinate-independent constants + + __global const float* vp = volPoseptr; + const float3 volRot0 = vload4(0, vp).xyz; + const float3 volRot1 = vload4(1, vp).xyz; + const float3 volRot2 = vload4(2, vp).xyz; + const float3 volTrans = (float3)(vp[3], vp[7], vp[11]); + + // kernel itself + int npts = 0; + if(validVoxel) + { + int3 ip = (int3)(x, y, z); + int3 cmul = ip*volDims; + int idx = cmul.x + cmul.y + cmul.z; + float2 voxel = volumeptr[idx].s0; + float value = voxel.s0; + int weight = as_int(voxel.s1); + + // if voxel is not empty + if(weight != 0 && value != 1.f) + { + float3 V = (((float3)(x, y, z)) + 0.5f)*voxelSize; + + #pragma unroll + for(int i = 0; i < 3; i++) + { + struct CoordReturn cr; + cr = coord(x, y, z, V, value, i, + volumeptr, volResolution, volDims, + neighbourCoords, + voxelSize, voxelSizeInv, + volRot0, volRot1, volRot2, volTrans, + false, true); + if(cr.result) + { + npts++; + } + } + } + } + + // reducebuf keeps counters for each thread + reducebuf[lid] = npts; + + // reduce counter to local mem + + // maxStep = ctz(lsz), ctz isn't supported on CUDA devices + const int c = clz(lsz & -lsz); + const int maxStep = c ? 31 - c : c; + for(int nstep = 1; nstep <= maxStep; nstep++) + { + if(lid % (1 << nstep) == 0) + { + int rto = lid; + int rfrom = lid + (1 << (nstep-1)); + reducebuf[rto] += reducebuf[rfrom]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if(lid == 0) + { + __global int* groupedRow = (__global int*)(groupedSumptr + + groupedSum_offset + + gy*groupedSum_step + + gz*groupedSum_slicestep); + + groupedRow[gx] = reducebuf[0]; + } +} + + +__kernel void fillPtsNrm(__global const float2* volumeptr, + const int4 volResolution4, + const int4 volDims4, + const int8 neighbourCoords, + __global const float * volPoseptr, + const float voxelSize, + const float voxelSizeInv, + const int needNormals, + __local float* localbuf, + volatile __global int* atomicCtr, + __global const char* groupedSumptr, + int groupedSum_slicestep, + int groupedSum_step, int groupedSum_offset, + __global char * pointsptr, + int points_step, int points_offset, + __global char * normalsptr, + int normals_step, int normals_offset + ) +{ + const int3 volDims = volDims4.xyz; + const int3 volResolution = volResolution4.xyz; + + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + + bool validVoxel = true; + if(x >= volResolution.x || y >= volResolution.y || z >= volResolution.z) + validVoxel = false; + + const int gx = get_group_id(0); + const int gy = get_group_id(1); + const int gz = get_group_id(2); + + __global int* groupedRow = (__global int*)(groupedSumptr + + groupedSum_offset + + gy*groupedSum_step + + gz*groupedSum_slicestep); + + // this group contains 0 pts, skip it + int nptsGroup = groupedRow[gx]; + if(nptsGroup == 0) + return; + + const int lx = get_local_id(0); + const int ly = get_local_id(1); + const int lz = get_local_id(2); + const int lw = get_local_size(0); + const int lh = get_local_size(1); + const int ld = get_local_size(2); + const int lsz = lw*lh*ld; + const int lid = lx + ly*lw + lz*lw*lh; + + // coordinate-independent constants + + __global const float* vp = volPoseptr; + const float3 volRot0 = vload4(0, vp).xyz; + const float3 volRot1 = vload4(1, vp).xyz; + const float3 volRot2 = vload4(2, vp).xyz; + const float3 volTrans = (float3)(vp[3], vp[7], vp[11]); + + // kernel itself + int npts = 0; + float3 parr[3], narr[3]; + if(validVoxel) + { + int3 ip = (int3)(x, y, z); + int3 cmul = ip*volDims; + int idx = cmul.x + cmul.y + cmul.z; + float2 voxel = volumeptr[idx].s0; + float value = voxel.s0; + int weight = as_int(voxel.s1); + + // if voxel is not empty + if(weight != 0 && value != 1.f) + { + float3 V = (((float3)(x, y, z)) + 0.5f)*voxelSize; + + #pragma unroll + for(int i = 0; i < 3; i++) + { + struct CoordReturn cr; + cr = coord(x, y, z, V, value, i, + volumeptr, volResolution, volDims, + neighbourCoords, + voxelSize, voxelSizeInv, + volRot0, volRot1, volRot2, volTrans, + needNormals, false); + + if(cr.result) + { + parr[npts] = cr.point; + narr[npts] = cr.normal; + npts++; + } + } + } + } + + // 4 floats per point or normal + const int elemStep = 4; + + __local float* normAddr; + __local int localCtr; + if(lid == 0) + localCtr = 0; + + // push all pts (and nrm) from private array to local mem + int privateCtr = 0; + barrier(CLK_LOCAL_MEM_FENCE); + privateCtr = atomic_add(&localCtr, npts); + barrier(CLK_LOCAL_MEM_FENCE); + + for(int i = 0; i < npts; i++) + { + __local float* addr = localbuf + (privateCtr+i)*elemStep; + vstore4((float4)(parr[i], 0), 0, addr); + } + + if(needNormals) + { + normAddr = localbuf + localCtr*elemStep; + + for(int i = 0; i < npts; i++) + { + __local float* addr = normAddr + (privateCtr+i)*elemStep; + vstore4((float4)(narr[i], 0), 0, addr); + } + } + + // debugging purposes + if(lid == 0) + { + if(localCtr != nptsGroup) + { + printf("!!! fetchPointsNormals result may be incorrect, npts != localCtr at %3d %3d %3d: %3d vs %3d\n", + gx, gy, gz, localCtr, nptsGroup); + } + } + + // copy local buffer to global mem + __local int whereToWrite; + if(lid == 0) + whereToWrite = atomic_add(atomicCtr, localCtr); + barrier(CLK_GLOBAL_MEM_FENCE); + + event_t ev[2]; + int evn = 0; + // points and normals are 1-column matrices + __global float* pts = (__global float*)(pointsptr + + points_offset + + whereToWrite*points_step); + ev[evn++] = async_work_group_copy(pts, localbuf, localCtr*elemStep, 0); + + if(needNormals) + { + __global float* nrm = (__global float*)(normalsptr + + normals_offset + + whereToWrite*normals_step); + ev[evn++] = async_work_group_copy(nrm, normAddr, localCtr*elemStep, 0); + } + + wait_group_events(evn, ev); +} diff --git a/modules/rgbd/src/precomp.hpp b/modules/rgbd/src/precomp.hpp index a94627e6c..4ed89e14d 100644 --- a/modules/rgbd/src/precomp.hpp +++ b/modules/rgbd/src/precomp.hpp @@ -13,6 +13,8 @@ #include "opencv2/core/utility.hpp" #include "opencv2/core/private.hpp" #include "opencv2/core/hal/intrin.hpp" +#include "opencv2/core/ocl.hpp" +#include "opencl_kernels_rgbd.hpp" #include "opencv2/imgproc.hpp" #include "opencv2/calib3d.hpp" #include "opencv2/rgbd.hpp" diff --git a/modules/rgbd/src/tsdf.cpp b/modules/rgbd/src/tsdf.cpp index 19560e985..b6d178a04 100644 --- a/modules/rgbd/src/tsdf.cpp +++ b/modules/rgbd/src/tsdf.cpp @@ -11,45 +11,28 @@ namespace cv { namespace kinfu { -typedef float volumeType; // can be float16 +// TODO: Optimization possible: +// * volumeType can be FP16 +// * weight can be int16 +typedef float volumeType; struct Voxel { volumeType v; int weight; }; - -} - -template<> class DataType -{ -public: - typedef kinfu::Voxel value_type; - typedef value_type work_type; - typedef value_type channel_type; - typedef value_type vec_type; - enum { generic_type = 0, - depth = CV_64F, - channels = 1, - fmt = (int)'v', - type = CV_MAKETYPE(depth, channels) - }; -}; - -namespace kinfu { +typedef Vec VecT; class TSDFVolumeCPU : public TSDFVolume { - typedef cv::Mat_ Volume; - public: // dimension in voxels, size in meters - TSDFVolumeCPU(int _res, float _size, cv::Affine3f _pose, float _truncDist, int _maxWeight, - float _raycastStepFactor); + TSDFVolumeCPU(Point3i _res, float _voxelSize, cv::Affine3f _pose, float _truncDist, int _maxWeight, + float _raycastStepFactor, bool zFirstMemOrder = true); - virtual void integrate(cv::Ptr depth, float depthFactor, cv::Affine3f cameraPose, cv::kinfu::Intr intrinsics) override; - virtual void raycast(cv::Affine3f cameraPose, cv::kinfu::Intr intrinsics, cv::Size frameSize, int pyramidLevels, - cv::Ptr frameGenerator, cv::Ptr frame) const override; + virtual void integrate(InputArray _depth, float depthFactor, cv::Affine3f cameraPose, cv::kinfu::Intr intrinsics) override; + virtual void raycast(cv::Affine3f cameraPose, cv::kinfu::Intr intrinsics, cv::Size frameSize, + cv::OutputArray points, cv::OutputArray normals) const override; virtual void fetchNormals(cv::InputArray points, cv::OutputArray _normals) const override; virtual void fetchPointsNormals(cv::OutputArray points, cv::OutputArray normals) const override; @@ -59,85 +42,89 @@ public: volumeType interpolateVoxel(cv::Point3f p) const; Point3f getNormalVoxel(cv::Point3f p) const; -#if CV_SIMD128 +#if USE_INTRINSICS volumeType interpolateVoxel(const v_float32x4& p) const; v_float32x4 getNormalVoxel(const v_float32x4& p) const; #endif - // edgeResolution^3 array - // &elem(x, y, z) = data + x*edgeRes^2 + y*edgeRes + z; - Volume volume; - - int neighbourCoords[8]; - int dims[4]; - float voxelSize; - float voxelSizeInv; - float truncDist; - float raycastStepFactor; + // See zFirstMemOrder arg of parent class constructor + // for the array layout info + // Consist of Voxel elements + Mat volume; }; -TSDFVolume::TSDFVolume(int _res, float _size, Affine3f _pose, float /*_truncDist*/, int _maxWeight, - float /*_raycastStepFactor*/) : - edgeSize(_size), - edgeResolution(_res), +TSDFVolume::TSDFVolume(Point3i _res, float _voxelSize, Affine3f _pose, float _truncDist, int _maxWeight, + float _raycastStepFactor, bool zFirstMemOrder) : + voxelSize(_voxelSize), + voxelSizeInv(1.f/_voxelSize), + volResolution(_res), maxWeight(_maxWeight), - pose(_pose) -{ } + pose(_pose), + raycastStepFactor(_raycastStepFactor) +{ + // Unlike original code, this should work with any volume size + // Not only when (x,y,z % 32) == 0 + + volSize = Point3f(volResolution) * voxelSize; + + truncDist = std::max(_truncDist, 2.1f * voxelSize); + + // (xRes*yRes*zRes) array + // Depending on zFirstMemOrder arg: + // &elem(x, y, z) = data + x*zRes*yRes + y*zRes + z; + // &elem(x, y, z) = data + x + y*xRes + z*xRes*yRes; + int xdim, ydim, zdim; + if(zFirstMemOrder) + { + xdim = volResolution.z * volResolution.y; + ydim = volResolution.z; + zdim = 1; + } + else + { + xdim = 1; + ydim = volResolution.x; + zdim = volResolution.x * volResolution.y; + } + + volDims = Vec4i(xdim, ydim, zdim); + neighbourCoords = Vec8i( + volDims.dot(Vec4i(0, 0, 0)), + volDims.dot(Vec4i(0, 0, 1)), + volDims.dot(Vec4i(0, 1, 0)), + volDims.dot(Vec4i(0, 1, 1)), + volDims.dot(Vec4i(1, 0, 0)), + volDims.dot(Vec4i(1, 0, 1)), + volDims.dot(Vec4i(1, 1, 0)), + volDims.dot(Vec4i(1, 1, 1)) + ); +} // dimension in voxels, size in meters -TSDFVolumeCPU::TSDFVolumeCPU(int _res, float _size, cv::Affine3f _pose, float _truncDist, int _maxWeight, - float _raycastStepFactor) : - TSDFVolume(_res, _size, _pose, _truncDist, _maxWeight, _raycastStepFactor) +TSDFVolumeCPU::TSDFVolumeCPU(Point3i _res, float _voxelSize, cv::Affine3f _pose, float _truncDist, int _maxWeight, + float _raycastStepFactor, bool zFirstMemOrder) : + TSDFVolume(_res, _voxelSize, _pose, _truncDist, _maxWeight, _raycastStepFactor, zFirstMemOrder) { - CV_Assert(_res % 32 == 0); - - int xdim = edgeResolution*edgeResolution; - int ydim = edgeResolution; - int steps[4] = { xdim, ydim, 1, 0 }; - for(int i = 0; i < 4; i++) - dims[i] = steps[i]; - int coords[8] = { - xdim*0 + ydim*0 + 1*0, - xdim*0 + ydim*0 + 1*1, - xdim*0 + ydim*1 + 1*0, - xdim*0 + ydim*1 + 1*1, - xdim*1 + ydim*0 + 1*0, - xdim*1 + ydim*0 + 1*1, - xdim*1 + ydim*1 + 1*0, - xdim*1 + ydim*1 + 1*1 - }; - for(int i = 0; i < 8; i++) - neighbourCoords[i] = coords[i]; - - voxelSize = edgeSize/edgeResolution; - voxelSizeInv = edgeResolution/edgeSize; - volume = Volume(1, _res * _res * _res); - - truncDist = std::max (_truncDist, 2.1f * voxelSize); - raycastStepFactor = _raycastStepFactor; + volume = Mat(1, volResolution.x * volResolution.y * volResolution.z, rawType()); reset(); } -struct FillZero -{ - void operator ()(Voxel &v, const int* /*position*/) const - { - v.v = 0; v.weight = 0; - } -}; - // zero volume, leave rest params the same void TSDFVolumeCPU::reset() { - ScopeTime st("tsdf: reset"); + CV_TRACE_FUNCTION(); - volume.forEach(FillZero()); + volume.forEach([](VecT& vv, const int* /* position */) + { + Voxel& v = reinterpret_cast(vv); + v.v = 0; v.weight = 0; + }); } // SIMD version of that code is manually inlined -#if !CV_SIMD128 +#if !USE_INTRINSICS static const bool fixMissingData = false; static inline depthType bilinearDepth(const Depth& m, cv::Point2f pt) @@ -222,31 +209,33 @@ struct IntegrateInvoker : ParallelLoopBody volume(_volume), depth(_depth), proj(intrinsics.makeProjector()), - vol2cam(cameraPose.inv() * volume.pose), - truncDistInv(1.f/volume.truncDist), + vol2cam(cameraPose.inv() * _volume.pose), + truncDistInv(1.f/_volume.truncDist), dfac(1.f/depthFactor) { - volDataStart = volume.volume[0]; + volDataStart = volume.volume.ptr(); } -#if CV_SIMD128 +#if USE_INTRINSICS virtual void operator() (const Range& range) const override { // zStep == vol2cam*(Point3f(x, y, 1)*voxelSize) - basePt; - Point3f zStepPt = Point3f(vol2cam.matrix(0, 2), vol2cam.matrix(1, 2), vol2cam.matrix(2, 2))*volume.voxelSize; + Point3f zStepPt = Point3f(vol2cam.matrix(0, 2), + vol2cam.matrix(1, 2), + vol2cam.matrix(2, 2))*volume.voxelSize; + v_float32x4 zStep(zStepPt.x, zStepPt.y, zStepPt.z, 0); v_float32x4 vfxy(proj.fx, proj.fy, 0.f, 0.f), vcxy(proj.cx, proj.cy, 0.f, 0.f); const v_float32x4 upLimits = v_cvt_f32(v_int32x4(depth.cols-1, depth.rows-1, 0, 0)); - // &elem(x, y, z) = data + x*edgeRes^2 + y*edgeRes + z; for(int x = range.start; x < range.end; x++) { - Voxel* volDataX = volDataStart + x*volume.dims[0]; - for(int y = 0; y < volume.edgeResolution; y++) + Voxel* volDataX = volDataStart + x*volume.volDims[0]; + for(int y = 0; y < volume.volResolution.y; y++) { - Voxel* volDataY = volDataX+y*volume.dims[1]; + Voxel* volDataY = volDataX + y*volume.volDims[1]; // optimization of camSpace transformation (vector addition instead of matmul at each z) - Point3f basePt = vol2cam*Point3f(x*volume.voxelSize, y*volume.voxelSize, 0); + Point3f basePt = vol2cam*(Point3f((float)x, (float)y, 0)*volume.voxelSize); v_float32x4 camSpacePt(basePt.x, basePt.y, basePt.z, 0); int startZ, endZ; @@ -256,7 +245,7 @@ struct IntegrateInvoker : ParallelLoopBody if(zStepPt.z > 0) { startZ = baseZ; - endZ = volume.edgeResolution; + endZ = volume.volResolution.z; } else { @@ -268,7 +257,7 @@ struct IntegrateInvoker : ParallelLoopBody { if(basePt.z > 0) { - startZ = 0; endZ = volume.edgeResolution; + startZ = 0; endZ = volume.volResolution.z; } else { @@ -277,7 +266,7 @@ struct IntegrateInvoker : ParallelLoopBody } } startZ = max(0, startZ); - endZ = min(volume.edgeResolution, endZ); + endZ = min(volume.volResolution.z, endZ); for(int z = startZ; z < endZ; z++) { // optimization of the following: @@ -354,7 +343,7 @@ struct IntegrateInvoker : ParallelLoopBody { volumeType tsdf = fmin(1.f, sdf * truncDistInv); - Voxel& voxel = volDataY[z]; + Voxel& voxel = volDataY[z*volume.volDims[2]]; int& weight = voxel.weight; volumeType& value = voxel.v; @@ -367,20 +356,21 @@ struct IntegrateInvoker : ParallelLoopBody } } #else - virtual void operator() (const Range& range) const + virtual void operator() (const Range& range) const override { - // &elem(x, y, z) = data + x*edgeRes^2 + y*edgeRes + z; for(int x = range.start; x < range.end; x++) { - Voxel* volDataX = volDataStart + x*volume.dims[0]; - for(int y = 0; y < volume.edgeResolution; y++) + Voxel* volDataX = volDataStart + x*volume.volDims[0]; + for(int y = 0; y < volume.volResolution.y; y++) { - Voxel* volDataY = volDataX+y*volume.dims[1]; + Voxel* volDataY = volDataX+y*volume.volDims[1]; // optimization of camSpace transformation (vector addition instead of matmul at each z) - Point3f basePt = vol2cam*Point3f(x*volume.voxelSize, y*volume.voxelSize, 0); + Point3f basePt = vol2cam*(Point3f(x, y, 0)*volume.voxelSize); Point3f camSpacePt = basePt; // zStep == vol2cam*(Point3f(x, y, 1)*voxelSize) - basePt; - Point3f zStep = Point3f(vol2cam.matrix(0, 2), vol2cam.matrix(1, 2), vol2cam.matrix(2, 2))*volume.voxelSize; + Point3f zStep = Point3f(vol2cam.matrix(0, 2), + vol2cam.matrix(1, 2), + vol2cam.matrix(2, 2))*volume.voxelSize; int startZ, endZ; if(abs(zStep.z) > 1e-5) @@ -389,7 +379,7 @@ struct IntegrateInvoker : ParallelLoopBody if(zStep.z > 0) { startZ = baseZ; - endZ = volume.edgeResolution; + endZ = volume.volResolution.z; } else { @@ -401,7 +391,7 @@ struct IntegrateInvoker : ParallelLoopBody { if(basePt.z > 0) { - startZ = 0; endZ = volume.edgeResolution; + startZ = 0; endZ = volume.volResolution.z; } else { @@ -410,7 +400,7 @@ struct IntegrateInvoker : ParallelLoopBody } } startZ = max(0, startZ); - endZ = min(volume.edgeResolution, endZ); + endZ = min(volume.volResolution.z, endZ); for(int z = startZ; z < endZ; z++) { // optimization of the following: @@ -439,7 +429,7 @@ struct IntegrateInvoker : ParallelLoopBody { volumeType tsdf = fmin(1.f, sdf * truncDistInv); - Voxel& voxel = volDataY[z]; + Voxel& voxel = volDataY[z*volume.volDims[2]]; int& weight = voxel.weight; volumeType& value = voxel.v; @@ -463,19 +453,19 @@ struct IntegrateInvoker : ParallelLoopBody }; // use depth instead of distance (optimization) -void TSDFVolumeCPU::integrate(cv::Ptr _depth, float depthFactor, cv::Affine3f cameraPose, Intr intrinsics) +void TSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, cv::Affine3f cameraPose, Intr intrinsics) { - ScopeTime st("tsdf: integrate"); + CV_TRACE_FUNCTION(); - Depth depth; - _depth->getDepth(depth); + CV_Assert(_depth.type() == DEPTH_TYPE); + Depth depth = _depth.getMat(); IntegrateInvoker ii(*this, depth, intrinsics, cameraPose, depthFactor); - Range range(0, edgeResolution); + Range range(0, volResolution.x); parallel_for_(range, ii); } -#if CV_SIMD128 +#if USE_INTRINSICS // all coordinate checks should be done in inclosing cycle inline volumeType TSDFVolumeCPU::interpolateVoxel(Point3f _p) const { @@ -494,14 +484,14 @@ inline volumeType TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); float tz = t.get0(); - int xdim = dims[0], ydim = dims[1]; - const Voxel* volData = volume[0]; + int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2]; + const Voxel* volData = volume.ptr(); int ix = ip.get0(); ip = v_rotate_right<1>(ip); int iy = ip.get0(); ip = v_rotate_right<1>(ip); int iz = ip.get0(); - int coordBase = ix*xdim + iy*ydim + iz; + int coordBase = ix*xdim + iy*ydim + iz*zdim; volumeType vx[8]; for(int i = 0; i < 8; i++) @@ -523,7 +513,7 @@ inline volumeType TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const #else inline volumeType TSDFVolumeCPU::interpolateVoxel(Point3f p) const { - int xdim = dims[0], ydim = dims[1]; + int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2]; int ix = cvFloor(p.x); int iy = cvFloor(p.y); @@ -533,8 +523,8 @@ inline volumeType TSDFVolumeCPU::interpolateVoxel(Point3f p) const float ty = p.y - iy; float tz = p.z - iz; - int coordBase = ix*xdim + iy*ydim + iz; - const Voxel* volData = volume[0]; + int coordBase = ix*xdim + iy*ydim + iz*zdim; + const Voxel* volData = volume.ptr(); volumeType vx[8]; for(int i = 0; i < 8; i++) @@ -552,8 +542,7 @@ inline volumeType TSDFVolumeCPU::interpolateVoxel(Point3f p) const } #endif - -#if CV_SIMD128 +#if USE_INTRINSICS //gradientDeltaFactor is fixed at 1.0 of voxel size inline Point3f TSDFVolumeCPU::getNormalVoxel(Point3f _p) const { @@ -567,7 +556,10 @@ inline Point3f TSDFVolumeCPU::getNormalVoxel(Point3f _p) const inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const { if(v_check_any((p < v_float32x4(1.f, 1.f, 1.f, 0.f)) + - (p >= v_setall_f32((float)(edgeResolution-2))))) + (p >= v_float32x4((float)(volResolution.x-2), + (float)(volResolution.y-2), + (float)(volResolution.z-2), 1.f)) + )) return nanv; v_int32x4 ip = v_floor(p); @@ -578,20 +570,20 @@ inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); float tz = t.get0(); - int xdim = dims[0], ydim = dims[1]; - const Voxel* volData = volume[0]; + const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2]; + const Voxel* volData = volume.ptr(); int ix = ip.get0(); ip = v_rotate_right<1>(ip); int iy = ip.get0(); ip = v_rotate_right<1>(ip); int iz = ip.get0(); - int coordBase = ix*xdim + iy*ydim + iz; + int coordBase = ix*xdim + iy*ydim + iz*zdim; float CV_DECL_ALIGNED(16) an[4]; an[0] = an[1] = an[2] = an[3] = 0.f; for(int c = 0; c < 3; c++) { - const int dim = dims[c]; + const int dim = volDims[c]; float& nv = an[c]; volumeType vx[8]; @@ -620,12 +612,12 @@ inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const #else inline Point3f TSDFVolumeCPU::getNormalVoxel(Point3f p) const { - const int xdim = dims[0], ydim = dims[1]; - const Voxel* volData = volume[0]; + const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2]; + const Voxel* volData = volume.ptr(); - if(p.x < 1 || p.x >= edgeResolution -2 || - p.y < 1 || p.y >= edgeResolution -2 || - p.z < 1 || p.z >= edgeResolution -2) + if(p.x < 1 || p.x >= volResolution.x - 2 || + p.y < 1 || p.y >= volResolution.y - 2 || + p.z < 1 || p.z >= volResolution.z - 2) return nan3; int ix = cvFloor(p.x); @@ -636,12 +628,12 @@ inline Point3f TSDFVolumeCPU::getNormalVoxel(Point3f p) const float ty = p.y - iy; float tz = p.z - iz; - int coordBase = ix*xdim + iy*ydim + iz; + int coordBase = ix*xdim + iy*ydim + iz*zdim; Vec3f an; for(int c = 0; c < 3; c++) { - const int dim = dims[c]; + const int dim = volDims[c]; float& nv = an[c]; volumeType vx[8]; @@ -677,16 +669,16 @@ struct RaycastInvoker : ParallelLoopBody // We do subtract voxel size to minimize checks after // Note: origin of volume coordinate is placed // in the center of voxel (0,0,0), not in the corner of the voxel! - boxMax(volume.edgeSize - volume.voxelSize, - volume.edgeSize - volume.voxelSize, - volume.edgeSize - volume.voxelSize), + boxMax(volume.volSize - Point3f(volume.voxelSize, + volume.voxelSize, + volume.voxelSize)), boxMin(), cam2vol(volume.pose.inv() * cameraPose), vol2cam(cameraPose.inv() * volume.pose), reproj(intrinsics.makeReprojector()) { } -#if CV_SIMD128 +#if USE_INTRINSICS virtual void operator() (const Range& range) const override { const v_float32x4 vfxy(reproj.fxinv, reproj.fyinv, 0, 0); @@ -701,7 +693,9 @@ struct RaycastInvoker : ParallelLoopBody const v_float32x4 boxDown(boxMin.x, boxMin.y, boxMin.z, 0.f); const v_float32x4 boxUp(boxMax.x, boxMax.y, boxMax.z, 0.f); - const v_float32x4 invVoxelSize = v_setall_f32(volume.voxelSizeInv); + const v_float32x4 invVoxelSize = v_float32x4(volume.voxelSizeInv, + volume.voxelSizeInv, + volume.voxelSizeInv, 1.f); const float (&vm)[16] = vol2cam.matrix.val; const v_float32x4 volRot0(vm[0], vm[4], vm[ 8], 0); @@ -758,7 +752,9 @@ struct RaycastInvoker : ParallelLoopBody orig *= invVoxelSize; dir *= invVoxelSize; - int xdim = volume.dims[0], ydim = volume.dims[1]; + int xdim = volume.volDims[0]; + int ydim = volume.volDims[1]; + int zdim = volume.volDims[2]; v_float32x4 rayStep = dir * v_setall_f32(tstep); v_float32x4 next = (orig + dir * v_setall_f32(tmin)); volumeType f = volume.interpolateVoxel(next), fnext = f; @@ -773,9 +769,9 @@ struct RaycastInvoker : ParallelLoopBody int ix = ip.get0(); ip = v_rotate_right<1>(ip); int iy = ip.get0(); ip = v_rotate_right<1>(ip); int iz = ip.get0(); - int coord = ix*xdim + iy*ydim + iz; + int coord = ix*xdim + iy*ydim + iz*zdim; - fnext = volume.volume(coord).v; + fnext = volume.volume.at(coord).v; if(fnext != f) { fnext = volume.interpolateVoxel(next); @@ -810,7 +806,10 @@ struct RaycastInvoker : ParallelLoopBody //convert pv and nv to camera space normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32()); // interpolation optimized a little - point = v_matmuladd(pv*v_setall_f32(volume.voxelSize), volRot0, volRot1, volRot2, volTrans); + point = v_matmuladd(pv*v_float32x4(volume.voxelSize, + volume.voxelSize, + volume.voxelSize, 1.f), + volRot0, volRot1, volRot2, volTrans); } } } @@ -822,7 +821,7 @@ struct RaycastInvoker : ParallelLoopBody } } #else - virtual void operator() (const Range& range) const + virtual void operator() (const Range& range) const override { const Point3f camTrans = cam2vol.translation(); const Matx33f camRot = cam2vol.rotation(); @@ -855,15 +854,15 @@ struct RaycastInvoker : ParallelLoopBody float tmin = max(max(max(minAx.x, minAx.y), max(minAx.x, minAx.z)), clip); float tmax = min(min(maxAx.x, maxAx.y), min(maxAx.x, maxAx.z)); + // precautions against getting coordinates out of bounds + tmin = tmin + tstep; + tmax = tmax - tstep; + if(tmin < tmax) { - // precautions against getting coordinates out of bounds - tmin = tmin + tstep; - tmax = tmax - tstep - tstep; - // interpolation optimized a little - orig *= volume.voxelSizeInv; - dir *= volume.voxelSizeInv; + orig = orig*volume.voxelSizeInv; + dir = dir*volume.voxelSizeInv; Point3f rayStep = dir * tstep; Point3f next = (orig + dir * tmin); @@ -875,11 +874,13 @@ struct RaycastInvoker : ParallelLoopBody for(; steps < nSteps; steps++) { next += rayStep; - int xdim = volume.dims[0], ydim = volume.dims[1]; + int xdim = volume.volDims[0]; + int ydim = volume.volDims[1]; + int zdim = volume.volDims[2]; int ix = cvRound(next.x); int iy = cvRound(next.y); int iz = cvRound(next.z); - fnext = volume.volume(ix*xdim + iy*ydim + iz).v; + fnext = volume.volume.at(ix*xdim + iy*ydim + iz*zdim).v; if(fnext != f) { fnext = volume.interpolateVoxel(next); @@ -914,7 +915,7 @@ struct RaycastInvoker : ParallelLoopBody //convert pv and nv to camera space normal = volRot * nv; // interpolation optimized a little - point = vol2cam * (pv * volume.voxelSize); + point = vol2cam * (pv*volume.voxelSize); } } } @@ -942,21 +943,23 @@ struct RaycastInvoker : ParallelLoopBody }; -void TSDFVolumeCPU::raycast(cv::Affine3f cameraPose, Intr intrinsics, Size frameSize, int pyramidLevels, - cv::Ptr frameGenerator, cv::Ptr frame) const +void TSDFVolumeCPU::raycast(cv::Affine3f cameraPose, Intr intrinsics, Size frameSize, + cv::OutputArray _points, cv::OutputArray _normals) const { - ScopeTime st("tsdf: raycast"); + CV_TRACE_FUNCTION(); CV_Assert(frameSize.area() > 0); - Points points(frameSize); - Normals normals(frameSize); + _points.create (frameSize, POINT_TYPE); + _normals.create(frameSize, POINT_TYPE); + + Points points = _points.getMat(); + Normals normals = _normals.getMat(); + + RaycastInvoker ri(points, normals, cameraPose, intrinsics, *this); const int nstripes = -1; - parallel_for_(Range(0, points.rows), RaycastInvoker(points, normals, cameraPose, intrinsics, *this), nstripes); - - // build a pyramid of points and normals - (*frameGenerator)(frame, points, normals, pyramidLevels); + parallel_for_(Range(0, points.rows), ri, nstripes); } @@ -972,41 +975,40 @@ struct FetchPointsNormalsInvoker : ParallelLoopBody nVecs(_nVecs), needNormals(_needNormals) { - volDataStart = vol.volume[0]; + volDataStart = vol.volume.ptr(); } inline void coord(std::vector& points, std::vector& normals, int x, int y, int z, Point3f V, float v0, int axis) const { // 0 for x, 1 for y, 2 for z - const int edgeResolution = vol.edgeResolution; bool limits = false; Point3i shift; float Vc = 0.f; if(axis == 0) { shift = Point3i(1, 0, 0); - limits = (x + 1 < edgeResolution); + limits = (x + 1 < vol.volResolution.x); Vc = V.x; } if(axis == 1) { shift = Point3i(0, 1, 0); - limits = (y + 1 < edgeResolution); + limits = (y + 1 < vol.volResolution.y); Vc = V.y; } if(axis == 2) { shift = Point3i(0, 0, 1); - limits = (z + 1 < edgeResolution); + limits = (z + 1 < vol.volResolution.z); Vc = V.z; } if(limits) { - const Voxel& voxeld = volDataStart[(x+shift.x)*vol.dims[0] + - (y+shift.y)*vol.dims[1] + - (z+shift.z)]; + const Voxel& voxeld = volDataStart[(x+shift.x)*vol.volDims[0] + + (y+shift.y)*vol.volDims[1] + + (z+shift.z)*vol.volDims[2]]; volumeType vd = voxeld.v; if(voxeld.weight != 0 && vd != 1.f) @@ -1024,7 +1026,8 @@ struct FetchPointsNormalsInvoker : ParallelLoopBody { points.push_back(toPtype(vol.pose * p)); if(needNormals) - normals.push_back(toPtype(vol.pose.rotation() * vol.getNormalVoxel(p * vol.voxelSizeInv))); + normals.push_back(toPtype(vol.pose.rotation() * + vol.getNormalVoxel(p*vol.voxelSizeInv))); } } } @@ -1033,22 +1036,20 @@ struct FetchPointsNormalsInvoker : ParallelLoopBody virtual void operator() (const Range& range) const override { - // &elem(x, y, z) = data + x*edgeRes^2 + y*edgeRes + z; std::vector points, normals; for(int x = range.start; x < range.end; x++) { - const Voxel* volDataX = volDataStart + x*vol.dims[0]; - for(int y = 0; y < vol.edgeResolution; y++) + const Voxel* volDataX = volDataStart + x*vol.volDims[0]; + for(int y = 0; y < vol.volResolution.y; y++) { - const Voxel* volDataY = volDataX + y*vol.dims[1]; - for(int z = 0; z < vol.edgeResolution; z++) + const Voxel* volDataY = volDataX + y*vol.volDims[1]; + for(int z = 0; z < vol.volResolution.z; z++) { - const Voxel& voxel0 = volDataY[z]; + const Voxel& voxel0 = volDataY[z*vol.volDims[2]]; volumeType v0 = voxel0.v; if(voxel0.weight != 0 && v0 != 1.f) { - Point3f V = (Point3f((float)x, (float)y, (float)z) + - Point3f(0.5f, 0.5f, 0.5f))*vol.voxelSize; + Point3f V(Point3f((float)x + 0.5f, (float)y + 0.5f, (float)z + 0.5f)*vol.voxelSize); coord(points, normals, x, y, z, V, v0, 0); coord(points, normals, x, y, z, V, v0, 1); @@ -1074,13 +1075,13 @@ struct FetchPointsNormalsInvoker : ParallelLoopBody void TSDFVolumeCPU::fetchPointsNormals(OutputArray _points, OutputArray _normals) const { - ScopeTime st("tsdf: fetch points+normals"); + CV_TRACE_FUNCTION(); if(_points.needed()) { std::vector< std::vector > pVecs, nVecs; FetchPointsNormalsInvoker fi(*this, pVecs, nVecs, _normals.needed()); - Range range(0, edgeResolution); + Range range(0, volResolution.x); const int nstripes = -1; parallel_for_(range, fi, nstripes); std::vector points, normals; @@ -1090,15 +1091,15 @@ void TSDFVolumeCPU::fetchPointsNormals(OutputArray _points, OutputArray _normals normals.insert(normals.end(), nVecs[i].begin(), nVecs[i].end()); } - _points.create((int)points.size(), 1, DataType::type); + _points.create((int)points.size(), 1, POINT_TYPE); if(!points.empty()) - Mat((int)points.size(), 1, DataType::type, &points[0]).copyTo(_points.getMat()); + Mat((int)points.size(), 1, POINT_TYPE, &points[0]).copyTo(_points.getMat()); if(_normals.needed()) { - _normals.create((int)normals.size(), 1, DataType::type); + _normals.create((int)normals.size(), 1, POINT_TYPE); if(!normals.empty()) - Mat((int)normals.size(), 1, DataType::type, &normals[0]).copyTo(_normals.getMat()); + Mat((int)normals.size(), 1, POINT_TYPE, &normals[0]).copyTo(_normals.getMat()); } } } @@ -1115,7 +1116,9 @@ struct PushNormals Point3f n = nan3; if(!isNaN(p)) { - n = vol.pose.rotation() * vol.getNormalVoxel(invPose * p * vol.voxelSizeInv); + Point3f voxPt = (invPose * p); + voxPt = voxPt * vol.voxelSizeInv; + n = vol.pose.rotation() * vol.getNormalVoxel(voxPt); } normals(position[0], position[1]) = toPtype(n); } @@ -1128,12 +1131,12 @@ struct PushNormals void TSDFVolumeCPU::fetchNormals(InputArray _points, OutputArray _normals) const { - ScopeTime st("tsdf: fetch normals"); + CV_TRACE_FUNCTION(); if(_normals.needed()) { Points points = _points.getMat(); - CV_Assert(points.type() == DataType::type); + CV_Assert(points.type() == POINT_TYPE); _normals.createSameSize(_points, _points.type()); Mat_ normals = _normals.getMat(); @@ -1144,76 +1147,336 @@ void TSDFVolumeCPU::fetchNormals(InputArray _points, OutputArray _normals) const ///////// GPU implementation ///////// +#ifdef HAVE_OPENCL + class TSDFVolumeGPU : public TSDFVolume { public: // dimension in voxels, size in meters - TSDFVolumeGPU(int _res, float _size, cv::Affine3f _pose, float _truncDist, int _maxWeight, + TSDFVolumeGPU(Point3i _res, float _voxelSize, cv::Affine3f _pose, float _truncDist, int _maxWeight, float _raycastStepFactor); - virtual void integrate(cv::Ptr depth, float depthFactor, cv::Affine3f cameraPose, cv::kinfu::Intr intrinsics) override; - virtual void raycast(cv::Affine3f cameraPose, cv::kinfu::Intr intrinsics, cv::Size frameSize, int pyramidLevels, - cv::Ptr frameGenerator, cv::Ptr frame) const override; + virtual void integrate(InputArray _depth, float depthFactor, cv::Affine3f cameraPose, cv::kinfu::Intr intrinsics) override; + virtual void raycast(cv::Affine3f cameraPose, cv::kinfu::Intr intrinsics, cv::Size frameSize, + cv::OutputArray _points, cv::OutputArray _normals) const override; virtual void fetchPointsNormals(cv::OutputArray points, cv::OutputArray normals) const override; - virtual void fetchNormals(cv::InputArray points, cv::OutputArray _normals) const override; + virtual void fetchNormals(cv::InputArray points, cv::OutputArray normals) const override; virtual void reset() override; + + // See zFirstMemOrder arg of parent class constructor + // for the array layout info + // Array elem is CV_32FC2, read as (float, int) + // TODO: optimization possible to (fp16, int16), see Voxel definition + UMat volume; }; -TSDFVolumeGPU::TSDFVolumeGPU(int _res, float _size, cv::Affine3f _pose, float _truncDist, int _maxWeight, - float _raycastStepFactor) : - TSDFVolume(_res, _size, _pose, _truncDist, _maxWeight, _raycastStepFactor) -{ } +TSDFVolumeGPU::TSDFVolumeGPU(Point3i _res, float _voxelSize, cv::Affine3f _pose, float _truncDist, int _maxWeight, + float _raycastStepFactor) : + TSDFVolume(_res, _voxelSize, _pose, _truncDist, _maxWeight, _raycastStepFactor, false) +{ + volume = UMat(1, volResolution.x * volResolution.y * volResolution.z, CV_32FC2); + + reset(); +} // zero volume, leave rest params the same void TSDFVolumeGPU::reset() { - throw std::runtime_error("Not implemented"); + CV_TRACE_FUNCTION(); + + volume.setTo(Scalar(0, 0)); } // use depth instead of distance (optimization) -void TSDFVolumeGPU::integrate(cv::Ptr /*depth*/, float /*depthFactor*/, cv::Affine3f /*cameraPose*/, Intr /*intrinsics*/) +void TSDFVolumeGPU::integrate(InputArray _depth, float depthFactor, + cv::Affine3f cameraPose, Intr intrinsics) { - throw std::runtime_error("Not implemented"); + CV_TRACE_FUNCTION(); + + UMat depth = _depth.getUMat(); + + cv::String errorStr; + cv::String name = "integrate"; + ocl::ProgramSource source = ocl::rgbd::tsdf_oclsrc; + cv::String options = "-cl-fast-relaxed-math -cl-mad-enable"; + ocl::Kernel k; + k.create(name.c_str(), source, options, &errorStr); + + if(k.empty()) + throw std::runtime_error("Failed to create kernel: " + errorStr); + + cv::Affine3f vol2cam(cameraPose.inv() * pose); + float dfac = 1.f/depthFactor; + Vec4i volResGpu(volResolution.x, volResolution.y, volResolution.z); + Vec2f fxy(intrinsics.fx, intrinsics.fy), cxy(intrinsics.cx, intrinsics.cy); + + // TODO: optimization possible + // Use sampler for depth (mask needed) + k.args(ocl::KernelArg::ReadOnly(depth), + ocl::KernelArg::PtrReadWrite(volume), + ocl::KernelArg::Constant(vol2cam.matrix.val, + sizeof(vol2cam.matrix.val)), + voxelSize, + volResGpu.val, + volDims.val, + fxy.val, + cxy.val, + dfac, + truncDist, + maxWeight); + + size_t globalSize[2]; + globalSize[0] = (size_t)volResolution.x; + globalSize[1] = (size_t)volResolution.y; + + if(!k.run(2, globalSize, NULL, true)) + throw std::runtime_error("Failed to run kernel"); } -void TSDFVolumeGPU::raycast(cv::Affine3f /*cameraPose*/, Intr /*intrinsics*/, Size /*frameSize*/, int /*pyramidLevels*/, - Ptr /* frameGenerator */, Ptr /* frame */) const +void TSDFVolumeGPU::raycast(cv::Affine3f cameraPose, Intr intrinsics, Size frameSize, + cv::OutputArray _points, cv::OutputArray _normals) const { - throw std::runtime_error("Not implemented"); + CV_TRACE_FUNCTION(); + + CV_Assert(frameSize.area() > 0); + + cv::String errorStr; + cv::String name = "raycast"; + ocl::ProgramSource source = ocl::rgbd::tsdf_oclsrc; + cv::String options = "-cl-fast-relaxed-math -cl-mad-enable"; + ocl::Kernel k; + k.create(name.c_str(), source, options, &errorStr); + + if(k.empty()) + throw std::runtime_error("Failed to create kernel: " + errorStr); + + _points.create (frameSize, CV_32FC4); + _normals.create(frameSize, CV_32FC4); + + UMat points = _points.getUMat(); + UMat normals = _normals.getUMat(); + + UMat vol2camGpu, cam2volGpu; + Affine3f vol2cam = cameraPose.inv() * pose; + Affine3f cam2vol = pose.inv() * cameraPose; + Mat(cam2vol.matrix).copyTo(cam2volGpu); + Mat(vol2cam.matrix).copyTo(vol2camGpu); + Intr::Reprojector r = intrinsics.makeReprojector(); + // We do subtract voxel size to minimize checks after + // Note: origin of volume coordinate is placed + // in the center of voxel (0,0,0), not in the corner of the voxel! + Vec4f boxMin, boxMax(volSize.x - voxelSize, + volSize.y - voxelSize, + volSize.z - voxelSize); + Vec2f finv(r.fxinv, r.fyinv), cxy(r.cx, r.cy); + float tstep = truncDist * raycastStepFactor; + + Vec4i volResGpu(volResolution.x, volResolution.y, volResolution.z); + + k.args(ocl::KernelArg::WriteOnlyNoSize(points), + ocl::KernelArg::WriteOnlyNoSize(normals), + frameSize, + ocl::KernelArg::PtrReadOnly(volume), + ocl::KernelArg::PtrReadOnly(vol2camGpu), + ocl::KernelArg::PtrReadOnly(cam2volGpu), + finv.val, cxy.val, + boxMin.val, boxMax.val, + tstep, + voxelSize, + volResGpu.val, + volDims.val, + neighbourCoords.val); + + size_t globalSize[2]; + globalSize[0] = (size_t)frameSize.width; + globalSize[1] = (size_t)frameSize.height; + + if(!k.run(2, globalSize, NULL, true)) + throw std::runtime_error("Failed to run kernel"); } -void TSDFVolumeGPU::fetchNormals(InputArray /*_points*/, OutputArray /*_normals*/) const +void TSDFVolumeGPU::fetchNormals(InputArray _points, OutputArray _normals) const { - throw std::runtime_error("Not implemented"); + CV_TRACE_FUNCTION(); + + if(_normals.needed()) + { + UMat points = _points.getUMat(); + CV_Assert(points.type() == POINT_TYPE); + + _normals.createSameSize(_points, POINT_TYPE); + UMat normals = _normals.getUMat(); + + cv::String errorStr; + cv::String name = "getNormals"; + ocl::ProgramSource source = ocl::rgbd::tsdf_oclsrc; + cv::String options = "-cl-fast-relaxed-math -cl-mad-enable"; + ocl::Kernel k; + k.create(name.c_str(), source, options, &errorStr); + + if(k.empty()) + throw std::runtime_error("Failed to create kernel: " + errorStr); + + UMat volPoseGpu, invPoseGpu; + Mat(pose .matrix).copyTo(volPoseGpu); + Mat(pose.inv().matrix).copyTo(invPoseGpu); + Vec4i volResGpu(volResolution.x, volResolution.y, volResolution.z); + Size frameSize = points.size(); + + k.args(ocl::KernelArg::ReadOnlyNoSize(points), + ocl::KernelArg::WriteOnlyNoSize(normals), + frameSize, + ocl::KernelArg::PtrReadOnly(volume), + ocl::KernelArg::PtrReadOnly(volPoseGpu), + ocl::KernelArg::PtrReadOnly(invPoseGpu), + voxelSizeInv, + volResGpu.val, + volDims.val, + neighbourCoords.val); + + size_t globalSize[2]; + globalSize[0] = (size_t)points.cols; + globalSize[1] = (size_t)points.rows; + + if(!k.run(2, globalSize, NULL, true)) + throw std::runtime_error("Failed to run kernel"); + } } -void TSDFVolumeGPU::fetchPointsNormals(OutputArray /*points*/, OutputArray /*normals*/) const +void TSDFVolumeGPU::fetchPointsNormals(OutputArray points, OutputArray normals) const { - throw std::runtime_error("Not implemented"); + CV_TRACE_FUNCTION(); + + if(points.needed()) + { + bool needNormals = normals.needed(); + + // 1. scan to count points in each group and allocate output arrays + + ocl::Kernel kscan; + + cv::String errorStr; + ocl::ProgramSource source = ocl::rgbd::tsdf_oclsrc; + cv::String options = "-cl-fast-relaxed-math -cl-mad-enable"; + + kscan.create("scanSize", source, options, &errorStr); + + if(kscan.empty()) + throw std::runtime_error("Failed to create kernel: " + errorStr); + + size_t globalSize[3]; + globalSize[0] = (size_t)volResolution.x; + globalSize[1] = (size_t)volResolution.y; + globalSize[2] = (size_t)volResolution.z; + + const ocl::Device& device = ocl::Device::getDefault(); + size_t wgsLimit = device.maxWorkGroupSize(); + size_t memSize = device.localMemSize(); + // local mem should keep a point (and a normal) for each thread in a group + // use 4 float per each point and normal + size_t elemSize = (sizeof(float)*4)*(needNormals ? 2 : 1); + const size_t lcols = 8; + const size_t lrows = 8; + size_t lplanes = min(memSize/elemSize, wgsLimit)/lcols/lrows; + lplanes = roundDownPow2(lplanes); + size_t localSize[3] = {lcols, lrows, lplanes}; + Vec3i ngroups((int)divUp(globalSize[0], (unsigned int)localSize[0]), + (int)divUp(globalSize[1], (unsigned int)localSize[1]), + (int)divUp(globalSize[2], (unsigned int)localSize[2])); + + const size_t counterSize = sizeof(int); + size_t lsz = localSize[0]*localSize[1]*localSize[2]*counterSize; + + const int gsz[3] = {ngroups[2], ngroups[1], ngroups[0]}; + UMat groupedSum(3, gsz, CV_32S, Scalar(0)); + + UMat volPoseGpu; + Mat(pose.matrix).copyTo(volPoseGpu); + Vec4i volResGpu(volResolution.x, volResolution.y, volResolution.z); + + kscan.args(ocl::KernelArg::PtrReadOnly(volume), + volResGpu.val, + volDims.val, + neighbourCoords.val, + ocl::KernelArg::PtrReadOnly(volPoseGpu), + voxelSize, + voxelSizeInv, + //TODO: replace by KernelArg::Local(lsz) + ocl::KernelArg(ocl::KernelArg::LOCAL, 0, 1, 1, 0, lsz), + ocl::KernelArg::WriteOnlyNoSize(groupedSum)); + + if(!kscan.run(3, globalSize, localSize, true)) + throw std::runtime_error("Failed to run kernel"); + + Mat groupedSumCpu = groupedSum.getMat(ACCESS_READ); + int gpuSum = (int)cv::sum(groupedSumCpu)[0]; + // should be no CPU copies when new kernel is executing + groupedSumCpu.release(); + + // 2. fill output arrays according to per-group points count + + ocl::Kernel kfill; + kfill.create("fillPtsNrm", source, options, &errorStr); + + if(kfill.empty()) + throw std::runtime_error("Failed to create kernel: " + errorStr); + + points.create(gpuSum, 1, POINT_TYPE); + UMat pts = points.getUMat(); + UMat nrm; + if(needNormals) + { + normals.create(gpuSum, 1, POINT_TYPE); + nrm = normals.getUMat(); + } + else + { + // it won't access but empty args are forbidden + nrm = UMat(1, 1, POINT_TYPE); + } + UMat atomicCtr(1, 1, CV_32S, Scalar(0)); + + // mem size to keep pts (and normals optionally) for all work-items in a group + lsz = localSize[0]*localSize[1]*localSize[2]*elemSize; + + kfill.args(ocl::KernelArg::PtrReadOnly(volume), + volResGpu.val, + volDims.val, + neighbourCoords.val, + ocl::KernelArg::PtrReadOnly(volPoseGpu), + voxelSize, + voxelSizeInv, + ((int)needNormals), + //TODO: replace by ::Local(lsz) + ocl::KernelArg(ocl::KernelArg::LOCAL, 0, 1, 1, 0, lsz), + ocl::KernelArg::PtrReadWrite(atomicCtr), + ocl::KernelArg::ReadOnlyNoSize(groupedSum), + ocl::KernelArg::WriteOnlyNoSize(pts), + ocl::KernelArg::WriteOnlyNoSize(nrm) + ); + + if(!kfill.run(3, globalSize, localSize, true)) + throw std::runtime_error("Failed to run kernel"); + } } -cv::Ptr makeTSDFVolume(cv::kinfu::Params::PlatformType t, - int _res, float _size, cv::Affine3f _pose, float _truncDist, int _maxWeight, +#endif + +cv::Ptr makeTSDFVolume(Point3i _res, float _voxelSize, cv::Affine3f _pose, float _truncDist, int _maxWeight, float _raycastStepFactor) { - switch (t) - { - case cv::kinfu::Params::PlatformType::PLATFORM_CPU: - return cv::makePtr(_res, _size, _pose, _truncDist, _maxWeight, - _raycastStepFactor); - case cv::kinfu::Params::PlatformType::PLATFORM_GPU: - return cv::makePtr(_res, _size, _pose, _truncDist, _maxWeight, - _raycastStepFactor); - default: - return cv::Ptr(); - } +#ifdef HAVE_OPENCL + if(cv::ocl::isOpenCLActivated()) + return cv::makePtr(_res, _voxelSize, _pose, _truncDist, _maxWeight, _raycastStepFactor); +#endif + return cv::makePtr(_res, _voxelSize, _pose, _truncDist, _maxWeight, _raycastStepFactor); } } // namespace kinfu diff --git a/modules/rgbd/src/tsdf.hpp b/modules/rgbd/src/tsdf.hpp index f4008ac11..3003f0375 100644 --- a/modules/rgbd/src/tsdf.hpp +++ b/modules/rgbd/src/tsdf.hpp @@ -18,12 +18,12 @@ class TSDFVolume { public: // dimension in voxels, size in meters - TSDFVolume(int _res, float _size, cv::Affine3f _pose, float _truncDist, int _maxWeight, - float _raycastStepFactor); + TSDFVolume(Point3i _res, float _voxelSize, cv::Affine3f _pose, float _truncDist, int _maxWeight, + float _raycastStepFactor, bool zFirstMemOrder = true); - virtual void integrate(cv::Ptr depth, float depthFactor, cv::Affine3f cameraPose, cv::kinfu::Intr intrinsics) = 0; - virtual void raycast(cv::Affine3f cameraPose, cv::kinfu::Intr intrinsics, cv::Size frameSize, int pyramidLevels, - cv::Ptr frameGenerator, cv::Ptr frame) const = 0; + virtual void integrate(InputArray _depth, float depthFactor, cv::Affine3f cameraPose, cv::kinfu::Intr intrinsics) = 0; + virtual void raycast(cv::Affine3f cameraPose, cv::kinfu::Intr intrinsics, cv::Size frameSize, + cv::OutputArray points, cv::OutputArray normals) const = 0; virtual void fetchPointsNormals(cv::OutputArray points, cv::OutputArray normals) const = 0; virtual void fetchNormals(cv::InputArray points, cv::OutputArray _normals) const = 0; @@ -32,14 +32,20 @@ public: virtual ~TSDFVolume() { } - float edgeSize; - int edgeResolution; + float voxelSize; + float voxelSizeInv; + Point3i volResolution; int maxWeight; cv::Affine3f pose; + float raycastStepFactor; + + Point3f volSize; + float truncDist; + Vec4i volDims; + Vec8i neighbourCoords; }; -cv::Ptr makeTSDFVolume(cv::kinfu::Params::PlatformType t, - int _res, float _size, cv::Affine3f _pose, float _truncDist, int _maxWeight, +cv::Ptr makeTSDFVolume(Point3i _res, float _voxelSize, cv::Affine3f _pose, float _truncDist, int _maxWeight, float _raycastStepFactor); } // namespace kinfu diff --git a/modules/rgbd/src/utils.cpp b/modules/rgbd/src/utils.cpp index 92e227d93..78de0aa85 100644 --- a/modules/rgbd/src/utils.cpp +++ b/modules/rgbd/src/utils.cpp @@ -50,38 +50,6 @@ namespace rgbd namespace kinfu { -#if PRINT_TIME - -ScopeTime::ScopeTime(std::string name_, bool _enablePrint) : - name(name_), enablePrint(_enablePrint) -{ - start = (double)cv::getTickCount(); - nested++; -} - -ScopeTime::~ScopeTime() -{ - double time_ms = ((double)cv::getTickCount() - start)*1000.0/cv::getTickFrequency(); - if(enablePrint) - { - std::string spaces(nested, '-'); - std::cout << spaces << "Time(" << name << ") = " << time_ms << " ms" << std::endl; - } - nested--; -} - -int ScopeTime::nested = 0; - -#else - -ScopeTime::ScopeTime(std::string /*name_*/, bool /*_enablePrint = true*/) -{ } - -ScopeTime::~ScopeTime() -{ } - -#endif - } // namespace kinfu } // namespace cv diff --git a/modules/rgbd/src/utils.hpp b/modules/rgbd/src/utils.hpp index 83c6fddaf..53b3195a2 100644 --- a/modules/rgbd/src/utils.hpp +++ b/modules/rgbd/src/utils.hpp @@ -47,14 +47,14 @@ rescaleDepthTemplated(const Mat& in, Mat& out) namespace kinfu { -// Print execution times of each block marked with ScopeTime -#define PRINT_TIME 0 +// One place to turn intrinsics on and off +#define USE_INTRINSICS CV_SIMD128 typedef float depthType; const float qnan = std::numeric_limits::quiet_NaN(); const cv::Vec3f nan3(qnan, qnan, qnan); -#if CV_SIMD128 +#if USE_INTRINSICS const cv::v_float32x4 nanv(qnan, qnan, qnan, qnan); #endif @@ -63,26 +63,13 @@ inline bool isNaN(cv::Point3f p) return (cvIsNaN(p.x) || cvIsNaN(p.y) || cvIsNaN(p.z)); } -#if CV_SIMD128 +#if USE_INTRINSICS static inline bool isNaN(const cv::v_float32x4& p) { return cv::v_check_any(p != p); } #endif -struct ScopeTime -{ - ScopeTime(std::string name_, bool _enablePrint = true); - ~ScopeTime(); - -#if PRINT_TIME - static int nested; - const std::string name; - const bool enablePrint; - double start; -#endif -}; - /** @brief Camera intrinsics */ struct Intr { @@ -143,6 +130,16 @@ struct Intr float fx, fy, cx, cy; }; +inline size_t roundDownPow2(size_t x) +{ + size_t shift = 0; + while(x != 0) + { + shift++; x >>= 1; + } + return (size_t)(1ULL << (shift-1)); +} + } // namespace kinfu } // namespace cv diff --git a/modules/rgbd/test/test_kinfu.cpp b/modules/rgbd/test/test_kinfu.cpp index 16bba6776..2ac5ebaad 100644 --- a/modules/rgbd/test/test_kinfu.cpp +++ b/modules/rgbd/test/test_kinfu.cpp @@ -90,10 +90,18 @@ struct RenderInvoker : ParallelLoopBody float depthFactor; }; -struct CubeSpheresScene +struct Scene +{ + virtual ~Scene() {} + static Ptr create(int nScene, Size sz, Matx33f _intr, float _depthFactor); + virtual Mat depth(Affine3f pose) = 0; + virtual std::vector getPoses() = 0; +}; + +struct CubeSpheresScene : Scene { const int framesPerCycle = 32; - const int nCycles = 1; + const float nCycles = 0.25f; const Affine3f startPose = Affine3f(Vec3f(-0.5f, 0.f, 0.f), Vec3f(2.1f, 1.4f, -2.1f)); CubeSpheresScene(Size sz, Matx33f _intr, float _depthFactor) : @@ -125,7 +133,7 @@ struct CubeSpheresScene return res; } - Mat depth(Affine3f pose) + Mat depth(Affine3f pose) override { Mat_ frame(frameSize); Reprojector reproj(intr); @@ -136,10 +144,10 @@ struct CubeSpheresScene return frame; } - std::vector getPoses() + std::vector getPoses() override { std::vector poses; - for(int i = 0; i < framesPerCycle*nCycles; i++) + for(int i = 0; i < (int)(framesPerCycle*nCycles); i++) { float angle = (float)(CV_2PI*i/framesPerCycle); Affine3f pose; @@ -160,10 +168,10 @@ struct CubeSpheresScene }; -struct RotatingScene +struct RotatingScene : Scene { - const int framesPerCycle = 64; - const int nCycles = 1; + const int framesPerCycle = 32; + const float nCycles = 0.5f; const Affine3f startPose = Affine3f(Vec3f(-1.f, 0.f, 0.f), Vec3f(1.5f, 2.f, -1.5f)); RotatingScene(Size sz, Matx33f _intr, float _depthFactor) : @@ -221,7 +229,7 @@ struct RotatingScene return res; } - Mat depth(Affine3f pose) + Mat depth(Affine3f pose) override { Mat_ frame(frameSize); Reprojector reproj(intr); @@ -232,7 +240,7 @@ struct RotatingScene return frame; } - std::vector getPoses() + std::vector getPoses() override { std::vector poses; for(int i = 0; i < framesPerCycle*nCycles; i++) @@ -258,24 +266,42 @@ struct RotatingScene Mat_ RotatingScene::randTexture(256, 256); +Ptr Scene::create(int nScene, Size sz, Matx33f _intr, float _depthFactor) +{ + if(nScene == 0) + return makePtr(sz, _intr, _depthFactor); + else + return makePtr(sz, _intr, _depthFactor); +} + static const bool display = false; -TEST( KinectFusion, lowDense ) +void flyTest(bool hiDense, bool inequal) { - Ptr params = kinfu::Params::coarseParams(); + Ptr params; + if(hiDense) + params = kinfu::Params::defaultParams(); + else + params = kinfu::Params::coarseParams(); - RotatingScene scene(params->frameSize, params->intr, params->depthFactor); + if(inequal) + { + params->volumeDims[0] += 32; + params->volumeDims[1] -= 32; + } + + Ptr scene = Scene::create(hiDense, params->frameSize, params->intr, params->depthFactor); Ptr kf = kinfu::KinFu::create(params); - std::vector poses = scene.getPoses(); + std::vector poses = scene->getPoses(); Affine3f startPoseGT = poses[0], startPoseKF; Affine3f pose, kfPose; for(size_t i = 0; i < poses.size(); i++) { pose = poses[i]; - Mat depth = scene.depth(pose); + Mat depth = scene->depth(pose); ASSERT_TRUE(kf->update(depth)); @@ -295,46 +321,35 @@ TEST( KinectFusion, lowDense ) } } - ASSERT_LT(cv::norm(kfPose.rvec() - pose.rvec()), 0.01); - ASSERT_LT(cv::norm(kfPose.translation() - pose.translation()), 0.1); + double rvecThreshold = hiDense ? 0.01 : 0.02; + ASSERT_LT(cv::norm(kfPose.rvec() - pose.rvec()), rvecThreshold); + double poseThreshold = hiDense ? 0.03 : 0.1; + ASSERT_LT(cv::norm(kfPose.translation() - pose.translation()), poseThreshold); +} + +TEST( KinectFusion, lowDense ) +{ + flyTest(false, false); } TEST( KinectFusion, highDense ) { - Ptr params = kinfu::Params::defaultParams(); - CubeSpheresScene scene(params->frameSize, params->intr, params->depthFactor); - - Ptr kf = kinfu::KinFu::create(params); - - std::vector poses = scene.getPoses(); - Affine3f startPoseGT = poses[0], startPoseKF; - Affine3f pose, kfPose; - for(size_t i = 0; i < poses.size(); i++) - { - pose = poses[i]; - - Mat depth = scene.depth(pose); - - ASSERT_TRUE(kf->update(depth)); - - kfPose = kf->getPose(); - if(i == 0) - startPoseKF = kfPose; - - pose = ( startPoseGT.inv() * pose )*startPoseKF; - - if(display) - { - imshow("depth", depth*(1.f/params->depthFactor/4.f)); - Mat rendered; - kf->render(rendered); - imshow("render", rendered); - waitKey(10); - } - } - - ASSERT_LT(cv::norm(kfPose.rvec() - pose.rvec()), 0.01); - ASSERT_LT(cv::norm(kfPose.translation() - pose.translation()), 0.03); + flyTest(true, false); } +TEST( KinectFusion, inequal ) +{ + flyTest(false, true); +} + +#ifdef HAVE_OPENCL +TEST( KinectFusion, OCL ) +{ + cv::ocl::setUseOpenCL(false); + flyTest(false, false); + cv::ocl::setUseOpenCL(true); + flyTest(false, false); +} +#endif + }} // namespace diff --git a/modules/rgbd/test/test_odometry.cpp b/modules/rgbd/test/test_odometry.cpp index 3da490c0b..ce7ec1016 100644 --- a/modules/rgbd/test/test_odometry.cpp +++ b/modules/rgbd/test/test_odometry.cpp @@ -235,10 +235,10 @@ void CV_OdometryTest::run(int) // On each iteration an input frame is warped using generated transformation. // Odometry is run on the following pair: the original frame and the warped one. // Comparing a computed transformation with an applied one we compute 2 errors: - // better_1time_count - count of poses which error is less than ground thrush pose, - // better_5times_count - count of poses which error is 5 times less than ground thrush pose. + // better_1time_count - count of poses which error is less than ground truth pose, + // better_5times_count - count of poses which error is 5 times less than ground truth pose. int iterCount = 100; - int better_1time_count = 0; + int better_1time_count = 0; int better_5times_count = 0; for(int iter = 0; iter < iterCount; iter++) { @@ -267,7 +267,6 @@ void CV_OdometryTest::run(int) waitKey(); #endif - // compare rotation double rdiffnorm = cv::norm(rvec - calcRvec), rnorm = cv::norm(rvec); diff --git a/modules/rgbd/test/test_precomp.hpp b/modules/rgbd/test/test_precomp.hpp index b39793a6c..a896e58e4 100644 --- a/modules/rgbd/test/test_precomp.hpp +++ b/modules/rgbd/test/test_precomp.hpp @@ -13,6 +13,10 @@ #include #include +#ifdef HAVE_OPENCL +#include +#endif + namespace opencv_test { using namespace cv::rgbd; }