From b267fc3d1daf766a2f6e615d4e76bf5b5df6a7a8 Mon Sep 17 00:00:00 2001
From: DumDereDum <46279571+DumDereDum@users.noreply.github.com>
Date: Thu, 17 Sep 2020 16:20:58 +0300
Subject: [PATCH] Merge pull request #2629 from DumDereDum:tsdf_optimization

TSDF Optimization

* getMat using fix

* min/max fix

* create WeightType

* create normals test

* bug fix

* complete normals test

* fix makeVolume and rewrite tests

* minor fixes

* add new normal tests

* replace operator() on lambda expressions

* make a valid points test

* minor fixes

* getNormalVoxel fix in tsdf and hashTsdf

* create renderPointsNormals

* replace Affine3f with Matx44f oin make volume

* minor fixes

* minor fix

* tmp

* create function interpolateVoxel for hashTSDF

* tmp

* right interpolation for HashTSDF

* rewrite intrinsics normalize

* minor fix

* rewrite GPU normalize

* start to write perf tests

* make Volume fix

* GPU normalize fix

* minor fix

* create perf test for raycast

* fix LNK2019 problem in perf test

* made all perf tests

* replace all Affine3f with Matx44f

* replace Point3i with Vec3i

* minor fix

* minor fix

* add CV_EXPORT_W

* build fix 1

* build fix 2

* build fix 3

* warning fix

* build test

* win test

* tests without HashTSDF

* create noparallel normals checking

* test without fetch

* test without fetch points normals

* add end line

* revert rotation() in hash_tsdf

* fix matrix multiplication order

* fetch points normals invoker fix

* warning fix

* warning fix

* Docs fix

* Hash push normals fix

* replace operator() with lambda in PushNormals

* warning fix

* create half type and types conversion

* error fix and preparation for CPU optimization

* replace all TsdfType with half

* minor fixes

* minor fix

* raycast fix

* conversion bug fix

* delete cout

* it's alive!

* improve conversion CPU

* warning fix

* warning fix 1

* intrinsics optimization

* minor fixes

* warning fix

* interpolate improve

* start to optimize GPU version

* vectorize tsdfToFloat

* CPU optimization

* GPU optimization

* minor fix

* minor fix

* Docs fix

* minor fixes

* add perf tests HashTSDF

* hashTSDF optimization

* minor fix

* interpolate improvement

* getNormalVoxel improvement

* added a new calculation pixNorm

* tsdfToFloat improve

* add flag USE_INTERPOLATION_IN_GETNORMAL to HashTSDF

* minor fix

* minor fix

* replace int with uchar

* getNormal improve

* minor fix

* minor fixes

* inline _at()

* inline _at()

* vectorize interpolation

* i tried :(

* minor fix

* try to improve _getNormalVoxel

* minor fix

* create new struct for tests

* minor test fix

* minor fix

* minor fix 1

* minor fix 2

* minor fix 3

* minor fix 4

* minor fix 5

* minor fix 6

* minor fix 7

* monor test fix

* monor test fix 1

* integrate improve

* rewrite interpolation in getNormalVoxel in HashTSDF

* intergrate improve tsdf

* minor Docs fix

* change autolock mutex place

* reduce interpolation calls

* new getNormal

* minor fix

* tmp mutex optimization

* rewrite mutex lock

* rewrite Settings structure as class

* minor Docs fix

* new inteprolateVoxel debug

* new inteprolateVoxel

* minor fix

* add new voxelToVolumeUnitIdx

* new integrate with lambda functions

* new integrate minor fix

* minor fix

* minor fix 1

* pixNorm, I try :(

* need to rewrite [][] part

* It's Alive

* omg? it works!

* minor fix

* minor fix 1

* minor fix

* add new environment check

* minor fix

* minor fix 1

* Docs fix

* minor fix 3

* minor fix

* minor fix 1

Co-authored-by: arsaratovtsev <artem.saratovtsev@intel.com>
---
 modules/rgbd/perf/perf_tsdf.cpp |  97 ++++---
 modules/rgbd/src/hash_tsdf.cpp  | 459 +++++++++++++++++++++++---------
 modules/rgbd/src/hash_tsdf.hpp  |   3 +-
 modules/rgbd/src/opencl/tsdf.cl |  76 ++++--
 modules/rgbd/src/tsdf.cpp       | 199 +++++++++-----
 modules/rgbd/src/tsdf.hpp       |  16 +-
 modules/rgbd/test/test_tsdf.cpp |   2 +-
 7 files changed, 590 insertions(+), 262 deletions(-)

diff --git a/modules/rgbd/perf/perf_tsdf.cpp b/modules/rgbd/perf/perf_tsdf.cpp
index 2ccc36851..ebca91587 100644
--- a/modules/rgbd/perf/perf_tsdf.cpp
+++ b/modules/rgbd/perf/perf_tsdf.cpp
@@ -169,25 +169,39 @@ Ptr<Scene> Scene::create(Size sz, Matx33f _intr, float _depthFactor)
     return makePtr<SemisphereScene>(sz, _intr, _depthFactor);
 }
 
+class Settings
+{
+public:
+    Ptr<kinfu::Params> _params;
+    Ptr<kinfu::Volume> volume;
+    Ptr<Scene> scene;
+    std::vector<Affine3f> poses;
+
+    Settings(bool useHashTSDF)
+    {
+        if (useHashTSDF)
+            _params = kinfu::Params::hashTSDFParams(true);
+        else
+            _params = kinfu::Params::coarseParams();
+
+        volume = kinfu::makeVolume(_params->volumeType, _params->voxelSize, _params->volumePose.matrix,
+            _params->raycast_step_factor, _params->tsdf_trunc_dist, _params->tsdf_max_weight,
+            _params->truncateThreshold, _params->volumeDims);
+
+        scene = Scene::create(_params->frameSize, _params->intr, _params->depthFactor);
+        poses = scene->getPoses();
+    }
+};
+
 PERF_TEST(Perf_TSDF, integrate)
 {
-    Ptr<kinfu::Params> _params;
-    _params = kinfu::Params::coarseParams();
-
-    Ptr<kinfu::Volume> volume = kinfu::makeVolume(_params->volumeType, _params->voxelSize, _params->volumePose.matrix,
-        _params->raycast_step_factor, _params->tsdf_trunc_dist, _params->tsdf_max_weight,
-        _params->truncateThreshold, _params->volumeDims);
-
-    Ptr<Scene> scene = Scene::create(_params->frameSize, _params->intr, _params->depthFactor);
-    std::vector<Affine3f> poses = scene->getPoses();
-
-    for (size_t i = 0; i < poses.size(); i++)
+    Settings settings(false);
+    for (size_t i = 0; i < settings.poses.size(); i++)
     {
-        UMat _points, _normals;
-        Matx44f pose = poses[i].matrix;
-        Mat depth = scene->depth(pose);
+        Matx44f pose = settings.poses[i].matrix;
+        Mat depth = settings.scene->depth(pose);
         startTimer();
-        volume->integrate(depth, _params->depthFactor, pose, _params->intr);
+        settings.volume->integrate(depth, settings._params->depthFactor, pose, settings._params->intr);
         stopTimer();
     }
     SANITY_CHECK_NOTHING();
@@ -195,25 +209,48 @@ PERF_TEST(Perf_TSDF, integrate)
 
 PERF_TEST(Perf_TSDF, raycast)
 {
-    Ptr<kinfu::Params> _params;
-    _params = kinfu::Params::coarseParams();
-
-    Ptr<kinfu::Volume> volume = kinfu::makeVolume(_params->volumeType, _params->voxelSize, _params->volumePose.matrix,
-        _params->raycast_step_factor, _params->tsdf_trunc_dist, _params->tsdf_max_weight,
-        _params->truncateThreshold, _params->volumeDims);
-
-    Ptr<Scene> scene = Scene::create(_params->frameSize, _params->intr, _params->depthFactor);
-    std::vector<Affine3f> poses = scene->getPoses();
-
-    for (size_t i = 0; i < poses.size(); i++)
+    Settings settings(false);
+    for (size_t i = 0; i < settings.poses.size(); i++)
     {
         UMat _points, _normals;
-        Matx44f pose = poses[i].matrix;
-        Mat depth = scene->depth(pose);
+        Matx44f pose = settings.poses[i].matrix;
+        Mat depth = settings.scene->depth(pose);
 
-        volume->integrate(depth, _params->depthFactor, pose, _params->intr);
+        settings.volume->integrate(depth, settings._params->depthFactor, pose, settings._params->intr);
         startTimer();
-        volume->raycast(pose, _params->intr, _params->frameSize, _points, _normals);
+        settings.volume->raycast(pose, settings._params->intr, settings._params->frameSize, _points, _normals);
+        stopTimer();
+    }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST(Perf_HashTSDF, integrate)
+{
+    Settings settings(true);
+
+    for (size_t i = 0; i < settings.poses.size(); i++)
+    {
+        Matx44f pose = settings.poses[i].matrix;
+        Mat depth = settings.scene->depth(pose);
+        startTimer();
+        settings.volume->integrate(depth, settings._params->depthFactor, pose, settings._params->intr);
+        stopTimer();
+    }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST(Perf_HashTSDF, raycast)
+{
+    Settings settings(true);
+    for (size_t i = 0; i < settings.poses.size(); i++)
+    {
+        UMat _points, _normals;
+        Matx44f pose = settings.poses[i].matrix;
+        Mat depth = settings.scene->depth(pose);
+
+        settings.volume->integrate(depth, settings._params->depthFactor, pose, settings._params->intr);
+        startTimer();
+        settings.volume->raycast(pose, settings._params->intr, settings._params->frameSize, _points, _normals);
         stopTimer();
     }
     SANITY_CHECK_NOTHING();
diff --git a/modules/rgbd/src/hash_tsdf.cpp b/modules/rgbd/src/hash_tsdf.cpp
index 6d45dbfe6..b41450021 100644
--- a/modules/rgbd/src/hash_tsdf.cpp
+++ b/modules/rgbd/src/hash_tsdf.cpp
@@ -16,10 +16,27 @@
 #include "opencv2/core/utils/trace.hpp"
 #include "utils.hpp"
 
+#define USE_INTERPOLATION_IN_GETNORMAL 1
+
+
 namespace cv
 {
 namespace kinfu
 {
+
+static inline TsdfType floatToTsdf(float num)
+{
+    //CV_Assert(-1 < num <= 1);
+    int8_t res = int8_t(num * (-128.f));
+    res = res ? res : (num < 0 ? 1 : -1);
+    return res;
+}
+
+static inline float tsdfToFloat(TsdfType num)
+{
+    return float(num) * (-1.f / 128.f);
+}
+
 HashTSDFVolume::HashTSDFVolume(float _voxelSize, cv::Matx44f _pose, float _raycastStepFactor,
                                float _truncDist, int _maxWeight, float _truncateThreshold,
                                int _volumeUnitRes, bool _zFirstMemOrder)
@@ -48,88 +65,7 @@ void HashTSDFVolumeCPU::reset()
     volumeUnits.clear();
 }
 
-struct AllocateVolumeUnitsInvoker : ParallelLoopBody
-{
-    AllocateVolumeUnitsInvoker(HashTSDFVolumeCPU& _volume, const Depth& _depth, Intr intrinsics,
-                               cv::Matx44f cameraPose, float _depthFactor, int _depthStride = 4)
-        : ParallelLoopBody(),
-          volume(_volume),
-          depth(_depth),
-          reproj(intrinsics.makeReprojector()),
-          cam2vol(_volume.pose.inv() * Affine3f(cameraPose)),
-          depthFactor(1.0f / _depthFactor),
-          depthStride(_depthStride)
-    {
-    }
-
-    virtual void operator()(const Range& range) const override
-    {
-        for (int y = range.start; y < range.end; y += depthStride)
-        {
-            const depthType* depthRow = depth[y];
-            for (int x = 0; x < depth.cols; x += depthStride)
-            {
-                depthType z = depthRow[x] * depthFactor;
-                if (z <= 0)
-                    continue;
-
-                Point3f camPoint = reproj(Point3f((float)x, (float)y, z));
-                Point3f volPoint = cam2vol * camPoint;
-
-                //! Find accessed TSDF volume unit for valid 3D vertex
-                cv::Vec3i lower_bound = volume.volumeToVolumeUnitIdx(
-                    volPoint - cv::Point3f(volume.truncDist, volume.truncDist, volume.truncDist));
-                cv::Vec3i upper_bound = volume.volumeToVolumeUnitIdx(
-                    volPoint + cv::Point3f(volume.truncDist, volume.truncDist, volume.truncDist));
-                VolumeUnitIndexSet localAccessVolUnits;
-                for (int i = lower_bound[0]; i <= upper_bound[0]; i++)
-                    for (int j = lower_bound[1]; j <= upper_bound[1]; j++)
-                        for (int k = lower_bound[2]; k <= lower_bound[2]; k++)
-                        {
-                            const cv::Vec3i tsdf_idx = cv::Vec3i(i, j, k);
-                            if (!localAccessVolUnits.count(tsdf_idx))
-                            {
-                                localAccessVolUnits.emplace(tsdf_idx);
-                            }
-                        }
-                AutoLock al(mutex);
-                for (const auto& tsdf_idx : localAccessVolUnits)
-                {
-                    //! If the insert into the global set passes
-                    if (!volume.volumeUnits.count(tsdf_idx))
-                    {
-                        VolumeUnit volumeUnit;
-                        cv::Point3i volumeDims(volume.volumeUnitResolution,
-                                               volume.volumeUnitResolution,
-                                               volume.volumeUnitResolution);
-
-                        cv::Matx44f subvolumePose =
-                            volume.pose.translate(volume.volumeUnitIdxToVolume(tsdf_idx)).matrix;
-                        volumeUnit.pVolume = cv::makePtr<TSDFVolumeCPU>(
-                            volume.voxelSize, subvolumePose, volume.raycastStepFactor,
-                            volume.truncDist, volume.maxWeight, volumeDims);
-                        //! This volume unit will definitely be required for current integration
-                        volumeUnit.index             = tsdf_idx;
-                        volumeUnit.isActive          = true;
-                        volume.volumeUnits[tsdf_idx] = volumeUnit;
-                    }
-                }
-            }
-        }
-    }
-
-    HashTSDFVolumeCPU& volume;
-    const Depth& depth;
-    const Intr::Reprojector reproj;
-    const cv::Affine3f cam2vol;
-    const float depthFactor;
-    const int depthStride;
-    mutable Mutex mutex;
-};
-
-
-void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor,
-                                  const cv::Matx44f& cameraPose, const Intr& intrinsics)
+void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const Matx44f& cameraPose, const Intr& intrinsics)
 {
     CV_TRACE_FUNCTION();
 
@@ -137,11 +73,72 @@ void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor,
     Depth depth = _depth.getMat();
 
     //! Compute volumes to be allocated
-    AllocateVolumeUnitsInvoker allocate_i(*this, depth, intrinsics, cameraPose, depthFactor);
+    const int depthStride = 1;
+    const float invDepthFactor = 1.f / depthFactor;
+    const Intr::Reprojector reproj(intrinsics.makeReprojector());
+    const Affine3f cam2vol(pose.inv() * Affine3f(cameraPose));
+    const Point3f truncPt(truncDist, truncDist, truncDist);
+    VolumeUnitIndexSet newIndices;
+    Mutex mutex;
     Range allocateRange(0, depth.rows);
-    parallel_for_(allocateRange, allocate_i);
+    auto AllocateVolumeUnitsInvoker = [&](const Range& range) {
+        VolumeUnitIndexSet localAccessVolUnits;
+        for (int y = range.start; y < range.end; y += depthStride)
+        {
+            const depthType* depthRow = depth[y];
+            for (int x = 0; x < depth.cols; x += depthStride)
+            {
+                depthType z = depthRow[x] * invDepthFactor;
+                if (z <= 0 || z > this->truncateThreshold)
+                    continue;
+                Point3f camPoint = reproj(Point3f((float)x, (float)y, z));
+                Point3f volPoint = cam2vol * camPoint;
+                //! Find accessed TSDF volume unit for valid 3D vertex
+                Vec3i lower_bound = this->volumeToVolumeUnitIdx(volPoint - truncPt);
+                Vec3i upper_bound = this->volumeToVolumeUnitIdx(volPoint + truncPt);
 
-    //! Get volumes that are in the current camera frame
+                for (int i = lower_bound[0]; i <= upper_bound[0]; i++)
+                    for (int j = lower_bound[1]; j <= upper_bound[1]; j++)
+                        for (int k = lower_bound[2]; k <= lower_bound[2]; k++)
+                        {
+                            const Vec3i tsdf_idx = Vec3i(i, j, k);
+                            if (!localAccessVolUnits.count(tsdf_idx))
+                            {
+                                //! This volume unit will definitely be required for current integration
+                                localAccessVolUnits.emplace(tsdf_idx);
+                            }
+                        }
+            }
+        }
+
+        mutex.lock();
+        for (const auto& tsdf_idx : localAccessVolUnits)
+        {
+            //! If the insert into the global set passes
+            if (!this->volumeUnits.count(tsdf_idx))
+            {
+                // Volume allocation can be performed outside of the lock
+                this->volumeUnits.emplace(tsdf_idx, VolumeUnit());
+                newIndices.emplace(tsdf_idx);
+            }
+        }
+        mutex.unlock();
+    };
+    parallel_for_(allocateRange, AllocateVolumeUnitsInvoker);
+
+    //! Perform the allocation
+    int res = volumeUnitResolution;
+    Point3i volumeDims(res, res, res);
+    for (auto idx : newIndices)
+    {
+        VolumeUnit& vu = volumeUnits[idx];
+        Matx44f subvolumePose = pose.translate(volumeUnitIdxToVolume(idx)).matrix;
+        vu.pVolume = makePtr<TSDFVolumeCPU>(voxelSize, subvolumePose, raycastStepFactor, truncDist, maxWeight, volumeDims);
+        //! This volume unit will definitely be required for current integration
+        vu.isActive = true;
+    }
+
+    //! Get keys for all the allocated volume Units
     std::vector<Vec3i> totalVolUnits;
     for (const auto& keyvalue : volumeUnits)
     {
@@ -156,12 +153,12 @@ void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor,
 
         for (int i = range.start; i < range.end; ++i)
         {
-            cv::Vec3i tsdf_idx         = totalVolUnits[i];
+            Vec3i tsdf_idx = totalVolUnits[i];
             VolumeUnitMap::iterator it = volumeUnits.find(tsdf_idx);
             if (it == volumeUnits.end())
                 return;
 
-            Point3f volumeUnitPos     = volumeUnitIdxToVolume(it->first);
+            Point3f volumeUnitPos = volumeUnitIdxToVolume(it->first);
             Point3f volUnitInCamSpace = vol2cam * volumeUnitPos;
             if (volUnitInCamSpace.z < 0 || volUnitInCamSpace.z > truncateThreshold)
             {
@@ -169,20 +166,19 @@ void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor,
                 return;
             }
             Point2f cameraPoint = proj(volUnitInCamSpace);
-            if (cameraPoint.x >= 0 && cameraPoint.y >= 0 && cameraPoint.x < depth.cols &&
-                cameraPoint.y < depth.rows)
+            if (cameraPoint.x >= 0 && cameraPoint.y >= 0 && cameraPoint.x < depth.cols && cameraPoint.y < depth.rows)
             {
                 assert(it != volumeUnits.end());
                 it->second.isActive = true;
             }
         }
-    });
+        });
 
     //! Integrate the correct volumeUnits
     parallel_for_(Range(0, (int)totalVolUnits.size()), [&](const Range& range) {
         for (int i = range.start; i < range.end; i++)
         {
-            cv::Vec3i tsdf_idx         = totalVolUnits[i];
+            Vec3i tsdf_idx = totalVolUnits[i];
             VolumeUnitMap::iterator it = volumeUnits.find(tsdf_idx);
             if (it == volumeUnits.end())
                 return;
@@ -196,7 +192,7 @@ void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor,
                 volumeUnit.isActive = false;
             }
         }
-    });
+        });
 }
 
 cv::Vec3i HashTSDFVolumeCPU::volumeToVolumeUnitIdx(cv::Point3f p) const
@@ -232,7 +228,7 @@ inline TsdfVoxel HashTSDFVolumeCPU::at(const cv::Vec3i& volumeIdx) const
     if (it == volumeUnits.end())
     {
         TsdfVoxel dummy;
-        dummy.tsdf   = 1.f;
+        dummy.tsdf   = floatToTsdf(1.f);
         dummy.weight = 0;
         return dummy;
     }
@@ -255,7 +251,7 @@ inline TsdfVoxel HashTSDFVolumeCPU::at(const cv::Point3f& point) const
     if (it == volumeUnits.end())
     {
         TsdfVoxel dummy;
-        dummy.tsdf   = 1.f;
+        dummy.tsdf   = floatToTsdf(1.f);
         dummy.weight = 0;
         return dummy;
     }
@@ -269,17 +265,90 @@ inline TsdfVoxel HashTSDFVolumeCPU::at(const cv::Point3f& point) const
     return volumeUnit->at(volUnitLocalIdx);
 }
 
-inline TsdfType HashTSDFVolumeCPU::interpolateVoxel(const cv::Point3f& point) const
+static inline Vec3i voxelToVolumeUnitIdx(const Vec3i& pt, const int vuRes)
 {
-    cv::Point3f neighbourCoords[] = {
-                                Point3f(0, 0, 0),
-                                Point3f(0, 0, 1),
-                                Point3f(0, 1, 0),
-                                Point3f(0, 1, 1),
-                                Point3f(1, 0, 0),
-                                Point3f(1, 0, 1),
-                                Point3f(1, 1, 0),
-                                Point3f(1, 1, 1) };
+    if (!(vuRes & (vuRes - 1)))
+    {
+        // vuRes is a power of 2, let's get this power
+        const int p2 = trailingZeros32(vuRes);
+        return Vec3i(pt[0] >> p2, pt[1] >> p2, pt[2] >> p2);
+    }
+    else
+    {
+        return Vec3i(cvFloor(float(pt[0]) / vuRes),
+            cvFloor(float(pt[1]) / vuRes),
+            cvFloor(float(pt[2]) / vuRes));
+    }
+}
+
+inline TsdfVoxel atVolumeUnit(const Vec3i& point, const Vec3i& volumeUnitIdx, VolumeUnitMap::const_iterator it,
+    VolumeUnitMap::const_iterator vend, int unitRes)
+{
+    if (it == vend)
+    {
+        TsdfVoxel dummy;
+        dummy.tsdf = floatToTsdf(1.f);
+        dummy.weight = 0;
+        return dummy;
+    }
+    Ptr<TSDFVolumeCPU> volumeUnit = std::dynamic_pointer_cast<TSDFVolumeCPU>(it->second.pVolume);
+
+    Vec3i volUnitLocalIdx = point - volumeUnitIdx * unitRes;
+
+    // expanding at(), removing bounds check
+    const TsdfVoxel* volData = volumeUnit->volume.ptr<TsdfVoxel>();
+    Vec4i volDims = volumeUnit->volDims;
+    int coordBase = volUnitLocalIdx[0] * volDims[0] + volUnitLocalIdx[1] * volDims[1] + volUnitLocalIdx[2] * volDims[2];
+    return volData[coordBase];
+}
+
+#if USE_INTRINSICS
+inline float interpolate(float tx, float ty, float tz, float vx[8])
+{
+    v_float32x4 v0246, v1357;
+    v_load_deinterleave(vx, v0246, v1357);
+
+    v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+
+    v_float32x4 v00_10 = vxx;
+    v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
+
+    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
+    float v0 = v0_1.get0();
+    v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
+    float v1 = v0_1.get0();
+
+    return v0 + tx * (v1 - v0);
+}
+
+#else
+inline float interpolate(float tx, float ty, float tz, float vx[8])
+{
+    float v00 = vx[0] + tz * (vx[1] - vx[0]);
+    float v01 = vx[2] + tz * (vx[3] - vx[2]);
+    float v10 = vx[4] + tz * (vx[5] - vx[4]);
+    float v11 = vx[6] + tz * (vx[7] - vx[6]);
+
+    float v0 = v00 + ty * (v01 - v00);
+    float v1 = v10 + ty * (v11 - v10);
+
+    return v0 + tx * (v1 - v0);
+}
+#endif
+
+float HashTSDFVolumeCPU::interpolateVoxelPoint(const Point3f& point) const
+{
+    const Vec3i neighbourCoords[] = { {0, 0, 0}, {0, 0, 1}, {0, 1, 0}, {0, 1, 1},
+                                      {1, 0, 0}, {1, 0, 1}, {1, 1, 0}, {1, 1, 1} };
+
+    // A small hash table to reduce a number of find() calls
+    bool queried[8];
+    VolumeUnitMap::const_iterator iterMap[8];
+    for (int i = 0; i < 8; i++)
+    {
+        iterMap[i] = volumeUnits.end();
+        queried[i] = false;
+    }
 
     int ix = cvFloor(point.x);
     int iy = cvFloor(point.y);
@@ -289,45 +358,171 @@ inline TsdfType HashTSDFVolumeCPU::interpolateVoxel(const cv::Point3f& point) co
     float ty = point.y - iy;
     float tz = point.z - iz;
 
-    TsdfType vx[8];
+    Vec3i iv(ix, iy, iz);
+    float vx[8];
     for (int i = 0; i < 8; i++)
-        vx[i] = at(neighbourCoords[i] * voxelSize + point).tsdf;
+    {
+        Vec3i pt = iv + neighbourCoords[i];
 
-    TsdfType v00 = vx[0] + tz * (vx[1] - vx[0]);
-    TsdfType v01 = vx[2] + tz * (vx[3] - vx[2]);
-    TsdfType v10 = vx[4] + tz * (vx[5] - vx[4]);
-    TsdfType v11 = vx[6] + tz * (vx[7] - vx[6]);
+        Vec3i volumeUnitIdx = voxelToVolumeUnitIdx(pt, volumeUnitResolution);
+        int dictIdx = (volumeUnitIdx[0] & 1) + (volumeUnitIdx[1] & 1) * 2 + (volumeUnitIdx[2] & 1) * 4;
+        auto it = iterMap[dictIdx];
+        if (!queried[dictIdx])
+        {
+            it = volumeUnits.find(volumeUnitIdx);
+            iterMap[dictIdx] = it;
+            queried[dictIdx] = true;
+        }
+        //VolumeUnitMap::const_iterator it = volumeUnits.find(volumeUnitIdx);
 
-    TsdfType v0 = v00 + ty * (v01 - v00);
-    TsdfType v1 = v10 + ty * (v11 - v10);
+        vx[i] = atVolumeUnit(pt, volumeUnitIdx, it, volumeUnits.end(), volumeUnitResolution).tsdf;
+    }
 
-    return v0 + tx * (v1 - v0);
+    return interpolate(tx, ty, tz, vx);
+}
+
+inline float HashTSDFVolumeCPU::interpolateVoxel(const cv::Point3f& point) const
+{
+    return interpolateVoxelPoint(point * voxelSizeInv);
 }
 
 inline Point3f HashTSDFVolumeCPU::getNormalVoxel(Point3f point) const
 {
-    Vec3f pointVec(point);
     Vec3f normal = Vec3f(0, 0, 0);
 
-    Vec3f pointPrev = point;
-    Vec3f pointNext = point;
+    Point3f ptVox = point * voxelSizeInv;
+    Vec3i iptVox(cvFloor(ptVox.x), cvFloor(ptVox.y), cvFloor(ptVox.z));
 
+    // A small hash table to reduce a number of find() calls
+    bool queried[8];
+    VolumeUnitMap::const_iterator iterMap[8];
+    for (int i = 0; i < 8; i++)
+    {
+        iterMap[i] = volumeUnits.end();
+        queried[i] = false;
+    }
+
+#if !USE_INTERPOLATION_IN_GETNORMAL
+    const Vec3i offsets[] = { { 1,  0,  0}, {-1,  0,  0}, { 0,  1,  0}, // 0-3
+                              { 0, -1,  0}, { 0,  0,  1}, { 0,  0, -1}  // 4-7
+    };
+    const int nVals = 6;
+
+#else
+    const Vec3i offsets[] = { { 0,  0,  0}, { 0,  0,  1}, { 0,  1,  0}, { 0,  1,  1}, //  0-3
+                              { 1,  0,  0}, { 1,  0,  1}, { 1,  1,  0}, { 1,  1,  1}, //  4-7
+                              {-1,  0,  0}, {-1,  0,  1}, {-1,  1,  0}, {-1,  1,  1}, //  8-11
+                              { 2,  0,  0}, { 2,  0,  1}, { 2,  1,  0}, { 2,  1,  1}, // 12-15
+                              { 0, -1,  0}, { 0, -1,  1}, { 1, -1,  0}, { 1, -1,  1}, // 16-19
+                              { 0,  2,  0}, { 0,  2,  1}, { 1,  2,  0}, { 1,  2,  1}, // 20-23
+                              { 0,  0, -1}, { 0,  1, -1}, { 1,  0, -1}, { 1,  1, -1}, // 24-27
+                              { 0,  0,  2}, { 0,  1,  2}, { 1,  0,  2}, { 1,  1,  2}, // 28-31
+    };
+    const int nVals = 32;
+#endif
+
+    float vals[nVals];
+    for (int i = 0; i < nVals; i++)
+    {
+        Vec3i pt = iptVox + offsets[i];
+
+        Vec3i volumeUnitIdx = voxelToVolumeUnitIdx(pt, volumeUnitResolution);
+
+        int dictIdx = (volumeUnitIdx[0] & 1) + (volumeUnitIdx[1] & 1) * 2 + (volumeUnitIdx[2] & 1) * 4;
+        auto it = iterMap[dictIdx];
+        if (!queried[dictIdx])
+        {
+            it = volumeUnits.find(volumeUnitIdx);
+            iterMap[dictIdx] = it;
+            queried[dictIdx] = true;
+        }
+        //VolumeUnitMap::const_iterator it = volumeUnits.find(volumeUnitIdx);
+
+        vals[i] = tsdfToFloat(atVolumeUnit(pt, volumeUnitIdx, it, volumeUnits.end(), volumeUnitResolution).tsdf);
+    }
+
+#if !USE_INTERPOLATION_IN_GETNORMAL
     for (int c = 0; c < 3; c++)
     {
-        pointPrev[c] -= voxelSize;
-        pointNext[c] += voxelSize;
-
-        //normal[c] = at(Point3f(pointNext)).tsdf - at(Point3f(pointPrev)).tsdf;
-        normal[c] = interpolateVoxel(Point3f(pointNext)) - interpolateVoxel(Point3f(pointPrev));
-
-        pointPrev[c] = pointVec[c];
-        pointNext[c] = pointVec[c];
+        normal[c] = vals[c * 2] - vals[c * 2 + 1];
     }
-    //std::cout << normal << std::endl;
+#else
+
+    float cxv[8], cyv[8], czv[8];
+
+    // How these numbers were obtained:
+    // 1. Take the basic interpolation sequence:
+    // 000, 001, 010, 011, 100, 101, 110, 111
+    // where each digit corresponds to shift by x, y, z axis respectively.
+    // 2. Add +1 for next or -1 for prev to each coordinate to corresponding axis
+    // 3. Search corresponding values in offsets
+    const int idxxp[8] = {  8,  9, 10, 11,  0,  1,  2,  3 };
+    const int idxxn[8] = {  4,  5,  6,  7, 12, 13, 14, 15 };
+    const int idxyp[8] = { 16, 17,  0,  1, 18, 19,  4,  5 };
+    const int idxyn[8] = {  2,  3, 20, 21,  6,  7, 22, 23 };
+    const int idxzp[8] = { 24,  0, 25,  2, 26,  4, 27,  6 };
+    const int idxzn[8] = {  1, 28,  3, 29,  5, 30,  7, 31 };
+
+#if !USE_INTRINSICS
+    for (int i = 0; i < 8; i++)
+    {
+        cxv[i] = vals[idxxn[i]] - vals[idxxp[i]];
+        cyv[i] = vals[idxyn[i]] - vals[idxyp[i]];
+        czv[i] = vals[idxzn[i]] - vals[idxzp[i]];
+    }
+#else
+
+# if CV_SIMD >= 32
+    v_float32x8 cxp = v_lut(vals, idxxp);
+    v_float32x8 cxn = v_lut(vals, idxxn);
+
+    v_float32x8 cyp = v_lut(vals, idxyp);
+    v_float32x8 cyn = v_lut(vals, idxyn);
+
+    v_float32x8 czp = v_lut(vals, idxzp);
+    v_float32x8 czn = v_lut(vals, idxzn);
+
+    v_float32x8 vcxv = cxn - cxp;
+    v_float32x8 vcyv = cyn - cyp;
+    v_float32x8 vczv = czn - czp;
+
+    v_store(cxv, vcxv);
+    v_store(cyv, vcyv);
+    v_store(czv, vczv);
+# else
+    v_float32x4 cxp0 = v_lut(vals, idxxp + 0); v_float32x4 cxp1 = v_lut(vals, idxxp + 4);
+    v_float32x4 cxn0 = v_lut(vals, idxxn + 0); v_float32x4 cxn1 = v_lut(vals, idxxn + 4);
+
+    v_float32x4 cyp0 = v_lut(vals, idxyp + 0); v_float32x4 cyp1 = v_lut(vals, idxyp + 4);
+    v_float32x4 cyn0 = v_lut(vals, idxyn + 0); v_float32x4 cyn1 = v_lut(vals, idxyn + 4);
+
+    v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4);
+    v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4);
+
+    v_float32x4 cxv0 = cxn0 - cxp0; v_float32x4 cxv1 = cxn1 - cxp1;
+    v_float32x4 cyv0 = cyn0 - cyp0; v_float32x4 cyv1 = cyn1 - cyp1;
+    v_float32x4 czv0 = czn0 - czp0; v_float32x4 czv1 = czn1 - czp1;
+
+    v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1);
+    v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1);
+    v_store(czv + 0, czv0); v_store(czv + 4, czv1);
+#endif
+
+#endif
+
+    float tx = ptVox.x - iptVox[0];
+    float ty = ptVox.y - iptVox[1];
+    float tz = ptVox.z - iptVox[2];
+
+    normal[0] = interpolate(tx, ty, tz, cxv);
+    normal[1] = interpolate(tx, ty, tz, cyv);
+    normal[2] = interpolate(tx, ty, tz, czv);
+#endif
+
     float nv = sqrt(normal[0] * normal[0] +
-                     normal[1] * normal[1] +
-                     normal[2] * normal[2]);
-    return nv < 0.0001f ? nan3 : normal/nv;
+                    normal[1] * normal[1] +
+                    normal[2] * normal[2]);
+    return nv < 0.0001f ? nan3 : normal / nv;
 }
 
 struct HashRaycastInvoker : ParallelLoopBody
@@ -377,7 +572,7 @@ struct HashRaycastInvoker : ParallelLoopBody
                               std::numeric_limits<int>::min());
 
                 float tprev       = tcurr;
-                TsdfType prevTsdf = volume.truncDist;
+                float prevTsdf = volume.truncDist;
                 cv::Ptr<TSDFVolumeCPU> currVolumeUnit;
                 while (tcurr < tmax)
                 {
@@ -386,7 +581,7 @@ struct HashRaycastInvoker : ParallelLoopBody
 
                     VolumeUnitMap::const_iterator it = volume.volumeUnits.find(currVolumeUnitIdx);
 
-                    TsdfType currTsdf = prevTsdf;
+                    float currTsdf = prevTsdf;
                     int currWeight    = 0;
                     float stepSize    = 0.5f * blockSize;
                     cv::Vec3i volUnitLocalIdx;
@@ -402,7 +597,7 @@ struct HashRaycastInvoker : ParallelLoopBody
 
                         //! TODO: Figure out voxel interpolation
                         TsdfVoxel currVoxel = currVolumeUnit->at(volUnitLocalIdx);
-                        currTsdf            = currVoxel.tsdf;
+                        currTsdf            = tsdfToFloat(currVoxel.tsdf);
                         currWeight          = currVoxel.weight;
                         stepSize            = tstep;
                     }
@@ -500,7 +695,7 @@ struct HashFetchPointsNormalsInvoker : ParallelLoopBody
                             cv::Vec3i voxelIdx(x, y, z);
                             TsdfVoxel voxel = volumeUnit->at(voxelIdx);
 
-                            if (voxel.tsdf != 1.f && voxel.weight != 0)
+                            if (voxel.tsdf != -128 && voxel.weight != 0)
                             {
                                 Point3f point = base_point + volume.voxelCoordToVolume(voxelIdx);
                                 localPoints.push_back(toPtype(point));
diff --git a/modules/rgbd/src/hash_tsdf.hpp b/modules/rgbd/src/hash_tsdf.hpp
index 74fc668cb..bac5ea9a4 100644
--- a/modules/rgbd/src/hash_tsdf.hpp
+++ b/modules/rgbd/src/hash_tsdf.hpp
@@ -89,7 +89,8 @@ class HashTSDFVolumeCPU : public HashTSDFVolume
     //! 1m)
     virtual TsdfVoxel at(const cv::Point3f& point) const;
 
-    inline TsdfType interpolateVoxel(const cv::Point3f& point) const;
+    float interpolateVoxelPoint(const Point3f& point) const;
+    inline float interpolateVoxel(const cv::Point3f& point) const;
     Point3f getNormalVoxel(cv::Point3f p) const;
 
     //! Utility functions for coordinate transformations
diff --git a/modules/rgbd/src/opencl/tsdf.cl b/modules/rgbd/src/opencl/tsdf.cl
index 418d0c1c2..b1349be6d 100644
--- a/modules/rgbd/src/opencl/tsdf.cl
+++ b/modules/rgbd/src/opencl/tsdf.cl
@@ -4,10 +4,33 @@
 
 // This code is also subject to the license terms in the LICENSE_KinectFusion.md file found in this module's directory
 
+typedef __INT8_TYPE__ int8_t;
+
+typedef int8_t TsdfType;
+typedef uchar WeightType;
+
+struct TsdfVoxel
+{
+    TsdfType tsdf;
+    WeightType weight;
+};
+
+static inline TsdfType floatToTsdf(float num)
+{
+    int8_t res = (int8_t) ( (num * (-128)) );
+    res = res ? res : (num < 0 ? 1 : -1);
+    return res;
+}
+
+static inline float tsdfToFloat(TsdfType num)
+{
+    return ( (float) num ) / (-128);
+}
+
 __kernel void integrate(__global const char * depthptr,
                         int depth_step, int depth_offset,
                         int depth_rows, int depth_cols,
-                        __global float2 * volumeptr,
+                        __global struct TsdfVoxel * volumeptr,
                         const float16 vol2camMatrix,
                         const float voxelSize,
                         const int4 volResolution4,
@@ -139,23 +162,23 @@ __kernel void integrate(__global const char * depthptr,
             float tsdf = fmin(1.0f, sdf * truncDistInv);
             int volIdx = volYidx + z*volDims.z;
 
-            float2 voxel = volumeptr[volIdx];
-            float value  = voxel.s0;
-            int weight = as_int(voxel.s1);
+            struct TsdfVoxel voxel = volumeptr[volIdx];
+            float value  = tsdfToFloat(voxel.tsdf);
+            int weight = voxel.weight;
 
             // update TSDF
             value = (value*weight + tsdf) / (weight + 1);
             weight = min(weight + 1, maxWeight);
 
-            voxel.s0 = value;
-            voxel.s1 = as_float(weight);
+            voxel.tsdf = floatToTsdf(value);
+            voxel.weight = weight;
             volumeptr[volIdx] = voxel;
         }
     }
 }
 
 
-inline float interpolateVoxel(float3 p, __global const float2* volumePtr,
+inline float interpolateVoxel(float3 p, __global const struct TsdfVoxel* volumePtr,
                               int3 volDims, int8 neighbourCoords)
 {
     float3 fip = floor(p);
@@ -169,7 +192,7 @@ inline float interpolateVoxel(float3 p, __global const float2* volumePtr,
 
     float vaz[8];
     for(int i = 0; i < 8; i++)
-        vaz[i] = volumePtr[nco[i]].s0;
+        vaz[i] = tsdfToFloat(volumePtr[nco[i]].tsdf);
 
     float8 vz = vload8(0, vaz);
 
@@ -178,7 +201,7 @@ inline float interpolateVoxel(float3 p, __global const float2* volumePtr,
     return mix(vx.s0, vx.s1, t.x);
 }
 
-inline float3 getNormalVoxel(float3 p, __global const float2* volumePtr,
+inline float3 getNormalVoxel(float3 p, __global const struct TsdfVoxel* volumePtr,
                              int3 volResolution, int3 volDims, int8 neighbourCoords)
 {
     if(any(p < 1) || any(p >= convert_float3(volResolution - 2)))
@@ -202,8 +225,8 @@ inline float3 getNormalVoxel(float3 p, __global const float2* volumePtr,
 
         float vaz[8];
         for(int i = 0; i < 8; i++)
-            vaz[i] = volumePtr[nco[i] + dim].s0 -
-                     volumePtr[nco[i] - dim].s0;
+            vaz[i] = tsdfToFloat(volumePtr[nco[i] + dim].tsdf -
+                                 volumePtr[nco[i] - dim].tsdf);
 
         float8 vz = vload8(0, vaz);
 
@@ -227,7 +250,7 @@ __kernel void raycast(__global char * pointsptr,
                       __global char * normalsptr,
                       int normals_step, int normals_offset,
                       const int2 frameSize,
-                      __global const float2 * volumeptr,
+                      __global const struct TsdfVoxel * volumeptr,
                       __global const float * vol2camptr,
                       __global const float * cam2volptr,
                       const float2 fixy,
@@ -332,7 +355,7 @@ __kernel void raycast(__global char * pointsptr,
                 int3 ip = convert_int3(round(next));
                 int3 cmul = ip*volDims;
                 int idx = cmul.x + cmul.y + cmul.z;
-                fnext = volumeptr[idx].s0;
+                fnext = tsdfToFloat(volumeptr[idx].tsdf);
 
                 if(fnext != f)
                 {
@@ -395,7 +418,7 @@ __kernel void getNormals(__global const char * pointsptr,
                          __global char * normalsptr,
                          int normals_step, int normals_offset,
                          const int2 frameSize,
-                         __global const float2* volumeptr,
+                         __global const struct TsdfVoxel* volumeptr,
                          __global const float * volPoseptr,
                          __global const float * invPoseptr,
                          const float voxelSizeInv,
@@ -464,7 +487,7 @@ struct CoordReturn
 };
 
 inline struct CoordReturn coord(int x, int y, int z, float3 V, float v0, int axis,
-                                __global const float2* volumeptr,
+                                __global const struct TsdfVoxel* volumeptr,
                                 int3 volResolution, int3 volDims,
                                 int8 neighbourCoords,
                                 float voxelSize, float voxelSizeInv,
@@ -506,9 +529,10 @@ inline struct CoordReturn coord(int x, int y, int z, float3 V, float v0, int axi
         int3 ip = ((int3)(x, y, z)) + shift;
         int3 cmul = ip*volDims;
         int idx = cmul.x + cmul.y + cmul.z;
-        float2 voxel = volumeptr[idx].s0;
-        float vd  = voxel.s0;
-        int weight = as_int(voxel.s1);
+
+        struct TsdfVoxel voxel = volumeptr[idx];
+        float vd  = tsdfToFloat(voxel.tsdf);
+        int weight = voxel.weight;
 
         if(weight != 0 && vd != 1.f)
         {
@@ -552,7 +576,7 @@ inline struct CoordReturn coord(int x, int y, int z, float3 V, float v0, int axi
 }
 
 
-__kernel void scanSize(__global const float2* volumeptr,
+__kernel void scanSize(__global const struct TsdfVoxel* volumeptr,
                        const int4 volResolution4,
                        const int4 volDims4,
                        const int8 neighbourCoords,
@@ -604,9 +628,9 @@ __kernel void scanSize(__global const float2* volumeptr,
         int3 ip = (int3)(x, y, z);
         int3 cmul = ip*volDims;
         int idx = cmul.x + cmul.y + cmul.z;
-        float2 voxel = volumeptr[idx].s0;
-        float value  = voxel.s0;
-        int weight = as_int(voxel.s1);
+        struct TsdfVoxel voxel = volumeptr[idx];
+        float value  = tsdfToFloat(voxel.tsdf);
+        int weight = voxel.weight;
 
         // if voxel is not empty
         if(weight != 0 && value != 1.f)
@@ -662,7 +686,7 @@ __kernel void scanSize(__global const float2* volumeptr,
 }
 
 
-__kernel void fillPtsNrm(__global const float2* volumeptr,
+__kernel void fillPtsNrm(__global const struct TsdfVoxel* volumeptr,
                          const int4 volResolution4,
                          const int4 volDims4,
                          const int8 neighbourCoords,
@@ -731,9 +755,9 @@ __kernel void fillPtsNrm(__global const float2* volumeptr,
         int3 ip = (int3)(x, y, z);
         int3 cmul = ip*volDims;
         int idx = cmul.x + cmul.y + cmul.z;
-        float2 voxel = volumeptr[idx].s0;
-        float value  = voxel.s0;
-        int weight = as_int(voxel.s1);
+        struct TsdfVoxel voxel = volumeptr[idx];
+        float value  = tsdfToFloat(voxel.tsdf);
+        int weight = voxel.weight;
 
         // if voxel is not empty
         if(weight != 0 && value != 1.f)
diff --git a/modules/rgbd/src/tsdf.cpp b/modules/rgbd/src/tsdf.cpp
index 99e038ad3..91005fb57 100644
--- a/modules/rgbd/src/tsdf.cpp
+++ b/modules/rgbd/src/tsdf.cpp
@@ -12,12 +12,32 @@ namespace cv {
 
 namespace kinfu {
 
+static inline v_float32x4 tsdfToFloat_INTR(const v_int32x4& num)
+{
+    v_float32x4 num128 = v_setall_f32(-1.f / 128.f);
+    return v_cvt_f32(num) * num128;
+}
+
+static inline TsdfType floatToTsdf(float num)
+{
+    //CV_Assert(-1 < num <= 1);
+    int8_t res = int8_t(num * (-128.f));
+    res = res ? res : (num < 0 ? 1 : -1);
+    return res;
+}
+
+static inline float tsdfToFloat(TsdfType num)
+{
+    return float(num) * (-1.f / 128.f);
+}
+
 TSDFVolume::TSDFVolume(float _voxelSize, Matx44f _pose, float _raycastStepFactor, float _truncDist,
                        int _maxWeight, Point3i _resolution, bool zFirstMemOrder)
     : Volume(_voxelSize, _pose, _raycastStepFactor),
       volResolution(_resolution),
-      maxWeight(_maxWeight)
+      maxWeight( WeightType(_maxWeight) )
 {
+    CV_Assert(_maxWeight < 255);
     // Unlike original code, this should work with any volume size
     // Not only when (x,y,z % 32) == 0
     volSize   = Point3f(volResolution) * voxelSize;
@@ -56,7 +76,7 @@ TSDFVolume::TSDFVolume(float _voxelSize, Matx44f _pose, float _raycastStepFactor
 
 // dimension in voxels, size in meters
 TSDFVolumeCPU::TSDFVolumeCPU(float _voxelSize, cv::Matx44f _pose, float _raycastStepFactor,
-                             float _truncDist, WeightType _maxWeight, Vec3i _resolution,
+                             float _truncDist, int _maxWeight, Vec3i _resolution,
                              bool zFirstMemOrder)
     : TSDFVolume(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, _resolution,
                  zFirstMemOrder)
@@ -74,7 +94,7 @@ void TSDFVolumeCPU::reset()
     volume.forEach<VecTsdfVoxel>([](VecTsdfVoxel& vv, const int* /* position */)
     {
         TsdfVoxel& v = reinterpret_cast<TsdfVoxel&>(vv);
-        v.tsdf = 0.0f; v.weight = 0;
+        v.tsdf = floatToTsdf(0.0f); v.weight = 0;
     });
 }
 
@@ -86,7 +106,7 @@ TsdfVoxel TSDFVolumeCPU::at(const cv::Vec3i& volumeIdx) const
         (volumeIdx[2] >= volResolution.z || volumeIdx[2] < 0))
     {
         TsdfVoxel dummy;
-        dummy.tsdf   = 1.0f;
+        dummy.tsdf   = floatToTsdf(1.0f);
         dummy.weight = 0;
         return dummy;
     }
@@ -150,7 +170,7 @@ static inline depthType bilinearDepth(const Depth& m, cv::Point2f pt)
             if(b10) return v10;
             if(b11) return v11;
         }
-        else if(nz == 2)
+        if(nz == 2)
         {
             if(b00 && b10) v01 = v00, v11 = v10;
             if(b01 && b11) v00 = v01, v10 = v11;
@@ -159,7 +179,7 @@ static inline depthType bilinearDepth(const Depth& m, cv::Point2f pt)
             if(b00 && b11) v01 = v10 = (v00 + v11)*0.5f;
             if(b01 && b10) v00 = v11 = (v01 + v10)*0.5f;
         }
-        else if(nz == 3)
+        if(nz == 3)
         {
             if(!b00) v00 = v10 + v01 - v11;
             if(!b01) v01 = v00 + v11 - v10;
@@ -175,17 +195,21 @@ static inline depthType bilinearDepth(const Depth& m, cv::Point2f pt)
 }
 #endif
 
+
+
 struct IntegrateInvoker : ParallelLoopBody
 {
-    IntegrateInvoker(TSDFVolumeCPU& _volume, const Depth& _depth, const Intr& intrinsics, const cv::Matx44f& cameraPose,
-                     float depthFactor) :
+    IntegrateInvoker(TSDFVolumeCPU& _volume, const Depth& _depth, const Intr& intrinsics,
+                    const cv::Matx44f& cameraPose, float depthFactor, Mat _pixNorms) :
         ParallelLoopBody(),
         volume(_volume),
         depth(_depth),
+        intr(intrinsics),
         proj(intrinsics.makeProjector()),
         vol2cam(Affine3f(cameraPose.inv()) * _volume.pose),
         truncDistInv(1.f/_volume.truncDist),
-        dfac(1.f/depthFactor)
+        dfac(1.f/depthFactor),
+        pixNorms(_pixNorms)
     {
         volDataStart = volume.volume.ptr<TsdfVoxel>();
     }
@@ -309,20 +333,20 @@ struct IntegrateInvoker : ParallelLoopBody
                     // norm(camPixVec) produces double which is too slow
                     float pixNorm = sqrt(v_reduce_sum(camPixVec*camPixVec));
                     // difference between distances of point and of surface to camera
-                    TsdfType sdf = pixNorm*(v*dfac - zCamSpace);
+                    float sdf = pixNorm*(v*dfac - zCamSpace);
                     // possible alternative is:
                     // kftype sdf = norm(camSpacePt)*(v*dfac/camSpacePt.z - 1);
                     if(sdf >= -volume.truncDist)
                     {
-                        TsdfType tsdf = fmin(1.f, sdf * truncDistInv);
+                        TsdfType tsdf = floatToTsdf(fmin(1.f, sdf * truncDistInv));
 
                         TsdfVoxel& voxel = volDataY[z*volume.volDims[2]];
                         WeightType& weight      = voxel.weight;
                         TsdfType& value  = voxel.tsdf;
 
                         // update TSDF
-                        value  = (value*weight+tsdf) / (weight + 1);
-                        weight = min(weight + 1, volume.maxWeight);
+                        value  = floatToTsdf((tsdfToFloat(value)*weight+ tsdfToFloat(tsdf)) / (weight + 1));
+                        weight = (weight + 1) < volume.maxWeight ? (weight + 1) : volume.maxWeight;
                     }
                 }
             }
@@ -338,7 +362,7 @@ struct IntegrateInvoker : ParallelLoopBody
             {
                 TsdfVoxel* volDataY = volDataX+y*volume.volDims[1];
                 // optimization of camSpace transformation (vector addition instead of matmul at each z)
-                Point3f basePt = vol2cam*(Point3f(x, y, 0)*volume.voxelSize);
+                Point3f basePt = vol2cam*(Point3f(float(x), float(y), 0.0f)*volume.voxelSize);
                 Point3f camSpacePt = basePt;
                 // zStep == vol2cam*(Point3f(x, y, 1)*voxelSize) - basePt;
                 // zStep == vol2cam*[Point3f(x, y, 1) - Point3f(x, y, 0)]*voxelSize
@@ -348,7 +372,7 @@ struct IntegrateInvoker : ParallelLoopBody
                 int startZ, endZ;
                 if(abs(zStep.z) > 1e-5)
                 {
-                    int baseZ = -basePt.z / zStep.z;
+                    int baseZ = int(-basePt.z / zStep.z);
                     if(zStep.z > 0)
                     {
                         startZ = baseZ;
@@ -375,11 +399,13 @@ struct IntegrateInvoker : ParallelLoopBody
                 }
                 startZ = max(0, startZ);
                 endZ   = min(volume.volResolution.z, endZ);
-                for(int z = startZ; z < endZ; z++)
+
+                for (int z = startZ; z < endZ; z++)
                 {
                     // optimization of the following:
                     //Point3f volPt = Point3f(x, y, z)*volume.voxelSize;
                     //Point3f camSpacePt = vol2cam * volPt;
+
                     camSpacePt += zStep;
                     if(camSpacePt.z <= 0)
                         continue;
@@ -388,27 +414,31 @@ struct IntegrateInvoker : ParallelLoopBody
                     Point2f projected = proj(camSpacePt, camPixVec);
 
                     depthType v = bilinearDepth(depth, projected);
-                    if(v == 0)
+                    if (v == 0) {
                         continue;
+                    }
+
+                    int _u = projected.x;
+                    int _v = projected.y;
+                    if (!(_u >= 0 && _u < depth.rows && _v >= 0 && _v < depth.cols))
+                        continue;
+                    float pixNorm = pixNorms.at<float>(_u, _v);
 
-                    // norm(camPixVec) produces double which is too slow
-                    float pixNorm = sqrt(camPixVec.dot(camPixVec));
                     // difference between distances of point and of surface to camera
-                    TsdfType sdf = pixNorm*(v*dfac - camSpacePt.z);
-
+                    float sdf = pixNorm*(v*dfac - camSpacePt.z);
                     // possible alternative is:
                     // kftype sdf = norm(camSpacePt)*(v*dfac/camSpacePt.z - 1);
                     if(sdf >= -volume.truncDist)
                     {
-                        TsdfType tsdf = fmin(1.f, sdf * truncDistInv);
+                        TsdfType tsdf = floatToTsdf(fmin(1.f, sdf * truncDistInv));
 
                         TsdfVoxel& voxel = volDataY[z*volume.volDims[2]];
                         WeightType& weight      = voxel.weight;
                         TsdfType& value  = voxel.tsdf;
 
                         // update TSDF
-                        value  = (value*weight+tsdf) / (weight + 1);
-                        weight = min(weight + 1, volume.maxWeight);
+                        value  = floatToTsdf((tsdfToFloat(value)*weight+ tsdfToFloat(tsdf)) / (weight + 1));
+                        weight = min(int(weight + 1), int(volume.maxWeight));
                     }
                 }
             }
@@ -418,13 +448,39 @@ struct IntegrateInvoker : ParallelLoopBody
 
     TSDFVolumeCPU& volume;
     const Depth& depth;
+    const Intr& intr;
     const Intr::Projector proj;
     const cv::Affine3f vol2cam;
     const float truncDistInv;
     const float dfac;
     TsdfVoxel* volDataStart;
+    Mat pixNorms;
 };
 
+static cv::Mat preCalculationPixNorm(Depth depth, const Intr& intrinsics)
+{
+    int height = depth.rows;
+    int widht = depth.cols;
+    Point2f fl(intrinsics.fx, intrinsics.fy);
+    Point2f pp(intrinsics.cx, intrinsics.cy);
+    Mat pixNorm (height, widht, CV_32F);
+    std::vector<float> x(widht);
+    std::vector<float> y(height);
+    for (int i = 0; i < widht; i++)
+        x[i] = (i - pp.x) / fl.x;
+    for (int i = 0; i < height; i++)
+        y[i] = (i - pp.y) / fl.y;
+
+    for (int i = 0; i < height; i++)
+    {
+        for (int j = 0; j < widht; j++)
+        {
+            pixNorm.at<float>(i, j) = sqrtf(x[j] * x[j] + y[i] * y[i] + 1.0f);
+        }
+    }
+    return pixNorm;
+}
+
 // use depth instead of distance (optimization)
 void TSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const cv::Matx44f& cameraPose,
                               const Intr& intrinsics)
@@ -434,20 +490,31 @@ void TSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const cv::Ma
     CV_Assert(_depth.type() == DEPTH_TYPE);
     CV_Assert(!_depth.empty());
     Depth depth = _depth.getMat();
-    IntegrateInvoker ii(*this, depth, intrinsics, cameraPose, depthFactor);
+    if (!(frameParams[0] == depth.rows  && frameParams[1] == depth.cols &&
+        frameParams[2] == intrinsics.fx && frameParams[3] == intrinsics.fy &&
+        frameParams[4] == intrinsics.cx && frameParams[5] == intrinsics.cy))
+    {
+        frameParams[0] = (float)depth.rows; frameParams[1] = (float)depth.cols;
+        frameParams[2] = intrinsics.fx;     frameParams[3] = intrinsics.fy;
+        frameParams[4] = intrinsics.cx;     frameParams[5] = intrinsics.cy;
+
+        pixNorms = preCalculationPixNorm(depth, intrinsics);
+    }
+
+    IntegrateInvoker ii(*this, depth, intrinsics, cameraPose, depthFactor, pixNorms);
     Range range(0, volResolution.x);
     parallel_for_(range, ii);
 }
 
 #if USE_INTRINSICS
 // all coordinate checks should be done in inclosing cycle
-inline TsdfType TSDFVolumeCPU::interpolateVoxel(Point3f _p) const
+inline float TSDFVolumeCPU::interpolateVoxel(Point3f _p) const
 {
     v_float32x4 p(_p.x, _p.y, _p.z, 0);
     return interpolateVoxel(p);
 }
 
-inline TsdfType TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
+inline float TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
 {
     // tx, ty, tz = floor(p)
     v_int32x4 ip  = v_floor(p);
@@ -473,7 +540,8 @@ inline TsdfType TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
     for(int i = 0; i < 8; i++)
         vx[i] = volData[neighbourCoords[i] + coordBase].tsdf;
 
-    v_float32x4 v0246(vx[0], vx[2], vx[4], vx[6]), v1357(vx[1], vx[3], vx[5], vx[7]);
+    v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6]));
+    v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7]));
     v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246);
 
     v_float32x4 v00_10 = vxx;
@@ -487,7 +555,7 @@ inline TsdfType TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
     return v0 + tx*(v1 - v0);
 }
 #else
-inline TsdfType TSDFVolumeCPU::interpolateVoxel(Point3f p) const
+inline float TSDFVolumeCPU::interpolateVoxel(Point3f p) const
 {
     int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
 
@@ -502,19 +570,20 @@ inline TsdfType TSDFVolumeCPU::interpolateVoxel(Point3f p) const
     int coordBase = ix*xdim + iy*ydim + iz*zdim;
     const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();
 
-    TsdfType vx[8];
-    for(int i = 0; i < 8; i++)
-        vx[i] = volData[neighbourCoords[i] + coordBase].tsdf;
+    float vx[8];
+    for (int i = 0; i < 8; i++)
+        vx[i] = tsdfToFloat(volData[neighbourCoords[i] + coordBase].tsdf);
 
-    TsdfType v00 = vx[0] + tz*(vx[1] - vx[0]);
-    TsdfType v01 = vx[2] + tz*(vx[3] - vx[2]);
-    TsdfType v10 = vx[4] + tz*(vx[5] - vx[4]);
-    TsdfType v11 = vx[6] + tz*(vx[7] - vx[6]);
+    float v00 = vx[0] + tz*(vx[1] - vx[0]);
+    float v01 = vx[2] + tz*(vx[3] - vx[2]);
+    float v10 = vx[4] + tz*(vx[5] - vx[4]);
+    float v11 = vx[6] + tz*(vx[7] - vx[6]);
 
-    TsdfType v0 = v00 + ty*(v01 - v00);
-    TsdfType v1 = v10 + ty*(v11 - v10);
+    float v0 = v00 + ty*(v01 - v00);
+    float v1 = v10 + ty*(v11 - v10);
 
     return v0 + tx*(v1 - v0);
+
 }
 #endif
 
@@ -567,7 +636,8 @@ inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
             vx[i] = volData[neighbourCoords[i] + coordBase + 1*dim].tsdf -
                     volData[neighbourCoords[i] + coordBase - 1*dim].tsdf;
 
-        v_float32x4 v0246(vx[0], vx[2], vx[4], vx[6]), v1357(vx[1], vx[3], vx[5], vx[7]);
+        v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6]));
+        v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7]));
         v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246);
 
         v_float32x4 v00_10 = vxx;
@@ -613,25 +683,25 @@ inline Point3f TSDFVolumeCPU::getNormalVoxel(Point3f p) const
         const int dim = volDims[c];
         float& nv = an[c];
 
-        TsdfType vx[8];
+        float vx[8];
         for(int i = 0; i < 8; i++)
-            vx[i] = volData[neighbourCoords[i] + coordBase + 1*dim].tsdf -
-                    volData[neighbourCoords[i] + coordBase - 1*dim].tsdf;
+            vx[i] = tsdfToFloat(volData[neighbourCoords[i] + coordBase + 1*dim].tsdf -
+                                volData[neighbourCoords[i] + coordBase - 1*dim].tsdf);
 
-        TsdfType v00 = vx[0] + tz*(vx[1] - vx[0]);
-        TsdfType v01 = vx[2] + tz*(vx[3] - vx[2]);
-        TsdfType v10 = vx[4] + tz*(vx[5] - vx[4]);
-        TsdfType v11 = vx[6] + tz*(vx[7] - vx[6]);
+        float v00 = vx[0] + tz*(vx[1] - vx[0]);
+        float v01 = vx[2] + tz*(vx[3] - vx[2]);
+        float v10 = vx[4] + tz*(vx[5] - vx[4]);
+        float v11 = vx[6] + tz*(vx[7] - vx[6]);
 
-        TsdfType v0 = v00 + ty*(v01 - v00);
-        TsdfType v1 = v10 + ty*(v11 - v10);
+        float v0 = v00 + ty*(v01 - v00);
+        float v1 = v10 + ty*(v11 - v10);
 
         nv = v0 + tx*(v1 - v0);
     }
 
     float nv = sqrt(an[0] * an[0] +
-        an[1] * an[1] +
-        an[2] * an[2]);
+                    an[1] * an[1] +
+                    an[2] * an[2]);
     return nv < 0.0001f ? nan3 : an / nv;
 }
 #endif
@@ -736,7 +806,7 @@ struct RaycastInvoker : ParallelLoopBody
                     int zdim            = volume.volDims[2];
                     v_float32x4 rayStep = dir * v_setall_f32(tstep);
                     v_float32x4 next    = (orig + dir * v_setall_f32(tmin));
-                    TsdfType f = volume.interpolateVoxel(next), fnext = f;
+                    float f = volume.interpolateVoxel(next), fnext = f;
 
                     //raymarch
                     int steps = 0;
@@ -750,7 +820,7 @@ struct RaycastInvoker : ParallelLoopBody
                         int iz = ip.get0();
                         int coord = ix*xdim + iy*ydim + iz*zdim;
 
-                        fnext = volume.volume.at<TsdfVoxel>(coord).tsdf;
+                        fnext = tsdfToFloat(volume.volume.at<TsdfVoxel>(coord).tsdf);
                         if(fnext != f)
                         {
                             fnext = volume.interpolateVoxel(next);
@@ -768,8 +838,8 @@ struct RaycastInvoker : ParallelLoopBody
                     if(f > 0.f && fnext < 0.f)
                     {
                         v_float32x4 tp = next - rayStep;
-                        TsdfType ft    = volume.interpolateVoxel(tp);
-                        TsdfType ftdt  = volume.interpolateVoxel(next);
+                        float ft    = volume.interpolateVoxel(tp);
+                        float ftdt  = volume.interpolateVoxel(next);
                         float ts = tmin + tstep*(steps - ft/(ftdt - ft));
 
                         // avoid division by zero
@@ -815,7 +885,7 @@ struct RaycastInvoker : ParallelLoopBody
 
                 Point3f orig = camTrans;
                 // direction through pixel in volume space
-                Point3f dir = normalize(Vec3f(camRot * reproj(Point3f(x, y, 1.f))));
+                Point3f dir = normalize(Vec3f(camRot * reproj(Point3f(float(x), float(y), 1.f))));
 
                 // compute intersection of ray with all six bbox planes
                 Vec3f rayinv(1.f/dir.x, 1.f/dir.y, 1.f/dir.z);
@@ -845,11 +915,11 @@ struct RaycastInvoker : ParallelLoopBody
 
                     Point3f rayStep = dir * tstep;
                     Point3f next = (orig + dir * tmin);
-                    TsdfType f = volume.interpolateVoxel(next), fnext = f;
+                    float f = volume.interpolateVoxel(next), fnext = f;
 
                     //raymarch
                     int steps = 0;
-                    int nSteps = floor((tmax - tmin)/tstep);
+                    int nSteps = int(floor((tmax - tmin)/tstep));
                     for(; steps < nSteps; steps++)
                     {
                         next += rayStep;
@@ -859,11 +929,10 @@ struct RaycastInvoker : ParallelLoopBody
                         int ix = cvRound(next.x);
                         int iy = cvRound(next.y);
                         int iz = cvRound(next.z);
-                        fnext = volume.volume.at<TsdfVoxel>(ix*xdim + iy*ydim + iz*zdim).tsdf;
+                        fnext = tsdfToFloat(volume.volume.at<TsdfVoxel>(ix*xdim + iy*ydim + iz*zdim).tsdf);
                         if(fnext != f)
                         {
                             fnext = volume.interpolateVoxel(next);
-
                             // when ray crosses a surface
                             if(std::signbit(f) != std::signbit(fnext))
                                 break;
@@ -871,14 +940,13 @@ struct RaycastInvoker : ParallelLoopBody
                             f = fnext;
                         }
                     }
-
                     // if ray penetrates a surface from outside
                     // linearly interpolate t between two f values
                     if(f > 0.f && fnext < 0.f)
                     {
                         Point3f tp    = next - rayStep;
-                        TsdfType ft   = volume.interpolateVoxel(tp);
-                        TsdfType ftdt = volume.interpolateVoxel(next);
+                        float ft   = volume.interpolateVoxel(tp);
+                        float ftdt = volume.interpolateVoxel(next);
                         // float t = tmin + steps*tstep;
                         // float ts = t - tstep*ft/(ftdt - ft);
                         float ts = tmin + tstep*(steps - ft/(ftdt - ft));
@@ -899,7 +967,6 @@ struct RaycastInvoker : ParallelLoopBody
                         }
                     }
                 }
-
                 ptsRow[x] = toPtype(point);
                 nrmRow[x] = toPtype(normal);
             }
@@ -988,7 +1055,7 @@ struct FetchPointsNormalsInvoker : ParallelLoopBody
             const TsdfVoxel& voxeld = volDataStart[(x+shift.x)*vol.volDims[0] +
                                                    (y+shift.y)*vol.volDims[1] +
                                                    (z+shift.z)*vol.volDims[2]];
-            TsdfType vd = voxeld.tsdf;
+            float vd = tsdfToFloat(voxeld.tsdf);
 
             if(voxeld.weight != 0 && vd != 1.f)
             {
@@ -1025,7 +1092,7 @@ struct FetchPointsNormalsInvoker : ParallelLoopBody
                 for(int z = 0; z < vol.volResolution.z; z++)
                 {
                     const TsdfVoxel& voxel0 = volDataY[z*vol.volDims[2]];
-                    TsdfType v0             = voxel0.tsdf;
+                    float v0             = tsdfToFloat(voxel0.tsdf);
                     if(voxel0.weight != 0 && v0 != 1.f)
                     {
                         Point3f V(Point3f((float)x + 0.5f, (float)y + 0.5f, (float)z + 0.5f)*vol.voxelSize);
@@ -1121,7 +1188,7 @@ TSDFVolumeGPU::TSDFVolumeGPU(float _voxelSize, cv::Matx44f _pose, float _raycast
                              Point3i _resolution) :
     TSDFVolume(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, _resolution, false)
 {
-    volume = UMat(1, volResolution.x * volResolution.y * volResolution.z, CV_32FC2);
+    volume = UMat(1, volResolution.x * volResolution.y * volResolution.z, CV_8UC2);
 
     reset();
 }
diff --git a/modules/rgbd/src/tsdf.hpp b/modules/rgbd/src/tsdf.hpp
index f4e7df321..4646c41dd 100644
--- a/modules/rgbd/src/tsdf.hpp
+++ b/modules/rgbd/src/tsdf.hpp
@@ -19,14 +19,16 @@ namespace kinfu
 // TODO: Optimization possible:
 // * TsdfType can be FP16
 // * WeightType can be uint16
-typedef float TsdfType;
-typedef int WeightType;
+
+typedef int8_t TsdfType;
+typedef uchar WeightType;
 
 struct TsdfVoxel
 {
     TsdfType tsdf;
     WeightType weight;
 };
+
 typedef Vec<uchar, sizeof(TsdfVoxel)> VecTsdfVoxel;
 
 class TSDFVolume : public Volume
@@ -38,6 +40,7 @@ class TSDFVolume : public Volume
     virtual ~TSDFVolume() = default;
 
    public:
+
     Point3i volResolution;
     WeightType maxWeight;
 
@@ -52,7 +55,7 @@ class TSDFVolumeCPU : public TSDFVolume
    public:
     // dimension in voxels, size in meters
     TSDFVolumeCPU(float _voxelSize, cv::Matx44f _pose, float _raycastStepFactor, float _truncDist,
-                  WeightType _maxWeight, Vec3i _resolution, bool zFirstMemOrder = true);
+                  int _maxWeight, Vec3i _resolution, bool zFirstMemOrder = true);
 
     virtual void integrate(InputArray _depth, float depthFactor, const cv::Matx44f& cameraPose,
                            const cv::kinfu::Intr& intrinsics) override;
@@ -66,14 +69,15 @@ class TSDFVolumeCPU : public TSDFVolume
     virtual void reset() override;
     virtual TsdfVoxel at(const cv::Vec3i& volumeIdx) const;
 
-    TsdfType interpolateVoxel(cv::Point3f p) const;
+    float interpolateVoxel(cv::Point3f p) const;
     Point3f getNormalVoxel(cv::Point3f p) const;
 
 #if USE_INTRINSICS
-    TsdfType interpolateVoxel(const v_float32x4& p) const;
+    float interpolateVoxel(const v_float32x4& p) const;
     v_float32x4 getNormalVoxel(const v_float32x4& p) const;
 #endif
-
+    Vec6f frameParams;
+    Mat pixNorms;
     // See zFirstMemOrder arg of parent class constructor
     // for the array layout info
     // Consist of Voxel elements
diff --git a/modules/rgbd/test/test_tsdf.cpp b/modules/rgbd/test/test_tsdf.cpp
index bf327b01a..99da95796 100644
--- a/modules/rgbd/test/test_tsdf.cpp
+++ b/modules/rgbd/test/test_tsdf.cpp
@@ -97,7 +97,7 @@ struct SemisphereScene : Scene
 {
     const int framesPerCycle = 72;
     const float nCycles = 0.25f;
-    const Affine3f startPose = Affine3f(Vec3f(0.f, 0.f, 0.f), Vec3f(1.5f, 0.3f, -1.5f));
+    const Affine3f startPose = Affine3f(Vec3f(0.f, 0.f, 0.f), Vec3f(1.5f, 0.3f, -2.3f));
 
     Size frameSize;
     Matx33f intr;