diff --git a/examples_tests b/examples_tests
index e5d5ae2ca9..1dec4de5e5 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit e5d5ae2ca9137a6966d00aa039f3e6dae7c23fb9
+Subproject commit 1dec4de5e5e92040150bf529ec311183efff3c8c
diff --git a/include/matrix3x4SIMD.h b/include/matrix3x4SIMD.h
deleted file mode 100644
index d52f305cec..0000000000
--- a/include/matrix3x4SIMD.h
+++ /dev/null
@@ -1,263 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef __NBL_MATRIX3X4SIMD_H_INCLUDED__
-#define __NBL_MATRIX3X4SIMD_H_INCLUDED__
-
-#include "vectorSIMD.h"
-#include "quaternion.h"
-
-namespace nbl::core
-{
-
-class matrix4x3;
-
-#define _NBL_MATRIX_ALIGNMENT _NBL_SIMD_ALIGNMENT
-static_assert(_NBL_MATRIX_ALIGNMENT>=_NBL_VECTOR_ALIGNMENT,"Matrix must be equally or more aligned than vector!");
-
-//! Equivalent of GLSL's mat4x3
-class matrix3x4SIMD// : private AllocationOverrideBase<_NBL_MATRIX_ALIGNMENT> EBO inheritance problem w.r.t `rows[3]`
-{
-	public:
-		_NBL_STATIC_INLINE_CONSTEXPR uint32_t VectorCount = 3u;
-		vectorSIMDf rows[VectorCount];
-
-		explicit matrix3x4SIMD(	const vectorSIMDf& _r0 = vectorSIMDf(1.f, 0.f, 0.f, 0.f),
-								const vectorSIMDf& _r1 = vectorSIMDf(0.f, 1.f, 0.f, 0.f),
-								const vectorSIMDf& _r2 = vectorSIMDf(0.f, 0.f, 1.f, 0.f)) : rows{_r0, _r1, _r2}
-		{
-		}
-
-		matrix3x4SIMD(	float _a00, float _a01, float _a02, float _a03,
-						float _a10, float _a11, float _a12, float _a13,
-						float _a20, float _a21, float _a22, float _a23)
-								: matrix3x4SIMD(vectorSIMDf(_a00, _a01, _a02, _a03),
-												vectorSIMDf(_a10, _a11, _a12, _a13),
-												vectorSIMDf(_a20, _a21, _a22, _a23))
-		{
-		}
-
-		explicit matrix3x4SIMD(const float* const _data)
-		{
-			if (!_data)
-				return;
-			for (size_t i = 0u; i < VectorCount; ++i)
-				rows[i] = vectorSIMDf(_data + 4*i);
-		}
-		matrix3x4SIMD(const float* const _data, bool ALIGNED)
-		{
-			if (!_data)
-				return;
-			for (size_t i = 0u; i < VectorCount; ++i)
-				rows[i] = vectorSIMDf(_data + 4*i, ALIGNED);
-		}
-
-		float* pointer() { return rows[0].pointer; }
-		const float* pointer() const { return rows[0].pointer; }
-
-		inline matrix3x4SIMD& set(const matrix4x3& _retarded);
-		inline matrix4x3 getAsRetardedIrrlichtMatrix() const;
-
-		static inline matrix3x4SIMD concatenateBFollowedByA(const matrix3x4SIMD& _a, const matrix3x4SIMD& _b);
-
-		static inline matrix3x4SIMD concatenateBFollowedByAPrecisely(const matrix3x4SIMD& _a, const matrix3x4SIMD& _b);
-
-		inline matrix3x4SIMD& concatenateAfter(const matrix3x4SIMD& _other)
-		{
-			return *this = concatenateBFollowedByA(*this, _other);
-		}
-
-		inline matrix3x4SIMD& concatenateBefore(const matrix3x4SIMD& _other)
-		{
-			return *this = concatenateBFollowedByA(_other, *this);
-		}
-
-		inline matrix3x4SIMD& concatenateAfterPrecisely(const matrix3x4SIMD& _other)
-		{
-			return *this = concatenateBFollowedByAPrecisely(*this, _other);
-		}
-
-		inline matrix3x4SIMD& concatenateBeforePrecisely(const matrix3x4SIMD& _other)
-		{
-			return *this = concatenateBFollowedByAPrecisely(_other, *this);
-		}
-
-		inline bool operator==(const matrix3x4SIMD& _other)
-		{
-			return !(*this != _other);
-		}
-
-		inline bool operator!=(const matrix3x4SIMD& _other);
-
-
-		inline matrix3x4SIMD operator-() const
-		{
-			matrix3x4SIMD retval;
-			retval.rows[0] = -rows[0];
-			retval.rows[1] = -rows[1];
-			retval.rows[2] = -rows[2];
-			return retval;
-		}
-
-
-		inline matrix3x4SIMD& operator+=(const matrix3x4SIMD& _other);
-		inline matrix3x4SIMD operator+(const matrix3x4SIMD& _other) const
-		{
-			matrix3x4SIMD retval(*this);
-			return retval += _other;
-		}
-
-		inline matrix3x4SIMD& operator-=(const matrix3x4SIMD& _other);
-		inline matrix3x4SIMD operator-(const matrix3x4SIMD& _other) const
-		{
-			matrix3x4SIMD retval(*this);
-			return retval -= _other;
-		}
-
-		inline matrix3x4SIMD& operator*=(float _scalar);
-		inline matrix3x4SIMD operator*(float _scalar) const
-		{
-			matrix3x4SIMD retval(*this);
-			return retval *= _scalar;
-		}
-
-		inline matrix3x4SIMD& setTranslation(const vectorSIMDf& _translation)
-		{
-			// no faster way of doing it?
-			rows[0].w = _translation.x;
-			rows[1].w = _translation.y;
-			rows[2].w = _translation.z;
-			return *this;
-		}
-		inline vectorSIMDf getTranslation() const;
-		inline vectorSIMDf getTranslation3D() const;
-
-		inline matrix3x4SIMD& setScale(const vectorSIMDf& _scale);
-
-		inline vectorSIMDf getScale() const;
-
-		inline void transformVect(vectorSIMDf& _out, const vectorSIMDf& _in) const;
-		inline void transformVect(vectorSIMDf& _in_out) const
-		{
-			transformVect(_in_out, _in_out);
-		}
-
-		inline void pseudoMulWith4x1(vectorSIMDf& _out, const vectorSIMDf& _in) const;
-		inline void pseudoMulWith4x1(vectorSIMDf& _in_out) const
-		{
-			pseudoMulWith4x1(_in_out,_in_out);
-		}
-
-		inline void mulSub3x3WithNx1(vectorSIMDf& _out, const vectorSIMDf& _in) const;
-		inline void mulSub3x3WithNx1(vectorSIMDf& _in_out) const
-		{
-			mulSub3x3WithNx1(_in_out, _in_out);
-		}
-
-		inline static matrix3x4SIMD buildCameraLookAtMatrixLH(
-			const vectorSIMDf& position,
-			const vectorSIMDf& target,
-			const vectorSIMDf& upVector);
-		inline static matrix3x4SIMD buildCameraLookAtMatrixRH(
-			const vectorSIMDf& position,
-			const vectorSIMDf& target,
-			const vectorSIMDf& upVector);
-
-		inline matrix3x4SIMD& setRotation(const quaternion& _quat);
-
-		inline matrix3x4SIMD& setScaleRotationAndTranslation(	const vectorSIMDf& _scale,
-																const quaternion& _quat,
-																const vectorSIMDf& _translation);
-
-		inline vectorSIMDf getPseudoDeterminant() const
-		{
-			vectorSIMDf tmp;
-			return determinant_helper(tmp);
-		}
-
-		inline bool getInverse(matrix3x4SIMD& _out) const;
-		bool makeInverse()
-		{
-			matrix3x4SIMD tmp;
-
-			if (getInverse(tmp))
-			{
-				*this = tmp;
-				return true;
-			}
-			return false;
-		}
-
-		//
-		inline bool getSub3x3InverseTranspose(matrix3x4SIMD& _out) const;
-
-		//
-		inline bool getSub3x3InverseTransposePacked(float outRows[9]) const
-		{
-			matrix3x4SIMD tmp;
-			if (!getSub3x3InverseTranspose(tmp))
-				return false;
-
-			float* _out = outRows;
-			for (auto i=0; i<3; i++)
-			{
-				const auto& row = tmp.rows[i];
-				for (auto j=0; j<3; j++)
-					*(_out++) = row[j];
-			}
-
-			return true;
-		}
-
-		//
-		inline core::matrix3x4SIMD getSub3x3TransposeCofactors() const;
-
-		//
-		inline void setTransformationCenter(const vectorSIMDf& _center, const vectorSIMDf& _translation);
-
-		//
-		static inline matrix3x4SIMD buildAxisAlignedBillboard(
-			const vectorSIMDf& camPos,
-			const vectorSIMDf& center,
-			const vectorSIMDf& translation,
-			const vectorSIMDf& axis,
-			const vectorSIMDf& from);
-
-
-		//
-		float& operator()(size_t _i, size_t _j) { return rows[_i].pointer[_j]; }
-		const float& operator()(size_t _i, size_t _j) const { return rows[_i].pointer[_j]; }
-
-		//
-		inline const vectorSIMDf& operator[](size_t _rown) const { return rows[_rown]; }
-		inline vectorSIMDf& operator[](size_t _rown) { return rows[_rown]; }
-
-	private:
-		static inline vectorSIMDf doJob(const __m128& a, const matrix3x4SIMD& _mtx);
-
-		// really need that dvec<2> or wider
-		inline __m128d halfRowAsDouble(size_t _n, bool _0) const;
-		static inline __m128d doJob_d(const __m128d& _a0, const __m128d& _a1, const matrix3x4SIMD& _mtx, bool _xyHalf);
-
-		vectorSIMDf determinant_helper(vectorSIMDf& r1crossr2) const
-		{
-			r1crossr2 = core::cross(rows[1], rows[2]);
-			return core::dot(rows[0], r1crossr2);
-		}
-};
-
-inline matrix3x4SIMD concatenateBFollowedByA(const matrix3x4SIMD& _a, const matrix3x4SIMD& _b)
-{
-    return matrix3x4SIMD::concatenateBFollowedByA(_a, _b);
-}
-/*
-inline matrix3x4SIMD concatenateBFollowedByAPrecisely(const matrix3x4SIMD& _a, const matrix3x4SIMD& _b)
-{
-    return matrix3x4SIMD::concatenateBFollowedByAPrecisely(_a, _b);
-}
-*/
-
-}
-
-#endif
diff --git a/include/matrix3x4SIMD_impl.h b/include/matrix3x4SIMD_impl.h
deleted file mode 100644
index 0e9022efd0..0000000000
--- a/include/matrix3x4SIMD_impl.h
+++ /dev/null
@@ -1,470 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef _NBL_MATRIX3X4SIMD_IMPL_H_INCLUDED_
-#define _NBL_MATRIX3X4SIMD_IMPL_H_INCLUDED_
-
-#include "matrix3x4SIMD.h"
-#include "nbl/core/math/glslFunctions.tcc"
-
-namespace nbl::core
-{
-
-// TODO: move to another implementation header
-inline quaternion::quaternion(const matrix3x4SIMD& m)
-{
-	const vectorSIMDf one(1.f);
-	auto Qx  = m.rows[0].xxxx()^vectorSIMDu32(0,0,0x80000000u,0x80000000u);
-	auto Qy  = m.rows[1].yyyy()^vectorSIMDu32(0,0x80000000u,0,0x80000000u);
-	auto Qz  = m.rows[2].zzzz()^vectorSIMDu32(0,0x80000000u,0x80000000u,0);
-
-	auto tmp = one+Qx+Qy+Qz;
-	auto invscales = inversesqrt(tmp)*0.5f;
-	auto scales = tmp*invscales*0.5f;
-
-	// TODO: speed this up
-	if (tmp.x > 0.0f)
-	{
-		X = (m(2, 1) - m(1, 2)) * invscales.x;
-		Y = (m(0, 2) - m(2, 0)) * invscales.x;
-		Z = (m(1, 0) - m(0, 1)) * invscales.x;
-		W = scales.x;
-	}
-	else
-	{
-		if (tmp.y>0.f)
-		{
-			X = scales.y;
-			Y = (m(0, 1) + m(1, 0)) * invscales.y;
-			Z = (m(2, 0) + m(0, 2)) * invscales.y;
-			W = (m(2, 1) - m(1, 2)) * invscales.y;
-		}
-		else if (tmp.z>0.f)
-		{
-			X = (m(0, 1) + m(1, 0)) * invscales.z;
-			Y = scales.z;
-			Z = (m(1, 2) + m(2, 1)) * invscales.z;
-			W = (m(0, 2) - m(2, 0)) * invscales.z;
-		}
-		else
-		{
-			X = (m(0, 2) + m(2, 0)) * invscales.w;
-			Y = (m(1, 2) + m(2, 1)) * invscales.w;
-			Z = scales.w;
-			W = (m(1, 0) - m(0, 1)) * invscales.w;
-		}
-	}
-
-	*this = normalize(*this);
-}
-
-inline bool matrix3x4SIMD::operator!=(const matrix3x4SIMD& _other)
-{
-	for (size_t i = 0u; i < VectorCount; ++i)
-		if ((rows[i] != _other.rows[i]).any())
-			return true;
-	return false;
-}
-
-inline matrix3x4SIMD& matrix3x4SIMD::operator+=(const matrix3x4SIMD& _other)
-{
-	for (size_t i = 0u; i < VectorCount; ++i)
-		rows[i] += _other.rows[i];
-	return *this;
-}
-inline matrix3x4SIMD& matrix3x4SIMD::operator-=(const matrix3x4SIMD& _other)
-{
-	for (size_t i = 0u; i < VectorCount; ++i)
-		rows[i] -= _other.rows[i];
-	return *this;
-}
-inline matrix3x4SIMD& matrix3x4SIMD::operator*=(float _scalar)
-{
-	for (size_t i = 0u; i < VectorCount; ++i)
-		rows[i] *= _scalar;
-	return *this;
-}
-
-#ifdef __NBL_COMPILE_WITH_SSE3
-#define BROADCAST32(fpx) _MM_SHUFFLE(fpx, fpx, fpx, fpx)
-#define BUILD_XORMASKF(_x_, _y_, _z_, _w_) _mm_setr_epi32(_x_ ? 0x80000000u:0x0u, _y_ ? 0x80000000u:0x0u, _z_ ? 0x80000000u:0x0u, _w_ ? 0x80000000u:0x0u)
-#define BUILD_MASKF(_x_, _y_, _z_, _w_) _mm_setr_epi32(_x_*0xffffffff, _y_*0xffffffff, _z_*0xffffffff, _w_*0xffffffff)
-
-inline matrix3x4SIMD matrix3x4SIMD::concatenateBFollowedByA(const matrix3x4SIMD& _a, const matrix3x4SIMD& _b)
-{
-#ifdef _NBL_DEBUG
-	assert(is_aligned_to(&_a, _NBL_SIMD_ALIGNMENT));
-	assert(is_aligned_to(&_b, _NBL_SIMD_ALIGNMENT));
-#endif // _NBL_DEBUG
-	__m128 r0 = _a.rows[0].getAsRegister();
-	__m128 r1 = _a.rows[1].getAsRegister();
-	__m128 r2 = _a.rows[2].getAsRegister();
-
-	matrix3x4SIMD out;
-	out.rows[0] = matrix3x4SIMD::doJob(r0, _b);
-	out.rows[1] = matrix3x4SIMD::doJob(r1, _b);
-	out.rows[2] = matrix3x4SIMD::doJob(r2, _b);
-
-	return out;
-}
-
-inline matrix3x4SIMD matrix3x4SIMD::concatenateBFollowedByAPrecisely(const matrix3x4SIMD& _a, const matrix3x4SIMD& _b)
-{
-	__m128d r00 = _a.halfRowAsDouble(0u, true);
-	__m128d r01 = _a.halfRowAsDouble(0u, false);
-	__m128d r10 = _a.halfRowAsDouble(1u, true);
-	__m128d r11 = _a.halfRowAsDouble(1u, false);
-	__m128d r20 = _a.halfRowAsDouble(2u, true);
-	__m128d r21 = _a.halfRowAsDouble(2u, false);
-
-	matrix3x4SIMD out;
-
-	const __m128i mask0011 = BUILD_MASKF(0, 0, 1, 1);
-
-	__m128 second = _mm_cvtpd_ps(matrix3x4SIMD::doJob_d(r00, r01, _b, false));
-	out.rows[0] = vectorSIMDf(_mm_cvtpd_ps(matrix3x4SIMD::doJob_d(r00, r01, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister());
-
-	second = _mm_cvtpd_ps(matrix3x4SIMD::doJob_d(r10, r11, _b, false));
-	out.rows[1] = vectorSIMDf(_mm_cvtpd_ps(matrix3x4SIMD::doJob_d(r10, r11, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister());
-
-	second = _mm_cvtpd_ps(matrix3x4SIMD::doJob_d(r20, r21, _b, false));
-	out.rows[2] = vectorSIMDf(_mm_cvtpd_ps(matrix3x4SIMD::doJob_d(r20, r21, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister());
-
-	return out;
-}
-
-inline vectorSIMDf matrix3x4SIMD::getTranslation() const
-{
-	__m128 xmm0 = _mm_unpackhi_ps(rows[0].getAsRegister(), rows[1].getAsRegister()); // (0z,1z,0w,1w)
-	__m128 xmm1 = _mm_unpackhi_ps(rows[2].getAsRegister(), _mm_setr_ps(0.f, 0.f, 0.f, 1.f)); // (2z,3z,2w,3w)
-	__m128 xmm2 = _mm_movehl_ps(xmm1, xmm0);// (0w,1w,2w,3w)
-
-	return xmm2;
-}
-inline vectorSIMDf matrix3x4SIMD::getTranslation3D() const
-{
-	__m128 xmm0 = _mm_unpackhi_ps(rows[0].getAsRegister(), rows[1].getAsRegister()); // (0z,1z,0w,1w)
-	__m128 xmm1 = _mm_unpackhi_ps(rows[2].getAsRegister(), _mm_setzero_ps()); // (2z,0,2w,0)
-	__m128 xmm2 = _mm_movehl_ps(xmm1, xmm0);// (0w,1w,2w,0)
-
-	return xmm2;
-}
-
-inline matrix3x4SIMD& matrix3x4SIMD::setScale(const core::vectorSIMDf& _scale)
-{
-	const vectorSIMDu32 mask0001 = vectorSIMDu32(BUILD_MASKF(0, 0, 0, 1));
-	const vectorSIMDu32 mask0010 = vectorSIMDu32(BUILD_MASKF(0, 0, 1, 0));
-	const vectorSIMDu32 mask0100 = vectorSIMDu32(BUILD_MASKF(0, 1, 0, 0));
-	const vectorSIMDu32 mask1000 = vectorSIMDu32(BUILD_MASKF(1, 0, 0, 0));
-
-	const vectorSIMDu32& scaleAlias = reinterpret_cast<const vectorSIMDu32&>(_scale);
-
-	vectorSIMDu32& rowAlias0 = reinterpret_cast<vectorSIMDu32&>(rows[0]);
-	vectorSIMDu32& rowAlias1 = reinterpret_cast<vectorSIMDu32&>(rows[1]);
-	vectorSIMDu32& rowAlias2 = reinterpret_cast<vectorSIMDu32&>(rows[2]);
-	rowAlias0 = (scaleAlias & reinterpret_cast<const vectorSIMDf&>(mask1000)) | (rowAlias0 & reinterpret_cast<const vectorSIMDf&>(mask0001));
-	rowAlias1 = (scaleAlias & reinterpret_cast<const vectorSIMDf&>(mask0100)) | (rowAlias1 & reinterpret_cast<const vectorSIMDf&>(mask0001));
-	rowAlias2 = (scaleAlias & reinterpret_cast<const vectorSIMDf&>(mask0010)) | (rowAlias2 & reinterpret_cast<const vectorSIMDf&>(mask0001));
-
-	return *this;
-}
-
-inline core::vectorSIMDf matrix3x4SIMD::getScale() const
-{
-	// xmm4-7 will now become columuns of B
-	__m128 xmm4 = rows[0].getAsRegister();
-	__m128 xmm5 = rows[1].getAsRegister();
-	__m128 xmm6 = rows[2].getAsRegister();
-	__m128 xmm7 = _mm_setzero_ps();
-	// g==0
-	__m128 xmm0 = _mm_unpacklo_ps(xmm4, xmm5);
-	__m128 xmm1 = _mm_unpacklo_ps(xmm6, xmm7); // (2x,g,2y,g)
-	__m128 xmm2 = _mm_unpackhi_ps(xmm4, xmm5);
-	__m128 xmm3 = _mm_unpackhi_ps(xmm6, xmm7); // (2z,g,2w,g)
-	xmm4 = _mm_movelh_ps(xmm1, xmm0); //(0x,1x,2x,g)
-	xmm5 = _mm_movehl_ps(xmm1, xmm0);
-	xmm6 = _mm_movelh_ps(xmm3, xmm2); //(0z,1z,2z,g)
-
-	// See http://www.robertblum.com/articles/2005/02/14/decomposing-matrices
-	// We have to do the full calculation.
-	xmm0 = _mm_mul_ps(xmm4, xmm4);// column 0 squared
-	xmm1 = _mm_mul_ps(xmm5, xmm5);// column 1 squared
-	xmm2 = _mm_mul_ps(xmm6, xmm6);// column 2 squared
-	xmm4 = _mm_hadd_ps(xmm0, xmm1);
-	xmm5 = _mm_hadd_ps(xmm2, xmm7);
-	xmm6 = _mm_hadd_ps(xmm4, xmm5);
-
-	return _mm_sqrt_ps(xmm6);
-}
-
-inline void matrix3x4SIMD::transformVect(vectorSIMDf& _out, const vectorSIMDf& _in) const
-{
-	vectorSIMDf r0 = rows[0] * _in,
-		r1 = rows[1] * _in,
-		r2 = rows[2] * _in;
-
-	_out =
-		_mm_hadd_ps(
-			_mm_hadd_ps(r0.getAsRegister(), r1.getAsRegister()),
-			_mm_hadd_ps(r2.getAsRegister(), _mm_set1_ps(0.25f))
-		);
-}
-
-inline void matrix3x4SIMD::pseudoMulWith4x1(vectorSIMDf& _out, const vectorSIMDf& _in) const
-{
-	__m128i mask1110 = BUILD_MASKF(1, 1, 1, 0);
-	_out = (_in & mask1110) | _mm_castps_si128(vectorSIMDf(0.f, 0.f, 0.f, 1.f).getAsRegister());
-	transformVect(_out);
-}
-
-inline void matrix3x4SIMD::mulSub3x3WithNx1(vectorSIMDf& _out, const vectorSIMDf& _in) const
-{
-	auto maskedIn = _in & BUILD_MASKF(1, 1, 1, 0);
-	vectorSIMDf r0 = rows[0] * maskedIn,
-		r1 = rows[1] * maskedIn,
-		r2 = rows[2] * maskedIn;
-
-	_out =
-		_mm_hadd_ps(
-			_mm_hadd_ps(r0.getAsRegister(), r1.getAsRegister()),
-			_mm_hadd_ps(r2.getAsRegister(), _mm_setzero_ps())
-		);
-}
-
-
-inline matrix3x4SIMD matrix3x4SIMD::buildCameraLookAtMatrixLH(
-	const core::vectorSIMDf& position,
-	const core::vectorSIMDf& target,
-	const core::vectorSIMDf& upVector)
-{
-	const core::vectorSIMDf zaxis = core::normalize(target - position);
-	const core::vectorSIMDf xaxis = core::normalize(core::cross(upVector, zaxis));
-	const core::vectorSIMDf yaxis = core::cross(zaxis, xaxis);
-
-	matrix3x4SIMD r;
-	r.rows[0] = xaxis;
-	r.rows[1] = yaxis;
-	r.rows[2] = zaxis;
-	r.rows[0].w = -dot(xaxis, position)[0];
-	r.rows[1].w = -dot(yaxis, position)[0];
-	r.rows[2].w = -dot(zaxis, position)[0];
-
-	return r;
-}
-inline matrix3x4SIMD matrix3x4SIMD::buildCameraLookAtMatrixRH(
-	const core::vectorSIMDf& position,
-	const core::vectorSIMDf& target,
-	const core::vectorSIMDf& upVector)
-{
-	const core::vectorSIMDf zaxis = core::normalize(position - target);
-	const core::vectorSIMDf xaxis = core::normalize(core::cross(upVector, zaxis));
-	const core::vectorSIMDf yaxis = core::cross(zaxis, xaxis);
-
-	matrix3x4SIMD r;
-	r.rows[0] = xaxis;
-	r.rows[1] = yaxis;
-	r.rows[2] = zaxis;
-	r.rows[0].w = -dot(xaxis, position)[0];
-	r.rows[1].w = -dot(yaxis, position)[0];
-	r.rows[2].w = -dot(zaxis, position)[0];
-
-	return r;
-}
-
-inline matrix3x4SIMD& matrix3x4SIMD::setRotation(const core::quaternion& _quat)
-{
-	const vectorSIMDu32 mask0001 = vectorSIMDu32(BUILD_MASKF(0, 0, 0, 1));
-	const __m128i mask1110 = BUILD_MASKF(1, 1, 1, 0);
-
-	const core::vectorSIMDf& quat = reinterpret_cast<const core::vectorSIMDf&>(_quat);
-	rows[0] = ((quat.yyyy() * ((quat.yxwx() & mask1110) * vectorSIMDf(2.f))) + (quat.zzzz() * (quat.zwxx() & mask1110) * vectorSIMDf(2.f, -2.f, 2.f, 0.f))) | (reinterpret_cast<const vectorSIMDu32&>(rows[0]) & (mask0001));
-	rows[0].x = 1.f - rows[0].x;
-
-	rows[1] = ((quat.zzzz() * ((quat.wzyx() & mask1110) * vectorSIMDf(2.f))) + (quat.xxxx() * (quat.yxwx() & mask1110) * vectorSIMDf(2.f, 2.f, -2.f, 0.f))) | (reinterpret_cast<const vectorSIMDu32&>(rows[1]) & (mask0001));
-	rows[1].y = 1.f - rows[1].y;
-
-	rows[2] = ((quat.xxxx() * ((quat.zwxx() & mask1110) * vectorSIMDf(2.f))) + (quat.yyyy() * (quat.wzyx() & mask1110) * vectorSIMDf(-2.f, 2.f, 2.f, 0.f))) | (reinterpret_cast<const vectorSIMDu32&>(rows[2]) & (mask0001));
-	rows[2].z = 1.f - rows[2].z;
-
-	return *this;
-}
-
-inline matrix3x4SIMD& matrix3x4SIMD::setScaleRotationAndTranslation(const vectorSIMDf& _scale, const core::quaternion& _quat, const vectorSIMDf& _translation)
-{
-	const __m128i mask1110 = BUILD_MASKF(1, 1, 1, 0);
-
-	const vectorSIMDf& quat = reinterpret_cast<const vectorSIMDf&>(_quat);
-	const vectorSIMDf dblScale = (_scale * 2.f) & mask1110;
-
-	vectorSIMDf mlt = dblScale ^ BUILD_XORMASKF(0, 1, 0, 0);
-	rows[0] = ((quat.yyyy() * ((quat.yxwx() & mask1110) * dblScale)) + (quat.zzzz() * (quat.zwxx() & mask1110) * mlt));
-	rows[0].x = _scale.x - rows[0].x;
-
-	mlt = dblScale ^ BUILD_XORMASKF(0, 0, 1, 0);
-	rows[1] = ((quat.zzzz() * ((quat.wzyx() & mask1110) * dblScale)) + (quat.xxxx() * (quat.yxwx() & mask1110) * mlt));
-	rows[1].y = _scale.y - rows[1].y;
-
-	mlt = dblScale ^ BUILD_XORMASKF(1, 0, 0, 0);
-	rows[2] = ((quat.xxxx() * ((quat.zwxx() & mask1110) * dblScale)) + (quat.yyyy() * (quat.wzyx() & mask1110) * mlt));
-	rows[2].z = _scale.z - rows[2].z;
-
-	setTranslation(_translation);
-
-	return *this;
-}
-
-
-inline bool matrix3x4SIMD::getInverse(matrix3x4SIMD& _out) const //! SUBOPTIMAL - OPTIMIZE!
-{
-	auto translation = getTranslation();
-	// `tmp` will have columns in its `rows`
-	core::matrix4SIMD tmp;
-	auto* cols = tmp.rows;
-	if (!getSub3x3InverseTranspose(reinterpret_cast<core::matrix3x4SIMD&>(tmp)))
-		return false;
-
-	// find inverse post-translation
-	cols[3] = -cols[0]*translation.xxxx()-cols[1]*translation.yyyy()-cols[2]*translation.zzzz();
-
-	// columns into rows
-	_out = transpose(tmp).extractSub3x4();
-
-	return true;
-}
-
-inline bool matrix3x4SIMD::getSub3x3InverseTranspose(core::matrix3x4SIMD& _out) const
-{
-	vectorSIMDf r1crossr2;
-	const vectorSIMDf d = determinant_helper(r1crossr2);
-	if (core::iszero(d.x, FLT_MIN))
-		return false;
-	auto rcp = core::reciprocal(d);
-
-	// matrix of cofactors * 1/det
-	_out = getSub3x3TransposeCofactors();
-	_out.rows[0] *= rcp;
-	_out.rows[1] *= rcp;
-	_out.rows[2] *= rcp;
-
-	return true;
-}
-
-inline core::matrix3x4SIMD matrix3x4SIMD::getSub3x3TransposeCofactors() const
-{
-	core::matrix3x4SIMD _out;
-	_out.rows[0] = core::cross(rows[1], rows[2]);
-	_out.rows[1] = core::cross(rows[2], rows[0]);
-	_out.rows[2] = core::cross(rows[0], rows[1]);
-	return _out;
-}
-
-// TODO: Double check this!-
-inline void matrix3x4SIMD::setTransformationCenter(const core::vectorSIMDf& _center, const core::vectorSIMDf& _translation)
-{
-	core::vectorSIMDf r0 = rows[0] * _center;
-	core::vectorSIMDf r1 = rows[1] * _center;
-	core::vectorSIMDf r2 = rows[2] * _center;
-	core::vectorSIMDf r3(0.f, 0.f, 0.f, 1.f);
-
-	__m128 col3 = _mm_hadd_ps(_mm_hadd_ps(r0.getAsRegister(), r1.getAsRegister()), _mm_hadd_ps(r2.getAsRegister(), r3.getAsRegister()));
-	const vectorSIMDf vcol3 = _center - _translation - col3;
-
-	for (size_t i = 0u; i < VectorCount; ++i)
-		rows[i].w = vcol3.pointer[i];
-}
-
-
-// TODO: Double check this!
-inline matrix3x4SIMD matrix3x4SIMD::buildAxisAlignedBillboard(
-	const core::vectorSIMDf& camPos,
-	const core::vectorSIMDf& center,
-	const core::vectorSIMDf& translation,
-	const core::vectorSIMDf& axis,
-	const core::vectorSIMDf& from)
-{
-	// axis of rotation
-	const core::vectorSIMDf up = core::normalize(axis);
-	const core::vectorSIMDf forward = core::normalize(camPos - center);
-	const core::vectorSIMDf right = core::normalize(core::cross(up, forward));
-
-	// correct look vector
-	const core::vectorSIMDf look = core::cross(right, up);
-
-	// rotate from to
-	// axis multiplication by sin
-	const core::vectorSIMDf vs = core::cross(look, from);
-
-	// cosinus angle
-	const core::vectorSIMDf ca = core::cross(from, look);
-
-	const core::vectorSIMDf vt(up * (core::vectorSIMDf(1.f) - ca));
-	const core::vectorSIMDf wt = vt * up.yzxx();
-	const core::vectorSIMDf vtuppca = vt * up + ca;
-
-	matrix3x4SIMD mat;
-	core::vectorSIMDf& row0 = mat.rows[0];
-	core::vectorSIMDf& row1 = mat.rows[1];
-	core::vectorSIMDf& row2 = mat.rows[2];
-
-	row0 = vtuppca & BUILD_MASKF(1, 0, 0, 0);
-	row1 = vtuppca & BUILD_MASKF(0, 1, 0, 0);
-	row2 = vtuppca & BUILD_MASKF(0, 0, 1, 0);
-
-	row0 += (wt.xxzx() + vs.xzyx() * core::vectorSIMDf(1.f, 1.f, -1.f, 1.f)) & BUILD_MASKF(0, 1, 1, 0);
-	row1 += (wt.xxyx() + vs.zxxx() * core::vectorSIMDf(-1.f, 1.f, 1.f, 1.f)) & BUILD_MASKF(1, 0, 1, 0);
-	row2 += (wt.zyxx() + vs.yxxx() * core::vectorSIMDf(1.f, -1.f, 1.f, 1.f)) & BUILD_MASKF(1, 1, 0, 0);
-
-	mat.setTransformationCenter(center, translation);
-	return mat;
-}
-
-
-
-inline vectorSIMDf matrix3x4SIMD::doJob(const __m128& a, const matrix3x4SIMD& _mtx)
-{
-	__m128 r0 = _mtx.rows[0].getAsRegister();
-	__m128 r1 = _mtx.rows[1].getAsRegister();
-	__m128 r2 = _mtx.rows[2].getAsRegister();
-
-	const __m128i mask = _mm_setr_epi32(0, 0, 0, 0xffffffff);
-
-	vectorSIMDf res;
-	res = _mm_mul_ps(_mm_shuffle_ps(a, a, BROADCAST32(0)), r0);
-	res += _mm_mul_ps(_mm_shuffle_ps(a, a, BROADCAST32(1)), r1);
-	res += _mm_mul_ps(_mm_shuffle_ps(a, a, BROADCAST32(2)), r2);
-	res += vectorSIMDf(a) & mask; // always 0 0 0 a3 -- no shuffle needed
-	return res;
-	}
-
-inline __m128d matrix3x4SIMD::halfRowAsDouble(size_t _n, bool _0) const
-{
-	return _mm_cvtps_pd(_0 ? rows[_n].xyxx().getAsRegister() : rows[_n].zwxx().getAsRegister());
-}
-inline __m128d matrix3x4SIMD::doJob_d(const __m128d& _a0, const __m128d& _a1, const matrix3x4SIMD& _mtx, bool _xyHalf)
-{
-	__m128d r0 = _mtx.halfRowAsDouble(0u, _xyHalf);
-	__m128d r1 = _mtx.halfRowAsDouble(1u, _xyHalf);
-	__m128d r2 = _mtx.halfRowAsDouble(2u, _xyHalf);
-
-	const __m128d mask01 = _mm_castsi128_pd(_mm_setr_epi32(0, 0, 0xffffffff, 0xffffffff));
-
-	__m128d res;
-	res = _mm_mul_pd(_mm_shuffle_pd(_a0, _a0, 0), r0);
-	res = _mm_add_pd(res, _mm_mul_pd(_mm_shuffle_pd(_a0, _a0, 3), r1));
-	res = _mm_add_pd(res, _mm_mul_pd(_mm_shuffle_pd(_a1, _a1, 0), r2));
-	if (!_xyHalf)
-		res = _mm_add_pd(res, _mm_and_pd(_a1, mask01));
-	return res;
-}
-
-#undef BUILD_MASKF
-#undef BUILD_XORMASKF
-#undef BROADCAST32
-#else
-#error "no implementation"
-#endif
-
-} // nbl::core
-
-#endif
diff --git a/include/matrix4SIMD.h b/include/matrix4SIMD.h
deleted file mode 100644
index 03126c61f7..0000000000
--- a/include/matrix4SIMD.h
+++ /dev/null
@@ -1,385 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef __NBL_MATRIX4SIMD_H_INCLUDED__
-#define __NBL_MATRIX4SIMD_H_INCLUDED__
-
-#include "matrix3x4SIMD.h"
-
-namespace nbl
-{
-namespace core
-{
-
-template<typename T>
-class aabbox3d;
-
-
-class matrix4SIMD// : public AlignedBase<_NBL_SIMD_ALIGNMENT> don't inherit from AlignedBase (which is empty) because member `rows[4]` inherits from it as well
-{
-	public:
-		_NBL_STATIC_INLINE_CONSTEXPR uint32_t VectorCount = 4u;
-		vectorSIMDf rows[VectorCount];
-
-		inline explicit matrix4SIMD(const vectorSIMDf& _r0 = vectorSIMDf(1.f, 0.f, 0.f, 0.f),
-									const vectorSIMDf& _r1 = vectorSIMDf(0.f, 1.f, 0.f, 0.f),
-									const vectorSIMDf& _r2 = vectorSIMDf(0.f, 0.f, 1.f, 0.f),
-									const vectorSIMDf& _r3 = vectorSIMDf(0.f, 0.f, 0.f, 1.f))
-											: rows{ _r0, _r1, _r2, _r3 }
-		{
-		}
-
-		inline matrix4SIMD(	float _a00, float _a01, float _a02, float _a03,
-							float _a10, float _a11, float _a12, float _a13,
-							float _a20, float _a21, float _a22, float _a23,
-							float _a30, float _a31, float _a32, float _a33)
-									: matrix4SIMD(	vectorSIMDf(_a00, _a01, _a02, _a03),
-													vectorSIMDf(_a10, _a11, _a12, _a13),
-													vectorSIMDf(_a20, _a21, _a22, _a23),
-													vectorSIMDf(_a30, _a31, _a32, _a33))
-		{
-		}
-
-		inline explicit matrix4SIMD(const float* const _data)
-		{
-			if (!_data)
-				return;
-			for (size_t i = 0u; i < VectorCount; ++i)
-				rows[i] = vectorSIMDf(_data + 4 * i);
-		}
-		inline matrix4SIMD(const float* const _data, bool ALIGNED)
-		{
-			if (!_data)
-				return;
-			for (size_t i = 0u; i < VectorCount; ++i)
-				rows[i] = vectorSIMDf(_data + 4 * i, ALIGNED);
-		}
-
-		inline explicit matrix4SIMD(const matrix3x4SIMD& smallMat)
-		{
-			*reinterpret_cast<matrix3x4SIMD*>(this) = smallMat;
-			rows[3].set(0.f,0.f,0.f,1.f);
-		}
-
-		inline matrix3x4SIMD extractSub3x4() const
-		{
-			return matrix3x4SIMD(rows[0],rows[1],rows[2]);
-		}
-
-		//! Access by row
-		inline const vectorSIMDf& getRow(size_t _rown) const{ return rows[_rown]; }
-		inline vectorSIMDf& getRow(size_t _rown) { return rows[_rown]; }
-
-		//! Access by element
-		inline float operator()(size_t _i, size_t _j) const { return rows[_i].pointer[_j]; }
-		inline float& operator()(size_t _i, size_t _j) { return rows[_i].pointer[_j]; }
-
-		//! Access for memory
-		inline const float* pointer() const {return rows[0].pointer;}
-		inline float* pointer() {return rows[0].pointer;}
-
-
-		inline bool operator==(const matrix4SIMD& _other) const
-		{
-			return !(*this != _other);
-		}
-		inline bool operator!=(const matrix4SIMD& _other) const;
-
-		inline matrix4SIMD& operator+=(const matrix4SIMD& _other);
-		inline matrix4SIMD operator+(const matrix4SIMD& _other) const
-		{
-			matrix4SIMD r{*this};
-			return r += _other;
-		}
-
-		inline matrix4SIMD& operator-=(const matrix4SIMD& _other);
-		inline matrix4SIMD operator-(const matrix4SIMD& _other) const
-		{
-			matrix4SIMD r{*this};
-			return r -= _other;
-		}
-
-		inline matrix4SIMD& operator*=(float _scalar);
-		inline matrix4SIMD operator*(float _scalar) const
-		{
-			matrix4SIMD r{*this};
-			return r *= _scalar;
-		}
-
-		static inline matrix4SIMD concatenateBFollowedByA(const matrix4SIMD& _a, const matrix4SIMD& _b);
-		static inline matrix4SIMD concatenateBFollowedByAPrecisely(const matrix4SIMD& _a, const matrix4SIMD& _b);
-
-		inline bool isIdentity() const
-		{
-			return *this == matrix4SIMD();
-		}
-		inline bool isIdentity(float _tolerance) const;
-
-		inline bool isOrthogonal() const
-		{
-			return concatenateBFollowedByA(transpose(*this), *this).isIdentity();
-		}
-		inline bool isOrthogonal(float _tolerance) const
-		{
-			return concatenateBFollowedByA(transpose(*this), *this).isIdentity(_tolerance);
-		}
-
-		inline matrix4SIMD& setScale(const core::vectorSIMDf& _scale);
-		inline matrix4SIMD& setScale(float _scale)
-		{
-			return setScale(vectorSIMDf(_scale));
-		}
-
-		inline void setTranslation(const float* _t)
-		{
-			for (size_t i = 0u; i < 3u; ++i)
-				rows[i].w = _t[i];
-		}
-		//! Takes into account only x,y,z components of _t
-		inline void setTranslation(const vectorSIMDf& _t)
-		{
-			setTranslation(_t.pointer);
-		}
-		inline void setTranslation(const vector3d<float>& _t)
-		{
-			setTranslation(&_t.X);
-		}
-
-		//! Returns last column of the matrix.
-		inline vectorSIMDf getTranslation() const;
-
-		//! Returns translation part of the matrix (w component is always 0).
-		inline vectorSIMDf getTranslation3D() const;
-		
-		enum class E_MATRIX_INVERSE_PRECISION
-		{
-		  EMIP_FAST_RECIPROCAL,
-		  EMIP_32BIT,
-		  EMIP_64BBIT
-		};
-
-		template<E_MATRIX_INVERSE_PRECISION precision = E_MATRIX_INVERSE_PRECISION::EMIP_FAST_RECIPROCAL>
-		inline bool getInverseTransform(matrix4SIMD& _out) const
-		{
-			if constexpr (precision == E_MATRIX_INVERSE_PRECISION::EMIP_64BBIT)
-			{
-				double a = rows[0][0], b = rows[0][1], c = rows[0][2], d = rows[0][3];
-				double e = rows[1][0], f = rows[1][1], g = rows[1][2], h = rows[1][3];
-				double i = rows[2][0], j = rows[2][1], k = rows[2][2], l = rows[2][3];
-				double m = rows[3][0], n = rows[3][1], o = rows[3][2], p = rows[3][3];
-
-				double kp_lo = k * p - l * o;
-				double jp_ln = j * p - l * n;
-				double jo_kn = j * o - k * n;
-				double ip_lm = i * p - l * m;
-				double io_km = i * o - k * m;
-				double in_jm = i * n - j * m;
-
-				double a11 = +(f * kp_lo - g * jp_ln + h * jo_kn);
-				double a12 = -(e * kp_lo - g * ip_lm + h * io_km);
-				double a13 = +(e * jp_ln - f * ip_lm + h * in_jm);
-				double a14 = -(e * jo_kn - f * io_km + g * in_jm);
-
-				double det = a * a11 + b * a12 + c * a13 + d * a14;
-
-				if (core::iszero(det, DBL_MIN))
-					return false;
-
-				double invDet = 1.0 / det;
-
-				_out.rows[0][0] = a11 * invDet;
-				_out.rows[1][0] = a12 * invDet;
-				_out.rows[2][0] = a13 * invDet;
-				_out.rows[3][0] = a14 * invDet;
-
-				_out.rows[0][1] = -(b * kp_lo - c * jp_ln + d * jo_kn) * invDet;
-				_out.rows[1][1] = +(a * kp_lo - c * ip_lm + d * io_km) * invDet;
-				_out.rows[2][1] = -(a * jp_ln - b * ip_lm + d * in_jm) * invDet;
-				_out.rows[3][1] = +(a * jo_kn - b * io_km + c * in_jm) * invDet;
-
-				double gp_ho = g * p - h * o;
-				double fp_hn = f * p - h * n;
-				double fo_gn = f * o - g * n;
-				double ep_hm = e * p - h * m;
-				double eo_gm = e * o - g * m;
-				double en_fm = e * n - f * m;
-
-				_out.rows[0][2] = +(b * gp_ho - c * fp_hn + d * fo_gn) * invDet;
-				_out.rows[1][2] = -(a * gp_ho - c * ep_hm + d * eo_gm) * invDet;
-				_out.rows[2][2] = +(a * fp_hn - b * ep_hm + d * en_fm) * invDet;
-				_out.rows[3][2] = -(a * fo_gn - b * eo_gm + c * en_fm) * invDet;
-
-				double gl_hk = g * l - h * k;
-				double fl_hj = f * l - h * j;
-				double fk_gj = f * k - g * j;
-				double el_hi = e * l - h * i;
-				double ek_gi = e * k - g * i;
-				double ej_fi = e * j - f * i;
-
-				_out.rows[0][3] = -(b * gl_hk - c * fl_hj + d * fk_gj) * invDet;
-				_out.rows[1][3] = +(a * gl_hk - c * el_hi + d * ek_gi) * invDet;
-				_out.rows[2][3] = -(a * fl_hj - b * el_hi + d * ej_fi) * invDet;
-				_out.rows[3][3] = +(a * fk_gj - b * ek_gi + c * ej_fi) * invDet;
-
-				return true;
-			}
-			else
-			{
-				auto mat2mul = [](vectorSIMDf _A, vectorSIMDf _B)
-				{
-					return _A*_B.xwxw()+_A.yxwz()*_B.zyzy();
-				};
-				auto mat2adjmul = [](vectorSIMDf _A, vectorSIMDf _B)
-				{
-					return _A.wwxx()*_B-_A.yyzz()*_B.zwxy();
-				};
-				auto mat2muladj = [](vectorSIMDf _A, vectorSIMDf _B)
-				{
-					return _A*_B.wxwx()-_A.yxwz()*_B.zyzy();
-				};
-
-				vectorSIMDf A = _mm_movelh_ps(rows[0].getAsRegister(), rows[1].getAsRegister());
-				vectorSIMDf B = _mm_movehl_ps(rows[1].getAsRegister(), rows[0].getAsRegister());
-				vectorSIMDf C = _mm_movelh_ps(rows[2].getAsRegister(), rows[3].getAsRegister());
-				vectorSIMDf D = _mm_movehl_ps(rows[3].getAsRegister(), rows[2].getAsRegister());
-
-				vectorSIMDf allDets =	vectorSIMDf(_mm_shuffle_ps(rows[0].getAsRegister(),rows[2].getAsRegister(),_MM_SHUFFLE(2,0,2,0)))*
-										vectorSIMDf(_mm_shuffle_ps(rows[1].getAsRegister(),rows[3].getAsRegister(),_MM_SHUFFLE(3,1,3,1)))
-									-
-										vectorSIMDf(_mm_shuffle_ps(rows[0].getAsRegister(),rows[2].getAsRegister(),_MM_SHUFFLE(3,1,3,1)))*
-										vectorSIMDf(_mm_shuffle_ps(rows[1].getAsRegister(),rows[3].getAsRegister(),_MM_SHUFFLE(2,0,2,0)));
-
-				auto detA = allDets.xxxx();
-				auto detB = allDets.yyyy();
-				auto detC = allDets.zzzz();
-				auto detD = allDets.wwww();
-
-				// https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html
-				auto D_C = mat2adjmul(D, C);
-				// A#B
-				auto A_B = mat2adjmul(A, B);
-				// X# = |D|A - B(D#C)
-				auto X_ = detD*A - mat2mul(B, D_C);
-				// W# = |A|D - C(A#B)
-				auto W_ = detA*D - mat2mul(C, A_B);
-
-				// |M| = |A|*|D| + ... (continue later)
-				auto detM = detA*detD;
-
-				// Y# = |B|C - D(A#B)#
-				auto Y_ = detB*C - mat2muladj(D, A_B);
-				// Z# = |C|B - A(D#C)#
-				auto Z_ = detC*B -  mat2muladj(A, D_C);
-
-				// |M| = |A|*|D| + |B|*|C| ... (continue later)
-				detM += detB*detC;
-
-				// tr((A#B)(D#C))
-				__m128 tr = (A_B*D_C.xzyw()).getAsRegister();
-				tr = _mm_hadd_ps(tr, tr);
-				tr = _mm_hadd_ps(tr, tr);
-				// |M| = |A|*|D| + |B|*|C| - tr((A#B)(D#C)
-				detM -= tr;
-
-				if (core::iszero(detM.x, FLT_MIN))
-					return false;
-
-				vectorSIMDf rDetM;
-
-				// (1/|M|, -1/|M|, -1/|M|, 1/|M|)
-				if constexpr (precision == E_MATRIX_INVERSE_PRECISION::EMIP_FAST_RECIPROCAL)
-					rDetM = vectorSIMDf(1.f, -1.f, -1.f, 1.f)*core::reciprocal(detM);
-				else if constexpr (precision == E_MATRIX_INVERSE_PRECISION::EMIP_32BIT)
-					rDetM = vectorSIMDf(1.f, -1.f, -1.f, 1.f).preciseDivision(detM);
-
-				X_ *= rDetM;
-				Y_ *= rDetM;
-				Z_ *= rDetM;
-				W_ *= rDetM;
-
-				// apply adjugate and store, here we combine adjugate shuffle and store shuffle
-				_out.rows[0] = _mm_shuffle_ps(X_.getAsRegister(), Y_.getAsRegister(), _MM_SHUFFLE(1, 3, 1, 3));
-				_out.rows[1] = _mm_shuffle_ps(X_.getAsRegister(), Y_.getAsRegister(), _MM_SHUFFLE(0, 2, 0, 2));
-				_out.rows[2] = _mm_shuffle_ps(Z_.getAsRegister(), W_.getAsRegister(), _MM_SHUFFLE(1, 3, 1, 3));
-				_out.rows[3] = _mm_shuffle_ps(Z_.getAsRegister(), W_.getAsRegister(), _MM_SHUFFLE(0, 2, 0, 2));
-
-				return true;
-			}
-		}
-
-		inline vectorSIMDf sub3x3TransformVect(const vectorSIMDf& _in) const;
-
-		inline void transformVect(vectorSIMDf& _out, const vectorSIMDf& _in) const;
-		inline void transformVect(vectorSIMDf& _vector) const
-		{
-			transformVect(_vector, _vector);
-		}
-
-		inline void translateVect(vectorSIMDf& _vect) const
-		{
-			_vect += getTranslation();
-		}
-
-		bool isBoxInFrustum(const aabbox3d<float>& bbox);
-
-		bool perspectiveTransformVect(core::vectorSIMDf& inOutVec)
-		{
-			transformVect(inOutVec);
-			const bool inFront = inOutVec[3] > 0.f;
-			inOutVec /= inOutVec.wwww();
-			return inFront;
-		}
-
-		core::vector2di fragCoordTransformVect(const core::vectorSIMDf& _in, const core::dimension2du& viewportDimensions)
-		{
-			core::vectorSIMDf pos(_in);
-			pos.w = 1.f;
-			if (perspectiveTransformVect(pos))
-				core::vector2di(-0x80000000, -0x80000000);
-
-			pos[0] *= 0.5f;
-			pos[1] *= 0.5f;
-			pos[0] += 0.5f;
-			pos[1] += 0.5f;
-
-			return core::vector2di(pos[0] * float(viewportDimensions.Width), pos[1] * float(viewportDimensions.Height));
-		}
-
-		static inline matrix4SIMD buildProjectionMatrixPerspectiveFovRH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar);
-		static inline matrix4SIMD buildProjectionMatrixPerspectiveFovLH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar);
-
-		static inline matrix4SIMD buildProjectionMatrixOrthoRH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar);
-		static inline matrix4SIMD buildProjectionMatrixOrthoLH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar);
-
-		//! Access by row
-		inline const vectorSIMDf& operator[](size_t _rown) const { return rows[_rown]; }
-		//! Access by row
-		inline vectorSIMDf& operator[](size_t _rown) { return rows[_rown]; }
-
-	private:
-		//! TODO: implement a dvec<2>
-		inline __m128d halfRowAsDouble(size_t _n, bool _firstHalf) const;
-		static inline __m128d concat64_helper(const __m128d& _a0, const __m128d& _a1, const matrix4SIMD& _mtx, bool _firstHalf);
-};
-
-inline matrix4SIMD operator*(float _scalar, const matrix4SIMD& _mtx)
-{
-    return _mtx * _scalar;
-}
-
-inline matrix4SIMD concatenateBFollowedByA(const matrix4SIMD& _a, const matrix4SIMD& _b)
-{
-    return matrix4SIMD::concatenateBFollowedByA(_a, _b);
-}
-/*
-inline matrix4SIMD concatenateBFollowedByAPrecisely(const matrix4SIMD& _a, const matrix4SIMD& _b)
-{
-    return matrix4SIMD::concatenateBFollowedByAPrecisely(_a, _b);
-}
-*/
-
-
-}} // nbl::core
-
-#endif
diff --git a/include/matrix4SIMD_impl.h b/include/matrix4SIMD_impl.h
deleted file mode 100644
index 02484e7a4c..0000000000
--- a/include/matrix4SIMD_impl.h
+++ /dev/null
@@ -1,299 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef __NBL_MATRIX4SIMD_IMPL_H_INCLUDED__
-#define __NBL_MATRIX4SIMD_IMPL_H_INCLUDED__
-
-#include "matrix4SIMD.h"
-#include "nbl/core/math/glslFunctions.tcc"
-#include "aabbox3d.h"
-
-namespace nbl
-{
-namespace core
-{
-
-
-inline bool matrix4SIMD::operator!=(const matrix4SIMD& _other) const
-{
-	for (size_t i = 0u; i < VectorCount; ++i)
-		if ((rows[i] != _other.rows[i]).any())
-			return true;
-	return false;
-}
-
-inline matrix4SIMD& matrix4SIMD::operator+=(const matrix4SIMD& _other)
-{
-	for (size_t i = 0u; i < VectorCount; ++i)
-		rows[i] += _other.rows[i];
-	return *this;
-}
-
-inline matrix4SIMD& matrix4SIMD::operator-=(const matrix4SIMD& _other)
-{
-	for (size_t i = 0u; i < VectorCount; ++i)
-		rows[i] -= _other.rows[i];
-	return *this;
-}
-
-inline matrix4SIMD& matrix4SIMD::operator*=(float _scalar)
-{
-	for (size_t i = 0u; i < VectorCount; ++i)
-		rows[i] *= _scalar;
-	return *this;
-}
-
-inline bool matrix4SIMD::isIdentity(float _tolerance) const
-{
-	return core::equals<matrix4SIMD>(*this, matrix4SIMD(), core::ROUNDING_ERROR<matrix4SIMD>());
-}
-
-#ifdef __NBL_COMPILE_WITH_SSE3
-#define BROADCAST32(fpx) _MM_SHUFFLE(fpx, fpx, fpx, fpx)
-#define BUILD_MASKF(_x_, _y_, _z_, _w_) _mm_setr_epi32(_x_*0xffffffff, _y_*0xffffffff, _z_*0xffffffff, _w_*0xffffffff)
-inline matrix4SIMD matrix4SIMD::concatenateBFollowedByA(const matrix4SIMD& _a, const matrix4SIMD& _b)
-{
-	auto calcRow = [](const __m128& _row, const matrix4SIMD& _mtx)
-	{
-		__m128 r0 = _mtx.rows[0].getAsRegister();
-		__m128 r1 = _mtx.rows[1].getAsRegister();
-		__m128 r2 = _mtx.rows[2].getAsRegister();
-		__m128 r3 = _mtx.rows[3].getAsRegister();
-
-		__m128 res;
-		res = _mm_mul_ps(_mm_shuffle_ps(_row, _row, BROADCAST32(0)), r0);
-		res = _mm_add_ps(res, _mm_mul_ps(_mm_shuffle_ps(_row, _row, BROADCAST32(1)), r1));
-		res = _mm_add_ps(res, _mm_mul_ps(_mm_shuffle_ps(_row, _row, BROADCAST32(2)), r2));
-		res = _mm_add_ps(res, _mm_mul_ps(_mm_shuffle_ps(_row, _row, BROADCAST32(3)), r3));
-		return res;
-	};
-
-	matrix4SIMD r;
-	for (size_t i = 0u; i < 4u; ++i)
-		r.rows[i] = calcRow(_a.rows[i].getAsRegister(), _b);
-
-	return r;
-}
-inline matrix4SIMD matrix4SIMD::concatenateBFollowedByAPrecisely(const matrix4SIMD& _a, const matrix4SIMD& _b)
-{
-	matrix4SIMD out;
-
-	__m128i mask0011 = BUILD_MASKF(0, 0, 1, 1);
-	__m128 second;
-
-	{
-	__m128d r00 = _a.halfRowAsDouble(0u, true);
-	__m128d r01 = _a.halfRowAsDouble(0u, false);
-	second = _mm_cvtpd_ps(concat64_helper(r00, r01, _b, false));
-	out.rows[0] = vectorSIMDf(_mm_cvtpd_ps(concat64_helper(r00, r01, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister());
-	}
-
-	{
-	__m128d r10 = _a.halfRowAsDouble(1u, true);
-	__m128d r11 = _a.halfRowAsDouble(1u, false);
-	second = _mm_cvtpd_ps(concat64_helper(r10, r11, _b, false));
-	out.rows[1] = vectorSIMDf(_mm_cvtpd_ps(concat64_helper(r10, r11, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister());
-	}
-
-	{
-	__m128d r20 = _a.halfRowAsDouble(2u, true);
-	__m128d r21 = _a.halfRowAsDouble(2u, false);
-	second = _mm_cvtpd_ps(concat64_helper(r20, r21, _b, false));
-	out.rows[2] = vectorSIMDf(_mm_cvtpd_ps(concat64_helper(r20, r21, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister());
-	}
-
-	{
-	__m128d r30 = _a.halfRowAsDouble(3u, true);
-	__m128d r31 = _a.halfRowAsDouble(3u, false);
-	second = _mm_cvtpd_ps(concat64_helper(r30, r31, _b, false));
-	out.rows[3] = vectorSIMDf(_mm_cvtpd_ps(concat64_helper(r30, r31, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister());
-	}
-
-	return out;
-}
-
-inline matrix4SIMD& matrix4SIMD::setScale(const core::vectorSIMDf& _scale)
-{
-	const __m128i mask0001 = BUILD_MASKF(0, 0, 0, 1);
-
-	rows[0] = (_scale & BUILD_MASKF(1, 0, 0, 0)) | _mm_castps_si128((rows[0] & mask0001).getAsRegister());
-	rows[1] = (_scale & BUILD_MASKF(0, 1, 0, 0)) | _mm_castps_si128((rows[1] & mask0001).getAsRegister());
-	rows[2] = (_scale & BUILD_MASKF(0, 0, 1, 0)) | _mm_castps_si128((rows[2] & mask0001).getAsRegister());
-	rows[3] = vectorSIMDf(0.f, 0.f, 0.f, 1.f);
-
-	return *this;
-}
-
-//! Returns last column of the matrix.
-inline vectorSIMDf matrix4SIMD::getTranslation() const
-{
-	__m128 tmp1 = _mm_unpackhi_ps(rows[0].getAsRegister(), rows[1].getAsRegister()); // (0z,1z,0w,1w)
-	__m128 tmp2 = _mm_unpackhi_ps(rows[2].getAsRegister(), rows[3].getAsRegister()); // (2z,3z,2w,3w)
-	__m128 col3 = _mm_movehl_ps(tmp1, tmp2);// (0w,1w,2w,3w)
-
-	return col3;
-}
-//! Returns translation part of the matrix (w component is always 0).
-inline vectorSIMDf matrix4SIMD::getTranslation3D() const
-{
-	__m128 tmp1 = _mm_unpackhi_ps(rows[0].getAsRegister(), rows[1].getAsRegister()); // (0z,1z,0w,1w)
-	__m128 tmp2 = _mm_unpackhi_ps(rows[2].getAsRegister(), _mm_setzero_ps()); // (2z,0,2w,0)
-	__m128 transl = _mm_movehl_ps(tmp1, tmp2);// (0w,1w,2w,0)
-
-	return transl;
-}
-
-inline vectorSIMDf matrix4SIMD::sub3x3TransformVect(const vectorSIMDf& _in) const
-{
-	matrix4SIMD cp{*this};
-	vectorSIMDf out = _in & BUILD_MASKF(1, 1, 1, 0);
-	transformVect(out);
-	return out;
-}
-
-inline void matrix4SIMD::transformVect(vectorSIMDf& _out, const vectorSIMDf& _in) const
-{
-	vectorSIMDf r[4];
-	for (size_t i = 0u; i < VectorCount; ++i)
-		r[i] = rows[i] * _in;
-
-	_out = _mm_hadd_ps(
-		_mm_hadd_ps(r[0].getAsRegister(), r[1].getAsRegister()),
-		_mm_hadd_ps(r[2].getAsRegister(), r[3].getAsRegister())
-	);
-}
-
-inline matrix4SIMD matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar)
-{
-	const float h = core::reciprocal<float>(tanf(fieldOfViewRadians*0.5f));
-	_NBL_DEBUG_BREAK_IF(aspectRatio == 0.f); //division by zero
-	const float w = h / aspectRatio;
-
-	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
-
-	matrix4SIMD m;
-	m.rows[0] = vectorSIMDf(w, 0.f, 0.f, 0.f);
-	m.rows[1] = vectorSIMDf(0.f, -h, 0.f, 0.f);
-	m.rows[2] = vectorSIMDf(0.f, 0.f, -zFar/(zFar-zNear), -zNear*zFar/(zFar-zNear));
-	m.rows[3] = vectorSIMDf(0.f, 0.f, -1.f, 0.f);
-
-	return m;
-}
-inline matrix4SIMD matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar)
-{
-	const float h = core::reciprocal<float>(tanf(fieldOfViewRadians*0.5f));
-	_NBL_DEBUG_BREAK_IF(aspectRatio == 0.f); //division by zero
-	const float w = h / aspectRatio;
-
-	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
-
-	matrix4SIMD m;
-	m.rows[0] = vectorSIMDf(w, 0.f, 0.f, 0.f);
-	m.rows[1] = vectorSIMDf(0.f, -h, 0.f, 0.f);
-	m.rows[2] = vectorSIMDf(0.f, 0.f, zFar/(zFar-zNear), -zNear*zFar/(zFar-zNear));
-	m.rows[3] = vectorSIMDf(0.f, 0.f, 1.f, 0.f);
-
-	return m;
-}
-
-inline matrix4SIMD matrix4SIMD::buildProjectionMatrixOrthoRH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar)
-{
-	_NBL_DEBUG_BREAK_IF(widthOfViewVolume == 0.f); //division by zero
-	_NBL_DEBUG_BREAK_IF(heightOfViewVolume == 0.f); //division by zero
-	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
-
-	matrix4SIMD m;
-	m.rows[0] = vectorSIMDf(2.f/widthOfViewVolume, 0.f, 0.f, 0.f);
-	m.rows[1] = vectorSIMDf(0.f, -2.f/heightOfViewVolume, 0.f, 0.f);
-	m.rows[2] = vectorSIMDf(0.f, 0.f, -1.f/(zFar-zNear), -zNear/(zFar-zNear));
-	m.rows[3] = vectorSIMDf(0.f, 0.f, 0.f, 1.f);
-
-	return m;
-}
-inline matrix4SIMD matrix4SIMD::buildProjectionMatrixOrthoLH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar)
-{
-	_NBL_DEBUG_BREAK_IF(widthOfViewVolume == 0.f); //division by zero
-	_NBL_DEBUG_BREAK_IF(heightOfViewVolume == 0.f); //division by zero
-	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
-
-	matrix4SIMD m;
-	m.rows[0] = vectorSIMDf(2.f/widthOfViewVolume, 0.f, 0.f, 0.f);
-	m.rows[1] = vectorSIMDf(0.f, -2.f/heightOfViewVolume, 0.f, 0.f);
-	m.rows[2] = vectorSIMDf(0.f, 0.f, 1.f/(zFar-zNear), -zNear/(zFar-zNear));
-	m.rows[3] = vectorSIMDf(0.f, 0.f, 0.f, 1.f);
-
-	return m;
-}
-
-
-
-inline __m128d matrix4SIMD::halfRowAsDouble(size_t _n, bool _firstHalf) const
-{
-	return _mm_cvtps_pd(_firstHalf ? rows[_n].xyxx().getAsRegister() : rows[_n].zwxx().getAsRegister());
-}
-inline __m128d matrix4SIMD::concat64_helper(const __m128d& _a0, const __m128d& _a1, const matrix4SIMD& _mtx, bool _firstHalf)
-{
-	__m128d r0 = _mtx.halfRowAsDouble(0u, _firstHalf);
-	__m128d r1 = _mtx.halfRowAsDouble(1u, _firstHalf);
-	__m128d r2 = _mtx.halfRowAsDouble(2u, _firstHalf);
-	__m128d r3 = _mtx.halfRowAsDouble(3u, _firstHalf);
-
-	//const __m128d mask01 = _mm_castsi128_pd(_mm_setr_epi32(0, 0, 0xffffffff, 0xffffffff));
-
-	__m128d res;
-	res = _mm_mul_pd(_mm_shuffle_pd(_a0, _a0, 0), r0);
-	res = _mm_add_pd(res, _mm_mul_pd(_mm_shuffle_pd(_a0, _a0, 3/*0b11*/), r1));
-	res = _mm_add_pd(res, _mm_mul_pd(_mm_shuffle_pd(_a1, _a1, 0), r2));
-	res = _mm_add_pd(res, _mm_mul_pd(_mm_shuffle_pd(_a1, _a1, 3/*0b11*/), r3));
-	return res;
-}
-
-#undef BUILD_MASKF
-#undef BROADCAST32
-#else
-#error "no implementation"
-#endif
-
-inline bool matrix4SIMD::isBoxInFrustum(const aabbox3d<float>& bbox)
-{
-	vectorSIMDf MinEdge, MaxEdge;
-	MinEdge.set(bbox.MinEdge);
-	MaxEdge.set(bbox.MaxEdge);
-	MinEdge.w = 1.f;
-	MaxEdge.w = 1.f;
-
-
-	auto getClosestDP = [&MinEdge,&MaxEdge](const vectorSIMDf& toDot) -> float
-	{
-		return dot(mix(MaxEdge,MinEdge,toDot<vectorSIMDf(0.f)),toDot)[0];
-	};
-
-	// near plane
-	if (getClosestDP(rows[3])<=0.f)
-		return false;
-
-	// x max
-	if (getClosestDP(rows[3]+rows[0])<=0.f)
-		return false;
-	// y max
-	if (getClosestDP(rows[3]+rows[1])<=0.f)
-		return false;
-	// x min
-	if (getClosestDP(rows[3]-rows[0])<=0.f)
-		return false;
-	// y min
-	if (getClosestDP(rows[3]-rows[1])<=0.f)
-		return false;
-
-	// far plane
-	if (getClosestDP(rows[3]+rows[2])<=0.f)
-		return false;
-
-	return true;
-}
-
-}
-} // nbl::core
-
-#endif
diff --git a/include/nabla.h b/include/nabla.h
index dac3155e5e..fa231e3db7 100644
--- a/include/nabla.h
+++ b/include/nabla.h
@@ -53,9 +53,7 @@
 #include "vector3d.h"
 #include "vectorSIMD.h"
 #include "line3d.h"
-#include "matrix4SIMD.h"
 #include "position2d.h"
-#include "quaternion.h"
 #include "rect.h"
 #include "dimension2d.h"
 
diff --git a/include/nbl/asset/IAccelerationStructure.h b/include/nbl/asset/IAccelerationStructure.h
index 755c81e477..6d64a2b769 100644
--- a/include/nbl/asset/IAccelerationStructure.h
+++ b/include/nbl/asset/IAccelerationStructure.h
@@ -204,7 +204,7 @@ class ITopLevelAccelerationStructure : public IDescriptor, public IAccelerationS
 			FORCE_OPACITY_MICROMAP_2_STATE_BIT = 0x1u<<4u,
 			FORCE_DISABLE_OPACITY_MICROMAPS_BIT = 0x1u<<5u,
 		};
-		// Note: `core::matrix3x4SIMD` is equvalent to VkTransformMatrixKHR, 4x3 row_major matrix
+		// Note: `hlsl::float32_t3x4` is equvalent to VkTransformMatrixKHR, 4x3 row_major matrix
 		template<typename _blas_ref_t>
 		struct Instance final
 		{
@@ -221,18 +221,18 @@ class ITopLevelAccelerationStructure : public IDescriptor, public IAccelerationS
 		template<typename blas_ref_t>
 		struct StaticInstance final
 		{
-			core::matrix3x4SIMD	transform = core::matrix3x4SIMD();
+			hlsl::float32_t3x4	transform = hlsl::float32_t3x4();
 			Instance<blas_ref_t> base = {};
 		};
 		template<typename blas_ref_t>
 		struct MatrixMotionInstance final
 		{
-			core::matrix3x4SIMD transform[2] = {core::matrix3x4SIMD(),core::matrix3x4SIMD()};
+			hlsl::float32_t3x4 transform[2] = {hlsl::float32_t3x4(),hlsl::float32_t3x4()};
 			Instance<blas_ref_t> base = {};
 		};
 		struct SRT
 		{
-			// TODO: some operators to convert back and forth from `core::matrix3x4SIMD
+			// TODO: some operators to convert back and forth from `hlsl::float32_t3x4
 
 			float    sx;
 			float    a;
diff --git a/include/nbl/asset/IAnimationLibrary.h b/include/nbl/asset/IAnimationLibrary.h
index 9665349103..3ab87e5d32 100644
--- a/include/nbl/asset/IAnimationLibrary.h
+++ b/include/nbl/asset/IAnimationLibrary.h
@@ -34,7 +34,7 @@ class IAnimationLibrary : public virtual core::IReferenceCounted
 					translation[2] = translation[1] = translation[0] = 0.f;
 					quat = core::vectorSIMDu32(128u,128u,128u,255u); // should be (0,0,0,1) encoded
 				}
-				Keyframe(const core::vectorSIMDf& _scale, const core::quaternion& _quat, const CQuantQuaternionCache* quantCache, const core::vectorSIMDf& _translation)
+				Keyframe(const core::vectorSIMDf& _scale, const hlsl::math::quaternion<float>& _quat, const CQuantQuaternionCache* quantCache, const core::vectorSIMDf& _translation)
 				{
 					std::copy(_translation.pointer,_translation.pointer+3,translation);
 					quat = quantCache->template quantize<decltype(quat)>(_quat);
@@ -42,13 +42,13 @@ class IAnimationLibrary : public virtual core::IReferenceCounted
 					//scale = ;
 				}
 
-				inline core::quaternion getRotation() const
+				inline hlsl::math::quaternion<float> getRotation() const
 				{
 					const void* _pix[4] = {&quat,nullptr,nullptr,nullptr};
 					double out[4];
 					decodePixels<EF_R8G8B8A8_SNORM,double>(_pix,out,0u,0u);
 					auto q = core::normalize(core::vectorSIMDf(out[0],out[1],out[2],out[3]));
-					return reinterpret_cast<const core::quaternion*>(&q)[0];
+					return reinterpret_cast<const hlsl::math::quaternion<float>*>(&q)[0];
 				}
 
 				inline core::vectorSIMDf getScale() const
diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h
index 1049798268..7418e46ce3 100644
--- a/include/nbl/asset/ICPUSkeleton.h
+++ b/include/nbl/asset/ICPUSkeleton.h
@@ -42,15 +42,15 @@ class ICPUSkeleton final : public ISkeleton<ICPUBuffer>, public IAsset
 		}
 
 		//!
-		inline const core::matrix3x4SIMD& getDefaultTransformMatrix(base_t::joint_id_t jointID) const
+		inline const hlsl::float32_t3x4& getDefaultTransformMatrix(base_t::joint_id_t jointID) const
 		{
 			const uint8_t* ptr = reinterpret_cast<const uint8_t*>(m_defaultTransforms.buffer->getPointer());
-			return reinterpret_cast<const core::matrix3x4SIMD*>(ptr+m_defaultTransforms.offset)[jointID];
+			return reinterpret_cast<const hlsl::float32_t3x4*>(ptr+m_defaultTransforms.offset)[jointID];
 		}
-		inline core::matrix3x4SIMD& getDefaultTransformMatrix(base_t::joint_id_t jointID)
+		inline hlsl::float32_t3x4& getDefaultTransformMatrix(base_t::joint_id_t jointID)
 		{
 			assert(isMutable());
-			return const_cast<core::matrix3x4SIMD&>(const_cast<const ICPUSkeleton*>(this)->getDefaultTransformMatrix(jointID));
+			return const_cast<hlsl::float32_t3x4&>(const_cast<const ICPUSkeleton*>(this)->getDefaultTransformMatrix(jointID));
 		}
 
 		//!
diff --git a/include/nbl/asset/ISkeleton.h b/include/nbl/asset/ISkeleton.h
index 7960ca4eef..03ba3af4ea 100644
--- a/include/nbl/asset/ISkeleton.h
+++ b/include/nbl/asset/ISkeleton.h
@@ -62,7 +62,7 @@ class ISkeleton : public virtual core::IReferenceCounted
 				return;
 
 			assert(m_parentJointIDs.buffer->getSize()>=m_parentJointIDs.offset+sizeof(joint_id_t)*m_jointCount);
-			assert(m_defaultTransforms.buffer->getSize()>=m_defaultTransforms.offset+sizeof(core::matrix3x4SIMD)*m_jointCount);
+			assert(m_defaultTransforms.buffer->getSize()>=m_defaultTransforms.offset+sizeof(hlsl::float32_t3x4)*m_jointCount);
 		}
 		virtual ~ISkeleton()
 		{
diff --git a/include/nbl/asset/utils/CQuantQuaternionCache.h b/include/nbl/asset/utils/CQuantQuaternionCache.h
index 8e46dffb0a..dc8d18545a 100644
--- a/include/nbl/asset/utils/CQuantQuaternionCache.h
+++ b/include/nbl/asset/utils/CQuantQuaternionCache.h
@@ -60,7 +60,7 @@ class CQuantQuaternionCache : public CDirQuantCacheBase<impl::Projection,impl::Q
 
 	public:
 		template<E_FORMAT CacheFormat>
-		value_type_t<CacheFormat> quantize(const core::quaternion& quat)
+		value_type_t<CacheFormat> quantize(const hlsl::math::quaternion<float>& quat)
 		{
 			return Base::quantize<4u,CacheFormat>(reinterpret_cast<const core::vectorSIMDf&>(quat));
 		}
diff --git a/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl b/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl
index cd89ce45d1..87922bcb51 100644
--- a/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl
+++ b/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl
@@ -772,6 +772,17 @@ struct mul_helper<LhsT, RhsT NBL_PARTIAL_REQ_BOT(concepts::Matrix<LhsT> && conce
 	}
 };
 
+template<typename LhsT, typename RhsT>
+NBL_PARTIAL_REQ_TOP((concepts::Matrix<LhsT> && concepts::Scalar<RhsT>) || (concepts::Scalar<LhsT> && concepts::Matrix<RhsT>))
+struct mul_helper<LhsT, RhsT NBL_PARTIAL_REQ_BOT((concepts::Matrix<LhsT> && concepts::Scalar<RhsT>) || (concepts::Scalar<LhsT> && concepts::Matrix<RhsT>)) >
+{
+	using return_t = hlsl::conditional_t<hlsl::is_matrix_v<LhsT>, LhsT, RhsT>;
+	static inline return_t __call(LhsT lhs, RhsT rhs)
+	{
+		return mul(lhs, rhs);
+	}
+};
+
 #define AUTO_SPECIALIZE_HELPER_FOR_VECTOR(HELPER_NAME, REQUIREMENT, RETURN_TYPE)\
 template<typename T>\
 NBL_PARTIAL_REQ_TOP(REQUIREMENT)\
diff --git a/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl b/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl
index 1ee5edf275..b704eef834 100644
--- a/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl
+++ b/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl
@@ -44,6 +44,16 @@ struct matrix final : private glm::mat<N,M,T>
     {
         return glm::operator*(reinterpret_cast<Base const&>(rhs), lhs);
     }
+    template<typename ScalarT>
+    inline friend matrix mul(const ScalarT lhs, matrix const& rhs)
+    {
+        return matrix(glm::operator*(lhs, reinterpret_cast<Base const&>(rhs)));
+    }
+    template<typename ScalarT>
+    inline friend matrix mul(matrix const& lhs, const ScalarT rhs)
+    {
+        return matrix(glm::operator*(reinterpret_cast<Base const&>(lhs), rhs));
+    }
 };
 #endif
 
diff --git a/include/nbl/builtin/hlsl/macros.h b/include/nbl/builtin/hlsl/macros.h
index 944f06cdc9..70838c93d8 100644
--- a/include/nbl/builtin/hlsl/macros.h
+++ b/include/nbl/builtin/hlsl/macros.h
@@ -36,8 +36,10 @@ inline auto functionAlias(Args&&... args) -> decltype(origFunctionName(std::forw
 
 #ifdef __HLSL_VERSION
 #define NBL_UNROLL [[unroll]]
+#define NBL_UNROLL_LIMITED(LIMIT) [unroll(LIMIT)]
 #else
 #define NBL_UNROLL
+#define NBL_UNROLL_LIMITED(LIMIT)
 #endif
 
 #ifdef __HLSL_VERSION // cause DXC is insane
diff --git a/include/nbl/builtin/hlsl/math/linalg/matrix_utils/transformation_matrix_utils.hlsl b/include/nbl/builtin/hlsl/math/linalg/matrix_utils/transformation_matrix_utils.hlsl
new file mode 100644
index 0000000000..934a6adf44
--- /dev/null
+++ b/include/nbl/builtin/hlsl/math/linalg/matrix_utils/transformation_matrix_utils.hlsl
@@ -0,0 +1,97 @@
+#ifndef _NBL_BUILTIN_HLSL_MATH_LINALG_MATRIX_UTILS_TRANSFORMATION_MATRIX_UTILS_INCLUDED_
+#define _NBL_BUILTIN_HLSL_MATH_LINALG_MATRIX_UTILS_TRANSFORMATION_MATRIX_UTILS_INCLUDED_
+#include <nbl/builtin/hlsl/math/quaternions.hlsl>
+// TODO: remove this header when deleting vectorSIMDf.hlsl
+#ifndef __HLSL_VERSION
+#include <nbl/core/math/glslFunctions.h>
+#include "vectorSIMD.h"
+#endif
+#include <nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl>
+#include <nbl/builtin/hlsl/macros.h>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace math
+{
+namespace linalg
+{
+
+template<typename MatT>
+MatT diagonal(typename matrix_traits<MatT>::scalar_type diagonal = 1)
+{
+	MatT output;
+	output[0][1] = 124;
+	using RowT = matrix_traits<MatT>::row_type;
+
+	NBL_UNROLL for (uint32_t i = 0; i < matrix_traits<MatT>::RowCount; ++i)
+	{
+		output[i] = promote<RowT>(0.0);
+		if (matrix_traits<MatT>::ColumnCount > i)
+			output[i][i] = diagonal;
+	}
+
+	return output;
+}
+
+template<typename MatT>
+MatT identity()
+{
+	return diagonal<MatT>(1);
+}
+
+template <uint16_t NOut, uint16_t MOut, uint16_t NIn, uint16_t MIn, typename T NBL_FUNC_REQUIRES(NOut <= NIn && MOut <= MIn && NOut != 0 && MOut != 0)
+inline matrix<T, NOut, MOut> truncate(const NBL_CONST_REF_ARG(matrix<T, NIn, MIn>) inMatrix)
+{
+	matrix<T, NOut, MOut> retval;
+
+	for (uint16_t i = 0; i < NOut; ++i)
+		for (uint16_t j = 0; j < MOut; ++j)
+			retval[i][j] = inMatrix[i][j];
+
+	return retval;
+}
+
+template<typename T, int N>
+inline matrix<T, 3, 3> getSub3x3(NBL_CONST_REF_ARG(matrix<T, N, 4>) mat)
+{
+	return matrix<T, 3, 3>(mat);
+}
+
+}
+}
+
+namespace impl
+{
+	/**
+	 * @brief Enables type-safe casting between matrices of identical dimensions
+	 *        but different scalar types.
+	 */
+	template<typename ScalarTo, typename ScalarFrom, uint16_t N, uint16_t M>
+	struct static_cast_helper<matrix<ScalarTo, N, M>, matrix<ScalarFrom, N, M>, void>
+	{
+		using To = matrix<ScalarTo, N, M>;
+		using From = matrix<ScalarFrom, N, M>;
+
+		static inline To cast(From mat)
+		{
+			To retval;
+
+			NBL_UNROLL for (int i = 0; i < N; ++i)
+			{
+				NBL_UNROLL for (int j = 0; j < M; ++j)
+				{
+					retval[i][j] = hlsl::_static_cast<ScalarTo>(mat[i][j]);
+				}
+			}
+
+			return retval;
+		}
+	};
+}
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/math/linalg/transform.hlsl b/include/nbl/builtin/hlsl/math/linalg/transform.hlsl
index 59ff142150..e13a333ade 100644
--- a/include/nbl/builtin/hlsl/math/linalg/transform.hlsl
+++ b/include/nbl/builtin/hlsl/math/linalg/transform.hlsl
@@ -88,12 +88,62 @@ matrix<T, NOut, MOut> promote_affine(const matrix<T, NIn, MIn> inMatrix)
     NBL_UNROLL for (uint32_t row_i = NIn; row_i < NOut; row_i++)
     {
         retval[row_i] = promote<out_row_t>(0.0);
-        if (row_i >= MIn && row_i < MOut)
+        if (row_i < MOut)
             retval[row_i][row_i] = T(1.0);
     }
     return retval;
 }
 
+// /Arek: glm:: for normalize till dot product is fixed (ambiguity with glm namespace + linker issues)
+template<typename T>
+inline matrix<T, 3, 4> lhLookAt(
+    const vector<T, 3>& position,
+    const vector<T, 3>& target,
+    const vector<T, 3>& upVector)
+{
+    const vector<T, 3> zaxis = hlsl::normalize(target - position);
+    const vector<T, 3> xaxis = hlsl::normalize(hlsl::cross(upVector, zaxis));
+    const vector<T, 3> yaxis = hlsl::cross(zaxis, xaxis);
+
+    matrix<T, 3, 4> r;
+    r[0] = vector<T, 4>(xaxis, -hlsl::dot(xaxis, position));
+    r[1] = vector<T, 4>(yaxis, -hlsl::dot(yaxis, position));
+    r[2] = vector<T, 4>(zaxis, -hlsl::dot(zaxis, position));
+
+    return r;
+}
+
+template<typename T>
+inline matrix<T, 3, 4> rhLookAt(
+    const vector<T, 3>& position,
+    const vector<T, 3>& target,
+    const vector<T, 3>& upVector)
+{
+    const vector<T, 3> zaxis = hlsl::normalize(position - target);
+    const vector<T, 3> xaxis = hlsl::normalize(hlsl::cross(upVector, zaxis));
+    const vector<T, 3> yaxis = hlsl::cross(zaxis, xaxis);
+
+    matrix<T, 3, 4> r;
+    r[0] = vector<T, 4>(xaxis, -hlsl::dot(xaxis, position));
+    r[1] = vector<T, 4>(yaxis, -hlsl::dot(yaxis, position));
+    r[2] = vector<T, 4>(zaxis, -hlsl::dot(zaxis, position));
+
+    return r;
+}
+
+template<typename T, int16_t N, int16_t M, int16_t VecN>
+inline void setTranslation(NBL_REF_ARG(matrix<T, N, M>) outMat, NBL_CONST_REF_ARG(vector<T, VecN>) translation)
+{
+    // TODO: not sure if it will be compatible with hlsl
+    static_assert(M > 0 && N > 0);
+    static_assert(M >= VecN);
+
+    NBL_CONSTEXPR int16_t indexOfTheLastRowComponent = M - 1;
+
+    for(int i = 0; i < VecN; ++i)
+        outMat[i][indexOfTheLastRowComponent] = translation[i];
+}
+
 }
 }
 }
diff --git a/include/nbl/builtin/hlsl/math/quaternions.hlsl b/include/nbl/builtin/hlsl/math/quaternions.hlsl
new file mode 100644
index 0000000000..833601b4b8
--- /dev/null
+++ b/include/nbl/builtin/hlsl/math/quaternions.hlsl
@@ -0,0 +1,305 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_MATH_QUATERNIONS_INCLUDED_
+#define _NBL_BUILTIN_HLSL_MATH_QUATERNIONS_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/tgmath.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace math
+{
+
+template<typename T>
+struct truncated_quaternion
+{
+    using this_t = truncated_quaternion<T>;
+    using scalar_type = T;
+    using data_type = vector<T, 3>;
+
+    static this_t create()
+    {
+        this_t q;
+        q.data = data_type(0.0, 0.0, 0.0);
+        return q;
+    }
+
+    data_type data;
+};
+
+template <typename T>
+struct quaternion
+{
+    using this_t = quaternion<T>;
+    using scalar_type = T;
+    using data_type = vector<T, 4>;
+    using vector3_type = vector<T, 3>;
+    using matrix_type = matrix<T, 3, 3>;
+
+    using AsUint = typename unsigned_integer_of_size<sizeof(scalar_type)>::type;
+
+    static this_t create()
+    {
+        this_t q;
+        q.data = data_type(0.0, 0.0, 0.0, 1.0);
+        return q;
+    }
+    
+    static this_t create(scalar_type x, scalar_type y, scalar_type z, scalar_type w)
+    {
+        this_t q;
+        q.data = data_type(x, y, z, w);
+        return q;
+    }
+
+    static this_t create(NBL_CONST_REF_ARG(this_t) other)
+    {
+        return other;
+    }
+
+    // angle: Rotation angle expressed in radians.
+    // axis: Rotation axis, must be normalized.
+    static this_t create(scalar_type angle, const vector3_type axis)
+    {
+        this_t q;
+        const scalar_type sinTheta = hlsl::sin(angle * 0.5);
+        const scalar_type cosTheta = hlsl::cos(angle * 0.5);
+        q.data = data_type(axis * sinTheta, cosTheta);
+        return q;
+    }
+
+
+    static this_t create(scalar_type pitch, scalar_type yaw, scalar_type roll)
+    {
+        const scalar_type rollDiv2 = roll * scalar_type(0.5);
+        const scalar_type sr = hlsl::sin(rollDiv2);
+        const scalar_type cr = hlsl::cos(rollDiv2);
+
+        const scalar_type pitchDiv2 = pitch * scalar_type(0.5);
+        const scalar_type sp = hlsl::sin(pitchDiv2);
+        const scalar_type cp = hlsl::cos(pitchDiv2);
+
+        const scalar_type yawDiv2 = yaw * scalar_type(0.5);
+        const scalar_type sy = hlsl::sin(yawDiv2);
+        const scalar_type cy = hlsl::cos(yawDiv2);
+
+        this_t output;
+        output.data[0] = cr * sp * cy + sr * cp * sy; // x
+        output.data[1] = cr * cp * sy - sr * sp * cy; // y
+        output.data[2] = sr * cp * cy - cr * sp * sy; // z
+        output.data[3] = cr * cp * cy + sr * sp * sy; // w
+
+        return output;
+    }
+
+    static this_t create(NBL_CONST_REF_ARG(matrix_type) m)
+    {
+        const scalar_type m00 = m[0][0], m11 = m[1][1], m22 = m[2][2];
+        const scalar_type neg_m00 = bit_cast<scalar_type>(bit_cast<AsUint>(m00)^0x80000000u);
+        const scalar_type neg_m11 = bit_cast<scalar_type>(bit_cast<AsUint>(m11)^0x80000000u);
+        const scalar_type neg_m22 = bit_cast<scalar_type>(bit_cast<AsUint>(m22)^0x80000000u);
+        const data_type Qx = data_type(m00, m00, neg_m00, neg_m00);
+        const data_type Qy = data_type(m11, neg_m11, m11, neg_m11);
+        const data_type Qz = data_type(m22, neg_m22, neg_m22, m22);
+
+        const data_type tmp = hlsl::promote<data_type>(1.0) + Qx + Qy + Qz;
+        const data_type invscales = hlsl::promote<data_type>(0.5) / hlsl::sqrt(tmp);
+        const data_type scales = tmp * invscales * hlsl::promote<data_type>(0.5);
+
+        // TODO: speed this up
+        this_t retval;
+        if (tmp.x > scalar_type(0.0))
+        {
+            retval.data.x = (m[2][1] - m[1][2]) * invscales.x;
+            retval.data.y = (m[0][2] - m[2][0]) * invscales.x;
+            retval.data.z = (m[1][0] - m[0][1]) * invscales.x;
+            retval.data.w = scales.x;
+        }
+        else
+        {
+            if (tmp.y > scalar_type(0.0))
+            {
+                retval.data.x = scales.y;
+                retval.data.y = (m[0][1] + m[1][0]) * invscales.y;
+                retval.data.z = (m[2][0] + m[0][2]) * invscales.y;
+                retval.data.w = (m[2][1] - m[1][2]) * invscales.y;
+            }
+            else if (tmp.z > scalar_type(0.0))
+            {
+                retval.data.x = (m[0][1] + m[1][0]) * invscales.z;
+                retval.data.y = scales.z;
+                retval.data.z = (m[0][2] - m[2][0]) * invscales.z;
+                retval.data.w = (m[1][2] + m[2][1]) * invscales.z;
+            }
+            else
+            {
+                retval.data.x = (m[0][2] + m[2][0]) * invscales.w;
+                retval.data.y = (m[1][2] + m[2][1]) * invscales.w;
+                retval.data.z = scales.w;
+                retval.data.w = (m[1][0] - m[0][1]) * invscales.w;
+            }
+        }
+
+        retval.data = hlsl::normalize(retval.data);
+        return retval;
+    }
+
+    static this_t create(NBL_CONST_REF_ARG(truncated_quaternion<T>) first3Components)
+    {
+        this_t retval;
+        retval.data.xyz = first3Components.data;
+        retval.data.w = hlsl::sqrt(scalar_type(1.0) - hlsl::dot(first3Components.data, first3Components.data));
+        return retval;
+    }
+
+    this_t operator*(scalar_type scalar)
+    {
+        this_t output;
+        output.data = data * scalar;
+        return output;
+    }
+
+    this_t operator*(NBL_CONST_REF_ARG(this_t) other)
+    {
+        return this_t::create(
+            data.w * other.data.w - data.x * other.x - data.y * other.data.y - data.z * other.data.z,
+            data.w * other.data.x + data.x * other.w + data.y * other.data.z - data.z * other.data.y,
+            data.w * other.data.y - data.x * other.z + data.y * other.data.w + data.z * other.data.x,
+            data.w * other.data.z + data.x * other.y - data.y * other.data.x + data.z * other.data.w
+        );
+    }
+
+    static this_t lerp(const this_t start, const this_t end, const scalar_type fraction, const scalar_type totalPseudoAngle)
+    {
+        const AsUint negationMask = hlsl::bit_cast<AsUint>(totalPseudoAngle) & AsUint(0x80000000u);
+        const data_type adjEnd = hlsl::bit_cast<scalar_type>(hlsl::bit_cast<AsUint>(end.data) ^ negationMask);
+
+        this_t retval;
+        retval.data = hlsl::mix(start.data, adjEnd, fraction);
+        return retval;
+    }
+
+    static this_t lerp(const this_t start, const this_t end, const scalar_type fraction)
+    {
+        return lerp(start, end, fraction, hlsl::dot(start.data, end.data));
+    }
+
+    static scalar_type __adj_interpolant(const scalar_type angle, const scalar_type fraction, const scalar_type interpolantPrecalcTerm2, const scalar_type interpolantPrecalcTerm3)
+    {
+        const scalar_type A = scalar_type(1.0904) + angle * (scalar_type(-3.2452) + angle * (scalar_type(3.55645) - angle * scalar_type(1.43519)));
+        const scalar_type B = scalar_type(0.848013) + angle * (scalar_type(-1.06021) + angle * scalar_type(0.215638));
+        const scalar_type k = A * interpolantPrecalcTerm2 + B;
+        return fraction + interpolantPrecalcTerm3 * k;
+    }
+
+    static this_t flerp(const this_t start, const this_t end, const scalar_type fraction)
+    {
+        const scalar_type pseudoAngle = hlsl::dot(start.data,end.data);
+        const scalar_type interpolantPrecalcTerm = fraction - scalar_type(0.5);
+        const scalar_type interpolantPrecalcTerm3 = fraction * interpolantPrecalcTerm * (fraction - scalar_type(1.0));
+        const scalar_type adjFrac = __adj_interpolant(hlsl::abs(pseudoAngle),fraction,interpolantPrecalcTerm*interpolantPrecalcTerm,interpolantPrecalcTerm3);
+        
+        this_t retval = lerp(start,end,adjFrac,pseudoAngle);
+        retval.data = hlsl::normalize(retval.data);
+        return retval;
+    }
+
+    vector3_type transformVector(const vector3_type v)
+    {
+        scalar_type scale = hlsl::length(data);
+        vector3_type direction = data.xyz;
+        return v * scale + hlsl::cross(direction, v * data.w + hlsl::cross(direction, v)) * scalar_type(2.0);
+    }
+
+    matrix_type constructMatrix() NBL_CONST_MEMBER_FUNC
+    {
+        matrix_type mat;
+        mat[0] = data.yzx * data.ywz + data.zxy * data.zyw * vector3_type( 1.0, 1.0,-1.0);
+        mat[1] = data.yzx * data.xzw + data.zxy * data.wxz * vector3_type(-1.0, 1.0, 1.0);
+        mat[2] = data.yzx * data.wyx + data.zxy * data.xwy * vector3_type( 1.0,-1.0, 1.0);
+        mat[0][0] = scalar_type(0.5) - mat[0][0];
+        mat[1][1] = scalar_type(0.5) - mat[1][1];
+        mat[2][2] = scalar_type(0.5) - mat[2][2];
+        mat = nbl::hlsl::mul(mat, scalar_type(2.0));
+        return hlsl::transpose(mat);    // TODO: double check transpose?
+    }
+
+    static vector3_type slerp_delta(const vector3_type start, const vector3_type preScaledWaypoint, scalar_type cosAngleFromStart)
+    {
+        vector3_type planeNormal = hlsl::cross(start,preScaledWaypoint);
+    
+        cosAngleFromStart *= scalar_type(0.5);
+        const scalar_type sinAngle = hlsl::sqrt(scalar_type(0.5) - cosAngleFromStart);
+        const scalar_type cosAngle = hlsl::sqrt(scalar_type(0.5) + cosAngleFromStart);
+        
+        planeNormal *= sinAngle;
+        const vector3_type precompPart = hlsl::cross(planeNormal, start) * scalar_type(2.0);
+
+        return precompPart * cosAngle + hlsl::cross(planeNormal, precompPart);
+    }
+
+    this_t inverse()
+    {
+        this_t retval;
+        retval.data.x = bit_cast<scalar_type>(bit_cast<AsUint>(data.x)^0x80000000u);
+        retval.data.y = bit_cast<scalar_type>(bit_cast<AsUint>(data.y)^0x80000000u);
+        retval.data.z = bit_cast<scalar_type>(bit_cast<AsUint>(data.z)^0x80000000u);
+        retval.data.w = data.w;
+        return retval;
+    }
+
+    static this_t normalize(NBL_CONST_REF_ARG(this_t) q)
+    {
+        this_t retval;
+        retval.data = hlsl::normalize(q.data);
+        return retval;
+    }
+
+    data_type data;
+};
+
+}
+
+namespace impl
+{
+
+template<typename T>
+struct static_cast_helper<math::quaternion<T>, math::truncated_quaternion<T> >
+{
+    static inline math::quaternion<T> cast(math::truncated_quaternion<T> q)
+    {
+        return math::quaternion<T>::create(q);
+    }
+};
+
+template<typename T>
+struct static_cast_helper<math::truncated_quaternion<T>, math::quaternion<T> >
+{
+    static inline math::truncated_quaternion<T> cast(math::quaternion<T> q)
+    {
+        math::truncated_quaternion<T> t;
+        t.data.x = t.data.x;
+        t.data.y = t.data.y;
+        t.data.z = t.data.z;
+        return t;
+    }
+};
+
+template<typename T>
+struct static_cast_helper<matrix<T,3,3>, math::quaternion<T> >
+{
+    static inline matrix<T,3,3> cast(math::quaternion<T> q)
+    {
+        return q.constructMatrix();
+    }
+};
+}
+
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/math/thin_lens_projection.hlsl b/include/nbl/builtin/hlsl/math/thin_lens_projection.hlsl
new file mode 100644
index 0000000000..70c46fdb37
--- /dev/null
+++ b/include/nbl/builtin/hlsl/math/thin_lens_projection.hlsl
@@ -0,0 +1,88 @@
+#ifndef _NBL_BUILTIN_HLSL_MATH_THIN_LENS_PROJECTION_INCLUDED_
+#define _NBL_BUILTIN_HLSL_MATH_THIN_LENS_PROJECTION_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/concepts.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace math
+{
+namespace thin_lens
+{
+
+template<typename FloatingPoint NBL_FUNC_REQUIRES(concepts::FloatingPoint<FloatingPoint>)
+inline matrix<FloatingPoint, 4, 4> rhPerspectiveFovMatrix(FloatingPoint fieldOfViewRadians, FloatingPoint aspectRatio, FloatingPoint zNear, FloatingPoint zFar)
+{
+	const FloatingPoint h = core::reciprocal<FloatingPoint>(tan(fieldOfViewRadians * 0.5f));
+	_NBL_DEBUG_BREAK_IF(aspectRatio == 0.f); //division by zero
+	const float w = h / aspectRatio;
+
+	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
+
+	matrix<FloatingPoint, 4, 4> m;
+	m[0] = vector<FloatingPoint, 4>(w, 0.f, 0.f, 0.f);
+	m[1] = vector<FloatingPoint, 4>(0.f, -h, 0.f, 0.f);
+	m[2] = vector<FloatingPoint, 4>(0.f, 0.f, -zFar / (zFar - zNear), -zNear * zFar / (zFar - zNear));
+	m[3] = vector<FloatingPoint, 4>(0.f, 0.f, -1.f, 0.f);
+
+	return m;
+}
+template<typename FloatingPoint NBL_FUNC_REQUIRES(concepts::FloatingPoint<FloatingPoint>)
+inline matrix<FloatingPoint, 4, 4> lhPerspectiveFovMatrix(FloatingPoint fieldOfViewRadians, FloatingPoint aspectRatio, FloatingPoint zNear, FloatingPoint zFar)
+{
+	const FloatingPoint h = core::reciprocal<FloatingPoint>(tan(fieldOfViewRadians * 0.5f));
+	_NBL_DEBUG_BREAK_IF(aspectRatio == 0.f); //division by zero
+	const float w = h / aspectRatio;
+
+	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
+
+	matrix<FloatingPoint, 4, 4> m;
+	m[0] = vector<FloatingPoint, 4>(w, 0.f, 0.f, 0.f);
+	m[1] = vector<FloatingPoint, 4>(0.f, -h, 0.f, 0.f);
+	m[2] = vector<FloatingPoint, 4>(0.f, 0.f, zFar / (zFar - zNear), -zNear * zFar / (zFar - zNear));
+	m[3] = vector<FloatingPoint, 4>(0.f, 0.f, 1.f, 0.f);
+
+	return m;
+}
+
+template<typename FloatingPoint  NBL_FUNC_REQUIRES(concepts::FloatingPoint<FloatingPoint>)
+inline matrix<FloatingPoint, 4, 4> rhProjectionOrthoMatrix(FloatingPoint widthOfViewVolume, FloatingPoint heightOfViewVolume, FloatingPoint zNear, FloatingPoint zFar)
+{
+	_NBL_DEBUG_BREAK_IF(widthOfViewVolume == 0.f); //division by zero
+	_NBL_DEBUG_BREAK_IF(heightOfViewVolume == 0.f); //division by zero
+	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
+
+	matrix<FloatingPoint, 4, 4> m;
+	m[0] = vector<FloatingPoint, 4>(2.f / widthOfViewVolume, 0.f, 0.f, 0.f);
+	m[1] = vector<FloatingPoint, 4>(0.f, -2.f / heightOfViewVolume, 0.f, 0.f);
+	m[2] = vector<FloatingPoint, 4>(0.f, 0.f, -1.f / (zFar - zNear), -zNear / (zFar - zNear));
+	m[3] = vector<FloatingPoint, 4>(0.f, 0.f, 0.f, 1.f);
+
+	return m;
+}
+
+template<typename FloatingPoint NBL_FUNC_REQUIRES(concepts::FloatingPoint<FloatingPoint>)
+inline matrix<FloatingPoint, 4, 4> lhProjectionOrthoMatrix(FloatingPoint widthOfViewVolume, FloatingPoint heightOfViewVolume, FloatingPoint zNear, FloatingPoint zFar)
+{
+	_NBL_DEBUG_BREAK_IF(widthOfViewVolume == 0.f); //division by zero
+	_NBL_DEBUG_BREAK_IF(heightOfViewVolume == 0.f); //division by zero
+	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
+
+	matrix<FloatingPoint, 4, 4> m;
+	m[0] = vector<FloatingPoint, 4>(2.f / widthOfViewVolume, 0.f, 0.f, 0.f);
+	m[1] = vector<FloatingPoint, 4>(0.f, -2.f / heightOfViewVolume, 0.f, 0.f);
+	m[2] = vector<FloatingPoint, 4>(0.f, 0.f, 1.f / (zFar - zNear), -zNear / (zFar - zNear));
+	m[3] = vector<FloatingPoint, 4>(0.f, 0.f, 0.f, 1.f);
+
+	return m;
+}
+
+}
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl b/include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl
index f9c031c8e7..f554be7abe 100644
--- a/include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl
+++ b/include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl
@@ -21,48 +21,18 @@ struct matrix_traits
     NBL_CONSTEXPR_STATIC_INLINE bool IsMatrix = false;
 };
 
-// i choose to implement it this way because of this DXC bug: https://github.com/microsoft/DirectXShaderCompiler/issues/7007
-#define DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(ROW_COUNT, COLUMN_COUNT) \
-template<typename T> \
-struct matrix_traits<matrix<T, ROW_COUNT, COLUMN_COUNT> > \
-{ \
-    using scalar_type = T; \
-    using row_type = vector<T, COLUMN_COUNT>; \
-    using transposed_type = matrix<T, COLUMN_COUNT, ROW_COUNT>; \
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t RowCount = ROW_COUNT; \
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t ColumnCount = COLUMN_COUNT; \
-    NBL_CONSTEXPR_STATIC_INLINE bool Square = RowCount == ColumnCount; \
-    NBL_CONSTEXPR_STATIC_INLINE bool IsMatrix = true; \
-};
-
-DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(1, 2)
-DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(1, 3)
-DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(1, 4)
-DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(2, 1)
-DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(2, 2)
-DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(2, 3)
-DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(2, 4)
-DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(3, 1)
-DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(3, 2)
-DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(3, 3)
-DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(3, 4)
-DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(4, 1)
-DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(4, 2)
-DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(4, 3)
-DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(4, 4)
-
-#undef DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION
-
 // TODO: when this bug: https://github.com/microsoft/DirectXShaderCompiler/issues/7007 is fixed, uncomment and delete template specializations
-/*template<typename T, uint32_t N, uint32_t M>
+template<typename T, uint32_t N, uint32_t M>
 struct matrix_traits<matrix<T,N,M> >
 {
     using scalar_type = T;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t RowCount = ROW_COUNT;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t ColumnCount = COLUMN_COUNT;
+    using row_type = vector<T, M>;
+    using transposed_type = matrix<T, M, N>;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t RowCount = N;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t ColumnCount = M;
     NBL_CONSTEXPR_STATIC_INLINE bool Square = RowCount == ColumnCount;
+    NBL_CONSTEXPR_STATIC_INLINE bool IsMatrix = true;
 };
-*/
 
 }
 }
diff --git a/include/nbl/core/declarations.h b/include/nbl/core/declarations.h
index 9aa708a793..466ea988aa 100644
--- a/include/nbl/core/declarations.h
+++ b/include/nbl/core/declarations.h
@@ -50,7 +50,6 @@
 #include "nbl/core/math/colorutil.h"
 #include "nbl/core/math/rational.h"
 #include "nbl/core/math/plane3dSIMD.h"
-#include "nbl/core/math/matrixutil.h"
 // memory
 #include "nbl/core/memory/memory.h"
 #include "nbl/core/memory/new_delete.h"
diff --git a/include/nbl/core/definitions.h b/include/nbl/core/definitions.h
index c08af6ad74..5913c2c8f2 100644
--- a/include/nbl/core/definitions.h
+++ b/include/nbl/core/definitions.h
@@ -15,8 +15,4 @@
 #include "nbl/core/math/floatutil.tcc"
 #include "nbl/core/math/glslFunctions.tcc"
 
-// implementations [deprecated]
-#include "matrix3x4SIMD_impl.h"
-#include "matrix4SIMD_impl.h"
-
 #endif
\ No newline at end of file
diff --git a/include/nbl/core/math/floatutil.tcc b/include/nbl/core/math/floatutil.tcc
index 71c8bd2da7..f20db5dec2 100644
--- a/include/nbl/core/math/floatutil.tcc
+++ b/include/nbl/core/math/floatutil.tcc
@@ -5,9 +5,8 @@
 #ifndef __NBL_CORE_FLOAT_UTIL_TCC_INCLUDED__
 #define __NBL_CORE_FLOAT_UTIL_TCC_INCLUDED__
 
-
+#include "vectorSIMD.h"
 #include "nbl/core/math/floatutil.h"
-#include "matrix4SIMD.h"
 
 namespace nbl
 {
@@ -29,16 +28,6 @@ NBL_FORCE_INLINE vectorSIMDf ROUNDING_ERROR<vectorSIMDf>()
 {
 	return vectorSIMDf(ROUNDING_ERROR<float>());
 }
-template<>
-NBL_FORCE_INLINE matrix3x4SIMD ROUNDING_ERROR<matrix3x4SIMD>()
-{
-	return matrix3x4SIMD(ROUNDING_ERROR<vectorSIMDf>(),ROUNDING_ERROR<vectorSIMDf>(),ROUNDING_ERROR<vectorSIMDf>());
-}
-template<>
-NBL_FORCE_INLINE matrix4SIMD ROUNDING_ERROR<matrix4SIMD>()
-{
-	return matrix4SIMD(ROUNDING_ERROR<vectorSIMDf>(),ROUNDING_ERROR<vectorSIMDf>(),ROUNDING_ERROR<vectorSIMDf>(),ROUNDING_ERROR<vectorSIMDf>());
-}
 template<typename T>
 NBL_FORCE_INLINE T ROUNDING_ERROR()
 {
diff --git a/include/nbl/core/math/glslFunctions.tcc b/include/nbl/core/math/glslFunctions.tcc
index 205585965b..b8326b41d1 100644
--- a/include/nbl/core/math/glslFunctions.tcc
+++ b/include/nbl/core/math/glslFunctions.tcc
@@ -8,7 +8,6 @@
 #include "nbl/core/declarations.h"
 
 #include "nbl/core/math/floatutil.tcc"
-#include "matrix4SIMD.h"
 
 #include <cmath>
 #include <numeric>
@@ -280,21 +279,6 @@ NBL_FORCE_INLINE vectorSIMDf cross<vectorSIMDf>(const vectorSIMDf& a, const vect
 #endif
 }
 
-template<>
-NBL_FORCE_INLINE matrix4SIMD transpose(const matrix4SIMD& m)
-{
-	core::matrix4SIMD retval;
-	__m128 a0 = m.rows[0].getAsRegister(), a1 = m.rows[1].getAsRegister(), a2 = m.rows[2].getAsRegister(), a3 = m.rows[3].getAsRegister();
-	_MM_TRANSPOSE4_PS(a0, a1, a2, a3);
-	retval.rows[0] = a0;
-	retval.rows[1] = a1;
-	retval.rows[2] = a2;
-	retval.rows[3] = a3;
-	return retval;
-}
-
-
-
 template<>
 NBL_FORCE_INLINE bool equals<vectorSIMDf>(const vectorSIMDf& a, const vectorSIMDf& b, const vectorSIMDf& tolerance)
 {
@@ -307,22 +291,6 @@ NBL_FORCE_INLINE bool equals(const core::vector3df& a, const core::vector3df& b,
 	auto la = a-tolerance;
 	return ha.X>=b.X&&ha.Y>=b.Y&&ha.Z>=b.Z && la.X<=b.X&&la.Y<=b.Y&&la.Z<=b.Z;
 }
-template<>
-NBL_FORCE_INLINE bool equals<matrix4SIMD>(const matrix4SIMD& a, const matrix4SIMD& b, const matrix4SIMD& tolerance)
-{
-	for (size_t i = 0u; i<matrix4SIMD::VectorCount; ++i)
-		if (!equals<vectorSIMDf>(a.rows[i], b.rows[i], tolerance.rows[i]))
-			return false;
-	return true;
-}
-template<>
-NBL_FORCE_INLINE bool equals<matrix3x4SIMD>(const matrix3x4SIMD& a, const matrix3x4SIMD& b, const matrix3x4SIMD& tolerance)
-{
-	for (size_t i = 0u; i<matrix3x4SIMD::VectorCount; ++i)
-		if (!equals<vectorSIMDf>(a.rows[i], b.rows[i], tolerance[i]))
-			return false;
-	return true;
-}
 template<typename T>
 NBL_FORCE_INLINE bool equals(const T& a, const T& b, const T& tolerance)
 {
diff --git a/include/nbl/core/math/matrixutil.h b/include/nbl/core/math/matrixutil.h
deleted file mode 100644
index afe7955c9b..0000000000
--- a/include/nbl/core/math/matrixutil.h
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef _NBL_MATRIX_UTIL_H_INCLUDED_
-#define _NBL_MATRIX_UTIL_H_INCLUDED_
-
-#include "matrix4SIMD.h"
-#include "matrix3x4SIMD.h"
-
-namespace nbl::core
-{
-
-
-//! TODO: OPTIMIZE THIS, DON'T PROMOTE THE MATRIX IF DON'T HAVE TO
-inline matrix4SIMD concatenateBFollowedByA(const matrix4SIMD& _a, const matrix3x4SIMD& _b)
-{
-    return concatenateBFollowedByA(_a, matrix4SIMD(_b));
-}
-/*
-inline matrix4SIMD concatenateBFollowedByAPrecisely(const matrix4SIMD& _a, const matrix3x4SIMD& _b)
-{
-    return concatenateBFollowedByAPrecisely(_a, matrix4SIMD(_b));
-}
-*/
-
-}
-
-#endif
diff --git a/include/nbl/core/math/plane3dSIMD.h b/include/nbl/core/math/plane3dSIMD.h
index 891ed1300c..edad0a1287 100644
--- a/include/nbl/core/math/plane3dSIMD.h
+++ b/include/nbl/core/math/plane3dSIMD.h
@@ -3,11 +3,13 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 // See the original file in irrlicht source for authors
 
+#include "vectorSIMD.h"
+#include <nbl/builtin/hlsl/math/linalg/matrix_utils/transformation_matrix_utils.hlsl>
+#include <nbl/builtin/hlsl/math/linalg/transform.hlsl>
+
 #ifndef __NBL_CORE_PLANE_3D_H_INCLUDED__
 #define __NBL_CORE_PLANE_3D_H_INCLUDED__
 
-#include "matrix3x4SIMD.h"
-
 namespace nbl
 {
 namespace core
@@ -99,14 +101,21 @@ class plane3dSIMDf : private vectorSIMDf
 		}
 
 		//!
-        static inline plane3dSIMDf transform(const plane3dSIMDf& _in, const matrix3x4SIMD& _mat)
+        static inline plane3dSIMDf transform(const plane3dSIMDf& _in, const hlsl::float32_t3x4& _mat)
         {
-            matrix3x4SIMD inv;
-            _mat.getInverse(inv);
+			hlsl::float32_t4x4 inv = hlsl::inverse(hlsl::math::linalg::promote_affine<4, 4, 3, 4>(_mat));
 
             vectorSIMDf normal(_in.getNormal());
             // transform by inverse transpose
-            return plane3dSIMDf(inv.rows[0]*normal.xxxx()+inv.rows[1]*normal.yyyy()+inv.rows[2]*normal.zzzz()+(normal.wwww()&BUILD_MASKF(0,0,0,1)));
+			hlsl::float32_t4 planeEq = inv[0] * hlsl::float32_t4(normal.x, normal.x, normal.x, normal.x) +
+				inv[1] * hlsl::float32_t4(normal.y, normal.y, normal.y, normal.y) +
+				inv[2] * hlsl::float32_t4(normal.z, normal.z, normal.z, normal.z) +
+				(hlsl::float32_t4(0, 0, 0, normal.w));
+			vectorSIMDf planeEqSIMD;
+			for (int i = 0; i < 4; ++i)
+				planeEqSIMD[i] = planeEq[i];
+
+			return plane3dSIMDf(planeEqSIMD);
 		    #undef BUILD_MASKF
         }
 
diff --git a/include/nbl/ext/Bullet/BulletUtility.h b/include/nbl/ext/Bullet/BulletUtility.h
index 507adbceda..450c20c50d 100644
--- a/include/nbl/ext/Bullet/BulletUtility.h
+++ b/include/nbl/ext/Bullet/BulletUtility.h
@@ -64,8 +64,8 @@ namespace Bullet3
         return convert<const btVector4&, const core::vectorSIMDf&>(vec);
     }
 
-    inline core::matrix3x4SIMD convertbtTransform(const btTransform &trans) {
-        core::matrix3x4SIMD mat;
+    inline hlsl::float32_t3x4 convertbtTransform(const btTransform &trans) {
+        hlsl::float32_t3x4 mat;
 
         for (uint32_t i = 0; i < 3u; ++i) {
             mat.rows[i] = frombtVec3(trans.getBasis().getRow(i));
@@ -75,7 +75,7 @@ namespace Bullet3
         return mat;
     }
 
-    inline btTransform convertMatrixSIMD(const core::matrix3x4SIMD &mat) {
+    inline btTransform convertMatrixSIMD(const hlsl::float32_t3x4 &mat) {
         btTransform transform;
         
         //Calling makeSafe3D on rows erases translation so save it
diff --git a/include/nbl/ext/Bullet/CPhysicsWorld.h b/include/nbl/ext/Bullet/CPhysicsWorld.h
index d6529a2565..cfaf70d6d6 100644
--- a/include/nbl/ext/Bullet/CPhysicsWorld.h
+++ b/include/nbl/ext/Bullet/CPhysicsWorld.h
@@ -24,7 +24,7 @@ class CPhysicsWorld : public core::IReferenceCounted
         struct RigidBodyData
         {
             btCollisionShape *shape;
-            core::matrix3x4SIMD trans;
+            hlsl::float32_t3x4 trans;
             core::vectorSIMDf inertia;
             float mass;
         };
diff --git a/include/nbl/ext/DebugDraw/CDraw3DLine.h b/include/nbl/ext/DebugDraw/CDraw3DLine.h
index 68cd64e9c1..86b874f9d1 100644
--- a/include/nbl/ext/DebugDraw/CDraw3DLine.h
+++ b/include/nbl/ext/DebugDraw/CDraw3DLine.h
@@ -33,7 +33,7 @@ class CDraw3DLine : public core::IReferenceCounted
 		}
 
 
-		void setData(const core::matrix4SIMD& viewProjMat, const core::vector<std::pair<S3DLineVertex, S3DLineVertex>>& linesData)
+		void setData(const hlsl::float32_t4x4& viewProjMat, const core::vector<std::pair<S3DLineVertex, S3DLineVertex>>& linesData)
 		{
 			m_viewProj = viewProjMat;
 			m_lines = linesData;
@@ -45,7 +45,7 @@ class CDraw3DLine : public core::IReferenceCounted
 			m_lines.clear();
 		}
 
-		void setLine(const core::matrix4SIMD& viewProjMat,
+		void setLine(const hlsl::float32_t4x4& viewProjMat,
 			float fromX, float fromY, float fromZ,
 			float toX, float toY, float toZ,
 			float r, float g, float b, float a
@@ -54,7 +54,7 @@ class CDraw3DLine : public core::IReferenceCounted
 			m_lines = core::vector<std::pair<S3DLineVertex, S3DLineVertex>>{ std::pair(S3DLineVertex{{ fromX, fromY, fromZ }, { r, g, b, a }}, S3DLineVertex{{ toX, toY, toZ }, { r, g, b, a }}) };
 		}
 
-		void addLine(const core::matrix4SIMD& viewProjMat,
+		void addLine(const hlsl::float32_t4x4& viewProjMat,
 			float fromX, float fromY, float fromZ,
 			float toX, float toY, float toZ,
 			float r, float g, float b, float a
@@ -73,7 +73,7 @@ class CDraw3DLine : public core::IReferenceCounted
 			m_lines.insert(m_lines.end(), linesData.begin(), linesData.end());
 		}
 
-		void setViewProjMatrix(const core::matrix4SIMD& viewProjMat)
+		void setViewProjMatrix(const hlsl::float32_t4x4& viewProjMat)
 		{
 			m_viewProj = viewProjMat;
 		}
@@ -91,7 +91,7 @@ class CDraw3DLine : public core::IReferenceCounted
 		*/
 		void recordToCommandBuffer(video::IGPUCommandBuffer* cmdBuffer, video::IGPUGraphicsPipeline* graphics_pipeline);
 
-		inline void addBox(const core::aabbox3df& box, float r, float g, float b, float a, const core::matrix3x4SIMD& tform=core::matrix3x4SIMD())
+		inline void addBox(const core::aabbox3df& box, float r, float g, float b, float a, const hlsl::float32_t3x4& tform=hlsl::float32_t3x4())
 		{
 			auto addLine = [&](auto s, auto e) -> void
 			{
@@ -128,7 +128,7 @@ class CDraw3DLine : public core::IReferenceCounted
 		core::smart_refctd_ptr<video::ILogicalDevice> m_device;
 		core::smart_refctd_ptr<video::IGPUBuffer> m_linesBuffer =  nullptr;
 		core::smart_refctd_ptr<video::IGPURenderpassIndependentPipeline> m_rpindependent_pipeline;
-		core::matrix4SIMD m_viewProj;
+		hlsl::float32_t4x4 m_viewProj;
 		core::vector<std::pair<S3DLineVertex, S3DLineVertex>> m_lines;
         const uint32_t alignments[1] = { sizeof(S3DLineVertex) };
 };
diff --git a/include/nbl/scene/ISkinInstanceCache.h b/include/nbl/scene/ISkinInstanceCache.h
index 6cb18160d4..2eb83b4aac 100644
--- a/include/nbl/scene/ISkinInstanceCache.h
+++ b/include/nbl/scene/ISkinInstanceCache.h
@@ -19,7 +19,7 @@ class ISkinInstanceCache : public virtual core::IReferenceCounted
 
 		// main pseudo-pool properties
 		using joint_t = ITransformTree::node_t;
-		using skinning_matrix_t = core::matrix3x4SIMD;
+		using skinning_matrix_t = hlsl::float32_t3x4;
 		using recomputed_stamp_t = ITransformTree::recomputed_stamp_t;
 		using inverse_bind_pose_offset_t = uint32_t;
 
@@ -35,7 +35,7 @@ class ISkinInstanceCache : public virtual core::IReferenceCounted
 		static inline constexpr uint32_t inverse_bind_pose_offset_prop_ix = 3u;
 
 		// for the inverse bind pose pool
-		using inverse_bind_pose_t = core::matrix3x4SIMD;
+		using inverse_bind_pose_t = hlsl::float32_t3x4;
 		static inline constexpr uint32_t inverse_bind_pose_prop_ix = 0u;
 		
 
diff --git a/include/nbl/scene/ISkinInstanceCacheManager.h b/include/nbl/scene/ISkinInstanceCacheManager.h
index 5a5e3f5881..474a8a3eaa 100644
--- a/include/nbl/scene/ISkinInstanceCacheManager.h
+++ b/include/nbl/scene/ISkinInstanceCacheManager.h
@@ -466,7 +466,7 @@ class ISkinInstanceCacheManager : public virtual core::IReferenceCounted
 		}
 		struct DebugPushConstants
 		{
-			core::matrix4SIMD viewProjectionMatrix;
+			hlsl::float32_t4x4 viewProjectionMatrix;
 			core::vector4df_SIMD lineColor;
 			core::vector3df aabbColor;
 			uint32_t skinCount;
diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h
index 1bb4fb0c66..3c10a255a2 100644
--- a/include/nbl/video/IGPUAccelerationStructure.h
+++ b/include/nbl/video/IGPUAccelerationStructure.h
@@ -272,7 +272,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat
 							// https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdBuildAccelerationStructuresIndirectKHR-pInfos-03809
 							// https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdBuildAccelerationStructuresIndirectKHR-pInfos-03810
 							// https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkBuildAccelerationStructuresKHR-pInfos-03773
-							if (Base::invalidInputBuffer(geometry.transform,buildRangeInfo.transformByteOffset,1u,sizeof(core::matrix3x4SIMD),sizeof(core::vectorSIMDf)))
+							if (Base::invalidInputBuffer(geometry.transform,buildRangeInfo.transformByteOffset,1u,sizeof(hlsl::float32_t3x4),sizeof(core::vectorSIMDf)))
 								return false;
 						}
 						else
@@ -622,7 +622,7 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 				inline PolymorphicInstance(const PolymorphicInstance<blas_ref_t>&) = default;
 				inline PolymorphicInstance(PolymorphicInstance<blas_ref_t>&&) = default;
 
-				// I made all these assignment operators because of the `core::matrix3x4SIMD` alignment and keeping `type` correct at all times
+				// I made all these assignment operators because of the `hlsl::float32_t3x4` alignment and keeping `type` correct at all times
 				inline PolymorphicInstance<blas_ref_t>& operator=(const StaticInstance<blas_ref_t>& _static)
 				{
 					type = INSTANCE_TYPE::STATIC;
@@ -657,7 +657,7 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 				static_assert(std::is_same_v<std::underlying_type_t<INSTANCE_TYPE>,uint32_t>);
 				// these must be 0 as per vulkan spec
 				uint32_t reservedMotionFlags = 0u;
-				// I don't do an actual union because the preceeding members don't play nicely with alignment of `core::matrix3x4SIMD` and Vulkan requires this struct to be packed
+				// I don't do an actual union because the preceeding members don't play nicely with alignment of `hlsl::float32_t3x4` and Vulkan requires this struct to be packed
 				SRTMotionInstance<blas_ref_t> largestUnionMember = {};
 				static_assert(alignof(SRTMotionInstance<blas_ref_t>)==8ull);
 
diff --git a/include/quaternion.h b/include/quaternion.h
deleted file mode 100644
index c1867235db..0000000000
--- a/include/quaternion.h
+++ /dev/null
@@ -1,462 +0,0 @@
-// Copyright (C) 2019 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine"
-// For conditions of distribution and use, see copyright notice in nabla.h
-// See the original file in irrlicht source for authors
-
-#ifndef __NBL_QUATERNION_H_INCLUDED__
-#define __NBL_QUATERNION_H_INCLUDED__
-
-
-#include "vectorSIMD.h"
-
-#include "nbl/core/math/glslFunctions.h"
-
-
-namespace nbl
-{
-namespace core
-{
-
-class matrix3x4SIMD;
-
-
-//! Quaternion class for representing rotations.
-/** It provides cheap combinations and avoids gimbal locks.
-Also useful for interpolations. */
-class quaternion : private vectorSIMDf
-{
-	public:
-		//! Default Constructor
-		inline quaternion() : vectorSIMDf(0,0,0,1) {}
-
-		inline quaternion(const quaternion& other) : vectorSIMDf(static_cast<const vectorSIMDf&>(other)) {}
-
-		inline quaternion(const float* data) : vectorSIMDf(data) {}
-
-		//! Constructor
-		inline quaternion(const float& x, const float& y, const float& z, const float& w) : vectorSIMDf(x,y,z,w) { }
-
-		//! Constructor which converts euler angles (radians) to a quaternion
-		inline quaternion(const float& pitch, const float& yaw, const float& roll) {set(pitch,yaw,roll);}
-
-		//! Constructor which converts a matrix to a quaternion
-		explicit quaternion(const matrix3x4SIMD& m);
-
-      		inline float* getPointer() {return pointer;}
-
-		//! Equalilty operator
-		inline vector4db_SIMD operator==(const quaternion& other) const {return vectorSIMDf::operator==(other);}
-
-		//! inequality operator
-		inline vector4db_SIMD operator!=(const quaternion& other) const {return vectorSIMDf::operator!=(other);}
-
-		//! Assignment operator
-		inline quaternion& operator=(const quaternion& other) {return reinterpret_cast<quaternion&>(vectorSIMDf::operator=(other));}
-
-		//! Multiplication operator with scalar
-		inline quaternion operator*(const float& s) const
-		{
-		    quaternion tmp;
-		    reinterpret_cast<vectorSIMDf&>(tmp) = reinterpret_cast<const vectorSIMDf*>(this)->operator*(s);
-		    return tmp;
-		}
-
-		//! Multiplication operator with scalar
-		inline quaternion& operator*=(const float& s)
-		{
-		    *this = (*this)*s;
-		    return *this;
-		}
-
-		//! Multiplication operator
-		inline quaternion& operator*=(const quaternion& other)
-		{
-		    *this = (*this)*other;
-		    return *this;
-		}
-
-		//! Multiplication operator
-		//http://momchil-velikov.blogspot.fr/2013/10/fast-sse-quternion-multiplication.html
-		inline quaternion operator*(const quaternion& other) const
-        {
-            __m128 xyzw = vectorSIMDf::getAsRegister();
-            __m128 abcd = reinterpret_cast<const vectorSIMDf&>(other).getAsRegister();
-
-          __m128 t0 = FAST_FLOAT_SHUFFLE(abcd, _MM_SHUFFLE (3, 3, 3, 3)); /* 1, 0.5 */
-          __m128 t1 = FAST_FLOAT_SHUFFLE(xyzw, _MM_SHUFFLE (2, 3, 0, 1)); /* 1, 0.5 */
-
-          __m128 t3 = FAST_FLOAT_SHUFFLE(abcd, _MM_SHUFFLE (0, 0, 0, 0)); /* 1, 0.5 */
-          __m128 t4 = FAST_FLOAT_SHUFFLE(xyzw, _MM_SHUFFLE (1, 0, 3, 2)); /* 1, 0.5 */
-
-          __m128 t5 = FAST_FLOAT_SHUFFLE(abcd, _MM_SHUFFLE (1, 1, 1, 1)); /* 1, 0.5 */
-          __m128 t6 = FAST_FLOAT_SHUFFLE(xyzw, _MM_SHUFFLE (2, 0, 3, 1)); /* 1, 0.5 */
-
-          /* [d,d,d,d]*[z,w,x,y] = [dz,dw,dx,dy] */
-          __m128 m0 = _mm_mul_ps (t0, t1); /* 5/4, 1 */
-
-          /* [a,a,a,a]*[y,x,w,z] = [ay,ax,aw,az]*/
-          __m128 m1 = _mm_mul_ps (t3, t4); /* 5/4, 1 */
-
-          /* [b,b,b,b]*[z,x,w,y] = [bz,bx,bw,by]*/
-          __m128 m2 = _mm_mul_ps (t5, t6); /* 5/4, 1 */
-
-          /* [c,c,c,c]*[w,z,x,y] = [cw,cz,cx,cy] */
-          __m128 t7 = FAST_FLOAT_SHUFFLE(abcd, _MM_SHUFFLE (2, 2, 2, 2)); /* 1, 0.5 */
-          __m128 t8 = FAST_FLOAT_SHUFFLE(xyzw, _MM_SHUFFLE (3, 2, 0, 1)); /* 1, 0.5 */
-
-          __m128 m3 = _mm_mul_ps (t7, t8); /* 5/4, 1 */
-
-          /* 1 */
-          /* [dz,dw,dx,dy]+-[ay,ax,aw,az] = [dz+ay,dw-ax,dx+aw,dy-az] */
-          __m128 e = _mm_addsub_ps (m0, m1); /* 3, 1 */
-
-          /* 2 */
-          /* [dx+aw,dz+ay,dy-az,dw-ax] */
-          e = FAST_FLOAT_SHUFFLE(e, _MM_SHUFFLE (1, 3, 0, 2)); /* 1, 0.5 */
-
-          /* [dx+aw,dz+ay,dy-az,dw-ax]+-[bz,bx,bw,by] = [dx+aw+bz,dz+ay-bx,dy-az+bw,dw-ax-by]*/
-          e = _mm_addsub_ps (e, m2); /* 3, 1 */
-
-          /* 2 */
-          /* [dz+ay-bx,dw-ax-by,dy-az+bw,dx+aw+bz] */
-          e = FAST_FLOAT_SHUFFLE(e, _MM_SHUFFLE (2, 0, 1, 3)); /* 1, 0.5 */
-
-          /* [dz+ay-bx,dw-ax-by,dy-az+bw,dx+aw+bz]+-[cw,cz,cx,cy]
-             = [dz+ay-bx+cw,dw-ax-by-cz,dy-az+bw+cx,dx+aw+bz-cy] */
-          e = _mm_addsub_ps (e, m3); /* 3, 1 */
-
-          /* 2 */
-          /* [dw-ax-by-cz,dz+ay-bx+cw,dy-az+bw+cx,dx+aw+bz-cy] */
-          quaternion tmp;
-          reinterpret_cast<vectorSIMDf&>(tmp) = FAST_FLOAT_SHUFFLE(e, _MM_SHUFFLE (2, 3, 1, 0)); /* 1, 0.5 */
-          return tmp;
-        }
-
-        inline vectorSIMDf transformVect(const vectorSIMDf& vec)
-        {
-            vectorSIMDf direction = *reinterpret_cast<const vectorSIMDf*>(this);
-            vectorSIMDf scale = core::length(direction);
-            direction.makeSafe3D();
-
-            return scale*vec+cross(direction,vec*W+cross(direction,vec))*2.f;
-        }
-
-		//! Sets new quaternion
-		inline quaternion& set(const vectorSIMDf& xyzw)
-		{
-		    *this = reinterpret_cast<const quaternion&>(xyzw);
-		    return *this;
-		}
-
-		//! Sets new quaternion based on euler angles (radians)
-		inline quaternion& set(const float& roll, const float& pitch, const float& yaw);
-
-		//! Sets new quaternion from other quaternion
-		inline quaternion& set(const quaternion& quat)
-		{
-		    *this = quat;
-		    return *this;
-		}
-
-		//! Inverts this quaternion
-		inline void makeInverse()
-		{
-		    reinterpret_cast<vectorSIMDf&>(*this) ^= _mm_set_epi32(0x0u,0x80000000u,0x80000000u,0x80000000u);
-		}
-
-		//! Fills an angle (radians) around an axis (unit vector)
-		void toAngleAxis(float& angle, vector3df_SIMD& axis) const;
-
-		//! Output this quaternion to an euler angle (radians)
-		void toEuler(vector3df_SIMD& euler) const;
-
-		//! Set quaternion to identity
-		inline void makeIdentity() {vectorSIMDf::set(0,0,0,1);}
-
-
-        vectorSIMDf& getData() {return *((vectorSIMDf*)this);}
-
-//statics
-        inline static quaternion normalize(const quaternion& in)
-        {
-            quaternion tmp;
-            reinterpret_cast<vectorSIMDf&>(tmp) = core::normalize(reinterpret_cast<const vectorSIMDf&>(in));
-            return tmp;
-        }
-
-        //! Helper func
-		static quaternion lerp(const quaternion &q1, const quaternion &q2, const float& interpolant, const bool& wrongDoubleCover);
-
-		//! Set this quaternion to the linear interpolation between two quaternions
-		/** \param q1 First quaternion to be interpolated.
-		\param q2 Second quaternion to be interpolated.
-		\param interpolant Progress of interpolation. For interpolant=0 the result is
-		q1, for interpolant=1 the result is q2. Otherwise interpolation
-		between q1 and q2.
-		*/
-		static quaternion lerp(const quaternion &q1, const quaternion &q2, const float& interpolant);
-
-        //! Helper func
-		static inline void flerp_interpolant_terms(float& interpolantPrecalcTerm2, float& interpolantPrecalcTerm3, const float& interpolant)
-        {
-            interpolantPrecalcTerm2 = (interpolant - 0.5f) * (interpolant - 0.5f);
-            interpolantPrecalcTerm3 = interpolant * (interpolant - 0.5f) * (interpolant - 1.f);
-        }
-
-		static float flerp_adjustedinterpolant(const float& angle, const float& interpolant, const float& interpolantPrecalcTerm2, const float& interpolantPrecalcTerm3);
-
-		//! Set this quaternion to the approximate slerp between two quaternions
-		/** \param q1 First quaternion to be interpolated.
-		\param q2 Second quaternion to be interpolated.
-		\param interpolant Progress of interpolation. For interpolant=0 the result is
-		q1, for interpolant=1 the result is q2. Otherwise interpolation
-		between q1 and q2.
-		*/
-		static quaternion flerp(const quaternion &q1, const quaternion &q2, const float& interpolant);
-
-		//! Set this quaternion to the result of the spherical interpolation between two quaternions
-		/** \param q1 First quaternion to be interpolated.
-		\param q2 Second quaternion to be interpolated.
-		\param time Progress of interpolation. For interpolant=0 the result is
-		q1, for interpolant=1 the result is q2. Otherwise interpolation
-		between q1 and q2.
-		\param threshold To avoid inaccuracies the
-		interpolation switches to linear interpolation at some point.
-		This value defines how much of the interpolation will
-		be calculated with lerp.
-		*/
-		static quaternion slerp(const quaternion& q1, const quaternion& q2,
-				const float& interpolant, const float& threshold=.05f);
-
-		inline static quaternion fromEuler(const vector3df_SIMD& euler)
-		{
-		    quaternion tmp;
-		    tmp.set(euler.X,euler.Y,euler.Z);
-		    return tmp;
-        }
-
-		inline static quaternion fromEuler(const vector3df& euler)
-		{
-		    quaternion tmp;
-		    tmp.set(euler.X,euler.Y,euler.Z);
-		    return tmp;
-        }
-
-		//! Set quaternion to represent a rotation from one vector to another.
-		static quaternion rotationFromTo(const vector3df_SIMD& from, const vector3df_SIMD& to);
-
-		//! Create quaternion from rotation angle and rotation axis.
-		/** Axis must be unit length.
-		The quaternion representing the rotation is
-		q = cos(A/2)+sin(A/2)*(x*i+y*j+z*k).
-		\param angle Rotation Angle in radians.
-		\param axis Rotation axis. */
-		static quaternion fromAngleAxis(const float& angle, const vector3df_SIMD& axis);
-};
-static_assert(sizeof(quaternion) == sizeof(vectorSIMDf), "Quaternion not same size as vec4");
-
-
-// set this quaternion to the result of the linear interpolation between two quaternions
-inline quaternion quaternion::lerp(const quaternion &q1, const quaternion &q2, const float& interpolant, const bool& wrongDoubleCover)
-{
-    vectorSIMDf retval;
-	if (wrongDoubleCover)
-        retval = mix<vectorSIMDf>(reinterpret_cast<const vectorSIMDf&>(q1),-reinterpret_cast<const vectorSIMDf&>(q2),vectorSIMDf(interpolant));
-    else
-        retval = mix<vectorSIMDf>(reinterpret_cast<const vectorSIMDf&>(q1), reinterpret_cast<const vectorSIMDf&>(q2),vectorSIMDf(interpolant));
-    return reinterpret_cast<const quaternion&>(retval);
-}
-
-// set this quaternion to the result of the linear interpolation between two quaternions
-inline quaternion quaternion::lerp(const quaternion &q1, const quaternion &q2, const float& interpolant)
-{
-	const float angle = dot<vectorSIMDf>(q1,q2)[0];
-    return lerp(q1,q2,interpolant,angle < 0.0f);
-}
-
-// Arseny Kapoulkine
-inline float quaternion::flerp_adjustedinterpolant(const float& angle, const float& interpolant, const float& interpolantPrecalcTerm2, const float& interpolantPrecalcTerm3)
-{
-    float A = 1.0904f + angle * (-3.2452f + angle * (3.55645f - angle * 1.43519f));
-    float B = 0.848013f + angle * (-1.06021f + angle * 0.215638f);
-    float k = A * interpolantPrecalcTerm2 + B;
-    float ot = interpolant + interpolantPrecalcTerm3 * k;
-    return ot;
-}
-
-// set this quaternion to the result of an approximate slerp
-inline quaternion quaternion::flerp(const quaternion &q1, const quaternion &q2, const float& interpolant)
-{
-	const float angle = dot<vectorSIMDf>(q1,q2)[0];
-    return lerp(q1,q2,flerp_adjustedinterpolant(fabsf(angle),interpolant,(interpolant - 0.5f) * (interpolant - 0.5f),interpolant * (interpolant - 0.5f) * (interpolant - 1.f)),angle < 0.0f);
-}
-
-
-// set this quaternion to the result of the interpolation between two quaternions
-inline quaternion quaternion::slerp(const quaternion &q1, const quaternion &q2, const float& interpolant, const float& threshold)
-{
-	float angle = dot<vectorSIMDf>(q1,q2)[0];
-
-	// make sure we use the short rotation
-	bool wrongDoubleCover = angle < 0.0f;
-	if (wrongDoubleCover)
-		angle *= -1.f;
-
-	if (angle <= (1.f-threshold)) // spherical interpolation
-	{ // acosf + sinf
-        vectorSIMDf retval;
-
-		const float sinARcp  = inversesqrt(1.f-angle*angle);
-		const float sinAt = sinf(acosf(angle) * interpolant); // could this line be optimized?
-		//1sqrt 3min/add 5mul from now on
-		const float sinAt_over_sinA = sinAt*sinARcp;
-
-        const float scale = core::sqrt(1.f-sinAt*sinAt)-angle*sinAt_over_sinA; //cosAt-cos(A)sin(tA)/sin(A) = (sin(A)cos(tA)-cos(A)sin(tA))/sin(A)
-        if (wrongDoubleCover) // make sure we use the short rotation
-            retval = reinterpret_cast<const vectorSIMDf&>(q1)*scale - reinterpret_cast<const vectorSIMDf&>(q2)*sinAt_over_sinA;
-        else
-            retval = reinterpret_cast<const vectorSIMDf&>(q1)*scale + reinterpret_cast<const vectorSIMDf&>(q2)*sinAt_over_sinA;
-
-        return reinterpret_cast<const quaternion&>(retval);
-	}
-	else
-        return normalize(lerp(q1,q2,interpolant,wrongDoubleCover));
-}
-
-
-#if !NBL_TEST_BROKEN_QUATERNION_USE
-//! axis must be unit length, angle in radians
-inline quaternion quaternion::fromAngleAxis(const float& angle, const vector3df_SIMD& axis)
-{
-	const float fHalfAngle = 0.5f*angle;
-	const float fSin = sinf(fHalfAngle);
-	quaternion retval;
-	reinterpret_cast<vectorSIMDf&>(retval) = axis*fSin;
-	reinterpret_cast<vectorSIMDf&>(retval).W = cosf(fHalfAngle);
-	return retval;
-}
-
-
-inline void quaternion::toAngleAxis(float& angle, vector3df_SIMD &axis) const
-{
-    vectorSIMDf scale = core::length(*reinterpret_cast<const vectorSIMDf*>(this));
-
-	if (scale.X==0.f)
-	{
-		angle = 0.0f;
-		axis.X = 0.0f;
-		axis.Y = 1.0f;
-		axis.Z = 0.0f;
-	}
-	else
-	{
-	    axis = reinterpret_cast<const vectorSIMDf*>(this)->operator/(scale);
-		angle = 2.f * acosf(axis.W);
-
-	    axis.makeSafe3D();
-	}
-}
-
-inline void quaternion::toEuler(vector3df_SIMD& euler) const
-{
-	vectorSIMDf sqr = *reinterpret_cast<const vectorSIMDf*>(this);
-	sqr *= sqr;
-	const double test = 2.0 * (Y*W - X*Z);
-
-	if (core::equals(test, 1.0, 0.000001))
-	{
-		// heading = rotation about z-axis
-		euler.Z = (float) (-2.0*atan2(X, W));
-		// bank = rotation about x-axis
-		euler.X = 0;
-		// attitude = rotation about y-axis
-		euler.Y = core::HALF_PI<float>();
-	}
-	else if (core::equals(test, -1.0, 0.000001))
-	{
-		// heading = rotation about z-axis
-		euler.Z = (float) (2.0*atan2(X, W));
-		// bank = rotation about x-axis
-		euler.X = 0;
-		// attitude = rotation about y-axis
-		euler.Y = -core::HALF_PI<float>();
-	}
-	else
-	{
-		// heading = rotation about z-axis
-		euler.Z = (float) atan2(2.0 * (X*Y +Z*W),(sqr.X - sqr.Y - sqr.Z + sqr.W));
-		// bank = rotation about x-axis
-		euler.X = (float) atan2(2.0 * (Y*Z +X*W),(-sqr.X - sqr.Y + sqr.Z + sqr.W));
-		// attitude = rotation about y-axis
-		euler.Y = (float) asin( core::clamp(test, -1.0, 1.0) );
-	}
-}
-
-inline quaternion quaternion::rotationFromTo(const vector3df_SIMD& from, const vector3df_SIMD& to)
-{
-	// Based on Stan Melax's article in Game Programming Gems
-	// Copy, since cannot modify local
-	vector3df_SIMD v0 = from;
-	vector3df_SIMD v1 = to;
-	v0 = core::normalize(v0);
-	v1 = core::normalize(v1);
-
-	const vectorSIMDf dddd = core::dot(v0,v1);
-	quaternion tmp;
-	if (dddd.X >= 1.0f) // If dot == 1, vectors are the same
-	{
-		return tmp;
-	}
-	else if (dddd.X <= -1.0f) // exactly opposite
-	{
-		vector3df_SIMD axis(1.0f, 0.f, 0.f);
-		axis = cross(axis,v0);
-		if (length(axis)[0]==0.f)
-		{
-			axis.set(0.f,1.f,0.f);
-			axis = cross(axis,v0);
-		}
-		// same as fromAngleAxis(PI, axis).normalize();
-		reinterpret_cast<vectorSIMDf&>(tmp) = axis;
-		return normalize(tmp);
-	}
-
-    vectorSIMDf s = core::sqrt(vectorSIMDf(2.f,2.f,2.f,0.f)+dddd*2.f);
-	reinterpret_cast<vectorSIMDf&>(tmp) = cross(v0,v1)*reciprocal_approxim(s);
-	tmp.W = s.X*0.5f;
-    return normalize(tmp);
-}
-#endif
-
-// sets new quaternion based on euler angles
-inline quaternion& quaternion::set(const float& roll, const float& pitch, const float& yaw)
-{
-	float angle;
-
-	angle = roll * 0.5f;
-	const float sr = sinf(angle);
-	const float cr = cosf(angle);
-
-	angle = pitch * 0.5f;
-	const float sp = sinf(angle);
-	const float cp = cos(angle);
-
-	angle = yaw * 0.5f;
-	const float sy = sinf(angle);
-	const float cy = cosf(angle);
-
-	const float cpcy = cp * cy;
-	const float spcy = sp * cy;
-	const float cpsy = cp * sy;
-	const float spsy = sp * sy;
-
-    *reinterpret_cast<vectorSIMDf*>(this) = vectorSIMDf(sr,cr,cr,cr)*vectorSIMDf(cpcy,spcy,cpsy,cpcy)+vectorSIMDf(-cr,sr,-sr,sr)*vectorSIMDf(spsy,cpsy,spcy,spsy);
-
-	return *this;
-}
-
-} // end namespace core
-} // end namespace nbl
-
-#endif
-
diff --git a/include/vectorSIMD.h b/include/vectorSIMD.h
index 9b09f95c97..6144dc446f 100644
--- a/include/vectorSIMD.h
+++ b/include/vectorSIMD.h
@@ -887,6 +887,42 @@ namespace core
 		}
 	};
 
+	// temporary solution until vectorSIMD gets deleted
+	inline hlsl::float32_t4 convertToHLSLVector(const vectorSIMDf& vec)
+	{
+		hlsl::float32_t4 retval;
+		retval.x = vec.x;
+		retval.y = vec.y;
+		retval.z = vec.z;
+		retval.w = vec.w;
+
+		return retval;
+	}
+
+	// temporary solution until vectorSIMD gets deleted
+	inline vectorSIMDf constructVecorSIMDFromHLSLVector(const hlsl::float32_t4& vec)
+	{
+		vectorSIMDf retval;
+		retval.x = vec.x;
+		retval.y = vec.y;
+		retval.z = vec.z;
+		retval.w = vec.w;
+
+		return retval;
+	}
+
+	// temporary solution until vectorSIMD gets deleted
+	inline vectorSIMDf constructVecorSIMDFromHLSLVector(const hlsl::float32_t3& vec)
+	{
+		vectorSIMDf retval;
+		retval.x = vec.x;
+		retval.y = vec.y;
+		retval.z = vec.z;
+		retval.w = 0.0f;
+
+		return retval;
+	}
+
 } // end namespace core
 } // end namespace nbl
 
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index e8798499f9..a92311a5c9 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -158,7 +158,6 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ieee754/impl.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/array_accessors.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/vector_utils/vector_traits.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/matrix_utils/matrix_traits.hlsl")
-
 #spirv intrinsics
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/core.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/fragment_shader_pixel_interlock.hlsl")
@@ -219,18 +218,21 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/format.hlsl")
 #linear algebra
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/linalg/fast_affine.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/linalg/transform.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/linalg/matrix_utils/transformation_matrix_utils.hlsl")
 # TODO: rename `equations` to `polynomials` probably
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/functions.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/geometry.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/intutil.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/polar.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/angle_adding.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/quaternions.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/equations/quadratic.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/equations/cubic.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/equations/quartic.hlsl")
 #extra math
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/quadrature/gauss_legendre/impl.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/thin_lens_projection.hlsl")
 #acceleration structures
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/acceleration_structures.hlsl")
 #colorspace