Skip to content
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
#ifndef EIGEN_PARALLELIZER_H #ifndef EIGEN_PARALLELIZER_H
#define EIGEN_PARALLELIZER_H #define EIGEN_PARALLELIZER_H
namespace Eigen { namespace Eigen {
namespace internal { namespace internal {
...@@ -49,8 +49,8 @@ inline void initParallel() ...@@ -49,8 +49,8 @@ inline void initParallel()
{ {
int nbt; int nbt;
internal::manage_multi_threading(GetAction, &nbt); internal::manage_multi_threading(GetAction, &nbt);
std::ptrdiff_t l1, l2; std::ptrdiff_t l1, l2, l3;
internal::manage_caching_sizes(GetAction, &l1, &l2); internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
} }
/** \returns the max number of threads reserved for Eigen /** \returns the max number of threads reserved for Eigen
...@@ -73,17 +73,17 @@ namespace internal { ...@@ -73,17 +73,17 @@ namespace internal {
template<typename Index> struct GemmParallelInfo template<typename Index> struct GemmParallelInfo
{ {
GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0) {} GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
int volatile sync; Index volatile sync;
int volatile users; int volatile users;
Index rhs_start; Index lhs_start;
Index rhs_length; Index lhs_length;
}; };
template<bool Condition, typename Functor, typename Index> template<bool Condition, typename Functor, typename Index>
void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose) void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose)
{ {
// TODO when EIGEN_USE_BLAS is defined, // TODO when EIGEN_USE_BLAS is defined,
// we should still enable OMP for other scalar types // we should still enable OMP for other scalar types
...@@ -92,6 +92,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos ...@@ -92,6 +92,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
// the matrix product when multithreading is enabled. This is a temporary // the matrix product when multithreading is enabled. This is a temporary
// fix to support row-major destination matrices. This whole // fix to support row-major destination matrices. This whole
// parallelizer mechanism has to be redisigned anyway. // parallelizer mechanism has to be redisigned anyway.
EIGEN_UNUSED_VARIABLE(depth);
EIGEN_UNUSED_VARIABLE(transpose); EIGEN_UNUSED_VARIABLE(transpose);
func(0,rows, 0,cols); func(0,rows, 0,cols);
#else #else
...@@ -102,56 +103,56 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos ...@@ -102,56 +103,56 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
// - we are not already in a parallel code // - we are not already in a parallel code
// - the sizes are large enough // - the sizes are large enough
// 1- are we already in a parallel session? // compute the maximal number of threads from the size of the product:
// FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp? // This first heuristic takes into account that the product kernel is fully optimized when working with nr columns at once.
if((!Condition) || (omp_get_num_threads()>1)) Index size = transpose ? rows : cols;
return func(0,rows, 0,cols); Index pb_max_threads = std::max<Index>(1,size / Functor::Traits::nr);
Index size = transpose ? cols : rows; // compute the maximal number of threads from the total amount of work:
double work = static_cast<double>(rows) * static_cast<double>(cols) *
static_cast<double>(depth);
double kMinTaskSize = 50000; // FIXME improve this heuristic.
pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, work / kMinTaskSize));
// 2- compute the maximal number of threads from the size of the product: // compute the number of threads we are going to use
// FIXME this has to be fine tuned Index threads = std::min<Index>(nbThreads(), pb_max_threads);
Index max_threads = std::max<Index>(1,size / 32);
// 3 - compute the number of threads we are going to use // if multi-threading is explicitely disabled, not useful, or if we already are in a parallel session,
Index threads = std::min<Index>(nbThreads(), max_threads); // then abort multi-threading
// FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
if(threads==1) if((!Condition) || (threads==1) || (omp_get_num_threads()>1))
return func(0,rows, 0,cols); return func(0,rows, 0,cols);
Eigen::initParallel(); Eigen::initParallel();
func.initParallelSession(); func.initParallelSession(threads);
if(transpose) if(transpose)
std::swap(rows,cols); std::swap(rows,cols);
GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads]; ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);
#pragma omp parallel num_threads(threads) #pragma omp parallel num_threads(threads)
{ {
Index i = omp_get_thread_num(); Index i = omp_get_thread_num();
// Note that the actual number of threads might be lower than the number of request ones. // Note that the actual number of threads might be lower than the number of request ones.
Index actual_threads = omp_get_num_threads(); Index actual_threads = omp_get_num_threads();
Index blockCols = (cols / actual_threads) & ~Index(0x3); Index blockCols = (cols / actual_threads) & ~Index(0x3);
Index blockRows = (rows / actual_threads) & ~Index(0x7); Index blockRows = (rows / actual_threads);
blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
Index r0 = i*blockRows; Index r0 = i*blockRows;
Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows; Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows;
Index c0 = i*blockCols; Index c0 = i*blockCols;
Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols; Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols;
info[i].rhs_start = c0; info[i].lhs_start = r0;
info[i].rhs_length = actualBlockCols; info[i].lhs_length = actualBlockRows;
if(transpose) if(transpose) func(c0, actualBlockCols, 0, rows, info);
func(0, cols, r0, actualBlockRows, info); else func(0, rows, c0, actualBlockCols, info);
else
func(r0, actualBlockRows, 0,cols, info);
} }
delete[] info;
#endif #endif
} }
......
...@@ -15,7 +15,7 @@ namespace Eigen { ...@@ -15,7 +15,7 @@ namespace Eigen {
namespace internal { namespace internal {
// pack a selfadjoint block diagonal for use with the gebp_kernel // pack a selfadjoint block diagonal for use with the gebp_kernel
template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder> template<typename Scalar, typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
struct symm_pack_lhs struct symm_pack_lhs
{ {
template<int BlockRows> inline template<int BlockRows> inline
...@@ -45,25 +45,32 @@ struct symm_pack_lhs ...@@ -45,25 +45,32 @@ struct symm_pack_lhs
} }
void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
{ {
enum { PacketSize = packet_traits<Scalar>::size };
const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride); const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
Index count = 0; Index count = 0;
Index peeled_mc = (rows/Pack1)*Pack1; //Index peeled_mc3 = (rows/Pack1)*Pack1;
for(Index i=0; i<peeled_mc; i+=Pack1)
{ const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
pack<Pack1>(blockA, lhs, cols, i, count); const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
} const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
if(rows-peeled_mc>=Pack2) if(Pack1>=3*PacketSize)
{ for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
pack<Pack2>(blockA, lhs, cols, peeled_mc, count); pack<3*PacketSize>(blockA, lhs, cols, i, count);
peeled_mc += Pack2;
} if(Pack1>=2*PacketSize)
for(Index i=peeled_mc3; i<peeled_mc2; i+=2*PacketSize)
pack<2*PacketSize>(blockA, lhs, cols, i, count);
if(Pack1>=1*PacketSize)
for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
pack<1*PacketSize>(blockA, lhs, cols, i, count);
// do the same with mr==1 // do the same with mr==1
for(Index i=peeled_mc; i<rows; i++) for(Index i=peeled_mc1; i<rows; i++)
{ {
for(Index k=0; k<i; k++) for(Index k=0; k<i; k++)
blockA[count++] = lhs(i, k); // normal blockA[count++] = lhs(i, k); // normal
blockA[count++] = numext::real(lhs(i, i)); // real (diagonal) blockA[count++] = numext::real(lhs(i, i)); // real (diagonal)
...@@ -82,7 +89,8 @@ struct symm_pack_rhs ...@@ -82,7 +89,8 @@ struct symm_pack_rhs
Index end_k = k2 + rows; Index end_k = k2 + rows;
Index count = 0; Index count = 0;
const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride); const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
Index packet_cols = (cols/nr)*nr; Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
// first part: normal case // first part: normal case
for(Index j2=0; j2<k2; j2+=nr) for(Index j2=0; j2<k2; j2+=nr)
...@@ -91,79 +99,151 @@ struct symm_pack_rhs ...@@ -91,79 +99,151 @@ struct symm_pack_rhs
{ {
blockB[count+0] = rhs(k,j2+0); blockB[count+0] = rhs(k,j2+0);
blockB[count+1] = rhs(k,j2+1); blockB[count+1] = rhs(k,j2+1);
if (nr==4) if (nr>=4)
{ {
blockB[count+2] = rhs(k,j2+2); blockB[count+2] = rhs(k,j2+2);
blockB[count+3] = rhs(k,j2+3); blockB[count+3] = rhs(k,j2+3);
} }
if (nr>=8)
{
blockB[count+4] = rhs(k,j2+4);
blockB[count+5] = rhs(k,j2+5);
blockB[count+6] = rhs(k,j2+6);
blockB[count+7] = rhs(k,j2+7);
}
count += nr; count += nr;
} }
} }
// second part: diagonal block // second part: diagonal block
for(Index j2=k2; j2<(std::min)(k2+rows,packet_cols); j2+=nr) Index end8 = nr>=8 ? (std::min)(k2+rows,packet_cols8) : k2;
if(nr>=8)
{ {
// again we can split vertically in three different parts (transpose, symmetric, normal) for(Index j2=k2; j2<end8; j2+=8)
// transpose
for(Index k=k2; k<j2; k++)
{ {
blockB[count+0] = numext::conj(rhs(j2+0,k)); // again we can split vertically in three different parts (transpose, symmetric, normal)
blockB[count+1] = numext::conj(rhs(j2+1,k)); // transpose
if (nr==4) for(Index k=k2; k<j2; k++)
{ {
blockB[count+0] = numext::conj(rhs(j2+0,k));
blockB[count+1] = numext::conj(rhs(j2+1,k));
blockB[count+2] = numext::conj(rhs(j2+2,k)); blockB[count+2] = numext::conj(rhs(j2+2,k));
blockB[count+3] = numext::conj(rhs(j2+3,k)); blockB[count+3] = numext::conj(rhs(j2+3,k));
blockB[count+4] = numext::conj(rhs(j2+4,k));
blockB[count+5] = numext::conj(rhs(j2+5,k));
blockB[count+6] = numext::conj(rhs(j2+6,k));
blockB[count+7] = numext::conj(rhs(j2+7,k));
count += 8;
} }
count += nr; // symmetric
} Index h = 0;
// symmetric for(Index k=j2; k<j2+8; k++)
Index h = 0; {
for(Index k=j2; k<j2+nr; k++) // normal
{ for (Index w=0 ; w<h; ++w)
// normal blockB[count+w] = rhs(k,j2+w);
for (Index w=0 ; w<h; ++w)
blockB[count+w] = rhs(k,j2+w);
blockB[count+h] = numext::real(rhs(k,k)); blockB[count+h] = numext::real(rhs(k,k));
// transpose // transpose
for (Index w=h+1 ; w<nr; ++w) for (Index w=h+1 ; w<8; ++w)
blockB[count+w] = numext::conj(rhs(j2+w,k)); blockB[count+w] = numext::conj(rhs(j2+w,k));
count += nr; count += 8;
++h; ++h;
}
// normal
for(Index k=j2+8; k<end_k; k++)
{
blockB[count+0] = rhs(k,j2+0);
blockB[count+1] = rhs(k,j2+1);
blockB[count+2] = rhs(k,j2+2);
blockB[count+3] = rhs(k,j2+3);
blockB[count+4] = rhs(k,j2+4);
blockB[count+5] = rhs(k,j2+5);
blockB[count+6] = rhs(k,j2+6);
blockB[count+7] = rhs(k,j2+7);
count += 8;
}
} }
// normal }
for(Index k=j2+nr; k<end_k; k++) if(nr>=4)
{
for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4)
{ {
blockB[count+0] = rhs(k,j2+0); // again we can split vertically in three different parts (transpose, symmetric, normal)
blockB[count+1] = rhs(k,j2+1); // transpose
if (nr==4) for(Index k=k2; k<j2; k++)
{
blockB[count+0] = numext::conj(rhs(j2+0,k));
blockB[count+1] = numext::conj(rhs(j2+1,k));
blockB[count+2] = numext::conj(rhs(j2+2,k));
blockB[count+3] = numext::conj(rhs(j2+3,k));
count += 4;
}
// symmetric
Index h = 0;
for(Index k=j2; k<j2+4; k++)
{ {
// normal
for (Index w=0 ; w<h; ++w)
blockB[count+w] = rhs(k,j2+w);
blockB[count+h] = numext::real(rhs(k,k));
// transpose
for (Index w=h+1 ; w<4; ++w)
blockB[count+w] = numext::conj(rhs(j2+w,k));
count += 4;
++h;
}
// normal
for(Index k=j2+4; k<end_k; k++)
{
blockB[count+0] = rhs(k,j2+0);
blockB[count+1] = rhs(k,j2+1);
blockB[count+2] = rhs(k,j2+2); blockB[count+2] = rhs(k,j2+2);
blockB[count+3] = rhs(k,j2+3); blockB[count+3] = rhs(k,j2+3);
count += 4;
} }
count += nr;
} }
} }
// third part: transposed // third part: transposed
for(Index j2=k2+rows; j2<packet_cols; j2+=nr) if(nr>=8)
{ {
for(Index k=k2; k<end_k; k++) for(Index j2=k2+rows; j2<packet_cols8; j2+=8)
{ {
blockB[count+0] = numext::conj(rhs(j2+0,k)); for(Index k=k2; k<end_k; k++)
blockB[count+1] = numext::conj(rhs(j2+1,k));
if (nr==4)
{ {
blockB[count+0] = numext::conj(rhs(j2+0,k));
blockB[count+1] = numext::conj(rhs(j2+1,k));
blockB[count+2] = numext::conj(rhs(j2+2,k)); blockB[count+2] = numext::conj(rhs(j2+2,k));
blockB[count+3] = numext::conj(rhs(j2+3,k)); blockB[count+3] = numext::conj(rhs(j2+3,k));
blockB[count+4] = numext::conj(rhs(j2+4,k));
blockB[count+5] = numext::conj(rhs(j2+5,k));
blockB[count+6] = numext::conj(rhs(j2+6,k));
blockB[count+7] = numext::conj(rhs(j2+7,k));
count += 8;
}
}
}
if(nr>=4)
{
for(Index j2=(std::max)(packet_cols8,k2+rows); j2<packet_cols4; j2+=4)
{
for(Index k=k2; k<end_k; k++)
{
blockB[count+0] = numext::conj(rhs(j2+0,k));
blockB[count+1] = numext::conj(rhs(j2+1,k));
blockB[count+2] = numext::conj(rhs(j2+2,k));
blockB[count+3] = numext::conj(rhs(j2+3,k));
count += 4;
} }
count += nr;
} }
} }
// copy the remaining columns one at a time (=> the same with nr==1) // copy the remaining columns one at a time (=> the same with nr==1)
for(Index j2=packet_cols; j2<cols; ++j2) for(Index j2=packet_cols4; j2<cols; ++j2)
{ {
// transpose // transpose
Index half = (std::min)(end_k,j2); Index half = (std::min)(end_k,j2);
...@@ -211,7 +291,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co ...@@ -211,7 +291,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co
const Scalar* lhs, Index lhsStride, const Scalar* lhs, Index lhsStride,
const Scalar* rhs, Index rhsStride, const Scalar* rhs, Index rhsStride,
Scalar* res, Index resStride, Scalar* res, Index resStride,
const Scalar& alpha) const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{ {
product_selfadjoint_matrix<Scalar, Index, product_selfadjoint_matrix<Scalar, Index,
EIGEN_LOGICAL_XOR(RhsSelfAdjoint,RhsStorageOrder==RowMajor) ? ColMajor : RowMajor, EIGEN_LOGICAL_XOR(RhsSelfAdjoint,RhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
...@@ -219,7 +299,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co ...@@ -219,7 +299,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co
EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor, EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs), LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs),
ColMajor> ColMajor>
::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha); ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking);
} }
}; };
...@@ -234,7 +314,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs ...@@ -234,7 +314,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs
const Scalar* _lhs, Index lhsStride, const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride, const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride, Scalar* res, Index resStride,
const Scalar& alpha); const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
}; };
template <typename Scalar, typename Index, template <typename Scalar, typename Index,
...@@ -244,33 +324,35 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t ...@@ -244,33 +324,35 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
Index rows, Index cols, Index rows, Index cols,
const Scalar* _lhs, Index lhsStride, const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride, const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride, Scalar* _res, Index resStride,
const Scalar& alpha) const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{ {
Index size = rows; Index size = rows;
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef gebp_traits<Scalar,Scalar> Traits; typedef gebp_traits<Scalar,Scalar> Traits;
Index kc = size; // cache block size along the K direction typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
Index mc = rows; // cache block size along the M direction typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
Index nc = cols; // cache block size along the N direction typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
// kc must smaller than mc LhsMapper lhs(_lhs,lhsStride);
LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride);
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
// kc must be smaller than mc
kc = (std::min)(kc,mc); kc = (std::min)(kc,mc);
std::size_t sizeA = kc*mc;
std::size_t sizeB = kc*cols;
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
std::size_t sizeW = kc*Traits::WorkSpaceFactor; gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
std::size_t sizeB = sizeW + kc*cols;
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
Scalar* blockB = allocatedBlockB + sizeW;
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed; gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
for(Index k2=0; k2<size; k2+=kc) for(Index k2=0; k2<size; k2+=kc)
{ {
...@@ -279,7 +361,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t ...@@ -279,7 +361,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
// we have selected one row panel of rhs and one column panel of lhs // we have selected one row panel of rhs and one column panel of lhs
// pack rhs's panel into a sequential chunk of memory // pack rhs's panel into a sequential chunk of memory
// and expand each coeff to a constant packet for further reuse // and expand each coeff to a constant packet for further reuse
pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols); pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols);
// the select lhs's panel has to be split in three different parts: // the select lhs's panel has to be split in three different parts:
// 1 - the transposed panel above the diagonal block => transposed packed copy // 1 - the transposed panel above the diagonal block => transposed packed copy
...@@ -289,9 +371,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t ...@@ -289,9 +371,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
{ {
const Index actual_mc = (std::min)(i2+mc,k2)-i2; const Index actual_mc = (std::min)(i2+mc,k2)-i2;
// transposed packed copy // transposed packed copy
pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc); pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(i2, k2), actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
} }
// the block diagonal // the block diagonal
{ {
...@@ -299,16 +381,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t ...@@ -299,16 +381,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
// symmetric packed copy // symmetric packed copy
pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc); pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
} }
for(Index i2=k2+kc; i2<size; i2+=mc) for(Index i2=k2+kc; i2<size; i2+=mc)
{ {
const Index actual_mc = (std::min)(i2+mc,size)-i2; const Index actual_mc = (std::min)(i2+mc,size)-i2;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>() gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
} }
} }
} }
...@@ -325,7 +407,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLh ...@@ -325,7 +407,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLh
const Scalar* _lhs, Index lhsStride, const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride, const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride, Scalar* res, Index resStride,
const Scalar& alpha); const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
}; };
template <typename Scalar, typename Index, template <typename Scalar, typename Index,
...@@ -335,27 +417,27 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f ...@@ -335,27 +417,27 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
Index rows, Index cols, Index rows, Index cols,
const Scalar* _lhs, Index lhsStride, const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride, const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride, Scalar* _res, Index resStride,
const Scalar& alpha) const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{ {
Index size = cols; Index size = cols;
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
typedef gebp_traits<Scalar,Scalar> Traits; typedef gebp_traits<Scalar,Scalar> Traits;
Index kc = size; // cache block size along the K direction typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
Index mc = rows; // cache block size along the M direction typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
Index nc = cols; // cache block size along the N direction LhsMapper lhs(_lhs,lhsStride);
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); ResMapper res(_res,resStride);
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*cols; Index kc = blocking.kc(); // cache block size along the K direction
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); std::size_t sizeA = kc*mc;
Scalar* blockB = allocatedBlockB + sizeW; std::size_t sizeB = kc*cols;
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
for(Index k2=0; k2<size; k2+=kc) for(Index k2=0; k2<size; k2+=kc)
...@@ -368,9 +450,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f ...@@ -368,9 +450,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
for(Index i2=0; i2<rows; i2+=mc) for(Index i2=0; i2<rows; i2+=mc)
{ {
const Index actual_mc = (std::min)(i2+mc,rows)-i2; const Index actual_mc = (std::min)(i2+mc,rows)-i2;
pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
} }
} }
} }
...@@ -382,55 +464,58 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f ...@@ -382,55 +464,58 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
***************************************************************************/ ***************************************************************************/
namespace internal { namespace internal {
template<typename Lhs, int LhsMode, typename Rhs, int RhsMode> template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> > struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false>
: traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs> >
{};
}
template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
: public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs >
{ {
EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) typedef typename Product<Lhs,Rhs>::Scalar Scalar;
SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} typedef internal::blas_traits<Lhs> LhsBlasTraits;
typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
typedef internal::blas_traits<Rhs> RhsBlasTraits;
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
enum { enum {
LhsIsUpper = (LhsMode&(Upper|Lower))==Upper, LhsIsUpper = (LhsMode&(Upper|Lower))==Upper,
LhsIsSelfAdjoint = (LhsMode&SelfAdjoint)==SelfAdjoint, LhsIsSelfAdjoint = (LhsMode&SelfAdjoint)==SelfAdjoint,
RhsIsUpper = (RhsMode&(Upper|Lower))==Upper, RhsIsUpper = (RhsMode&(Upper|Lower))==Upper,
RhsIsSelfAdjoint = (RhsMode&SelfAdjoint)==SelfAdjoint RhsIsSelfAdjoint = (RhsMode&SelfAdjoint)==SelfAdjoint
}; };
template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const template<typename Dest>
static void run(Dest &dst, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
{ {
eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols());
typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs); typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs); typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
* RhsBlasTraits::extractScalarFactor(m_rhs); * RhsBlasTraits::extractScalarFactor(a_rhs);
typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,1> BlockingType;
BlockingType blocking(lhs.rows(), rhs.cols(), lhs.cols(), 1, false);
internal::product_selfadjoint_matrix<Scalar, Index, internal::product_selfadjoint_matrix<Scalar, Index,
EIGEN_LOGICAL_XOR(LhsIsUpper, EIGEN_LOGICAL_XOR(LhsIsUpper,internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)), NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
EIGEN_LOGICAL_XOR(RhsIsUpper, EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)), NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor> internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor>
::run( ::run(
lhs.rows(), rhs.cols(), // sizes lhs.rows(), rhs.cols(), // sizes
&lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
&rhs.coeffRef(0,0), rhs.outerStride(), // rhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info
&dst.coeffRef(0,0), dst.outerStride(), // result info &dst.coeffRef(0,0), dst.outerStride(), // result info
actualAlpha // alpha actualAlpha, blocking // alpha
); );
} }
}; };
} // end namespace internal
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H #endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H