Skip to content
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_SELECT_H
#define EIGEN_SELECT_H
namespace Eigen {
/** \class Select
* \ingroup Core_Module
*
* \brief Expression of a coefficient wise version of the C++ ternary operator ?:
*
* \param ConditionMatrixType the type of the \em condition expression which must be a boolean matrix
* \param ThenMatrixType the type of the \em then expression
* \param ElseMatrixType the type of the \em else expression
*
* This class represents an expression of a coefficient wise version of the C++ ternary operator ?:.
* It is the return type of DenseBase::select() and most of the time this is the only way it is used.
*
* \sa DenseBase::select(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const
*/
namespace internal {
template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
struct traits<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
: traits<ThenMatrixType>
{
typedef typename traits<ThenMatrixType>::Scalar Scalar;
typedef Dense StorageKind;
typedef typename traits<ThenMatrixType>::XprKind XprKind;
typedef typename ConditionMatrixType::Nested ConditionMatrixNested;
typedef typename ThenMatrixType::Nested ThenMatrixNested;
typedef typename ElseMatrixType::Nested ElseMatrixNested;
enum {
RowsAtCompileTime = ConditionMatrixType::RowsAtCompileTime,
ColsAtCompileTime = ConditionMatrixType::ColsAtCompileTime,
MaxRowsAtCompileTime = ConditionMatrixType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = ConditionMatrixType::MaxColsAtCompileTime,
Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & RowMajorBit
};
};
}
template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
class Select : public internal::dense_xpr_base< Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >::type,
internal::no_assignment_operator
{
public:
typedef typename internal::dense_xpr_base<Select>::type Base;
EIGEN_DENSE_PUBLIC_INTERFACE(Select)
inline EIGEN_DEVICE_FUNC
Select(const ConditionMatrixType& a_conditionMatrix,
const ThenMatrixType& a_thenMatrix,
const ElseMatrixType& a_elseMatrix)
: m_condition(a_conditionMatrix), m_then(a_thenMatrix), m_else(a_elseMatrix)
{
eigen_assert(m_condition.rows() == m_then.rows() && m_condition.rows() == m_else.rows());
eigen_assert(m_condition.cols() == m_then.cols() && m_condition.cols() == m_else.cols());
}
inline EIGEN_DEVICE_FUNC Index rows() const { return m_condition.rows(); }
inline EIGEN_DEVICE_FUNC Index cols() const { return m_condition.cols(); }
inline EIGEN_DEVICE_FUNC
const Scalar coeff(Index i, Index j) const
{
if (m_condition.coeff(i,j))
return m_then.coeff(i,j);
else
return m_else.coeff(i,j);
}
inline EIGEN_DEVICE_FUNC
const Scalar coeff(Index i) const
{
if (m_condition.coeff(i))
return m_then.coeff(i);
else
return m_else.coeff(i);
}
inline EIGEN_DEVICE_FUNC const ConditionMatrixType& conditionMatrix() const
{
return m_condition;
}
inline EIGEN_DEVICE_FUNC const ThenMatrixType& thenMatrix() const
{
return m_then;
}
inline EIGEN_DEVICE_FUNC const ElseMatrixType& elseMatrix() const
{
return m_else;
}
protected:
typename ConditionMatrixType::Nested m_condition;
typename ThenMatrixType::Nested m_then;
typename ElseMatrixType::Nested m_else;
};
/** \returns a matrix where each coefficient (i,j) is equal to \a thenMatrix(i,j)
* if \c *this(i,j), and \a elseMatrix(i,j) otherwise.
*
* Example: \include MatrixBase_select.cpp
* Output: \verbinclude MatrixBase_select.out
*
* \sa class Select
*/
template<typename Derived>
template<typename ThenDerived,typename ElseDerived>
inline const Select<Derived,ThenDerived,ElseDerived>
DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
const DenseBase<ElseDerived>& elseMatrix) const
{
return Select<Derived,ThenDerived,ElseDerived>(derived(), thenMatrix.derived(), elseMatrix.derived());
}
/** Version of DenseBase::select(const DenseBase&, const DenseBase&) with
* the \em else expression being a scalar value.
*
* \sa DenseBase::select(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const, class Select
*/
template<typename Derived>
template<typename ThenDerived>
inline const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
const typename ThenDerived::Scalar& elseScalar) const
{
return Select<Derived,ThenDerived,typename ThenDerived::ConstantReturnType>(
derived(), thenMatrix.derived(), ThenDerived::Constant(rows(),cols(),elseScalar));
}
/** Version of DenseBase::select(const DenseBase&, const DenseBase&) with
* the \em then expression being a scalar value.
*
* \sa DenseBase::select(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const, class Select
*/
template<typename Derived>
template<typename ElseDerived>
inline const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
DenseBase<Derived>::select(const typename ElseDerived::Scalar& thenScalar,
const DenseBase<ElseDerived>& elseMatrix) const
{
return Select<Derived,typename ElseDerived::ConstantReturnType,ElseDerived>(
derived(), ElseDerived::Constant(rows(),cols(),thenScalar), elseMatrix.derived());
}
} // end namespace Eigen
#endif // EIGEN_SELECT_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_SELFADJOINTMATRIX_H
#define EIGEN_SELFADJOINTMATRIX_H
namespace Eigen {
/** \class SelfAdjointView
* \ingroup Core_Module
*
*
* \brief Expression of a selfadjoint matrix from a triangular part of a dense matrix
*
* \param MatrixType the type of the dense matrix storing the coefficients
* \param TriangularPart can be either \c #Lower or \c #Upper
*
* This class is an expression of a sefladjoint matrix from a triangular part of a matrix
* with given dense storage of the coefficients. It is the return type of MatrixBase::selfadjointView()
* and most of the time this is the only way that it is used.
*
* \sa class TriangularBase, MatrixBase::selfadjointView()
*/
namespace internal {
template<typename MatrixType, unsigned int UpLo>
struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType>
{
typedef typename ref_selector<MatrixType>::non_const_type MatrixTypeNested;
typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
typedef MatrixType ExpressionType;
typedef typename MatrixType::PlainObject FullMatrixType;
enum {
Mode = UpLo | SelfAdjoint,
FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
Flags = MatrixTypeNestedCleaned::Flags & (HereditaryBits|FlagsLvalueBit)
& (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)) // FIXME these flags should be preserved
};
};
}
template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
: public TriangularBase<SelfAdjointView<_MatrixType, UpLo> >
{
public:
typedef _MatrixType MatrixType;
typedef TriangularBase<SelfAdjointView> Base;
typedef typename internal::traits<SelfAdjointView>::MatrixTypeNested MatrixTypeNested;
typedef typename internal::traits<SelfAdjointView>::MatrixTypeNestedCleaned MatrixTypeNestedCleaned;
typedef MatrixTypeNestedCleaned NestedExpression;
/** \brief The type of coefficients in this matrix */
typedef typename internal::traits<SelfAdjointView>::Scalar Scalar;
typedef typename MatrixType::StorageIndex StorageIndex;
typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
enum {
Mode = internal::traits<SelfAdjointView>::Mode,
Flags = internal::traits<SelfAdjointView>::Flags,
TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0)
};
typedef typename MatrixType::PlainObject PlainObject;
EIGEN_DEVICE_FUNC
explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
{}
EIGEN_DEVICE_FUNC
inline Index rows() const { return m_matrix.rows(); }
EIGEN_DEVICE_FUNC
inline Index cols() const { return m_matrix.cols(); }
EIGEN_DEVICE_FUNC
inline Index outerStride() const { return m_matrix.outerStride(); }
EIGEN_DEVICE_FUNC
inline Index innerStride() const { return m_matrix.innerStride(); }
/** \sa MatrixBase::coeff()
* \warning the coordinates must fit into the referenced triangular part
*/
EIGEN_DEVICE_FUNC
inline Scalar coeff(Index row, Index col) const
{
Base::check_coordinates_internal(row, col);
return m_matrix.coeff(row, col);
}
/** \sa MatrixBase::coeffRef()
* \warning the coordinates must fit into the referenced triangular part
*/
EIGEN_DEVICE_FUNC
inline Scalar& coeffRef(Index row, Index col)
{
EIGEN_STATIC_ASSERT_LVALUE(SelfAdjointView);
Base::check_coordinates_internal(row, col);
return m_matrix.coeffRef(row, col);
}
/** \internal */
EIGEN_DEVICE_FUNC
const MatrixTypeNestedCleaned& _expression() const { return m_matrix; }
EIGEN_DEVICE_FUNC
const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
EIGEN_DEVICE_FUNC
MatrixTypeNestedCleaned& nestedExpression() { return m_matrix; }
/** Efficient triangular matrix times vector/matrix product */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
const Product<SelfAdjointView,OtherDerived>
operator*(const MatrixBase<OtherDerived>& rhs) const
{
return Product<SelfAdjointView,OtherDerived>(*this, rhs.derived());
}
/** Efficient vector/matrix times triangular matrix product */
template<typename OtherDerived> friend
EIGEN_DEVICE_FUNC
const Product<OtherDerived,SelfAdjointView>
operator*(const MatrixBase<OtherDerived>& lhs, const SelfAdjointView& rhs)
{
return Product<OtherDerived,SelfAdjointView>(lhs.derived(),rhs);
}
friend EIGEN_DEVICE_FUNC
const SelfAdjointView<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,MatrixType,product),UpLo>
operator*(const Scalar& s, const SelfAdjointView& mat)
{
return (s*mat.nestedExpression()).template selfadjointView<UpLo>();
}
/** Perform a symmetric rank 2 update of the selfadjoint matrix \c *this:
* \f$ this = this + \alpha u v^* + conj(\alpha) v u^* \f$
* \returns a reference to \c *this
*
* The vectors \a u and \c v \b must be column vectors, however they can be
* a adjoint expression without any overhead. Only the meaningful triangular
* part of the matrix is updated, the rest is left unchanged.
*
* \sa rankUpdate(const MatrixBase<DerivedU>&, Scalar)
*/
template<typename DerivedU, typename DerivedV>
EIGEN_DEVICE_FUNC
SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha = Scalar(1));
/** Perform a symmetric rank K update of the selfadjoint matrix \c *this:
* \f$ this = this + \alpha ( u u^* ) \f$ where \a u is a vector or matrix.
*
* \returns a reference to \c *this
*
* Note that to perform \f$ this = this + \alpha ( u^* u ) \f$ you can simply
* call this function with u.adjoint().
*
* \sa rankUpdate(const MatrixBase<DerivedU>&, const MatrixBase<DerivedV>&, Scalar)
*/
template<typename DerivedU>
EIGEN_DEVICE_FUNC
SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
/** \returns an expression of a triangular view extracted from the current selfadjoint view of a given triangular part
*
* The parameter \a TriMode can have the following values: \c #Upper, \c #StrictlyUpper, \c #UnitUpper,
* \c #Lower, \c #StrictlyLower, \c #UnitLower.
*
* If \c TriMode references the same triangular part than \c *this, then this method simply return a \c TriangularView of the nested expression,
* otherwise, the nested expression is first transposed, thus returning a \c TriangularView<Transpose<MatrixType>> object.
*
* \sa MatrixBase::triangularView(), class TriangularView
*/
template<unsigned int TriMode>
EIGEN_DEVICE_FUNC
typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)),
TriangularView<MatrixType,TriMode>,
TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type
triangularView() const
{
typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), MatrixType&, typename MatrixType::ConstTransposeReturnType>::type tmp1(m_matrix);
typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), MatrixType&, typename MatrixType::AdjointReturnType>::type tmp2(tmp1);
return typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)),
TriangularView<MatrixType,TriMode>,
TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2);
}
typedef SelfAdjointView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
/** \sa MatrixBase::conjugate() const */
EIGEN_DEVICE_FUNC
inline const ConjugateReturnType conjugate() const
{ return ConjugateReturnType(m_matrix.conjugate()); }
typedef SelfAdjointView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
/** \sa MatrixBase::adjoint() const */
EIGEN_DEVICE_FUNC
inline const AdjointReturnType adjoint() const
{ return AdjointReturnType(m_matrix.adjoint()); }
typedef SelfAdjointView<typename MatrixType::TransposeReturnType,TransposeMode> TransposeReturnType;
/** \sa MatrixBase::transpose() */
EIGEN_DEVICE_FUNC
inline TransposeReturnType transpose()
{
EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
typename MatrixType::TransposeReturnType tmp(m_matrix);
return TransposeReturnType(tmp);
}
typedef SelfAdjointView<const typename MatrixType::ConstTransposeReturnType,TransposeMode> ConstTransposeReturnType;
/** \sa MatrixBase::transpose() const */
EIGEN_DEVICE_FUNC
inline const ConstTransposeReturnType transpose() const
{
return ConstTransposeReturnType(m_matrix.transpose());
}
/** \returns a const expression of the main diagonal of the matrix \c *this
*
* This method simply returns the diagonal of the nested expression, thus by-passing the SelfAdjointView decorator.
*
* \sa MatrixBase::diagonal(), class Diagonal */
EIGEN_DEVICE_FUNC
typename MatrixType::ConstDiagonalReturnType diagonal() const
{
return typename MatrixType::ConstDiagonalReturnType(m_matrix);
}
/////////// Cholesky module ///////////
const LLT<PlainObject, UpLo> llt() const;
const LDLT<PlainObject, UpLo> ldlt() const;
/////////// Eigenvalue module ///////////
/** Real part of #Scalar */
typedef typename NumTraits<Scalar>::Real RealScalar;
/** Return type of eigenvalues() */
typedef Matrix<RealScalar, internal::traits<MatrixType>::ColsAtCompileTime, 1> EigenvaluesReturnType;
EIGEN_DEVICE_FUNC
EigenvaluesReturnType eigenvalues() const;
EIGEN_DEVICE_FUNC
RealScalar operatorNorm() const;
protected:
MatrixTypeNested m_matrix;
};
// template<typename OtherDerived, typename MatrixType, unsigned int UpLo>
// internal::selfadjoint_matrix_product_returntype<OtherDerived,SelfAdjointView<MatrixType,UpLo> >
// operator*(const MatrixBase<OtherDerived>& lhs, const SelfAdjointView<MatrixType,UpLo>& rhs)
// {
// return internal::matrix_selfadjoint_product_returntype<OtherDerived,SelfAdjointView<MatrixType,UpLo> >(lhs.derived(),rhs);
// }
// selfadjoint to dense matrix
namespace internal {
// TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
// in the future selfadjoint-ness should be defined by the expression traits
// such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
template<typename MatrixType, unsigned int Mode>
struct evaluator_traits<SelfAdjointView<MatrixType,Mode> >
{
typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
typedef SelfAdjointShape Shape;
};
template<int UpLo, int SetOpposite, typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version>
class triangular_dense_assignment_kernel<UpLo,SelfAdjoint,SetOpposite,DstEvaluatorTypeT,SrcEvaluatorTypeT,Functor,Version>
: public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version>
{
protected:
typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version> Base;
typedef typename Base::DstXprType DstXprType;
typedef typename Base::SrcXprType SrcXprType;
using Base::m_dst;
using Base::m_src;
using Base::m_functor;
public:
typedef typename Base::DstEvaluatorType DstEvaluatorType;
typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
typedef typename Base::Scalar Scalar;
typedef typename Base::AssignmentTraits AssignmentTraits;
EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
: Base(dst, src, func, dstExpr)
{}
EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
{
eigen_internal_assert(row!=col);
Scalar tmp = m_src.coeff(row,col);
m_functor.assignCoeff(m_dst.coeffRef(row,col), tmp);
m_functor.assignCoeff(m_dst.coeffRef(col,row), numext::conj(tmp));
}
EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
{
Base::assignCoeff(id,id);
}
EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index, Index)
{ eigen_internal_assert(false && "should never be called"); }
};
} // end namespace internal
/***************************************************************************
* Implementation of MatrixBase methods
***************************************************************************/
/** This is the const version of MatrixBase::selfadjointView() */
template<typename Derived>
template<unsigned int UpLo>
typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
MatrixBase<Derived>::selfadjointView() const
{
return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
}
/** \returns an expression of a symmetric/self-adjoint view extracted from the upper or lower triangular part of the current matrix
*
* The parameter \a UpLo can be either \c #Upper or \c #Lower
*
* Example: \include MatrixBase_selfadjointView.cpp
* Output: \verbinclude MatrixBase_selfadjointView.out
*
* \sa class SelfAdjointView
*/
template<typename Derived>
template<unsigned int UpLo>
typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
MatrixBase<Derived>::selfadjointView()
{
return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
}
} // end namespace Eigen
#endif // EIGEN_SELFADJOINTMATRIX_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_SELFCWISEBINARYOP_H
#define EIGEN_SELFCWISEBINARYOP_H
namespace Eigen {
// TODO generalize the scalar type of 'other'
template<typename Derived>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
{
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
return derived();
}
template<typename Derived>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
{
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
return derived();
}
template<typename Derived>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
{
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
return derived();
}
template<typename Derived>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
{
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
return derived();
}
} // end namespace Eigen
#endif // EIGEN_SELFCWISEBINARYOP_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_SOLVE_H
#define EIGEN_SOLVE_H
namespace Eigen {
template<typename Decomposition, typename RhsType, typename StorageKind> class SolveImpl;
/** \class Solve
* \ingroup Core_Module
*
* \brief Pseudo expression representing a solving operation
*
* \tparam Decomposition the type of the matrix or decomposion object
* \tparam Rhstype the type of the right-hand side
*
* This class represents an expression of A.solve(B)
* and most of the time this is the only way it is used.
*
*/
namespace internal {
// this solve_traits class permits to determine the evaluation type with respect to storage kind (Dense vs Sparse)
template<typename Decomposition, typename RhsType,typename StorageKind> struct solve_traits;
template<typename Decomposition, typename RhsType>
struct solve_traits<Decomposition,RhsType,Dense>
{
typedef typename make_proper_matrix_type<typename RhsType::Scalar,
Decomposition::ColsAtCompileTime,
RhsType::ColsAtCompileTime,
RhsType::PlainObject::Options,
Decomposition::MaxColsAtCompileTime,
RhsType::MaxColsAtCompileTime>::type PlainObject;
};
template<typename Decomposition, typename RhsType>
struct traits<Solve<Decomposition, RhsType> >
: traits<typename solve_traits<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>::PlainObject>
{
typedef typename solve_traits<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>::PlainObject PlainObject;
typedef typename promote_index_type<typename Decomposition::StorageIndex, typename RhsType::StorageIndex>::type StorageIndex;
typedef traits<PlainObject> BaseTraits;
enum {
Flags = BaseTraits::Flags & RowMajorBit,
CoeffReadCost = HugeCost
};
};
}
template<typename Decomposition, typename RhsType>
class Solve : public SolveImpl<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>
{
public:
typedef typename internal::traits<Solve>::PlainObject PlainObject;
typedef typename internal::traits<Solve>::StorageIndex StorageIndex;
Solve(const Decomposition &dec, const RhsType &rhs)
: m_dec(dec), m_rhs(rhs)
{}
EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); }
EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); }
EIGEN_DEVICE_FUNC const Decomposition& dec() const { return m_dec; }
EIGEN_DEVICE_FUNC const RhsType& rhs() const { return m_rhs; }
protected:
const Decomposition &m_dec;
const RhsType &m_rhs;
};
// Specialization of the Solve expression for dense results
template<typename Decomposition, typename RhsType>
class SolveImpl<Decomposition,RhsType,Dense>
: public MatrixBase<Solve<Decomposition,RhsType> >
{
typedef Solve<Decomposition,RhsType> Derived;
public:
typedef MatrixBase<Solve<Decomposition,RhsType> > Base;
EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
private:
Scalar coeff(Index row, Index col) const;
Scalar coeff(Index i) const;
};
// Generic API dispatcher
template<typename Decomposition, typename RhsType, typename StorageKind>
class SolveImpl : public internal::generic_xpr_base<Solve<Decomposition,RhsType>, MatrixXpr, StorageKind>::type
{
public:
typedef typename internal::generic_xpr_base<Solve<Decomposition,RhsType>, MatrixXpr, StorageKind>::type Base;
};
namespace internal {
// Evaluator of Solve -> eval into a temporary
template<typename Decomposition, typename RhsType>
struct evaluator<Solve<Decomposition,RhsType> >
: public evaluator<typename Solve<Decomposition,RhsType>::PlainObject>
{
typedef Solve<Decomposition,RhsType> SolveType;
typedef typename SolveType::PlainObject PlainObject;
typedef evaluator<PlainObject> Base;
enum { Flags = Base::Flags | EvalBeforeNestingBit };
EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve)
: m_result(solve.rows(), solve.cols())
{
::new (static_cast<Base*>(this)) Base(m_result);
solve.dec()._solve_impl(solve.rhs(), m_result);
}
protected:
PlainObject m_result;
};
// Specialization for "dst = dec.solve(rhs)"
// NOTE we need to specialize it for Dense2Dense to avoid ambiguous specialization error and a Sparse2Sparse specialization must exist somewhere
template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
{
typedef Solve<DecType,RhsType> SrcXprType;
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
{
Index dstRows = src.rows();
Index dstCols = src.cols();
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
dst.resize(dstRows, dstCols);
src.dec()._solve_impl(src.rhs(), dst);
}
};
// Specialization for "dst = dec.transpose().solve(rhs)"
template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
{
typedef Solve<Transpose<const DecType>,RhsType> SrcXprType;
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
{
Index dstRows = src.rows();
Index dstCols = src.cols();
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
dst.resize(dstRows, dstCols);
src.dec().nestedExpression().template _solve_impl_transposed<false>(src.rhs(), dst);
}
};
// Specialization for "dst = dec.adjoint().solve(rhs)"
template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType>,
internal::assign_op<Scalar,Scalar>, Dense2Dense>
{
typedef Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType> SrcXprType;
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
{
Index dstRows = src.rows();
Index dstCols = src.cols();
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
dst.resize(dstRows, dstCols);
src.dec().nestedExpression().nestedExpression().template _solve_impl_transposed<true>(src.rhs(), dst);
}
};
} // end namepsace internal
} // end namespace Eigen
#endif // EIGEN_SOLVE_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_SOLVETRIANGULAR_H
#define EIGEN_SOLVETRIANGULAR_H
namespace Eigen {
namespace internal {
// Forward declarations:
// The following two routines are implemented in the products/TriangularSolver*.h files
template<typename LhsScalar, typename RhsScalar, typename Index, int Side, int Mode, bool Conjugate, int StorageOrder>
struct triangular_solve_vector;
template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder>
struct triangular_solve_matrix;
// small helper struct extracting some traits on the underlying solver operation
template<typename Lhs, typename Rhs, int Side>
class trsolve_traits
{
private:
enum {
RhsIsVectorAtCompileTime = (Side==OnTheLeft ? Rhs::ColsAtCompileTime : Rhs::RowsAtCompileTime)==1
};
public:
enum {
Unrolling = (RhsIsVectorAtCompileTime && Rhs::SizeAtCompileTime != Dynamic && Rhs::SizeAtCompileTime <= 8)
? CompleteUnrolling : NoUnrolling,
RhsVectors = RhsIsVectorAtCompileTime ? 1 : Dynamic
};
};
template<typename Lhs, typename Rhs,
int Side, // can be OnTheLeft/OnTheRight
int Mode, // can be Upper/Lower | UnitDiag
int Unrolling = trsolve_traits<Lhs,Rhs,Side>::Unrolling,
int RhsVectors = trsolve_traits<Lhs,Rhs,Side>::RhsVectors
>
struct triangular_solver_selector;
template<typename Lhs, typename Rhs, int Side, int Mode>
struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,1>
{
typedef typename Lhs::Scalar LhsScalar;
typedef typename Rhs::Scalar RhsScalar;
typedef blas_traits<Lhs> LhsProductTraits;
typedef typename LhsProductTraits::ExtractType ActualLhsType;
typedef Map<Matrix<RhsScalar,Dynamic,1>, Aligned> MappedRhs;
static void run(const Lhs& lhs, Rhs& rhs)
{
ActualLhsType actualLhs = LhsProductTraits::extract(lhs);
// FIXME find a way to allow an inner stride if packet_traits<Scalar>::size==1
bool useRhsDirectly = Rhs::InnerStrideAtCompileTime==1 || rhs.innerStride()==1;
ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhs,rhs.size(),
(useRhsDirectly ? rhs.data() : 0));
if(!useRhsDirectly)
MappedRhs(actualRhs,rhs.size()) = rhs;
triangular_solve_vector<LhsScalar, RhsScalar, Index, Side, Mode, LhsProductTraits::NeedToConjugate,
(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor>
::run(actualLhs.cols(), actualLhs.data(), actualLhs.outerStride(), actualRhs);
if(!useRhsDirectly)
rhs = MappedRhs(actualRhs, rhs.size());
}
};
// the rhs is a matrix
template<typename Lhs, typename Rhs, int Side, int Mode>
struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
{
typedef typename Rhs::Scalar Scalar;
typedef blas_traits<Lhs> LhsProductTraits;
typedef typename LhsProductTraits::DirectLinearAccessType ActualLhsType;
static void run(const Lhs& lhs, Rhs& rhs)
{
typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsProductTraits::extract(lhs);
const Index size = lhs.rows();
const Index othersize = Side==OnTheLeft? rhs.cols() : rhs.rows();
typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType;
BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);
triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.outerStride(), blocking);
}
};
/***************************************************************************
* meta-unrolling implementation
***************************************************************************/
template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size,
bool Stop = LoopIndex==Size>
struct triangular_solver_unroller;
template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,false> {
enum {
IsLower = ((Mode&Lower)==Lower),
DiagIndex = IsLower ? LoopIndex : Size - LoopIndex - 1,
StartIndex = IsLower ? 0 : DiagIndex+1
};
static void run(const Lhs& lhs, Rhs& rhs)
{
if (LoopIndex>0)
rhs.coeffRef(DiagIndex) -= lhs.row(DiagIndex).template segment<LoopIndex>(StartIndex).transpose()
.cwiseProduct(rhs.template segment<LoopIndex>(StartIndex)).sum();
if(!(Mode & UnitDiag))
rhs.coeffRef(DiagIndex) /= lhs.coeff(DiagIndex,DiagIndex);
triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex+1,Size>::run(lhs,rhs);
}
};
template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,true> {
static void run(const Lhs&, Rhs&) {}
};
template<typename Lhs, typename Rhs, int Mode>
struct triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,CompleteUnrolling,1> {
static void run(const Lhs& lhs, Rhs& rhs)
{ triangular_solver_unroller<Lhs,Rhs,Mode,0,Rhs::SizeAtCompileTime>::run(lhs,rhs); }
};
template<typename Lhs, typename Rhs, int Mode>
struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
static void run(const Lhs& lhs, Rhs& rhs)
{
Transpose<const Lhs> trLhs(lhs);
Transpose<Rhs> trRhs(rhs);
triangular_solver_unroller<Transpose<const Lhs>,Transpose<Rhs>,
((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag),
0,Rhs::SizeAtCompileTime>::run(trLhs,trRhs);
}
};
} // end namespace internal
/***************************************************************************
* TriangularView methods
***************************************************************************/
#ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename MatrixType, unsigned int Mode>
template<int Side, typename OtherDerived>
void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
{
OtherDerived& other = _other.const_cast_derived();
eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
enum { copy = (internal::traits<OtherDerived>::Flags & RowMajorBit) && OtherDerived::IsVectorAtCompileTime && OtherDerived::SizeAtCompileTime!=1};
typedef typename internal::conditional<copy,
typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
OtherCopy otherCopy(other);
internal::triangular_solver_selector<MatrixType, typename internal::remove_reference<OtherCopy>::type,
Side, Mode>::run(derived().nestedExpression(), otherCopy);
if (copy)
other = otherCopy;
}
template<typename Derived, unsigned int Mode>
template<int Side, typename Other>
const internal::triangular_solve_retval<Side,TriangularView<Derived,Mode>,Other>
TriangularViewImpl<Derived,Mode,Dense>::solve(const MatrixBase<Other>& other) const
{
return internal::triangular_solve_retval<Side,TriangularViewType,Other>(derived(), other.derived());
}
#endif
namespace internal {
template<int Side, typename TriangularType, typename Rhs>
struct traits<triangular_solve_retval<Side, TriangularType, Rhs> >
{
typedef typename internal::plain_matrix_type_column_major<Rhs>::type ReturnType;
};
template<int Side, typename TriangularType, typename Rhs> struct triangular_solve_retval
: public ReturnByValue<triangular_solve_retval<Side, TriangularType, Rhs> >
{
typedef typename remove_all<typename Rhs::Nested>::type RhsNestedCleaned;
typedef ReturnByValue<triangular_solve_retval> Base;
triangular_solve_retval(const TriangularType& tri, const Rhs& rhs)
: m_triangularMatrix(tri), m_rhs(rhs)
{}
inline Index rows() const { return m_rhs.rows(); }
inline Index cols() const { return m_rhs.cols(); }
template<typename Dest> inline void evalTo(Dest& dst) const
{
if(!is_same_dense(dst,m_rhs))
dst = m_rhs;
m_triangularMatrix.template solveInPlace<Side>(dst);
}
protected:
const TriangularType& m_triangularMatrix;
typename Rhs::Nested m_rhs;
};
} // namespace internal
} // end namespace Eigen
#endif // EIGEN_SOLVETRIANGULAR_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_SOLVERBASE_H
#define EIGEN_SOLVERBASE_H
namespace Eigen {
namespace internal {
} // end namespace internal
/** \class SolverBase
* \brief A base class for matrix decomposition and solvers
*
* \tparam Derived the actual type of the decomposition/solver.
*
* Any matrix decomposition inheriting this base class provide the following API:
*
* \code
* MatrixType A, b, x;
* DecompositionType dec(A);
* x = dec.solve(b); // solve A * x = b
* x = dec.transpose().solve(b); // solve A^T * x = b
* x = dec.adjoint().solve(b); // solve A' * x = b
* \endcode
*
* \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation errors.
*
* \sa class PartialPivLU, class FullPivLU
*/
template<typename Derived>
class SolverBase : public EigenBase<Derived>
{
public:
typedef EigenBase<Derived> Base;
typedef typename internal::traits<Derived>::Scalar Scalar;
typedef Scalar CoeffReturnType;
enum {
RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
internal::traits<Derived>::ColsAtCompileTime>::ret),
MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
internal::traits<Derived>::MaxColsAtCompileTime>::ret),
IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
|| internal::traits<Derived>::MaxColsAtCompileTime == 1
};
/** Default constructor */
SolverBase()
{}
~SolverBase()
{}
using Base::derived;
/** \returns an expression of the solution x of \f$ A x = b \f$ using the current decomposition of A.
*/
template<typename Rhs>
inline const Solve<Derived, Rhs>
solve(const MatrixBase<Rhs>& b) const
{
eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
return Solve<Derived, Rhs>(derived(), b.derived());
}
/** \internal the return type of transpose() */
typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
/** \returns an expression of the transposed of the factored matrix.
*
* A typical usage is to solve for the transposed problem A^T x = b:
* \code x = dec.transpose().solve(b); \endcode
*
* \sa adjoint(), solve()
*/
inline ConstTransposeReturnType transpose() const
{
return ConstTransposeReturnType(derived());
}
/** \internal the return type of adjoint() */
typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
ConstTransposeReturnType
>::type AdjointReturnType;
/** \returns an expression of the adjoint of the factored matrix
*
* A typical usage is to solve for the adjoint problem A' x = b:
* \code x = dec.adjoint().solve(b); \endcode
*
* For real scalar types, this function is equivalent to transpose().
*
* \sa transpose(), solve()
*/
inline AdjointReturnType adjoint() const
{
return AdjointReturnType(derived().transpose());
}
protected:
};
namespace internal {
template<typename Derived>
struct generic_xpr_base<Derived, MatrixXpr, SolverStorage>
{
typedef SolverBase<Derived> type;
};
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_SOLVERBASE_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_STABLENORM_H
#define EIGEN_STABLENORM_H
namespace Eigen {
namespace internal {
template<typename ExpressionType, typename Scalar>
inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& scale, Scalar& invScale)
{
Scalar maxCoeff = bl.cwiseAbs().maxCoeff();
if(maxCoeff>scale)
{
ssq = ssq * numext::abs2(scale/maxCoeff);
Scalar tmp = Scalar(1)/maxCoeff;
if(tmp > NumTraits<Scalar>::highest())
{
invScale = NumTraits<Scalar>::highest();
scale = Scalar(1)/invScale;
}
else if(maxCoeff>NumTraits<Scalar>::highest()) // we got a INF
{
invScale = Scalar(1);
scale = maxCoeff;
}
else
{
scale = maxCoeff;
invScale = tmp;
}
}
else if(maxCoeff!=maxCoeff) // we got a NaN
{
scale = maxCoeff;
}
// TODO if the maxCoeff is much much smaller than the current scale,
// then we can neglect this sub vector
if(scale>Scalar(0)) // if scale==0, then bl is 0
ssq += (bl*invScale).squaredNorm();
}
template<typename Derived>
inline typename NumTraits<typename traits<Derived>::Scalar>::Real
blueNorm_impl(const EigenBase<Derived>& _vec)
{
typedef typename Derived::RealScalar RealScalar;
using std::pow;
using std::sqrt;
using std::abs;
const Derived& vec(_vec.derived());
static bool initialized = false;
static RealScalar b1, b2, s1m, s2m, rbig, relerr;
if(!initialized)
{
int ibeta, it, iemin, iemax, iexp;
RealScalar eps;
// This program calculates the machine-dependent constants
// bl, b2, slm, s2m, relerr overfl
// from the "basic" machine-dependent numbers
// nbig, ibeta, it, iemin, iemax, rbig.
// The following define the basic machine-dependent constants.
// For portability, the PORT subprograms "ilmaeh" and "rlmach"
// are used. For any specific computer, each of the assignment
// statements can be replaced
ibeta = std::numeric_limits<RealScalar>::radix; // base for floating-point numbers
it = std::numeric_limits<RealScalar>::digits; // number of base-beta digits in mantissa
iemin = std::numeric_limits<RealScalar>::min_exponent; // minimum exponent
iemax = std::numeric_limits<RealScalar>::max_exponent; // maximum exponent
rbig = (std::numeric_limits<RealScalar>::max)(); // largest floating-point number
iexp = -((1-iemin)/2);
b1 = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp))); // lower boundary of midrange
iexp = (iemax + 1 - it)/2;
b2 = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp))); // upper boundary of midrange
iexp = (2-iemin)/2;
s1m = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp))); // scaling factor for lower range
iexp = - ((iemax+it)/2);
s2m = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp))); // scaling factor for upper range
eps = RealScalar(pow(double(ibeta), 1-it));
relerr = sqrt(eps); // tolerance for neglecting asml
initialized = true;
}
Index n = vec.size();
RealScalar ab2 = b2 / RealScalar(n);
RealScalar asml = RealScalar(0);
RealScalar amed = RealScalar(0);
RealScalar abig = RealScalar(0);
for(typename Derived::InnerIterator it(vec, 0); it; ++it)
{
RealScalar ax = abs(it.value());
if(ax > ab2) abig += numext::abs2(ax*s2m);
else if(ax < b1) asml += numext::abs2(ax*s1m);
else amed += numext::abs2(ax);
}
if(amed!=amed)
return amed; // we got a NaN
if(abig > RealScalar(0))
{
abig = sqrt(abig);
if(abig > rbig) // overflow, or *this contains INF values
return abig; // return INF
if(amed > RealScalar(0))
{
abig = abig/s2m;
amed = sqrt(amed);
}
else
return abig/s2m;
}
else if(asml > RealScalar(0))
{
if (amed > RealScalar(0))
{
abig = sqrt(amed);
amed = sqrt(asml) / s1m;
}
else
return sqrt(asml)/s1m;
}
else
return sqrt(amed);
asml = numext::mini(abig, amed);
abig = numext::maxi(abig, amed);
if(asml <= abig*relerr)
return abig;
else
return abig * sqrt(RealScalar(1) + numext::abs2(asml/abig));
}
} // end namespace internal
/** \returns the \em l2 norm of \c *this avoiding underflow and overflow.
* This version use a blockwise two passes algorithm:
* 1 - find the absolute largest coefficient \c s
* 2 - compute \f$ s \Vert \frac{*this}{s} \Vert \f$ in a standard way
*
* For architecture/scalar types supporting vectorization, this version
* is faster than blueNorm(). Otherwise the blueNorm() is much faster.
*
* \sa norm(), blueNorm(), hypotNorm()
*/
template<typename Derived>
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
MatrixBase<Derived>::stableNorm() const
{
using std::sqrt;
using std::abs;
const Index blockSize = 4096;
RealScalar scale(0);
RealScalar invScale(1);
RealScalar ssq(0); // sum of square
typedef typename internal::nested_eval<Derived,2>::type DerivedCopy;
typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean;
DerivedCopy copy(derived());
enum {
CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit)
|| (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
&& (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
};
typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
Index n = size();
if(n==1)
return abs(this->coeff(0));
Index bi = internal::first_default_aligned(copy);
if (bi>0)
internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
for (; bi<n; bi+=blockSize)
internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
return scale * sqrt(ssq);
}
/** \returns the \em l2 norm of \c *this using the Blue's algorithm.
* A Portable Fortran Program to Find the Euclidean Norm of a Vector,
* ACM TOMS, Vol 4, Issue 1, 1978.
*
* For architecture/scalar types without vectorization, this version
* is much faster than stableNorm(). Otherwise the stableNorm() is faster.
*
* \sa norm(), stableNorm(), hypotNorm()
*/
template<typename Derived>
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
MatrixBase<Derived>::blueNorm() const
{
return internal::blueNorm_impl(*this);
}
/** \returns the \em l2 norm of \c *this avoiding undeflow and overflow.
* This version use a concatenation of hypot() calls, and it is very slow.
*
* \sa norm(), stableNorm()
*/
template<typename Derived>
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
MatrixBase<Derived>::hypotNorm() const
{
return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
}
} // end namespace Eigen
#endif // EIGEN_STABLENORM_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_STRIDE_H
#define EIGEN_STRIDE_H
namespace Eigen {
/** \class Stride
* \ingroup Core_Module
*
* \brief Holds strides information for Map
*
* This class holds the strides information for mapping arrays with strides with class Map.
*
* It holds two values: the inner stride and the outer stride.
*
* The inner stride is the pointer increment between two consecutive entries within a given row of a
* row-major matrix or within a given column of a column-major matrix.
*
* The outer stride is the pointer increment between two consecutive rows of a row-major matrix or
* between two consecutive columns of a column-major matrix.
*
* These two values can be passed either at compile-time as template parameters, or at runtime as
* arguments to the constructor.
*
* Indeed, this class takes two template parameters:
* \tparam _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime.
* \tparam _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime.
*
* Here is an example:
* \include Map_general_stride.cpp
* Output: \verbinclude Map_general_stride.out
*
* \sa class InnerStride, class OuterStride, \ref TopicStorageOrders
*/
template<int _OuterStrideAtCompileTime, int _InnerStrideAtCompileTime>
class Stride
{
public:
typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
enum {
InnerStrideAtCompileTime = _InnerStrideAtCompileTime,
OuterStrideAtCompileTime = _OuterStrideAtCompileTime
};
/** Default constructor, for use when strides are fixed at compile time */
EIGEN_DEVICE_FUNC
Stride()
: m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime)
{
eigen_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic);
}
/** Constructor allowing to pass the strides at runtime */
EIGEN_DEVICE_FUNC
Stride(Index outerStride, Index innerStride)
: m_outer(outerStride), m_inner(innerStride)
{
eigen_assert(innerStride>=0 && outerStride>=0);
}
/** Copy constructor */
EIGEN_DEVICE_FUNC
Stride(const Stride& other)
: m_outer(other.outer()), m_inner(other.inner())
{}
/** \returns the outer stride */
EIGEN_DEVICE_FUNC
inline Index outer() const { return m_outer.value(); }
/** \returns the inner stride */
EIGEN_DEVICE_FUNC
inline Index inner() const { return m_inner.value(); }
protected:
internal::variable_if_dynamic<Index, OuterStrideAtCompileTime> m_outer;
internal::variable_if_dynamic<Index, InnerStrideAtCompileTime> m_inner;
};
/** \brief Convenience specialization of Stride to specify only an inner stride
* See class Map for some examples */
template<int Value>
class InnerStride : public Stride<0, Value>
{
typedef Stride<0, Value> Base;
public:
EIGEN_DEVICE_FUNC InnerStride() : Base() {}
EIGEN_DEVICE_FUNC InnerStride(Index v) : Base(0, v) {} // FIXME making this explicit could break valid code
};
/** \brief Convenience specialization of Stride to specify only an outer stride
* See class Map for some examples */
template<int Value>
class OuterStride : public Stride<Value, 0>
{
typedef Stride<Value, 0> Base;
public:
EIGEN_DEVICE_FUNC OuterStride() : Base() {}
EIGEN_DEVICE_FUNC OuterStride(Index v) : Base(v,0) {} // FIXME making this explicit could break valid code
};
} // end namespace Eigen
#endif // EIGEN_STRIDE_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_SWAP_H
#define EIGEN_SWAP_H
namespace Eigen {
namespace internal {
// Overload default assignPacket behavior for swapping them
template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT>
class generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, Specialized>
: public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, BuiltIn>
{
protected:
typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, BuiltIn> Base;
using Base::m_dst;
using Base::m_src;
using Base::m_functor;
public:
typedef typename Base::Scalar Scalar;
typedef typename Base::DstXprType DstXprType;
typedef swap_assign_op<Scalar> Functor;
EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
: Base(dst, src, func, dstExpr)
{}
template<int StoreMode, int LoadMode, typename PacketType>
void assignPacket(Index row, Index col)
{
PacketType tmp = m_src.template packet<LoadMode,PacketType>(row,col);
const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(row,col, m_dst.template packet<StoreMode,PacketType>(row,col));
m_dst.template writePacket<StoreMode>(row,col,tmp);
}
template<int StoreMode, int LoadMode, typename PacketType>
void assignPacket(Index index)
{
PacketType tmp = m_src.template packet<LoadMode,PacketType>(index);
const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(index, m_dst.template packet<StoreMode,PacketType>(index));
m_dst.template writePacket<StoreMode>(index,tmp);
}
// TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)
template<int StoreMode, int LoadMode, typename PacketType>
void assignPacketByOuterInner(Index outer, Index inner)
{
Index row = Base::rowIndexByOuterInner(outer, inner);
Index col = Base::colIndexByOuterInner(outer, inner);
assignPacket<StoreMode,LoadMode,PacketType>(row, col);
}
};
} // namespace internal
} // end namespace Eigen
#endif // EIGEN_SWAP_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_TRANSPOSE_H
#define EIGEN_TRANSPOSE_H
namespace Eigen {
namespace internal {
template<typename MatrixType>
struct traits<Transpose<MatrixType> > : public traits<MatrixType>
{
typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedPlain;
enum {
RowsAtCompileTime = MatrixType::ColsAtCompileTime,
ColsAtCompileTime = MatrixType::RowsAtCompileTime,
MaxRowsAtCompileTime = MatrixType::MaxColsAtCompileTime,
MaxColsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
Flags0 = traits<MatrixTypeNestedPlain>::Flags & ~(LvalueBit | NestByRefBit),
Flags1 = Flags0 | FlagsLvalueBit,
Flags = Flags1 ^ RowMajorBit,
InnerStrideAtCompileTime = inner_stride_at_compile_time<MatrixType>::ret,
OuterStrideAtCompileTime = outer_stride_at_compile_time<MatrixType>::ret
};
};
}
template<typename MatrixType, typename StorageKind> class TransposeImpl;
/** \class Transpose
* \ingroup Core_Module
*
* \brief Expression of the transpose of a matrix
*
* \tparam MatrixType the type of the object of which we are taking the transpose
*
* This class represents an expression of the transpose of a matrix.
* It is the return type of MatrixBase::transpose() and MatrixBase::adjoint()
* and most of the time this is the only way it is used.
*
* \sa MatrixBase::transpose(), MatrixBase::adjoint()
*/
template<typename MatrixType> class Transpose
: public TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>
{
public:
typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
typedef typename TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose)
typedef typename internal::remove_all<MatrixType>::type NestedExpression;
EIGEN_DEVICE_FUNC
explicit inline Transpose(MatrixType& matrix) : m_matrix(matrix) {}
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.cols(); }
EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.rows(); }
/** \returns the nested expression */
EIGEN_DEVICE_FUNC
const typename internal::remove_all<MatrixTypeNested>::type&
nestedExpression() const { return m_matrix; }
/** \returns the nested expression */
EIGEN_DEVICE_FUNC
typename internal::remove_reference<MatrixTypeNested>::type&
nestedExpression() { return m_matrix; }
/** \internal */
void resize(Index nrows, Index ncols) {
m_matrix.resize(ncols,nrows);
}
protected:
typename internal::ref_selector<MatrixType>::non_const_type m_matrix;
};
namespace internal {
template<typename MatrixType, bool HasDirectAccess = has_direct_access<MatrixType>::ret>
struct TransposeImpl_base
{
typedef typename dense_xpr_base<Transpose<MatrixType> >::type type;
};
template<typename MatrixType>
struct TransposeImpl_base<MatrixType, false>
{
typedef typename dense_xpr_base<Transpose<MatrixType> >::type type;
};
} // end namespace internal
// Generic API dispatcher
template<typename XprType, typename StorageKind>
class TransposeImpl
: public internal::generic_xpr_base<Transpose<XprType> >::type
{
public:
typedef typename internal::generic_xpr_base<Transpose<XprType> >::type Base;
};
template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
: public internal::TransposeImpl_base<MatrixType>::type
{
public:
typedef typename internal::TransposeImpl_base<MatrixType>::type Base;
using Base::coeffRef;
EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl)
EIGEN_DEVICE_FUNC inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
EIGEN_DEVICE_FUNC inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
typedef typename internal::conditional<
internal::is_lvalue<MatrixType>::value,
Scalar,
const Scalar
>::type ScalarWithConstIfNotLvalue;
EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
EIGEN_DEVICE_FUNC inline const Scalar* data() const { return derived().nestedExpression().data(); }
// FIXME: shall we keep the const version of coeffRef?
EIGEN_DEVICE_FUNC
inline const Scalar& coeffRef(Index rowId, Index colId) const
{
return derived().nestedExpression().coeffRef(colId, rowId);
}
EIGEN_DEVICE_FUNC
inline const Scalar& coeffRef(Index index) const
{
return derived().nestedExpression().coeffRef(index);
}
};
/** \returns an expression of the transpose of *this.
*
* Example: \include MatrixBase_transpose.cpp
* Output: \verbinclude MatrixBase_transpose.out
*
* \warning If you want to replace a matrix by its own transpose, do \b NOT do this:
* \code
* m = m.transpose(); // bug!!! caused by aliasing effect
* \endcode
* Instead, use the transposeInPlace() method:
* \code
* m.transposeInPlace();
* \endcode
* which gives Eigen good opportunities for optimization, or alternatively you can also do:
* \code
* m = m.transpose().eval();
* \endcode
*
* \sa transposeInPlace(), adjoint() */
template<typename Derived>
inline Transpose<Derived>
DenseBase<Derived>::transpose()
{
return TransposeReturnType(derived());
}
/** This is the const version of transpose().
*
* Make sure you read the warning for transpose() !
*
* \sa transposeInPlace(), adjoint() */
template<typename Derived>
inline typename DenseBase<Derived>::ConstTransposeReturnType
DenseBase<Derived>::transpose() const
{
return ConstTransposeReturnType(derived());
}
/** \returns an expression of the adjoint (i.e. conjugate transpose) of *this.
*
* Example: \include MatrixBase_adjoint.cpp
* Output: \verbinclude MatrixBase_adjoint.out
*
* \warning If you want to replace a matrix by its own adjoint, do \b NOT do this:
* \code
* m = m.adjoint(); // bug!!! caused by aliasing effect
* \endcode
* Instead, use the adjointInPlace() method:
* \code
* m.adjointInPlace();
* \endcode
* which gives Eigen good opportunities for optimization, or alternatively you can also do:
* \code
* m = m.adjoint().eval();
* \endcode
*
* \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
template<typename Derived>
inline const typename MatrixBase<Derived>::AdjointReturnType
MatrixBase<Derived>::adjoint() const
{
return AdjointReturnType(this->transpose());
}
/***************************************************************************
* "in place" transpose implementation
***************************************************************************/
namespace internal {
template<typename MatrixType,
bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic,
bool MatchPacketSize =
(int(MatrixType::RowsAtCompileTime) == int(internal::packet_traits<typename MatrixType::Scalar>::size))
&& (internal::evaluator<MatrixType>::Flags&PacketAccessBit) >
struct inplace_transpose_selector;
template<typename MatrixType>
struct inplace_transpose_selector<MatrixType,true,false> { // square matrix
static void run(MatrixType& m) {
m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
}
};
// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only.
template<typename MatrixType>
struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize
static void run(MatrixType& m) {
typedef typename MatrixType::Scalar Scalar;
typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
const Index PacketSize = internal::packet_traits<Scalar>::size;
const Index Alignment = internal::evaluator<MatrixType>::Alignment;
PacketBlock<Packet> A;
for (Index i=0; i<PacketSize; ++i)
A.packet[i] = m.template packetByOuterInner<Alignment>(i,0);
internal::ptranspose(A);
for (Index i=0; i<PacketSize; ++i)
m.template writePacket<Alignment>(m.rowIndexByOuterInner(i,0), m.colIndexByOuterInner(i,0), A.packet[i]);
}
};
template<typename MatrixType,bool MatchPacketSize>
struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square matrix
static void run(MatrixType& m) {
if (m.rows()==m.cols())
m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
else
m = m.transpose().eval();
}
};
} // end namespace internal
/** This is the "in place" version of transpose(): it replaces \c *this by its own transpose.
* Thus, doing
* \code
* m.transposeInPlace();
* \endcode
* has the same effect on m as doing
* \code
* m = m.transpose().eval();
* \endcode
* and is faster and also safer because in the latter line of code, forgetting the eval() results
* in a bug caused by \ref TopicAliasing "aliasing".
*
* Notice however that this method is only useful if you want to replace a matrix by its own transpose.
* If you just need the transpose of a matrix, use transpose().
*
* \note if the matrix is not square, then \c *this must be a resizable matrix.
* This excludes (non-square) fixed-size matrices, block-expressions and maps.
*
* \sa transpose(), adjoint(), adjointInPlace() */
template<typename Derived>
inline void DenseBase<Derived>::transposeInPlace()
{
eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic))
&& "transposeInPlace() called on a non-square non-resizable matrix");
internal::inplace_transpose_selector<Derived>::run(derived());
}
/***************************************************************************
* "in place" adjoint implementation
***************************************************************************/
/** This is the "in place" version of adjoint(): it replaces \c *this by its own transpose.
* Thus, doing
* \code
* m.adjointInPlace();
* \endcode
* has the same effect on m as doing
* \code
* m = m.adjoint().eval();
* \endcode
* and is faster and also safer because in the latter line of code, forgetting the eval() results
* in a bug caused by aliasing.
*
* Notice however that this method is only useful if you want to replace a matrix by its own adjoint.
* If you just need the adjoint of a matrix, use adjoint().
*
* \note if the matrix is not square, then \c *this must be a resizable matrix.
* This excludes (non-square) fixed-size matrices, block-expressions and maps.
*
* \sa transpose(), adjoint(), transposeInPlace() */
template<typename Derived>
inline void MatrixBase<Derived>::adjointInPlace()
{
derived() = adjoint().eval();
}
#ifndef EIGEN_NO_DEBUG
// The following is to detect aliasing problems in most common cases.
namespace internal {
template<bool DestIsTransposed, typename OtherDerived>
struct check_transpose_aliasing_compile_time_selector
{
enum { ret = bool(blas_traits<OtherDerived>::IsTransposed) != DestIsTransposed };
};
template<bool DestIsTransposed, typename BinOp, typename DerivedA, typename DerivedB>
struct check_transpose_aliasing_compile_time_selector<DestIsTransposed,CwiseBinaryOp<BinOp,DerivedA,DerivedB> >
{
enum { ret = bool(blas_traits<DerivedA>::IsTransposed) != DestIsTransposed
|| bool(blas_traits<DerivedB>::IsTransposed) != DestIsTransposed
};
};
template<typename Scalar, bool DestIsTransposed, typename OtherDerived>
struct check_transpose_aliasing_run_time_selector
{
static bool run(const Scalar* dest, const OtherDerived& src)
{
return (bool(blas_traits<OtherDerived>::IsTransposed) != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src));
}
};
template<typename Scalar, bool DestIsTransposed, typename BinOp, typename DerivedA, typename DerivedB>
struct check_transpose_aliasing_run_time_selector<Scalar,DestIsTransposed,CwiseBinaryOp<BinOp,DerivedA,DerivedB> >
{
static bool run(const Scalar* dest, const CwiseBinaryOp<BinOp,DerivedA,DerivedB>& src)
{
return ((blas_traits<DerivedA>::IsTransposed != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src.lhs())))
|| ((blas_traits<DerivedB>::IsTransposed != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src.rhs())));
}
};
// the following selector, checkTransposeAliasing_impl, based on MightHaveTransposeAliasing,
// is because when the condition controlling the assert is known at compile time, ICC emits a warning.
// This is actually a good warning: in expressions that don't have any transposing, the condition is
// known at compile time to be false, and using that, we can avoid generating the code of the assert again
// and again for all these expressions that don't need it.
template<typename Derived, typename OtherDerived,
bool MightHaveTransposeAliasing
= check_transpose_aliasing_compile_time_selector
<blas_traits<Derived>::IsTransposed,OtherDerived>::ret
>
struct checkTransposeAliasing_impl
{
static void run(const Derived& dst, const OtherDerived& other)
{
eigen_assert((!check_transpose_aliasing_run_time_selector
<typename Derived::Scalar,blas_traits<Derived>::IsTransposed,OtherDerived>
::run(extract_data(dst), other))
&& "aliasing detected during transposition, use transposeInPlace() "
"or evaluate the rhs into a temporary using .eval()");
}
};
template<typename Derived, typename OtherDerived>
struct checkTransposeAliasing_impl<Derived, OtherDerived, false>
{
static void run(const Derived&, const OtherDerived&)
{
}
};
template<typename Dst, typename Src>
void check_for_aliasing(const Dst &dst, const Src &src)
{
internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
}
} // end namespace internal
#endif // EIGEN_NO_DEBUG
} // end namespace Eigen
#endif // EIGEN_TRANSPOSE_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2010-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_TRANSPOSITIONS_H
#define EIGEN_TRANSPOSITIONS_H
namespace Eigen {
template<typename Derived>
class TranspositionsBase
{
typedef internal::traits<Derived> Traits;
public:
typedef typename Traits::IndicesType IndicesType;
typedef typename IndicesType::Scalar StorageIndex;
typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
Derived& derived() { return *static_cast<Derived*>(this); }
const Derived& derived() const { return *static_cast<const Derived*>(this); }
/** Copies the \a other transpositions into \c *this */
template<typename OtherDerived>
Derived& operator=(const TranspositionsBase<OtherDerived>& other)
{
indices() = other.indices();
return derived();
}
#ifndef EIGEN_PARSED_BY_DOXYGEN
/** This is a special case of the templated operator=. Its purpose is to
* prevent a default operator= from hiding the templated operator=.
*/
Derived& operator=(const TranspositionsBase& other)
{
indices() = other.indices();
return derived();
}
#endif
/** \returns the number of transpositions */
Index size() const { return indices().size(); }
/** \returns the number of rows of the equivalent permutation matrix */
Index rows() const { return indices().size(); }
/** \returns the number of columns of the equivalent permutation matrix */
Index cols() const { return indices().size(); }
/** Direct access to the underlying index vector */
inline const StorageIndex& coeff(Index i) const { return indices().coeff(i); }
/** Direct access to the underlying index vector */
inline StorageIndex& coeffRef(Index i) { return indices().coeffRef(i); }
/** Direct access to the underlying index vector */
inline const StorageIndex& operator()(Index i) const { return indices()(i); }
/** Direct access to the underlying index vector */
inline StorageIndex& operator()(Index i) { return indices()(i); }
/** Direct access to the underlying index vector */
inline const StorageIndex& operator[](Index i) const { return indices()(i); }
/** Direct access to the underlying index vector */
inline StorageIndex& operator[](Index i) { return indices()(i); }
/** const version of indices(). */
const IndicesType& indices() const { return derived().indices(); }
/** \returns a reference to the stored array representing the transpositions. */
IndicesType& indices() { return derived().indices(); }
/** Resizes to given size. */
inline void resize(Index newSize)
{
indices().resize(newSize);
}
/** Sets \c *this to represents an identity transformation */
void setIdentity()
{
for(StorageIndex i = 0; i < indices().size(); ++i)
coeffRef(i) = i;
}
// FIXME: do we want such methods ?
// might be usefull when the target matrix expression is complex, e.g.:
// object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..);
/*
template<typename MatrixType>
void applyForwardToRows(MatrixType& mat) const
{
for(Index k=0 ; k<size() ; ++k)
if(m_indices(k)!=k)
mat.row(k).swap(mat.row(m_indices(k)));
}
template<typename MatrixType>
void applyBackwardToRows(MatrixType& mat) const
{
for(Index k=size()-1 ; k>=0 ; --k)
if(m_indices(k)!=k)
mat.row(k).swap(mat.row(m_indices(k)));
}
*/
/** \returns the inverse transformation */
inline Transpose<TranspositionsBase> inverse() const
{ return Transpose<TranspositionsBase>(derived()); }
/** \returns the tranpose transformation */
inline Transpose<TranspositionsBase> transpose() const
{ return Transpose<TranspositionsBase>(derived()); }
protected:
};
namespace internal {
template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
: traits<PermutationMatrix<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
{
typedef Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
typedef TranspositionsStorage StorageKind;
};
}
/** \class Transpositions
* \ingroup Core_Module
*
* \brief Represents a sequence of transpositions (row/column interchange)
*
* \tparam SizeAtCompileTime the number of transpositions, or Dynamic
* \tparam MaxSizeAtCompileTime the maximum number of transpositions, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
*
* This class represents a permutation transformation as a sequence of \em n transpositions
* \f$[T_{n-1} \ldots T_{i} \ldots T_{0}]\f$. It is internally stored as a vector of integers \c indices.
* Each transposition \f$ T_{i} \f$ applied on the left of a matrix (\f$ T_{i} M\f$) interchanges
* the rows \c i and \c indices[i] of the matrix \c M.
* A transposition applied on the right (e.g., \f$ M T_{i}\f$) yields a column interchange.
*
* Compared to the class PermutationMatrix, such a sequence of transpositions is what is
* computed during a decomposition with pivoting, and it is faster when applying the permutation in-place.
*
* To apply a sequence of transpositions to a matrix, simply use the operator * as in the following example:
* \code
* Transpositions tr;
* MatrixXf mat;
* mat = tr * mat;
* \endcode
* In this example, we detect that the matrix appears on both side, and so the transpositions
* are applied in-place without any temporary or extra copy.
*
* \sa class PermutationMatrix
*/
template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
{
typedef internal::traits<Transpositions> Traits;
public:
typedef TranspositionsBase<Transpositions> Base;
typedef typename Traits::IndicesType IndicesType;
typedef typename IndicesType::Scalar StorageIndex;
inline Transpositions() {}
/** Copy constructor. */
template<typename OtherDerived>
inline Transpositions(const TranspositionsBase<OtherDerived>& other)
: m_indices(other.indices()) {}
#ifndef EIGEN_PARSED_BY_DOXYGEN
/** Standard copy constructor. Defined only to prevent a default copy constructor
* from hiding the other templated constructor */
inline Transpositions(const Transpositions& other) : m_indices(other.indices()) {}
#endif
/** Generic constructor from expression of the transposition indices. */
template<typename Other>
explicit inline Transpositions(const MatrixBase<Other>& indices) : m_indices(indices)
{}
/** Copies the \a other transpositions into \c *this */
template<typename OtherDerived>
Transpositions& operator=(const TranspositionsBase<OtherDerived>& other)
{
return Base::operator=(other);
}
#ifndef EIGEN_PARSED_BY_DOXYGEN
/** This is a special case of the templated operator=. Its purpose is to
* prevent a default operator= from hiding the templated operator=.
*/
Transpositions& operator=(const Transpositions& other)
{
m_indices = other.m_indices;
return *this;
}
#endif
/** Constructs an uninitialized permutation matrix of given size.
*/
inline Transpositions(Index size) : m_indices(size)
{}
/** const version of indices(). */
const IndicesType& indices() const { return m_indices; }
/** \returns a reference to the stored array representing the transpositions. */
IndicesType& indices() { return m_indices; }
protected:
IndicesType m_indices;
};
namespace internal {
template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int _PacketAccess>
struct traits<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,_PacketAccess> >
: traits<PermutationMatrix<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
{
typedef Map<const Matrix<_StorageIndex,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1>, _PacketAccess> IndicesType;
typedef _StorageIndex StorageIndex;
typedef TranspositionsStorage StorageKind;
};
}
template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int PacketAccess>
class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,PacketAccess>
: public TranspositionsBase<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,PacketAccess> >
{
typedef internal::traits<Map> Traits;
public:
typedef TranspositionsBase<Map> Base;
typedef typename Traits::IndicesType IndicesType;
typedef typename IndicesType::Scalar StorageIndex;
explicit inline Map(const StorageIndex* indicesPtr)
: m_indices(indicesPtr)
{}
inline Map(const StorageIndex* indicesPtr, Index size)
: m_indices(indicesPtr,size)
{}
/** Copies the \a other transpositions into \c *this */
template<typename OtherDerived>
Map& operator=(const TranspositionsBase<OtherDerived>& other)
{
return Base::operator=(other);
}
#ifndef EIGEN_PARSED_BY_DOXYGEN
/** This is a special case of the templated operator=. Its purpose is to
* prevent a default operator= from hiding the templated operator=.
*/
Map& operator=(const Map& other)
{
m_indices = other.m_indices;
return *this;
}
#endif
/** const version of indices(). */
const IndicesType& indices() const { return m_indices; }
/** \returns a reference to the stored array representing the transpositions. */
IndicesType& indices() { return m_indices; }
protected:
IndicesType m_indices;
};
namespace internal {
template<typename _IndicesType>
struct traits<TranspositionsWrapper<_IndicesType> >
: traits<PermutationWrapper<_IndicesType> >
{
typedef TranspositionsStorage StorageKind;
};
}
template<typename _IndicesType>
class TranspositionsWrapper
: public TranspositionsBase<TranspositionsWrapper<_IndicesType> >
{
typedef internal::traits<TranspositionsWrapper> Traits;
public:
typedef TranspositionsBase<TranspositionsWrapper> Base;
typedef typename Traits::IndicesType IndicesType;
typedef typename IndicesType::Scalar StorageIndex;
explicit inline TranspositionsWrapper(IndicesType& indices)
: m_indices(indices)
{}
/** Copies the \a other transpositions into \c *this */
template<typename OtherDerived>
TranspositionsWrapper& operator=(const TranspositionsBase<OtherDerived>& other)
{
return Base::operator=(other);
}
#ifndef EIGEN_PARSED_BY_DOXYGEN
/** This is a special case of the templated operator=. Its purpose is to
* prevent a default operator= from hiding the templated operator=.
*/
TranspositionsWrapper& operator=(const TranspositionsWrapper& other)
{
m_indices = other.m_indices;
return *this;
}
#endif
/** const version of indices(). */
const IndicesType& indices() const { return m_indices; }
/** \returns a reference to the stored array representing the transpositions. */
IndicesType& indices() { return m_indices; }
protected:
typename IndicesType::Nested m_indices;
};
/** \returns the \a matrix with the \a transpositions applied to the columns.
*/
template<typename MatrixDerived, typename TranspositionsDerived>
EIGEN_DEVICE_FUNC
const Product<MatrixDerived, TranspositionsDerived, AliasFreeProduct>
operator*(const MatrixBase<MatrixDerived> &matrix,
const TranspositionsBase<TranspositionsDerived>& transpositions)
{
return Product<MatrixDerived, TranspositionsDerived, AliasFreeProduct>
(matrix.derived(), transpositions.derived());
}
/** \returns the \a matrix with the \a transpositions applied to the rows.
*/
template<typename TranspositionsDerived, typename MatrixDerived>
EIGEN_DEVICE_FUNC
const Product<TranspositionsDerived, MatrixDerived, AliasFreeProduct>
operator*(const TranspositionsBase<TranspositionsDerived> &transpositions,
const MatrixBase<MatrixDerived>& matrix)
{
return Product<TranspositionsDerived, MatrixDerived, AliasFreeProduct>
(transpositions.derived(), matrix.derived());
}
// Template partial specialization for transposed/inverse transpositions
namespace internal {
template<typename Derived>
struct traits<Transpose<TranspositionsBase<Derived> > >
: traits<Derived>
{};
} // end namespace internal
template<typename TranspositionsDerived>
class Transpose<TranspositionsBase<TranspositionsDerived> >
{
typedef TranspositionsDerived TranspositionType;
typedef typename TranspositionType::IndicesType IndicesType;
public:
explicit Transpose(const TranspositionType& t) : m_transpositions(t) {}
Index size() const { return m_transpositions.size(); }
Index rows() const { return m_transpositions.size(); }
Index cols() const { return m_transpositions.size(); }
/** \returns the \a matrix with the inverse transpositions applied to the columns.
*/
template<typename OtherDerived> friend
const Product<OtherDerived, Transpose, AliasFreeProduct>
operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trt)
{
return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt.derived());
}
/** \returns the \a matrix with the inverse transpositions applied to the rows.
*/
template<typename OtherDerived>
const Product<Transpose, OtherDerived, AliasFreeProduct>
operator*(const MatrixBase<OtherDerived>& matrix) const
{
return Product<Transpose, OtherDerived, AliasFreeProduct>(*this, matrix.derived());
}
const TranspositionType& nestedExpression() const { return m_transpositions; }
protected:
const TranspositionType& m_transpositions;
};
} // end namespace Eigen
#endif // EIGEN_TRANSPOSITIONS_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_TRIANGULARMATRIX_H
#define EIGEN_TRIANGULARMATRIX_H
namespace Eigen {
namespace internal {
template<int Side, typename TriangularType, typename Rhs> struct triangular_solve_retval;
}
/** \class TriangularBase
* \ingroup Core_Module
*
* \brief Base class for triangular part in a matrix
*/
template<typename Derived> class TriangularBase : public EigenBase<Derived>
{
public:
enum {
Mode = internal::traits<Derived>::Mode,
RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
internal::traits<Derived>::ColsAtCompileTime>::ret),
/**< This is equal to the number of coefficients, i.e. the number of
* rows times the number of columns, or to \a Dynamic if this is not
* known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
internal::traits<Derived>::MaxColsAtCompileTime>::ret)
};
typedef typename internal::traits<Derived>::Scalar Scalar;
typedef typename internal::traits<Derived>::StorageKind StorageKind;
typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
typedef typename internal::traits<Derived>::FullMatrixType DenseMatrixType;
typedef DenseMatrixType DenseType;
typedef Derived const& Nested;
EIGEN_DEVICE_FUNC
inline TriangularBase() { eigen_assert(!((Mode&UnitDiag) && (Mode&ZeroDiag))); }
EIGEN_DEVICE_FUNC
inline Index rows() const { return derived().rows(); }
EIGEN_DEVICE_FUNC
inline Index cols() const { return derived().cols(); }
EIGEN_DEVICE_FUNC
inline Index outerStride() const { return derived().outerStride(); }
EIGEN_DEVICE_FUNC
inline Index innerStride() const { return derived().innerStride(); }
// dummy resize function
void resize(Index rows, Index cols)
{
EIGEN_UNUSED_VARIABLE(rows);
EIGEN_UNUSED_VARIABLE(cols);
eigen_assert(rows==this->rows() && cols==this->cols());
}
EIGEN_DEVICE_FUNC
inline Scalar coeff(Index row, Index col) const { return derived().coeff(row,col); }
EIGEN_DEVICE_FUNC
inline Scalar& coeffRef(Index row, Index col) { return derived().coeffRef(row,col); }
/** \see MatrixBase::copyCoeff(row,col)
*/
template<typename Other>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE void copyCoeff(Index row, Index col, Other& other)
{
derived().coeffRef(row, col) = other.coeff(row, col);
}
EIGEN_DEVICE_FUNC
inline Scalar operator()(Index row, Index col) const
{
check_coordinates(row, col);
return coeff(row,col);
}
EIGEN_DEVICE_FUNC
inline Scalar& operator()(Index row, Index col)
{
check_coordinates(row, col);
return coeffRef(row,col);
}
#ifndef EIGEN_PARSED_BY_DOXYGEN
EIGEN_DEVICE_FUNC
inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
EIGEN_DEVICE_FUNC
inline Derived& derived() { return *static_cast<Derived*>(this); }
#endif // not EIGEN_PARSED_BY_DOXYGEN
template<typename DenseDerived>
EIGEN_DEVICE_FUNC
void evalTo(MatrixBase<DenseDerived> &other) const;
template<typename DenseDerived>
EIGEN_DEVICE_FUNC
void evalToLazy(MatrixBase<DenseDerived> &other) const;
EIGEN_DEVICE_FUNC
DenseMatrixType toDenseMatrix() const
{
DenseMatrixType res(rows(), cols());
evalToLazy(res);
return res;
}
protected:
void check_coordinates(Index row, Index col) const
{
EIGEN_ONLY_USED_FOR_DEBUG(row);
EIGEN_ONLY_USED_FOR_DEBUG(col);
eigen_assert(col>=0 && col<cols() && row>=0 && row<rows());
const int mode = int(Mode) & ~SelfAdjoint;
EIGEN_ONLY_USED_FOR_DEBUG(mode);
eigen_assert((mode==Upper && col>=row)
|| (mode==Lower && col<=row)
|| ((mode==StrictlyUpper || mode==UnitUpper) && col>row)
|| ((mode==StrictlyLower || mode==UnitLower) && col<row));
}
#ifdef EIGEN_INTERNAL_DEBUGGING
void check_coordinates_internal(Index row, Index col) const
{
check_coordinates(row, col);
}
#else
void check_coordinates_internal(Index , Index ) const {}
#endif
};
/** \class TriangularView
* \ingroup Core_Module
*
* \brief Expression of a triangular part in a matrix
*
* \param MatrixType the type of the object in which we are taking the triangular part
* \param Mode the kind of triangular matrix expression to construct. Can be #Upper,
* #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower.
* This is in fact a bit field; it must have either #Upper or #Lower,
* and additionally it may have #UnitDiag or #ZeroDiag or neither.
*
* This class represents a triangular part of a matrix, not necessarily square. Strictly speaking, for rectangular
* matrices one should speak of "trapezoid" parts. This class is the return type
* of MatrixBase::triangularView() and SparseMatrixBase::triangularView(), and most of the time this is the only way it is used.
*
* \sa MatrixBase::triangularView()
*/
namespace internal {
template<typename MatrixType, unsigned int _Mode>
struct traits<TriangularView<MatrixType, _Mode> > : traits<MatrixType>
{
typedef typename ref_selector<MatrixType>::non_const_type MatrixTypeNested;
typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedNonRef;
typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
typedef typename MatrixType::PlainObject FullMatrixType;
typedef MatrixType ExpressionType;
enum {
Mode = _Mode,
FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
Flags = (MatrixTypeNestedCleaned::Flags & (HereditaryBits | FlagsLvalueBit) & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)))
};
};
}
template<typename _MatrixType, unsigned int _Mode, typename StorageKind> class TriangularViewImpl;
template<typename _MatrixType, unsigned int _Mode> class TriangularView
: public TriangularViewImpl<_MatrixType, _Mode, typename internal::traits<_MatrixType>::StorageKind >
{
public:
typedef TriangularViewImpl<_MatrixType, _Mode, typename internal::traits<_MatrixType>::StorageKind > Base;
typedef typename internal::traits<TriangularView>::Scalar Scalar;
typedef _MatrixType MatrixType;
protected:
typedef typename internal::traits<TriangularView>::MatrixTypeNested MatrixTypeNested;
typedef typename internal::traits<TriangularView>::MatrixTypeNestedNonRef MatrixTypeNestedNonRef;
typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
public:
typedef typename internal::traits<TriangularView>::StorageKind StorageKind;
typedef typename internal::traits<TriangularView>::MatrixTypeNestedCleaned NestedExpression;
enum {
Mode = _Mode,
Flags = internal::traits<TriangularView>::Flags,
TransposeMode = (Mode & Upper ? Lower : 0)
| (Mode & Lower ? Upper : 0)
| (Mode & (UnitDiag))
| (Mode & (ZeroDiag)),
IsVectorAtCompileTime = false
};
EIGEN_DEVICE_FUNC
explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix)
{}
using Base::operator=;
TriangularView& operator=(const TriangularView &other)
{ return Base::operator=(other); }
/** \copydoc EigenBase::rows() */
EIGEN_DEVICE_FUNC
inline Index rows() const { return m_matrix.rows(); }
/** \copydoc EigenBase::cols() */
EIGEN_DEVICE_FUNC
inline Index cols() const { return m_matrix.cols(); }
/** \returns a const reference to the nested expression */
EIGEN_DEVICE_FUNC
const NestedExpression& nestedExpression() const { return m_matrix; }
/** \returns a reference to the nested expression */
EIGEN_DEVICE_FUNC
NestedExpression& nestedExpression() { return m_matrix; }
typedef TriangularView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
/** \sa MatrixBase::conjugate() const */
EIGEN_DEVICE_FUNC
inline const ConjugateReturnType conjugate() const
{ return ConjugateReturnType(m_matrix.conjugate()); }
typedef TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
/** \sa MatrixBase::adjoint() const */
EIGEN_DEVICE_FUNC
inline const AdjointReturnType adjoint() const
{ return AdjointReturnType(m_matrix.adjoint()); }
typedef TriangularView<typename MatrixType::TransposeReturnType,TransposeMode> TransposeReturnType;
/** \sa MatrixBase::transpose() */
EIGEN_DEVICE_FUNC
inline TransposeReturnType transpose()
{
EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
typename MatrixType::TransposeReturnType tmp(m_matrix);
return TransposeReturnType(tmp);
}
typedef TriangularView<const typename MatrixType::ConstTransposeReturnType,TransposeMode> ConstTransposeReturnType;
/** \sa MatrixBase::transpose() const */
EIGEN_DEVICE_FUNC
inline const ConstTransposeReturnType transpose() const
{
return ConstTransposeReturnType(m_matrix.transpose());
}
template<typename Other>
EIGEN_DEVICE_FUNC
inline const Solve<TriangularView, Other>
solve(const MatrixBase<Other>& other) const
{ return Solve<TriangularView, Other>(*this, other.derived()); }
// workaround MSVC ICE
#if EIGEN_COMP_MSVC
template<int Side, typename Other>
EIGEN_DEVICE_FUNC
inline const internal::triangular_solve_retval<Side,TriangularView, Other>
solve(const MatrixBase<Other>& other) const
{ return Base::template solve<Side>(other); }
#else
using Base::solve;
#endif
/** \returns a selfadjoint view of the referenced triangular part which must be either \c #Upper or \c #Lower.
*
* This is a shortcut for \code this->nestedExpression().selfadjointView<(*this)::Mode>() \endcode
* \sa MatrixBase::selfadjointView() */
EIGEN_DEVICE_FUNC
SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView()
{
EIGEN_STATIC_ASSERT((Mode&(UnitDiag|ZeroDiag))==0,PROGRAMMING_ERROR);
return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
}
/** This is the const version of selfadjointView() */
EIGEN_DEVICE_FUNC
const SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView() const
{
EIGEN_STATIC_ASSERT((Mode&(UnitDiag|ZeroDiag))==0,PROGRAMMING_ERROR);
return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
}
/** \returns the determinant of the triangular matrix
* \sa MatrixBase::determinant() */
EIGEN_DEVICE_FUNC
Scalar determinant() const
{
if (Mode & UnitDiag)
return 1;
else if (Mode & ZeroDiag)
return 0;
else
return m_matrix.diagonal().prod();
}
protected:
MatrixTypeNested m_matrix;
};
/** \ingroup Core_Module
*
* \brief Base class for a triangular part in a \b dense matrix
*
* This class is an abstract base class of class TriangularView, and objects of type TriangularViewImpl cannot be instantiated.
* It extends class TriangularView with additional methods which available for dense expressions only.
*
* \sa class TriangularView, MatrixBase::triangularView()
*/
template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_MatrixType,_Mode,Dense>
: public TriangularBase<TriangularView<_MatrixType, _Mode> >
{
public:
typedef TriangularView<_MatrixType, _Mode> TriangularViewType;
typedef TriangularBase<TriangularViewType> Base;
typedef typename internal::traits<TriangularViewType>::Scalar Scalar;
typedef _MatrixType MatrixType;
typedef typename MatrixType::PlainObject DenseMatrixType;
typedef DenseMatrixType PlainObject;
public:
using Base::evalToLazy;
using Base::derived;
typedef typename internal::traits<TriangularViewType>::StorageKind StorageKind;
enum {
Mode = _Mode,
Flags = internal::traits<TriangularViewType>::Flags
};
/** \returns the outer-stride of the underlying dense matrix
* \sa DenseCoeffsBase::outerStride() */
EIGEN_DEVICE_FUNC
inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
/** \returns the inner-stride of the underlying dense matrix
* \sa DenseCoeffsBase::innerStride() */
EIGEN_DEVICE_FUNC
inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
/** \sa MatrixBase::operator+=() */
template<typename Other>
EIGEN_DEVICE_FUNC
TriangularViewType& operator+=(const DenseBase<Other>& other) {
internal::call_assignment_no_alias(derived(), other.derived(), internal::add_assign_op<Scalar,typename Other::Scalar>());
return derived();
}
/** \sa MatrixBase::operator-=() */
template<typename Other>
EIGEN_DEVICE_FUNC
TriangularViewType& operator-=(const DenseBase<Other>& other) {
internal::call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar,typename Other::Scalar>());
return derived();
}
/** \sa MatrixBase::operator*=() */
EIGEN_DEVICE_FUNC
TriangularViewType& operator*=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = derived().nestedExpression() * other; }
/** \sa DenseBase::operator/=() */
EIGEN_DEVICE_FUNC
TriangularViewType& operator/=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = derived().nestedExpression() / other; }
/** \sa MatrixBase::fill() */
EIGEN_DEVICE_FUNC
void fill(const Scalar& value) { setConstant(value); }
/** \sa MatrixBase::setConstant() */
EIGEN_DEVICE_FUNC
TriangularViewType& setConstant(const Scalar& value)
{ return *this = MatrixType::Constant(derived().rows(), derived().cols(), value); }
/** \sa MatrixBase::setZero() */
EIGEN_DEVICE_FUNC
TriangularViewType& setZero() { return setConstant(Scalar(0)); }
/** \sa MatrixBase::setOnes() */
EIGEN_DEVICE_FUNC
TriangularViewType& setOnes() { return setConstant(Scalar(1)); }
/** \sa MatrixBase::coeff()
* \warning the coordinates must fit into the referenced triangular part
*/
EIGEN_DEVICE_FUNC
inline Scalar coeff(Index row, Index col) const
{
Base::check_coordinates_internal(row, col);
return derived().nestedExpression().coeff(row, col);
}
/** \sa MatrixBase::coeffRef()
* \warning the coordinates must fit into the referenced triangular part
*/
EIGEN_DEVICE_FUNC
inline Scalar& coeffRef(Index row, Index col)
{
EIGEN_STATIC_ASSERT_LVALUE(TriangularViewType);
Base::check_coordinates_internal(row, col);
return derived().nestedExpression().coeffRef(row, col);
}
/** Assigns a triangular matrix to a triangular part of a dense matrix */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
TriangularViewType& operator=(const TriangularBase<OtherDerived>& other);
/** Shortcut for\code *this = other.other.triangularView<(*this)::Mode>() \endcode */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
TriangularViewType& operator=(const MatrixBase<OtherDerived>& other);
#ifndef EIGEN_PARSED_BY_DOXYGEN
EIGEN_DEVICE_FUNC
TriangularViewType& operator=(const TriangularViewImpl& other)
{ return *this = other.derived().nestedExpression(); }
/** \deprecated */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
void lazyAssign(const TriangularBase<OtherDerived>& other);
/** \deprecated */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
void lazyAssign(const MatrixBase<OtherDerived>& other);
#endif
/** Efficient triangular matrix times vector/matrix product */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
const Product<TriangularViewType,OtherDerived>
operator*(const MatrixBase<OtherDerived>& rhs) const
{
return Product<TriangularViewType,OtherDerived>(derived(), rhs.derived());
}
/** Efficient vector/matrix times triangular matrix product */
template<typename OtherDerived> friend
EIGEN_DEVICE_FUNC
const Product<OtherDerived,TriangularViewType>
operator*(const MatrixBase<OtherDerived>& lhs, const TriangularViewImpl& rhs)
{
return Product<OtherDerived,TriangularViewType>(lhs.derived(),rhs.derived());
}
/** \returns the product of the inverse of \c *this with \a other, \a *this being triangular.
*
* This function computes the inverse-matrix matrix product inverse(\c *this) * \a other if
* \a Side==OnTheLeft (the default), or the right-inverse-multiply \a other * inverse(\c *this) if
* \a Side==OnTheRight.
*
* Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
*
* The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
* diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
* is an upper (resp. lower) triangular matrix.
*
* Example: \include Triangular_solve.cpp
* Output: \verbinclude Triangular_solve.out
*
* This function returns an expression of the inverse-multiply and can works in-place if it is assigned
* to the same matrix or vector \a other.
*
* For users coming from BLAS, this function (and more specifically solveInPlace()) offer
* all the operations supported by the \c *TRSV and \c *TRSM BLAS routines.
*
* \sa TriangularView::solveInPlace()
*/
template<int Side, typename Other>
EIGEN_DEVICE_FUNC
inline const internal::triangular_solve_retval<Side,TriangularViewType, Other>
solve(const MatrixBase<Other>& other) const;
/** "in-place" version of TriangularView::solve() where the result is written in \a other
*
* \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
* This function will const_cast it, so constness isn't honored here.
*
* Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
*
* See TriangularView:solve() for the details.
*/
template<int Side, typename OtherDerived>
EIGEN_DEVICE_FUNC
void solveInPlace(const MatrixBase<OtherDerived>& other) const;
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
void solveInPlace(const MatrixBase<OtherDerived>& other) const
{ return solveInPlace<OnTheLeft>(other); }
/** Swaps the coefficients of the common triangular parts of two matrices */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
#ifdef EIGEN_PARSED_BY_DOXYGEN
void swap(TriangularBase<OtherDerived> &other)
#else
void swap(TriangularBase<OtherDerived> const & other)
#endif
{
EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
}
/** \deprecated
* Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
void swap(MatrixBase<OtherDerived> const & other)
{
EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
}
template<typename RhsType, typename DstType>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE void _solve_impl(const RhsType &rhs, DstType &dst) const {
if(!internal::is_same_dense(dst,rhs))
dst = rhs;
this->solveInPlace(dst);
}
template<typename ProductType>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta);
};
/***************************************************************************
* Implementation of triangular evaluation/assignment
***************************************************************************/
#ifndef EIGEN_PARSED_BY_DOXYGEN
// FIXME should we keep that possibility
template<typename MatrixType, unsigned int Mode>
template<typename OtherDerived>
inline TriangularView<MatrixType, Mode>&
TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
{
internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
return derived();
}
// FIXME should we keep that possibility
template<typename MatrixType, unsigned int Mode>
template<typename OtherDerived>
void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
{
internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
}
template<typename MatrixType, unsigned int Mode>
template<typename OtherDerived>
inline TriangularView<MatrixType, Mode>&
TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other)
{
eigen_assert(Mode == int(OtherDerived::Mode));
internal::call_assignment(derived(), other.derived());
return derived();
}
template<typename MatrixType, unsigned int Mode>
template<typename OtherDerived>
void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
{
eigen_assert(Mode == int(OtherDerived::Mode));
internal::call_assignment_no_alias(derived(), other.derived());
}
#endif
/***************************************************************************
* Implementation of TriangularBase methods
***************************************************************************/
/** Assigns a triangular or selfadjoint matrix to a dense matrix.
* If the matrix is triangular, the opposite part is set to zero. */
template<typename Derived>
template<typename DenseDerived>
void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
{
evalToLazy(other.derived());
}
/***************************************************************************
* Implementation of TriangularView methods
***************************************************************************/
/***************************************************************************
* Implementation of MatrixBase methods
***************************************************************************/
/**
* \returns an expression of a triangular view extracted from the current matrix
*
* The parameter \a Mode can have the following values: \c #Upper, \c #StrictlyUpper, \c #UnitUpper,
* \c #Lower, \c #StrictlyLower, \c #UnitLower.
*
* Example: \include MatrixBase_triangularView.cpp
* Output: \verbinclude MatrixBase_triangularView.out
*
* \sa class TriangularView
*/
template<typename Derived>
template<unsigned int Mode>
typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
MatrixBase<Derived>::triangularView()
{
return typename TriangularViewReturnType<Mode>::Type(derived());
}
/** This is the const version of MatrixBase::triangularView() */
template<typename Derived>
template<unsigned int Mode>
typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
MatrixBase<Derived>::triangularView() const
{
return typename ConstTriangularViewReturnType<Mode>::Type(derived());
}
/** \returns true if *this is approximately equal to an upper triangular matrix,
* within the precision given by \a prec.
*
* \sa isLowerTriangular()
*/
template<typename Derived>
bool MatrixBase<Derived>::isUpperTriangular(const RealScalar& prec) const
{
RealScalar maxAbsOnUpperPart = static_cast<RealScalar>(-1);
for(Index j = 0; j < cols(); ++j)
{
Index maxi = numext::mini(j, rows()-1);
for(Index i = 0; i <= maxi; ++i)
{
RealScalar absValue = numext::abs(coeff(i,j));
if(absValue > maxAbsOnUpperPart) maxAbsOnUpperPart = absValue;
}
}
RealScalar threshold = maxAbsOnUpperPart * prec;
for(Index j = 0; j < cols(); ++j)
for(Index i = j+1; i < rows(); ++i)
if(numext::abs(coeff(i, j)) > threshold) return false;
return true;
}
/** \returns true if *this is approximately equal to a lower triangular matrix,
* within the precision given by \a prec.
*
* \sa isUpperTriangular()
*/
template<typename Derived>
bool MatrixBase<Derived>::isLowerTriangular(const RealScalar& prec) const
{
RealScalar maxAbsOnLowerPart = static_cast<RealScalar>(-1);
for(Index j = 0; j < cols(); ++j)
for(Index i = j; i < rows(); ++i)
{
RealScalar absValue = numext::abs(coeff(i,j));
if(absValue > maxAbsOnLowerPart) maxAbsOnLowerPart = absValue;
}
RealScalar threshold = maxAbsOnLowerPart * prec;
for(Index j = 1; j < cols(); ++j)
{
Index maxi = numext::mini(j, rows()-1);
for(Index i = 0; i < maxi; ++i)
if(numext::abs(coeff(i, j)) > threshold) return false;
}
return true;
}
/***************************************************************************
****************************************************************************
* Evaluators and Assignment of triangular expressions
***************************************************************************
***************************************************************************/
namespace internal {
// TODO currently a triangular expression has the form TriangularView<.,.>
// in the future triangular-ness should be defined by the expression traits
// such that Transpose<TriangularView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
template<typename MatrixType, unsigned int Mode>
struct evaluator_traits<TriangularView<MatrixType,Mode> >
{
typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
typedef typename glue_shapes<typename evaluator_traits<MatrixType>::Shape, TriangularShape>::type Shape;
};
template<typename MatrixType, unsigned int Mode>
struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased>
: evaluator<typename internal::remove_all<MatrixType>::type>
{
typedef TriangularView<MatrixType,Mode> XprType;
typedef evaluator<typename internal::remove_all<MatrixType>::type> Base;
unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {}
};
// Additional assignment kinds:
struct Triangular2Triangular {};
struct Triangular2Dense {};
struct Dense2Triangular {};
template<typename Kernel, unsigned int Mode, int UnrollCount, bool ClearOpposite> struct triangular_assignment_loop;
/** \internal Specialization of the dense assignment kernel for triangular matrices.
* The main difference is that the triangular, diagonal, and opposite parts are processed through three different functions.
* \tparam UpLo must be either Lower or Upper
* \tparam Mode must be either 0, UnitDiag, ZeroDiag, or SelfAdjoint
*/
template<int UpLo, int Mode, int SetOpposite, typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version = Specialized>
class triangular_dense_assignment_kernel : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version>
{
protected:
typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version> Base;
typedef typename Base::DstXprType DstXprType;
typedef typename Base::SrcXprType SrcXprType;
using Base::m_dst;
using Base::m_src;
using Base::m_functor;
public:
typedef typename Base::DstEvaluatorType DstEvaluatorType;
typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
typedef typename Base::Scalar Scalar;
typedef typename Base::AssignmentTraits AssignmentTraits;
EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
: Base(dst, src, func, dstExpr)
{}
#ifdef EIGEN_INTERNAL_DEBUGGING
EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
{
eigen_internal_assert(row!=col);
Base::assignCoeff(row,col);
}
#else
using Base::assignCoeff;
#endif
EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
{
if(Mode==UnitDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(1));
else if(Mode==ZeroDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(0));
else if(Mode==0) Base::assignCoeff(id,id);
}
EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index row, Index col)
{
eigen_internal_assert(row!=col);
if(SetOpposite)
m_functor.assignCoeff(m_dst.coeffRef(row,col), Scalar(0));
}
};
template<int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType, typename Functor>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func)
{
typedef evaluator<DstXprType> DstEvaluatorType;
typedef evaluator<SrcXprType> SrcEvaluatorType;
SrcEvaluatorType srcEvaluator(src);
Index dstRows = src.rows();
Index dstCols = src.cols();
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
dst.resize(dstRows, dstCols);
DstEvaluatorType dstEvaluator(dst);
typedef triangular_dense_assignment_kernel< Mode&(Lower|Upper),Mode&(UnitDiag|ZeroDiag|SelfAdjoint),SetOpposite,
DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
enum {
unroll = DstXprType::SizeAtCompileTime != Dynamic
&& SrcEvaluatorType::CoeffReadCost < HugeCost
&& DstXprType::SizeAtCompileTime * (DstEvaluatorType::CoeffReadCost+SrcEvaluatorType::CoeffReadCost) / 2 <= EIGEN_UNROLLING_LIMIT
};
triangular_assignment_loop<Kernel, Mode, unroll ? int(DstXprType::SizeAtCompileTime) : Dynamic, SetOpposite>::run(kernel);
}
template<int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src)
{
call_triangular_assignment_loop<Mode,SetOpposite>(dst, src, internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
}
template<> struct AssignmentKind<TriangularShape,TriangularShape> { typedef Triangular2Triangular Kind; };
template<> struct AssignmentKind<DenseShape,TriangularShape> { typedef Triangular2Dense Kind; };
template<> struct AssignmentKind<TriangularShape,DenseShape> { typedef Dense2Triangular Kind; };
template< typename DstXprType, typename SrcXprType, typename Functor>
struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular>
{
EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
{
eigen_assert(int(DstXprType::Mode) == int(SrcXprType::Mode));
call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
}
};
template< typename DstXprType, typename SrcXprType, typename Functor>
struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense>
{
EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
{
call_triangular_assignment_loop<SrcXprType::Mode, (SrcXprType::Mode&SelfAdjoint)==0>(dst, src, func);
}
};
template< typename DstXprType, typename SrcXprType, typename Functor>
struct Assignment<DstXprType, SrcXprType, Functor, Dense2Triangular>
{
EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
{
call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
}
};
template<typename Kernel, unsigned int Mode, int UnrollCount, bool SetOpposite>
struct triangular_assignment_loop
{
// FIXME: this is not very clean, perhaps this information should be provided by the kernel?
typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
typedef typename DstEvaluatorType::XprType DstXprType;
enum {
col = (UnrollCount-1) / DstXprType::RowsAtCompileTime,
row = (UnrollCount-1) % DstXprType::RowsAtCompileTime
};
typedef typename Kernel::Scalar Scalar;
EIGEN_DEVICE_FUNC
static inline void run(Kernel &kernel)
{
triangular_assignment_loop<Kernel, Mode, UnrollCount-1, SetOpposite>::run(kernel);
if(row==col)
kernel.assignDiagonalCoeff(row);
else if( ((Mode&Lower) && row>col) || ((Mode&Upper) && row<col) )
kernel.assignCoeff(row,col);
else if(SetOpposite)
kernel.assignOppositeCoeff(row,col);
}
};
// prevent buggy user code from causing an infinite recursion
template<typename Kernel, unsigned int Mode, bool SetOpposite>
struct triangular_assignment_loop<Kernel, Mode, 0, SetOpposite>
{
EIGEN_DEVICE_FUNC
static inline void run(Kernel &) {}
};
// TODO: experiment with a recursive assignment procedure splitting the current
// triangular part into one rectangular and two triangular parts.
template<typename Kernel, unsigned int Mode, bool SetOpposite>
struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
{
typedef typename Kernel::Scalar Scalar;
EIGEN_DEVICE_FUNC
static inline void run(Kernel &kernel)
{
for(Index j = 0; j < kernel.cols(); ++j)
{
Index maxi = numext::mini(j, kernel.rows());
Index i = 0;
if (((Mode&Lower) && SetOpposite) || (Mode&Upper))
{
for(; i < maxi; ++i)
if(Mode&Upper) kernel.assignCoeff(i, j);
else kernel.assignOppositeCoeff(i, j);
}
else
i = maxi;
if(i<kernel.rows()) // then i==j
kernel.assignDiagonalCoeff(i++);
if (((Mode&Upper) && SetOpposite) || (Mode&Lower))
{
for(; i < kernel.rows(); ++i)
if(Mode&Lower) kernel.assignCoeff(i, j);
else kernel.assignOppositeCoeff(i, j);
}
}
}
};
} // end namespace internal
/** Assigns a triangular or selfadjoint matrix to a dense matrix.
* If the matrix is triangular, the opposite part is set to zero. */
template<typename Derived>
template<typename DenseDerived>
void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
{
other.derived().resize(this->rows(), this->cols());
internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
}
namespace internal {
// Triangular = Product
template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
{
typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename SrcXprType::Scalar> &)
{
Index dstRows = src.rows();
Index dstCols = src.cols();
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
dst.resize(dstRows, dstCols);
dst._assignProduct(src, 1, 0);
}
};
// Triangular += Product
template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
{
typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,typename SrcXprType::Scalar> &)
{
dst._assignProduct(src, 1, 1);
}
};
// Triangular -= Product
template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
{
typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,typename SrcXprType::Scalar> &)
{
dst._assignProduct(src, -1, 1);
}
};
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_TRIANGULARMATRIX_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_VECTORBLOCK_H
#define EIGEN_VECTORBLOCK_H
namespace Eigen {
namespace internal {
template<typename VectorType, int Size>
struct traits<VectorBlock<VectorType, Size> >
: public traits<Block<VectorType,
traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
traits<VectorType>::Flags & RowMajorBit ? Size : 1> >
{
};
}
/** \class VectorBlock
* \ingroup Core_Module
*
* \brief Expression of a fixed-size or dynamic-size sub-vector
*
* \tparam VectorType the type of the object in which we are taking a sub-vector
* \tparam Size size of the sub-vector we are taking at compile time (optional)
*
* This class represents an expression of either a fixed-size or dynamic-size sub-vector.
* It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and
* most of the time this is the only way it is used.
*
* However, if you want to directly maniputate sub-vector expressions,
* for instance if you want to write a function returning such an expression, you
* will need to use this class.
*
* Here is an example illustrating the dynamic case:
* \include class_VectorBlock.cpp
* Output: \verbinclude class_VectorBlock.out
*
* \note Even though this expression has dynamic size, in the case where \a VectorType
* has fixed size, this expression inherits a fixed maximal size which means that evaluating
* it does not cause a dynamic memory allocation.
*
* Here is an example illustrating the fixed-size case:
* \include class_FixedVectorBlock.cpp
* Output: \verbinclude class_FixedVectorBlock.out
*
* \sa class Block, DenseBase::segment(Index,Index,Index,Index), DenseBase::segment(Index,Index)
*/
template<typename VectorType, int Size> class VectorBlock
: public Block<VectorType,
internal::traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
internal::traits<VectorType>::Flags & RowMajorBit ? Size : 1>
{
typedef Block<VectorType,
internal::traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
internal::traits<VectorType>::Flags & RowMajorBit ? Size : 1> Base;
enum {
IsColVector = !(internal::traits<VectorType>::Flags & RowMajorBit)
};
public:
EIGEN_DENSE_PUBLIC_INTERFACE(VectorBlock)
using Base::operator=;
/** Dynamic-size constructor
*/
EIGEN_DEVICE_FUNC
inline VectorBlock(VectorType& vector, Index start, Index size)
: Base(vector,
IsColVector ? start : 0, IsColVector ? 0 : start,
IsColVector ? size : 1, IsColVector ? 1 : size)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock);
}
/** Fixed-size constructor
*/
EIGEN_DEVICE_FUNC
inline VectorBlock(VectorType& vector, Index start)
: Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock);
}
};
} // end namespace Eigen
#endif // EIGEN_VECTORBLOCK_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PARTIAL_REDUX_H
#define EIGEN_PARTIAL_REDUX_H
namespace Eigen {
/** \class PartialReduxExpr
* \ingroup Core_Module
*
* \brief Generic expression of a partially reduxed matrix
*
* \tparam MatrixType the type of the matrix we are applying the redux operation
* \tparam MemberOp type of the member functor
* \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal)
*
* This class represents an expression of a partial redux operator of a matrix.
* It is the return type of some VectorwiseOp functions,
* and most of the time this is the only way it is used.
*
* \sa class VectorwiseOp
*/
template< typename MatrixType, typename MemberOp, int Direction>
class PartialReduxExpr;
namespace internal {
template<typename MatrixType, typename MemberOp, int Direction>
struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> >
: traits<MatrixType>
{
typedef typename MemberOp::result_type Scalar;
typedef typename traits<MatrixType>::StorageKind StorageKind;
typedef typename traits<MatrixType>::XprKind XprKind;
typedef typename MatrixType::Scalar InputScalar;
enum {
RowsAtCompileTime = Direction==Vertical ? 1 : MatrixType::RowsAtCompileTime,
ColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::ColsAtCompileTime,
MaxRowsAtCompileTime = Direction==Vertical ? 1 : MatrixType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::MaxColsAtCompileTime,
Flags = RowsAtCompileTime == 1 ? RowMajorBit : 0,
TraversalSize = Direction==Vertical ? MatrixType::RowsAtCompileTime : MatrixType::ColsAtCompileTime
};
};
}
template< typename MatrixType, typename MemberOp, int Direction>
class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<MatrixType, MemberOp, Direction> >::type,
internal::no_assignment_operator
{
public:
typedef typename internal::dense_xpr_base<PartialReduxExpr>::type Base;
EIGEN_DENSE_PUBLIC_INTERFACE(PartialReduxExpr)
EIGEN_DEVICE_FUNC
explicit PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
: m_matrix(mat), m_functor(func) {}
EIGEN_DEVICE_FUNC
Index rows() const { return (Direction==Vertical ? 1 : m_matrix.rows()); }
EIGEN_DEVICE_FUNC
Index cols() const { return (Direction==Horizontal ? 1 : m_matrix.cols()); }
EIGEN_DEVICE_FUNC
typename MatrixType::Nested nestedExpression() const { return m_matrix; }
EIGEN_DEVICE_FUNC
const MemberOp& functor() const { return m_functor; }
protected:
typename MatrixType::Nested m_matrix;
const MemberOp m_functor;
};
#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST) \
template <typename ResultType> \
struct member_##MEMBER { \
EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER) \
typedef ResultType result_type; \
template<typename Scalar, int Size> struct Cost \
{ enum { value = COST }; }; \
template<typename XprType> \
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
ResultType operator()(const XprType& mat) const \
{ return mat.MEMBER(); } \
}
namespace internal {
EIGEN_MEMBER_FUNCTOR(squaredNorm, Size * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(norm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(stableNorm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(blueNorm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(hypotNorm, (Size-1) * functor_traits<scalar_hypot_op<Scalar> >::Cost );
EIGEN_MEMBER_FUNCTOR(sum, (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(mean, (Size-1)*NumTraits<Scalar>::AddCost + NumTraits<Scalar>::MulCost);
EIGEN_MEMBER_FUNCTOR(minCoeff, (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(maxCoeff, (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(all, (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits<Scalar>::AddCost);
EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost);
template <int p, typename ResultType>
struct member_lpnorm {
typedef ResultType result_type;
template<typename Scalar, int Size> struct Cost
{ enum { value = (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost }; };
EIGEN_DEVICE_FUNC member_lpnorm() {}
template<typename XprType>
EIGEN_DEVICE_FUNC inline ResultType operator()(const XprType& mat) const
{ return mat.template lpNorm<p>(); }
};
template <typename BinaryOp, typename Scalar>
struct member_redux {
typedef typename result_of<
BinaryOp(const Scalar&,const Scalar&)
>::type result_type;
template<typename _Scalar, int Size> struct Cost
{ enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
EIGEN_DEVICE_FUNC explicit member_redux(const BinaryOp func) : m_functor(func) {}
template<typename Derived>
EIGEN_DEVICE_FUNC inline result_type operator()(const DenseBase<Derived>& mat) const
{ return mat.redux(m_functor); }
const BinaryOp m_functor;
};
}
/** \class VectorwiseOp
* \ingroup Core_Module
*
* \brief Pseudo expression providing partial reduction operations
*
* \tparam ExpressionType the type of the object on which to do partial reductions
* \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal)
*
* This class represents a pseudo expression with partial reduction features.
* It is the return type of DenseBase::colwise() and DenseBase::rowwise()
* and most of the time this is the only way it is used.
*
* Example: \include MatrixBase_colwise.cpp
* Output: \verbinclude MatrixBase_colwise.out
*
* \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr
*/
template<typename ExpressionType, int Direction> class VectorwiseOp
{
public:
typedef typename ExpressionType::Scalar Scalar;
typedef typename ExpressionType::RealScalar RealScalar;
typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
typedef typename internal::ref_selector<ExpressionType>::non_const_type ExpressionTypeNested;
typedef typename internal::remove_all<ExpressionTypeNested>::type ExpressionTypeNestedCleaned;
template<template<typename _Scalar> class Functor,
typename Scalar_=Scalar> struct ReturnType
{
typedef PartialReduxExpr<ExpressionType,
Functor<Scalar_>,
Direction
> Type;
};
template<typename BinaryOp> struct ReduxReturnType
{
typedef PartialReduxExpr<ExpressionType,
internal::member_redux<BinaryOp,Scalar>,
Direction
> Type;
};
enum {
isVertical = (Direction==Vertical) ? 1 : 0,
isHorizontal = (Direction==Horizontal) ? 1 : 0
};
protected:
typedef typename internal::conditional<isVertical,
typename ExpressionType::ColXpr,
typename ExpressionType::RowXpr>::type SubVector;
/** \internal
* \returns the i-th subvector according to the \c Direction */
EIGEN_DEVICE_FUNC
SubVector subVector(Index i)
{
return SubVector(m_matrix.derived(),i);
}
/** \internal
* \returns the number of subvectors in the direction \c Direction */
EIGEN_DEVICE_FUNC
Index subVectors() const
{ return isVertical?m_matrix.cols():m_matrix.rows(); }
template<typename OtherDerived> struct ExtendedType {
typedef Replicate<OtherDerived,
isVertical ? 1 : ExpressionType::RowsAtCompileTime,
isHorizontal ? 1 : ExpressionType::ColsAtCompileTime> Type;
};
/** \internal
* Replicates a vector to match the size of \c *this */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
typename ExtendedType<OtherDerived>::Type
extendedTo(const DenseBase<OtherDerived>& other) const
{
EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isVertical, OtherDerived::MaxColsAtCompileTime==1),
YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED)
EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isHorizontal, OtherDerived::MaxRowsAtCompileTime==1),
YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED)
return typename ExtendedType<OtherDerived>::Type
(other.derived(),
isVertical ? 1 : m_matrix.rows(),
isHorizontal ? 1 : m_matrix.cols());
}
template<typename OtherDerived> struct OppositeExtendedType {
typedef Replicate<OtherDerived,
isHorizontal ? 1 : ExpressionType::RowsAtCompileTime,
isVertical ? 1 : ExpressionType::ColsAtCompileTime> Type;
};
/** \internal
* Replicates a vector in the opposite direction to match the size of \c *this */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
typename OppositeExtendedType<OtherDerived>::Type
extendedToOpposite(const DenseBase<OtherDerived>& other) const
{
EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isHorizontal, OtherDerived::MaxColsAtCompileTime==1),
YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED)
EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isVertical, OtherDerived::MaxRowsAtCompileTime==1),
YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED)
return typename OppositeExtendedType<OtherDerived>::Type
(other.derived(),
isHorizontal ? 1 : m_matrix.rows(),
isVertical ? 1 : m_matrix.cols());
}
public:
EIGEN_DEVICE_FUNC
explicit inline VectorwiseOp(ExpressionType& matrix) : m_matrix(matrix) {}
/** \internal */
EIGEN_DEVICE_FUNC
inline const ExpressionType& _expression() const { return m_matrix; }
/** \returns a row or column vector expression of \c *this reduxed by \a func
*
* The template parameter \a BinaryOp is the type of the functor
* of the custom redux operator. Note that func must be an associative operator.
*
* \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise()
*/
template<typename BinaryOp>
EIGEN_DEVICE_FUNC
const typename ReduxReturnType<BinaryOp>::Type
redux(const BinaryOp& func = BinaryOp()) const
{ return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func)); }
typedef typename ReturnType<internal::member_minCoeff>::Type MinCoeffReturnType;
typedef typename ReturnType<internal::member_maxCoeff>::Type MaxCoeffReturnType;
typedef typename ReturnType<internal::member_squaredNorm,RealScalar>::Type SquaredNormReturnType;
typedef typename ReturnType<internal::member_norm,RealScalar>::Type NormReturnType;
typedef typename ReturnType<internal::member_blueNorm,RealScalar>::Type BlueNormReturnType;
typedef typename ReturnType<internal::member_stableNorm,RealScalar>::Type StableNormReturnType;
typedef typename ReturnType<internal::member_hypotNorm,RealScalar>::Type HypotNormReturnType;
typedef typename ReturnType<internal::member_sum>::Type SumReturnType;
typedef typename ReturnType<internal::member_mean>::Type MeanReturnType;
typedef typename ReturnType<internal::member_all>::Type AllReturnType;
typedef typename ReturnType<internal::member_any>::Type AnyReturnType;
typedef PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> CountReturnType;
typedef typename ReturnType<internal::member_prod>::Type ProdReturnType;
typedef Reverse<const ExpressionType, Direction> ConstReverseReturnType;
typedef Reverse<ExpressionType, Direction> ReverseReturnType;
template<int p> struct LpNormReturnType {
typedef PartialReduxExpr<ExpressionType, internal::member_lpnorm<p,RealScalar>,Direction> Type;
};
/** \returns a row (or column) vector expression of the smallest coefficient
* of each column (or row) of the referenced expression.
*
* \warning the result is undefined if \c *this contains NaN.
*
* Example: \include PartialRedux_minCoeff.cpp
* Output: \verbinclude PartialRedux_minCoeff.out
*
* \sa DenseBase::minCoeff() */
EIGEN_DEVICE_FUNC
const MinCoeffReturnType minCoeff() const
{ return MinCoeffReturnType(_expression()); }
/** \returns a row (or column) vector expression of the largest coefficient
* of each column (or row) of the referenced expression.
*
* \warning the result is undefined if \c *this contains NaN.
*
* Example: \include PartialRedux_maxCoeff.cpp
* Output: \verbinclude PartialRedux_maxCoeff.out
*
* \sa DenseBase::maxCoeff() */
EIGEN_DEVICE_FUNC
const MaxCoeffReturnType maxCoeff() const
{ return MaxCoeffReturnType(_expression()); }
/** \returns a row (or column) vector expression of the squared norm
* of each column (or row) of the referenced expression.
* This is a vector with real entries, even if the original matrix has complex entries.
*
* Example: \include PartialRedux_squaredNorm.cpp
* Output: \verbinclude PartialRedux_squaredNorm.out
*
* \sa DenseBase::squaredNorm() */
EIGEN_DEVICE_FUNC
const SquaredNormReturnType squaredNorm() const
{ return SquaredNormReturnType(_expression()); }
/** \returns a row (or column) vector expression of the norm
* of each column (or row) of the referenced expression.
* This is a vector with real entries, even if the original matrix has complex entries.
*
* Example: \include PartialRedux_norm.cpp
* Output: \verbinclude PartialRedux_norm.out
*
* \sa DenseBase::norm() */
EIGEN_DEVICE_FUNC
const NormReturnType norm() const
{ return NormReturnType(_expression()); }
/** \returns a row (or column) vector expression of the norm
* of each column (or row) of the referenced expression.
* This is a vector with real entries, even if the original matrix has complex entries.
*
* Example: \include PartialRedux_norm.cpp
* Output: \verbinclude PartialRedux_norm.out
*
* \sa DenseBase::norm() */
template<int p>
EIGEN_DEVICE_FUNC
const typename LpNormReturnType<p>::Type lpNorm() const
{ return typename LpNormReturnType<p>::Type(_expression()); }
/** \returns a row (or column) vector expression of the norm
* of each column (or row) of the referenced expression, using
* Blue's algorithm.
* This is a vector with real entries, even if the original matrix has complex entries.
*
* \sa DenseBase::blueNorm() */
EIGEN_DEVICE_FUNC
const BlueNormReturnType blueNorm() const
{ return BlueNormReturnType(_expression()); }
/** \returns a row (or column) vector expression of the norm
* of each column (or row) of the referenced expression, avoiding
* underflow and overflow.
* This is a vector with real entries, even if the original matrix has complex entries.
*
* \sa DenseBase::stableNorm() */
EIGEN_DEVICE_FUNC
const StableNormReturnType stableNorm() const
{ return StableNormReturnType(_expression()); }
/** \returns a row (or column) vector expression of the norm
* of each column (or row) of the referenced expression, avoiding
* underflow and overflow using a concatenation of hypot() calls.
* This is a vector with real entries, even if the original matrix has complex entries.
*
* \sa DenseBase::hypotNorm() */
EIGEN_DEVICE_FUNC
const HypotNormReturnType hypotNorm() const
{ return HypotNormReturnType(_expression()); }
/** \returns a row (or column) vector expression of the sum
* of each column (or row) of the referenced expression.
*
* Example: \include PartialRedux_sum.cpp
* Output: \verbinclude PartialRedux_sum.out
*
* \sa DenseBase::sum() */
EIGEN_DEVICE_FUNC
const SumReturnType sum() const
{ return SumReturnType(_expression()); }
/** \returns a row (or column) vector expression of the mean
* of each column (or row) of the referenced expression.
*
* \sa DenseBase::mean() */
EIGEN_DEVICE_FUNC
const MeanReturnType mean() const
{ return MeanReturnType(_expression()); }
/** \returns a row (or column) vector expression representing
* whether \b all coefficients of each respective column (or row) are \c true.
* This expression can be assigned to a vector with entries of type \c bool.
*
* \sa DenseBase::all() */
EIGEN_DEVICE_FUNC
const AllReturnType all() const
{ return AllReturnType(_expression()); }
/** \returns a row (or column) vector expression representing
* whether \b at \b least one coefficient of each respective column (or row) is \c true.
* This expression can be assigned to a vector with entries of type \c bool.
*
* \sa DenseBase::any() */
EIGEN_DEVICE_FUNC
const AnyReturnType any() const
{ return AnyReturnType(_expression()); }
/** \returns a row (or column) vector expression representing
* the number of \c true coefficients of each respective column (or row).
* This expression can be assigned to a vector whose entries have the same type as is used to
* index entries of the original matrix; for dense matrices, this is \c std::ptrdiff_t .
*
* Example: \include PartialRedux_count.cpp
* Output: \verbinclude PartialRedux_count.out
*
* \sa DenseBase::count() */
EIGEN_DEVICE_FUNC
const CountReturnType count() const
{ return CountReturnType(_expression()); }
/** \returns a row (or column) vector expression of the product
* of each column (or row) of the referenced expression.
*
* Example: \include PartialRedux_prod.cpp
* Output: \verbinclude PartialRedux_prod.out
*
* \sa DenseBase::prod() */
EIGEN_DEVICE_FUNC
const ProdReturnType prod() const
{ return ProdReturnType(_expression()); }
/** \returns a matrix expression
* where each column (or row) are reversed.
*
* Example: \include Vectorwise_reverse.cpp
* Output: \verbinclude Vectorwise_reverse.out
*
* \sa DenseBase::reverse() */
EIGEN_DEVICE_FUNC
const ConstReverseReturnType reverse() const
{ return ConstReverseReturnType( _expression() ); }
/** \returns a writable matrix expression
* where each column (or row) are reversed.
*
* \sa reverse() const */
EIGEN_DEVICE_FUNC
ReverseReturnType reverse()
{ return ReverseReturnType( _expression() ); }
typedef Replicate<ExpressionType,(isVertical?Dynamic:1),(isHorizontal?Dynamic:1)> ReplicateReturnType;
EIGEN_DEVICE_FUNC
const ReplicateReturnType replicate(Index factor) const;
/**
* \return an expression of the replication of each column (or row) of \c *this
*
* Example: \include DirectionWise_replicate.cpp
* Output: \verbinclude DirectionWise_replicate.out
*
* \sa VectorwiseOp::replicate(Index), DenseBase::replicate(), class Replicate
*/
// NOTE implemented here because of sunstudio's compilation errors
// isVertical*Factor+isHorizontal instead of (isVertical?Factor:1) to handle CUDA bug with ternary operator
template<int Factor> const Replicate<ExpressionType,isVertical*Factor+isHorizontal,isHorizontal*Factor+isVertical>
EIGEN_DEVICE_FUNC
replicate(Index factor = Factor) const
{
return Replicate<ExpressionType,(isVertical?Factor:1),(isHorizontal?Factor:1)>
(_expression(),isVertical?factor:1,isHorizontal?factor:1);
}
/////////// Artithmetic operators ///////////
/** Copies the vector \a other to each subvector of \c *this */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
ExpressionType& operator=(const DenseBase<OtherDerived>& other)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
//eigen_assert((m_matrix.isNull()) == (other.isNull())); FIXME
return const_cast<ExpressionType&>(m_matrix = extendedTo(other.derived()));
}
/** Adds the vector \a other to each subvector of \c *this */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
ExpressionType& operator+=(const DenseBase<OtherDerived>& other)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
return const_cast<ExpressionType&>(m_matrix += extendedTo(other.derived()));
}
/** Substracts the vector \a other to each subvector of \c *this */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
ExpressionType& operator-=(const DenseBase<OtherDerived>& other)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
return const_cast<ExpressionType&>(m_matrix -= extendedTo(other.derived()));
}
/** Multiples each subvector of \c *this by the vector \a other */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
ExpressionType& operator*=(const DenseBase<OtherDerived>& other)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
m_matrix *= extendedTo(other.derived());
return const_cast<ExpressionType&>(m_matrix);
}
/** Divides each subvector of \c *this by the vector \a other */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
ExpressionType& operator/=(const DenseBase<OtherDerived>& other)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
m_matrix /= extendedTo(other.derived());
return const_cast<ExpressionType&>(m_matrix);
}
/** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
template<typename OtherDerived> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
CwiseBinaryOp<internal::scalar_sum_op<Scalar,typename OtherDerived::Scalar>,
const ExpressionTypeNestedCleaned,
const typename ExtendedType<OtherDerived>::Type>
operator+(const DenseBase<OtherDerived>& other) const
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
return m_matrix + extendedTo(other.derived());
}
/** Returns the expression of the difference between each subvector of \c *this and the vector \a other */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
CwiseBinaryOp<internal::scalar_difference_op<Scalar,typename OtherDerived::Scalar>,
const ExpressionTypeNestedCleaned,
const typename ExtendedType<OtherDerived>::Type>
operator-(const DenseBase<OtherDerived>& other) const
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
return m_matrix - extendedTo(other.derived());
}
/** Returns the expression where each subvector is the product of the vector \a other
* by the corresponding subvector of \c *this */
template<typename OtherDerived> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
CwiseBinaryOp<internal::scalar_product_op<Scalar>,
const ExpressionTypeNestedCleaned,
const typename ExtendedType<OtherDerived>::Type>
EIGEN_DEVICE_FUNC
operator*(const DenseBase<OtherDerived>& other) const
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
return m_matrix * extendedTo(other.derived());
}
/** Returns the expression where each subvector is the quotient of the corresponding
* subvector of \c *this by the vector \a other */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
const ExpressionTypeNestedCleaned,
const typename ExtendedType<OtherDerived>::Type>
operator/(const DenseBase<OtherDerived>& other) const
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
return m_matrix / extendedTo(other.derived());
}
/** \returns an expression where each column (or row) of the referenced matrix are normalized.
* The referenced matrix is \b not modified.
* \sa MatrixBase::normalized(), normalize()
*/
EIGEN_DEVICE_FUNC
CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
const ExpressionTypeNestedCleaned,
const typename OppositeExtendedType<typename ReturnType<internal::member_norm,RealScalar>::Type>::Type>
normalized() const { return m_matrix.cwiseQuotient(extendedToOpposite(this->norm())); }
/** Normalize in-place each row or columns of the referenced matrix.
* \sa MatrixBase::normalize(), normalized()
*/
EIGEN_DEVICE_FUNC void normalize() {
m_matrix = this->normalized();
}
EIGEN_DEVICE_FUNC inline void reverseInPlace();
/////////// Geometry module ///////////
typedef Homogeneous<ExpressionType,Direction> HomogeneousReturnType;
EIGEN_DEVICE_FUNC
HomogeneousReturnType homogeneous() const;
typedef typename ExpressionType::PlainObject CrossReturnType;
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
const CrossReturnType cross(const MatrixBase<OtherDerived>& other) const;
enum {
HNormalized_Size = Direction==Vertical ? internal::traits<ExpressionType>::RowsAtCompileTime
: internal::traits<ExpressionType>::ColsAtCompileTime,
HNormalized_SizeMinusOne = HNormalized_Size==Dynamic ? Dynamic : HNormalized_Size-1
};
typedef Block<const ExpressionType,
Direction==Vertical ? int(HNormalized_SizeMinusOne)
: int(internal::traits<ExpressionType>::RowsAtCompileTime),
Direction==Horizontal ? int(HNormalized_SizeMinusOne)
: int(internal::traits<ExpressionType>::ColsAtCompileTime)>
HNormalized_Block;
typedef Block<const ExpressionType,
Direction==Vertical ? 1 : int(internal::traits<ExpressionType>::RowsAtCompileTime),
Direction==Horizontal ? 1 : int(internal::traits<ExpressionType>::ColsAtCompileTime)>
HNormalized_Factors;
typedef CwiseBinaryOp<internal::scalar_quotient_op<typename internal::traits<ExpressionType>::Scalar>,
const HNormalized_Block,
const Replicate<HNormalized_Factors,
Direction==Vertical ? HNormalized_SizeMinusOne : 1,
Direction==Horizontal ? HNormalized_SizeMinusOne : 1> >
HNormalizedReturnType;
EIGEN_DEVICE_FUNC
const HNormalizedReturnType hnormalized() const;
protected:
ExpressionTypeNested m_matrix;
};
//const colwise moved to DenseBase.h due to CUDA compiler bug
/** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
*
* \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
*/
template<typename Derived>
inline typename DenseBase<Derived>::ColwiseReturnType
DenseBase<Derived>::colwise()
{
return ColwiseReturnType(derived());
}
//const rowwise moved to DenseBase.h due to CUDA compiler bug
/** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
*
* \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
*/
template<typename Derived>
inline typename DenseBase<Derived>::RowwiseReturnType
DenseBase<Derived>::rowwise()
{
return RowwiseReturnType(derived());
}
} // end namespace Eigen
#endif // EIGEN_PARTIAL_REDUX_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_VISITOR_H
#define EIGEN_VISITOR_H
namespace Eigen {
namespace internal {
template<typename Visitor, typename Derived, int UnrollCount>
struct visitor_impl
{
enum {
col = (UnrollCount-1) / Derived::RowsAtCompileTime,
row = (UnrollCount-1) % Derived::RowsAtCompileTime
};
EIGEN_DEVICE_FUNC
static inline void run(const Derived &mat, Visitor& visitor)
{
visitor_impl<Visitor, Derived, UnrollCount-1>::run(mat, visitor);
visitor(mat.coeff(row, col), row, col);
}
};
template<typename Visitor, typename Derived>
struct visitor_impl<Visitor, Derived, 1>
{
EIGEN_DEVICE_FUNC
static inline void run(const Derived &mat, Visitor& visitor)
{
return visitor.init(mat.coeff(0, 0), 0, 0);
}
};
template<typename Visitor, typename Derived>
struct visitor_impl<Visitor, Derived, Dynamic>
{
EIGEN_DEVICE_FUNC
static inline void run(const Derived& mat, Visitor& visitor)
{
visitor.init(mat.coeff(0,0), 0, 0);
for(Index i = 1; i < mat.rows(); ++i)
visitor(mat.coeff(i, 0), i, 0);
for(Index j = 1; j < mat.cols(); ++j)
for(Index i = 0; i < mat.rows(); ++i)
visitor(mat.coeff(i, j), i, j);
}
};
// evaluator adaptor
template<typename XprType>
class visitor_evaluator
{
public:
EIGEN_DEVICE_FUNC
explicit visitor_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
enum {
RowsAtCompileTime = XprType::RowsAtCompileTime,
CoeffReadCost = internal::evaluator<XprType>::CoeffReadCost
};
EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
{ return m_evaluator.coeff(row, col); }
protected:
internal::evaluator<XprType> m_evaluator;
const XprType &m_xpr;
};
} // end namespace internal
/** Applies the visitor \a visitor to the whole coefficients of the matrix or vector.
*
* The template parameter \a Visitor is the type of the visitor and provides the following interface:
* \code
* struct MyVisitor {
* // called for the first coefficient
* void init(const Scalar& value, Index i, Index j);
* // called for all other coefficients
* void operator() (const Scalar& value, Index i, Index j);
* };
* \endcode
*
* \note compared to one or two \em for \em loops, visitors offer automatic
* unrolling for small fixed size matrix.
*
* \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux()
*/
template<typename Derived>
template<typename Visitor>
EIGEN_DEVICE_FUNC
void DenseBase<Derived>::visit(Visitor& visitor) const
{
typedef typename internal::visitor_evaluator<Derived> ThisEvaluator;
ThisEvaluator thisEval(derived());
enum {
unroll = SizeAtCompileTime != Dynamic
&& SizeAtCompileTime * ThisEvaluator::CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits<Visitor>::Cost <= EIGEN_UNROLLING_LIMIT
};
return internal::visitor_impl<Visitor, ThisEvaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(thisEval, visitor);
}
namespace internal {
/** \internal
* \brief Base class to implement min and max visitors
*/
template <typename Derived>
struct coeff_visitor
{
typedef typename Derived::Scalar Scalar;
Index row, col;
Scalar res;
EIGEN_DEVICE_FUNC
inline void init(const Scalar& value, Index i, Index j)
{
res = value;
row = i;
col = j;
}
};
/** \internal
* \brief Visitor computing the min coefficient with its value and coordinates
*
* \sa DenseBase::minCoeff(Index*, Index*)
*/
template <typename Derived>
struct min_coeff_visitor : coeff_visitor<Derived>
{
typedef typename Derived::Scalar Scalar;
EIGEN_DEVICE_FUNC
void operator() (const Scalar& value, Index i, Index j)
{
if(value < this->res)
{
this->res = value;
this->row = i;
this->col = j;
}
}
};
template<typename Scalar>
struct functor_traits<min_coeff_visitor<Scalar> > {
enum {
Cost = NumTraits<Scalar>::AddCost
};
};
/** \internal
* \brief Visitor computing the max coefficient with its value and coordinates
*
* \sa DenseBase::maxCoeff(Index*, Index*)
*/
template <typename Derived>
struct max_coeff_visitor : coeff_visitor<Derived>
{
typedef typename Derived::Scalar Scalar;
EIGEN_DEVICE_FUNC
void operator() (const Scalar& value, Index i, Index j)
{
if(value > this->res)
{
this->res = value;
this->row = i;
this->col = j;
}
}
};
template<typename Scalar>
struct functor_traits<max_coeff_visitor<Scalar> > {
enum {
Cost = NumTraits<Scalar>::AddCost
};
};
} // end namespace internal
/** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
* \returns the minimum of all coefficients of *this and puts in *row and *col its location.
* \warning the result is undefined if \c *this contains NaN.
*
* \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
*/
template<typename Derived>
template<typename IndexType>
EIGEN_DEVICE_FUNC
typename internal::traits<Derived>::Scalar
DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
{
internal::min_coeff_visitor<Derived> minVisitor;
this->visit(minVisitor);
*rowId = minVisitor.row;
if (colId) *colId = minVisitor.col;
return minVisitor.res;
}
/** \returns the minimum of all coefficients of *this and puts in *index its location.
* \warning the result is undefined if \c *this contains NaN.
*
* \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff()
*/
template<typename Derived>
template<typename IndexType>
EIGEN_DEVICE_FUNC
typename internal::traits<Derived>::Scalar
DenseBase<Derived>::minCoeff(IndexType* index) const
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
internal::min_coeff_visitor<Derived> minVisitor;
this->visit(minVisitor);
*index = IndexType((RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row);
return minVisitor.res;
}
/** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
* \returns the maximum of all coefficients of *this and puts in *row and *col its location.
* \warning the result is undefined if \c *this contains NaN.
*
* \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
*/
template<typename Derived>
template<typename IndexType>
EIGEN_DEVICE_FUNC
typename internal::traits<Derived>::Scalar
DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
{
internal::max_coeff_visitor<Derived> maxVisitor;
this->visit(maxVisitor);
*rowPtr = maxVisitor.row;
if (colPtr) *colPtr = maxVisitor.col;
return maxVisitor.res;
}
/** \returns the maximum of all coefficients of *this and puts in *index its location.
* \warning the result is undefined if \c *this contains NaN.
*
* \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
*/
template<typename Derived>
template<typename IndexType>
EIGEN_DEVICE_FUNC
typename internal::traits<Derived>::Scalar
DenseBase<Derived>::maxCoeff(IndexType* index) const
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
internal::max_coeff_visitor<Derived> maxVisitor;
this->visit(maxVisitor);
*index = (RowsAtCompileTime==1) ? maxVisitor.col : maxVisitor.row;
return maxVisitor.res;
}
} // end namespace Eigen
#endif // EIGEN_VISITOR_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COMPLEX_AVX_H
#define EIGEN_COMPLEX_AVX_H
namespace Eigen {
namespace internal {
//---------- float ----------
struct Packet4cf
{
EIGEN_STRONG_INLINE Packet4cf() {}
EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {}
__m256 v;
};
template<> struct packet_traits<std::complex<float> > : default_packet_traits
{
typedef Packet4cf type;
typedef Packet2cf half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 4,
HasHalfPacket = 1,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasSetLinear = 0
};
};
template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a)
{
return Packet4cf(pnegate(a.v));
}
template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a)
{
const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
return Packet4cf(_mm256_xor_ps(a.v,mask));
}
template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
{
__m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v);
__m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
__m256 result = _mm256_addsub_ps(tmp1, tmp2);
return Packet4cf(result);
}
template<> EIGEN_STRONG_INLINE Packet4cf pand <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf por <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pxor <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
{
return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from)));
}
template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
{
// FIXME The following might be optimized using _mm256_movedup_pd
Packet2cf a = ploaddup<Packet2cf>(from);
Packet2cf b = ploaddup<Packet2cf>(from+1);
return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
}
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, Index stride)
{
return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]),
std::imag(from[2*stride]), std::real(from[2*stride]),
std::imag(from[1*stride]), std::real(from[1*stride]),
std::imag(from[0*stride]), std::real(from[0*stride])));
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, Index stride)
{
__m128 low = _mm256_extractf128_ps(from.v, 0);
to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)),
_mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)),
_mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
__m128 high = _mm256_extractf128_ps(from.v, 1);
to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)),
_mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)),
_mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
}
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(const Packet4cf& a)
{
return pfirst(Packet2cf(_mm256_castps256_ps128(a.v)));
}
template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
__m128 low = _mm256_extractf128_ps(a.v, 0);
__m128 high = _mm256_extractf128_ps(a.v, 1);
__m128d lowd = _mm_castps_pd(low);
__m128d highd = _mm_castps_pd(high);
low = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1));
high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1));
__m256 result = _mm256_setzero_ps();
result = _mm256_insertf128_ps(result, low, 1);
result = _mm256_insertf128_ps(result, high, 0);
return Packet4cf(result);
}
template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a)
{
return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)),
Packet2cf(_mm256_extractf128_ps(a.v,1))));
}
template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs)
{
Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0));
Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0));
t0 = _mm256_hadd_ps(t0,t1);
Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0));
Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0));
t2 = _mm256_hadd_ps(t2,t3);
t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4));
t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4));
return Packet4cf(_mm256_add_ps(t1,t3));
}
template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
{
return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
Packet2cf(_mm256_extractf128_ps(a.v, 1))));
}
template<int Offset>
struct palign_impl<Offset,Packet4cf>
{
static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second)
{
if (Offset==0) return;
palign_impl<Offset*2,Packet8f>::run(first.v, second.v);
}
};
template<> struct conj_helper<Packet4cf, Packet4cf, false,true>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
{
return internal::pmul(a, pconj(b));
}
};
template<> struct conj_helper<Packet4cf, Packet4cf, true,false>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
{
return internal::pmul(pconj(a), b);
}
};
template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
{
return pconj(internal::pmul(a, b));
}
};
template<> struct conj_helper<Packet8f, Packet4cf, false,false>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
{ return Packet4cf(Eigen::internal::pmul(x, y.v)); }
};
template<> struct conj_helper<Packet4cf, Packet8f, false,false>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
{ return Packet4cf(Eigen::internal::pmul(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
{
Packet4cf num = pmul(a, pconj(b));
__m256 tmp = _mm256_mul_ps(b.v, b.v);
__m256 tmp2 = _mm256_shuffle_ps(tmp,tmp,0xB1);
__m256 denom = _mm256_add_ps(tmp, tmp2);
return Packet4cf(_mm256_div_ps(num.v, denom));
}
template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x)
{
return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
}
//---------- double ----------
struct Packet2cd
{
EIGEN_STRONG_INLINE Packet2cd() {}
EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {}
__m256d v;
};
template<> struct packet_traits<std::complex<double> > : default_packet_traits
{
typedef Packet2cd type;
typedef Packet1cd half;
enum {
Vectorizable = 1,
AlignedOnScalar = 0,
size = 2,
HasHalfPacket = 1,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasSetLinear = 0
};
};
template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; };
template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a)
{
const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
return Packet2cd(_mm256_xor_pd(a.v,mask));
}
template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
{
__m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0);
__m256d even = _mm256_mul_pd(tmp1, b.v);
__m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF);
__m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5);
__m256d odd = _mm256_mul_pd(tmp2, tmp3);
return Packet2cd(_mm256_addsub_pd(even, odd));
}
template<> EIGEN_STRONG_INLINE Packet2cd pand <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd por <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pxor <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from)
{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from)); }
template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from)
{
// in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though)
// return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from));
return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from));
}
template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); }
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, Index stride)
{
return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]),
std::imag(from[0*stride]), std::real(from[0*stride])));
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, Index stride)
{
__m128d low = _mm256_extractf128_pd(from.v, 0);
to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
__m128d high = _mm256_extractf128_pd(from.v, 1);
to[stride*1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
}
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a)
{
__m128d low = _mm256_extractf128_pd(a.v, 0);
EIGEN_ALIGN16 double res[2];
_mm_store_pd(res, low);
return std::complex<double>(res[0],res[1]);
}
template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
__m256d result = _mm256_permute2f128_pd(a.v, a.v, 1);
return Packet2cd(result);
}
template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a)
{
return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)),
Packet1cd(_mm256_extractf128_pd(a.v,1))));
}
template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs)
{
Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4));
Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4));
return Packet2cd(_mm256_add_pd(t0,t1));
}
template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
{
return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
Packet1cd(_mm256_extractf128_pd(a.v,1))));
}
template<int Offset>
struct palign_impl<Offset,Packet2cd>
{
static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second)
{
if (Offset==0) return;
palign_impl<Offset*2,Packet4d>::run(first.v, second.v);
}
};
template<> struct conj_helper<Packet2cd, Packet2cd, false,true>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
{
return internal::pmul(a, pconj(b));
}
};
template<> struct conj_helper<Packet2cd, Packet2cd, true,false>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
{
return internal::pmul(pconj(a), b);
}
};
template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
{
return pconj(internal::pmul(a, b));
}
};
template<> struct conj_helper<Packet4d, Packet2cd, false,false>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
{ return Packet2cd(Eigen::internal::pmul(x, y.v)); }
};
template<> struct conj_helper<Packet2cd, Packet4d, false,false>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
{ return Packet2cd(Eigen::internal::pmul(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
{
Packet2cd num = pmul(a, pconj(b));
__m256d tmp = _mm256_mul_pd(b.v, b.v);
__m256d denom = _mm256_hadd_pd(tmp, tmp);
return Packet2cd(_mm256_div_pd(num.v, denom));
}
template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x)
{
return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
}
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet4cf,4>& kernel) {
__m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
__m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
__m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
__m256d P3 = _mm256_castps_pd(kernel.packet[3].v);
__m256d T0 = _mm256_shuffle_pd(P0, P1, 15);
__m256d T1 = _mm256_shuffle_pd(P0, P1, 0);
__m256d T2 = _mm256_shuffle_pd(P2, P3, 15);
__m256d T3 = _mm256_shuffle_pd(P2, P3, 0);
kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32));
kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49));
kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32));
kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
}
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet2cd,2>& kernel) {
__m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4));
kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4));
kernel.packet[0].v = tmp;
}
template<> EIGEN_STRONG_INLINE Packet4cf pinsertfirst(const Packet4cf& a, std::complex<float> b)
{
return Packet4cf(_mm256_blend_ps(a.v,pset1<Packet4cf>(b).v,1|2));
}
template<> EIGEN_STRONG_INLINE Packet2cd pinsertfirst(const Packet2cd& a, std::complex<double> b)
{
return Packet2cd(_mm256_blend_pd(a.v,pset1<Packet2cd>(b).v,1|2));
}
template<> EIGEN_STRONG_INLINE Packet4cf pinsertlast(const Packet4cf& a, std::complex<float> b)
{
return Packet4cf(_mm256_blend_ps(a.v,pset1<Packet4cf>(b).v,(1<<7)|(1<<6)));
}
template<> EIGEN_STRONG_INLINE Packet2cd pinsertlast(const Packet2cd& a, std::complex<double> b)
{
return Packet2cd(_mm256_blend_pd(a.v,pset1<Packet2cd>(b).v,(1<<3)|(1<<2)));
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_COMPLEX_AVX_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_MATH_FUNCTIONS_AVX_H
#define EIGEN_MATH_FUNCTIONS_AVX_H
/* The sin, cos, exp, and log functions of this file are loosely derived from
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
*/
namespace Eigen {
namespace internal {
inline Packet8i pshiftleft(Packet8i v, int n)
{
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_slli_epi32(v, n);
#else
__m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n);
__m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n);
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
#endif
}
inline Packet8f pshiftright(Packet8f v, int n)
{
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n));
#else
__m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n);
__m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n);
return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1));
#endif
}
// Sine function
// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and
// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants
// are (anti-)symmetric and thus have only odd/even coefficients
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
psin<Packet8f>(const Packet8f& _x) {
Packet8f x = _x;
// Some useful values.
_EIGEN_DECLARE_CONST_Packet8i(one, 1);
_EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
_EIGEN_DECLARE_CONST_Packet8f(two, 2.0f);
_EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f);
_EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f);
_EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f);
_EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f);
_EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f);
_EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f);
// Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period.
Packet8f z = pmul(x, p8f_one_over_pi);
Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four));
x = pmadd(shift, p8f_neg_pi_first, x);
x = pmadd(shift, p8f_neg_pi_second, x);
x = pmadd(shift, p8f_neg_pi_third, x);
z = pmul(x, p8f_four_over_pi);
// Make a mask for the entries that need flipping, i.e. wherever the shift
// is odd.
Packet8i shift_ints = _mm256_cvtps_epi32(shift);
Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one)));
Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31);
// Create a mask for which interpolant to use, i.e. if z > 1, then the mask
// is set to ones for that entry.
Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ);
// Evaluate the polynomial for the interval [1,3] in z.
_EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f);
_EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f);
_EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f);
_EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f);
Packet8f z_minus_two = psub(z, p8f_two);
Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two);
Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4);
right = pmadd(right, z_minus_two2, p8f_coeff_right_2);
right = pmadd(right, z_minus_two2, p8f_coeff_right_0);
// Evaluate the polynomial for the interval [-1,1] in z.
_EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f);
_EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f);
_EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f);
_EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f);
Packet8f z2 = pmul(z, z);
Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5);
left = pmadd(left, z2, p8f_coeff_left_3);
left = pmadd(left, z2, p8f_coeff_left_1);
left = pmul(left, z);
// Assemble the results, i.e. select the left and right polynomials.
left = _mm256_andnot_ps(ival_mask, left);
right = _mm256_and_ps(ival_mask, right);
Packet8f res = _mm256_or_ps(left, right);
// Flip the sign on the odd intervals and return the result.
res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask));
return res;
}
// Natural logarithm
// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
// be easily approximated by a polynomial centered on m=1 for stability.
// TODO(gonnet): Further reduce the interval allowing for lower-degree
// polynomial interpolants -> ... -> profit!
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
plog<Packet8f>(const Packet8f& _x) {
Packet8f x = _x;
_EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
_EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
_EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f);
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000);
// The smallest non denormalized float number.
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000);
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000);
// Polynomial coefficients.
_EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f);
Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); // not greater equal is true if x is NaN
Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
// Truncate input values to the minimum positive normal.
x = pmax(x, p8f_min_norm_pos);
Packet8f emm0 = pshiftright(x,23);
Packet8f e = _mm256_sub_ps(emm0, p8f_126f);
// Set the exponents to -1, i.e. x are in the range [0.5,1).
x = _mm256_and_ps(x, p8f_inv_mant_mask);
x = _mm256_or_ps(x, p8f_half);
// part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
// and shift by -1. The values are then centered around 0, which improves
// the stability of the polynomial evaluation.
// if( x < SQRTHF ) {
// e -= 1;
// x = x + x - 1.0;
// } else { x = x - 1.0; }
Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ);
Packet8f tmp = _mm256_and_ps(x, mask);
x = psub(x, p8f_1);
e = psub(e, _mm256_and_ps(p8f_1, mask));
x = padd(x, tmp);
Packet8f x2 = pmul(x, x);
Packet8f x3 = pmul(x2, x);
// Evaluate the polynomial approximant of degree 8 in three parts, probably
// to improve instruction-level parallelism.
Packet8f y, y1, y2;
y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1);
y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4);
y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7);
y = pmadd(y, x, p8f_cephes_log_p2);
y1 = pmadd(y1, x, p8f_cephes_log_p5);
y2 = pmadd(y2, x, p8f_cephes_log_p8);
y = pmadd(y, x3, y1);
y = pmadd(y, x3, y2);
y = pmul(y, x3);
// Add the logarithm of the exponent back to the result of the interpolation.
y1 = pmul(e, p8f_cephes_log_q1);
tmp = pmul(x2, p8f_half);
y = padd(y, y1);
x = psub(x, tmp);
y2 = pmul(e, p8f_cephes_log_q2);
x = padd(x, y);
x = padd(x, y2);
// Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
return _mm256_or_ps(
_mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)),
_mm256_and_ps(iszero_mask, p8f_minus_inf));
}
// Exponential function. Works by writing "x = m*log(2) + r" where
// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
pexp<Packet8f>(const Packet8f& _x) {
_EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
_EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
_EIGEN_DECLARE_CONST_Packet8f(127, 127.0f);
_EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f);
_EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f);
// Clamp x.
Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo);
// Express exp(x) as exp(m*ln(2) + r), start by extracting
// m = floor(x/ln(2) + 0.5).
Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half));
// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
// truncation errors. Note that we don't use the "pmadd" function here to
// ensure that a precision-preserving FMA instruction is used.
#ifdef EIGEN_VECTORIZE_FMA
_EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f);
Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x);
#else
_EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f);
_EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f);
Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1));
r = psub(r, pmul(m, p8f_cephes_exp_C2));
#endif
Packet8f r2 = pmul(r, r);
// TODO(gonnet): Split into odd/even polynomials and try to exploit
// instruction-level parallelism.
Packet8f y = p8f_cephes_exp_p0;
y = pmadd(y, r, p8f_cephes_exp_p1);
y = pmadd(y, r, p8f_cephes_exp_p2);
y = pmadd(y, r, p8f_cephes_exp_p3);
y = pmadd(y, r, p8f_cephes_exp_p4);
y = pmadd(y, r, p8f_cephes_exp_p5);
y = pmadd(y, r2, r);
y = padd(y, p8f_1);
// Build emm0 = 2^m.
Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127));
emm0 = pshiftleft(emm0, 23);
// Return 2^m * exp(r).
return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x);
}
// Hyperbolic Tangent function.
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
ptanh<Packet8f>(const Packet8f& x) {
return internal::generic_fast_tanh_float(x);
}
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
pexp<Packet4d>(const Packet4d& _x) {
Packet4d x = _x;
_EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
_EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
_EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
_EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
_EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
_EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
_EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
Packet4d tmp, fx;
// clamp x
x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
// Express exp(x) as exp(g + n*log(2)).
fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
// Get the integer modulus of log(2), i.e. the "n" described above.
fx = _mm256_floor_pd(fx);
// Get the remainder modulo log(2), i.e. the "g" described above. Subtract
// n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
// digits right.
tmp = pmul(fx, p4d_cephes_exp_C1);
Packet4d z = pmul(fx, p4d_cephes_exp_C2);
x = psub(x, tmp);
x = psub(x, z);
Packet4d x2 = pmul(x, x);
// Evaluate the numerator polynomial of the rational interpolant.
Packet4d px = p4d_cephes_exp_p0;
px = pmadd(px, x2, p4d_cephes_exp_p1);
px = pmadd(px, x2, p4d_cephes_exp_p2);
px = pmul(px, x);
// Evaluate the denominator polynomial of the rational interpolant.
Packet4d qx = p4d_cephes_exp_q0;
qx = pmadd(qx, x2, p4d_cephes_exp_q1);
qx = pmadd(qx, x2, p4d_cephes_exp_q2);
qx = pmadd(qx, x2, p4d_cephes_exp_q3);
// I don't really get this bit, copied from the SSE2 routines, so...
// TODO(gonnet): Figure out what is going on here, perhaps find a better
// rational interpolant?
x = _mm256_div_pd(px, psub(qx, px));
x = pmadd(p4d_2, x, p4d_1);
// Build e=2^n by constructing the exponents in a 128-bit vector and
// shifting them to where they belong in double-precision values.
__m128i emm0 = _mm256_cvtpd_epi32(fx);
emm0 = _mm_add_epi32(emm0, p4i_1023);
emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
__m128i lo = _mm_slli_epi64(emm0, 52);
__m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
__m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
e = _mm256_insertf128_si256(e, hi, 1);
// Construct the result 2^n * exp(g) = e * x. The max is used to catch
// non-finite values in the input.
return pmax(pmul(x, _mm256_castsi256_pd(e)), _x);
}
// Functions for sqrt.
// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
// exact solution. It does not handle +inf, or denormalized numbers correctly.
// The main advantage of this approach is not just speed, but also the fact that
// it can be inlined and pipelined with other computations, further reducing its
// effective latency. This is similar to Quake3's fast inverse square root.
// For detail see here: http://www.beyond3d.com/content/articles/8/
#if EIGEN_FAST_MATH
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
psqrt<Packet8f>(const Packet8f& _x) {
Packet8f half = pmul(_x, pset1<Packet8f>(.5f));
Packet8f denormal_mask = _mm256_and_ps(
_mm256_cmp_ps(_x, pset1<Packet8f>((std::numeric_limits<float>::min)()),
_CMP_LT_OQ),
_mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_GE_OQ));
// Compute approximate reciprocal sqrt.
Packet8f x = _mm256_rsqrt_ps(_x);
// Do a single step of Newton's iteration.
x = pmul(x, psub(pset1<Packet8f>(1.5f), pmul(half, pmul(x,x))));
// Flush results for denormals to zero.
return _mm256_andnot_ps(denormal_mask, pmul(_x,x));
}
#else
template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet8f psqrt<Packet8f>(const Packet8f& x) {
return _mm256_sqrt_ps(x);
}
#endif
template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4d psqrt<Packet4d>(const Packet4d& x) {
return _mm256_sqrt_pd(x);
}
#if EIGEN_FAST_MATH
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000);
_EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
_EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
Packet8f neg_half = pmul(_x, p8f_minus_half);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x));
// Fill in NaNs and Infs for the negative/zero entries.
Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ);
Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask);
Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan),
_mm256_and_ps(zero_mask, p8f_inf));
// Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five));
// Insert NaNs and Infs in all the right places.
return _mm256_or_ps(x, infs_and_nans);
}
#else
template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet8f prsqrt<Packet8f>(const Packet8f& x) {
_EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x));
}
#endif
template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4d prsqrt<Packet4d>(const Packet4d& x) {
_EIGEN_DECLARE_CONST_Packet4d(one, 1.0);
return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x));
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_AVX_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_AVX_H
#define EIGEN_PACKET_MATH_AVX_H
namespace Eigen {
namespace internal {
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
#endif
#ifdef __FMA__
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
#endif
typedef __m256 Packet8f;
typedef __m256i Packet8i;
typedef __m256d Packet4d;
template<> struct is_arithmetic<__m256> { enum { value = true }; };
template<> struct is_arithmetic<__m256i> { enum { value = true }; };
template<> struct is_arithmetic<__m256d> { enum { value = true }; };
#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
const Packet8f p8f_##NAME = pset1<Packet8f>(X)
#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \
const Packet4d p4d_##NAME = pset1<Packet4d>(X)
#define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \
const Packet8f p8f_##NAME = _mm256_castsi256_ps(pset1<Packet8i>(X))
#define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \
const Packet8i p8i_##NAME = pset1<Packet8i>(X)
// Use the packet_traits defined in AVX512/PacketMath.h instead if we're going
// to leverage AVX512 instructions.
#ifndef EIGEN_VECTORIZE_AVX512
template<> struct packet_traits<float> : default_packet_traits
{
typedef Packet8f type;
typedef Packet4f half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=8,
HasHalfPacket = 1,
HasDiv = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = 0,
HasLog = 1,
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasTanh = EIGEN_FAST_MATH,
HasBlend = 1,
HasRound = 1,
HasFloor = 1,
HasCeil = 1
};
};
template<> struct packet_traits<double> : default_packet_traits
{
typedef Packet4d type;
typedef Packet2d half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=4,
HasHalfPacket = 1,
HasDiv = 1,
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasBlend = 1,
HasRound = 1,
HasFloor = 1,
HasCeil = 1
};
};
#endif
template<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
template<> struct scalar_div_cost<double,true> { enum { value = 16 }; };
/* Proper support for integers is only provided by AVX2. In the meantime, we'll
use SSE instructions and packets to deal with integers.
template<> struct packet_traits<int> : default_packet_traits
{
typedef Packet8i type;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=8
};
};
*/
template<> struct unpacket_traits<Packet8f> { typedef float type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; };
template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; };
template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; };
template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); }
template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) { return _mm256_set1_epi32(from); }
template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) { return _mm256_broadcast_ss(from); }
template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
template<> EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
{
return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
}
template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
{
return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
}
template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/)
{ eigen_assert(false && "packet integer division are not supported by AVX");
return pset1<Packet8i>(0);
}
#ifdef __FMA__
template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
// clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
// and gcc stupidly generates a vfmadd132ps instruction,
// so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
// the result of the product.
Packet8f res = c;
__asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
return res;
#else
return _mm256_fmadd_ps(a,b,c);
#endif
}
template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
// see above
Packet4d res = c;
__asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
return res;
#else
return _mm256_fmadd_pd(a,b,c);
#endif
}
#endif
template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
template<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); }
template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); }
template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); }
template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
// Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3}
template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
{
// TODO try to find a way to avoid the need of a temporary register
// Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from));
// tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
// return _mm256_unpacklo_ps(tmp,tmp);
// _mm256_insertf128_ps is very slow on Haswell, thus:
Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
// mimic an "inplace" permutation of the lower 128bits using a blend
tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
// then we can perform a consistent permutation on the global register to get everything in shape:
return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
}
// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1}
template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
{
Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
return _mm256_permute_pd(tmp, 3<<2);
}
// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1}
template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
{
Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
}
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride)
{
return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
}
template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride)
{
return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride)
{
__m128 low = _mm256_extractf128_ps(from, 0);
to[stride*0] = _mm_cvtss_f32(low);
to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
__m128 high = _mm256_extractf128_ps(from, 1);
to[stride*4] = _mm_cvtss_f32(high);
to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride)
{
__m128d low = _mm256_extractf128_pd(from, 0);
to[stride*0] = _mm_cvtsd_f64(low);
to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
__m128d high = _mm256_extractf128_pd(from, 1);
to[stride*2] = _mm_cvtsd_f64(high);
to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
}
template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
{
Packet8f pa = pset1<Packet8f>(a);
pstore(to, pa);
}
template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a)
{
Packet4d pa = pset1<Packet4d>(a);
pstore(to, pa);
}
template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
{
Packet8i pa = pset1<Packet8i>(a);
pstore(to, pa);
}
#ifndef EIGEN_VECTORIZE_AVX512
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
#endif
template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
return _mm_cvtss_f32(_mm256_castps256_ps128(a));
}
template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
}
template<> EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) {
return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
}
template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
{
__m256 tmp = _mm256_shuffle_ps(a,a,0x1b);
return _mm256_permute2f128_ps(tmp, tmp, 1);
}
template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
{
__m256d tmp = _mm256_shuffle_pd(a,a,5);
return _mm256_permute2f128_pd(tmp, tmp, 1);
__m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
return _mm256_permute_pd(swap_halves,5);
}
// pabs should be ok
template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
{
const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
return _mm256_and_ps(a,mask);
}
template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
{
const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
return _mm256_and_pd(a,mask);
}
// preduxp should be ok
// FIXME: why is this ok? why isn't the simply implementation working as expected?
template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
{
__m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]);
__m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]);
__m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]);
__m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]);
__m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
__m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
__m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
__m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
__m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
__m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
__m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
__m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
__m256 sum1 = _mm256_add_ps(perm1, hsum5);
__m256 sum2 = _mm256_add_ps(perm2, hsum6);
__m256 sum3 = _mm256_add_ps(perm3, hsum7);
__m256 sum4 = _mm256_add_ps(perm4, hsum8);
__m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
__m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
__m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
return final;
}
template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)
{
Packet4d tmp0, tmp1;
tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]);
tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]);
tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
return _mm256_blend_pd(tmp0, tmp1, 0xC);
}
template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
{
return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1))));
}
template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
{
return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
}
template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a)
{
return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
}
template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
{
Packet8f tmp;
tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1));
tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
}
template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a)
{
Packet4d tmp;
tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1));
return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
}
template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a)
{
Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1));
tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
}
template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a)
{
Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1));
return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
}
template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a)
{
Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1));
tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
}
template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
{
Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1));
return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
}
template<int Offset>
struct palign_impl<Offset,Packet8f>
{
static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second)
{
if (Offset==1)
{
first = _mm256_blend_ps(first, second, 1);
Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
first = _mm256_blend_ps(tmp1, tmp2, 0x88);
}
else if (Offset==2)
{
first = _mm256_blend_ps(first, second, 3);
Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
first = _mm256_blend_ps(tmp1, tmp2, 0xcc);
}
else if (Offset==3)
{
first = _mm256_blend_ps(first, second, 7);
Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
first = _mm256_blend_ps(tmp1, tmp2, 0xee);
}
else if (Offset==4)
{
first = _mm256_blend_ps(first, second, 15);
Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0));
}
else if (Offset==5)
{
first = _mm256_blend_ps(first, second, 31);
first = _mm256_permute2f128_ps(first, first, 1);
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
first = _mm256_permute2f128_ps(tmp, tmp, 1);
first = _mm256_blend_ps(tmp, first, 0x88);
}
else if (Offset==6)
{
first = _mm256_blend_ps(first, second, 63);
first = _mm256_permute2f128_ps(first, first, 1);
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
first = _mm256_permute2f128_ps(tmp, tmp, 1);
first = _mm256_blend_ps(tmp, first, 0xcc);
}
else if (Offset==7)
{
first = _mm256_blend_ps(first, second, 127);
first = _mm256_permute2f128_ps(first, first, 1);
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
first = _mm256_permute2f128_ps(tmp, tmp, 1);
first = _mm256_blend_ps(tmp, first, 0xee);
}
}
};
template<int Offset>
struct palign_impl<Offset,Packet4d>
{
static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second)
{
if (Offset==1)
{
first = _mm256_blend_pd(first, second, 1);
__m256d tmp = _mm256_permute_pd(first, 5);
first = _mm256_permute2f128_pd(tmp, tmp, 1);
first = _mm256_blend_pd(tmp, first, 0xA);
}
else if (Offset==2)
{
first = _mm256_blend_pd(first, second, 3);
first = _mm256_permute2f128_pd(first, first, 1);
}
else if (Offset==3)
{
first = _mm256_blend_pd(first, second, 7);
__m256d tmp = _mm256_permute_pd(first, 5);
first = _mm256_permute2f128_pd(tmp, tmp, 1);
first = _mm256_blend_pd(tmp, first, 5);
}
}
};
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet8f,8>& kernel) {
__m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
__m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
__m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
__m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
__m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
__m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
__m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
__m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
__m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
__m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
__m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
__m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
__m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0));
__m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2));
__m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0));
__m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2));
kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
}
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet8f,4>& kernel) {
__m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
__m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
__m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
__m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
__m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
__m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
__m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
__m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
}
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet4d,4>& kernel) {
__m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
__m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
__m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
__m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
}
template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) {
const __m256 zero = _mm256_setzero_ps();
const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
}
template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) {
const __m256d zero = _mm256_setzero_pd();
const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ);
return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
}
template<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b)
{
return _mm256_blend_ps(a,pset1<Packet8f>(b),1);
}
template<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b)
{
return _mm256_blend_pd(a,pset1<Packet4d>(b),1);
}
template<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b)
{
return _mm256_blend_ps(a,pset1<Packet8f>(b),(1<<7));
}
template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b)
{
return _mm256_blend_pd(a,pset1<Packet4d>(b),(1<<3));
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_PACKET_MATH_AVX_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_TYPE_CASTING_AVX_H
#define EIGEN_TYPE_CASTING_AVX_H
namespace Eigen {
namespace internal {
// For now we use SSE to handle integers, so we can't use AVX instructions to cast
// from int to float
template <>
struct type_casting_traits<float, int> {
enum {
VectorizedCast = 0,
SrcCoeffRatio = 1,
TgtCoeffRatio = 1
};
};
template <>
struct type_casting_traits<int, float> {
enum {
VectorizedCast = 0,
SrcCoeffRatio = 1,
TgtCoeffRatio = 1
};
};
template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
return _mm256_cvtps_epi32(a);
}
template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
return _mm256_cvtepi32_ps(a);
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_TYPE_CASTING_AVX_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Pedro Gonnet (pedro.gonnet@gmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
#define THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
namespace Eigen {
namespace internal {
// Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics.
#if EIGEN_GNUC_AT_LEAST(5, 3)
#define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
const Packet16f p16f_##NAME = pset1<Packet16f>(X)
#define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \
const Packet16f p16f_##NAME = (__m512)pset1<Packet16i>(X)
#define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \
const Packet8d p8d_##NAME = pset1<Packet8d>(X)
#define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
// Natural logarithm
// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
// be easily approximated by a polynomial centered on m=1 for stability.
#if defined(EIGEN_VECTORIZE_AVX512DQ)
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
plog<Packet16f>(const Packet16f& _x) {
Packet16f x = _x;
_EIGEN_DECLARE_CONST_Packet16f(1, 1.0f);
_EIGEN_DECLARE_CONST_Packet16f(half, 0.5f);
_EIGEN_DECLARE_CONST_Packet16f(126f, 126.0f);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inv_mant_mask, ~0x7f800000);
// The smallest non denormalized float number.
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
// Polynomial coefficients.
_EIGEN_DECLARE_CONST_Packet16f(cephes_SQRTHF, 0.707106781186547524f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p0, 7.0376836292E-2f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p1, -1.1514610310E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p2, 1.1676998740E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p3, -1.2420140846E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p4, +1.4249322787E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p5, -1.6668057665E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p6, +2.0000714765E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p7, -2.4999993993E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p8, +3.3333331174E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_q1, -2.12194440e-4f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f);
// invalid_mask is set to true when x is NaN
__mmask16 invalid_mask =
_mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
__mmask16 iszero_mask =
_mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_UQ);
// Truncate input values to the minimum positive normal.
x = pmax(x, p16f_min_norm_pos);
// Extract the shifted exponents.
Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((__m512i)x, 23));
Packet16f e = _mm512_sub_ps(emm0, p16f_126f);
// Set the exponents to -1, i.e. x are in the range [0.5,1).
x = _mm512_and_ps(x, p16f_inv_mant_mask);
x = _mm512_or_ps(x, p16f_half);
// part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
// and shift by -1. The values are then centered around 0, which improves
// the stability of the polynomial evaluation.
// if( x < SQRTHF ) {
// e -= 1;
// x = x + x - 1.0;
// } else { x = x - 1.0; }
__mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps());
x = psub(x, p16f_1);
e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps()));
x = padd(x, tmp);
Packet16f x2 = pmul(x, x);
Packet16f x3 = pmul(x2, x);
// Evaluate the polynomial approximant of degree 8 in three parts, probably
// to improve instruction-level parallelism.
Packet16f y, y1, y2;
y = pmadd(p16f_cephes_log_p0, x, p16f_cephes_log_p1);
y1 = pmadd(p16f_cephes_log_p3, x, p16f_cephes_log_p4);
y2 = pmadd(p16f_cephes_log_p6, x, p16f_cephes_log_p7);
y = pmadd(y, x, p16f_cephes_log_p2);
y1 = pmadd(y1, x, p16f_cephes_log_p5);
y2 = pmadd(y2, x, p16f_cephes_log_p8);
y = pmadd(y, x3, y1);
y = pmadd(y, x3, y2);
y = pmul(y, x3);
// Add the logarithm of the exponent back to the result of the interpolation.
y1 = pmul(e, p16f_cephes_log_q1);
tmp = pmul(x2, p16f_half);
y = padd(y, y1);
x = psub(x, tmp);
y2 = pmul(e, p16f_cephes_log_q2);
x = padd(x, y);
x = padd(x, y2);
// Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf,
_mm512_mask_blend_ps(invalid_mask, p16f_nan, x));
}
#endif
// Exponential function. Works by writing "x = m*log(2) + r" where
// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
pexp<Packet16f>(const Packet16f& _x) {
_EIGEN_DECLARE_CONST_Packet16f(1, 1.0f);
_EIGEN_DECLARE_CONST_Packet16f(half, 0.5f);
_EIGEN_DECLARE_CONST_Packet16f(127, 127.0f);
_EIGEN_DECLARE_CONST_Packet16f(exp_hi, 88.3762626647950f);
_EIGEN_DECLARE_CONST_Packet16f(exp_lo, -88.3762626647949f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_LOG2EF, 1.44269504088896341f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p0, 1.9875691500E-4f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p1, 1.3981999507E-3f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p2, 8.3334519073E-3f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p3, 4.1665795894E-2f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p4, 1.6666665459E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p5, 5.0000001201E-1f);
// Clamp x.
Packet16f x = pmax(pmin(_x, p16f_exp_hi), p16f_exp_lo);
// Express exp(x) as exp(m*ln(2) + r), start by extracting
// m = floor(x/ln(2) + 0.5).
Packet16f m = _mm512_floor_ps(pmadd(x, p16f_cephes_LOG2EF, p16f_half));
// Get r = x - m*ln(2). Note that we can do this without losing more than one
// ulp precision due to the FMA instruction.
_EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f);
Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x);
Packet16f r2 = pmul(r, r);
// TODO(gonnet): Split into odd/even polynomials and try to exploit
// instruction-level parallelism.
Packet16f y = p16f_cephes_exp_p0;
y = pmadd(y, r, p16f_cephes_exp_p1);
y = pmadd(y, r, p16f_cephes_exp_p2);
y = pmadd(y, r, p16f_cephes_exp_p3);
y = pmadd(y, r, p16f_cephes_exp_p4);
y = pmadd(y, r, p16f_cephes_exp_p5);
y = pmadd(y, r2, r);
y = padd(y, p16f_1);
// Build emm0 = 2^m.
Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127));
emm0 = _mm512_slli_epi32(emm0, 23);
// Return 2^m * exp(r).
return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x);
}
/*template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
pexp<Packet8d>(const Packet8d& _x) {
Packet8d x = _x;
_EIGEN_DECLARE_CONST_Packet8d(1, 1.0);
_EIGEN_DECLARE_CONST_Packet8d(2, 2.0);
_EIGEN_DECLARE_CONST_Packet8d(exp_hi, 709.437);
_EIGEN_DECLARE_CONST_Packet8d(exp_lo, -709.436139303);
_EIGEN_DECLARE_CONST_Packet8d(cephes_LOG2EF, 1.4426950408889634073599);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p0, 1.26177193074810590878e-4);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p1, 3.02994407707441961300e-2);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p2, 9.99999999999999999910e-1);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q0, 3.00198505138664455042e-6);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q1, 2.52448340349684104192e-3);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q2, 2.27265548208155028766e-1);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q3, 2.00000000000000000009e0);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C1, 0.693145751953125);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C2, 1.42860682030941723212e-6);
// clamp x
x = pmax(pmin(x, p8d_exp_hi), p8d_exp_lo);
// Express exp(x) as exp(g + n*log(2)).
const Packet8d n =
_mm512_mul_round_pd(p8d_cephes_LOG2EF, x, _MM_FROUND_TO_NEAREST_INT);
// Get the remainder modulo log(2), i.e. the "g" described above. Subtract
// n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
// digits right.
const Packet8d nC1 = pmul(n, p8d_cephes_exp_C1);
const Packet8d nC2 = pmul(n, p8d_cephes_exp_C2);
x = psub(x, nC1);
x = psub(x, nC2);
const Packet8d x2 = pmul(x, x);
// Evaluate the numerator polynomial of the rational interpolant.
Packet8d px = p8d_cephes_exp_p0;
px = pmadd(px, x2, p8d_cephes_exp_p1);
px = pmadd(px, x2, p8d_cephes_exp_p2);
px = pmul(px, x);
// Evaluate the denominator polynomial of the rational interpolant.
Packet8d qx = p8d_cephes_exp_q0;
qx = pmadd(qx, x2, p8d_cephes_exp_q1);
qx = pmadd(qx, x2, p8d_cephes_exp_q2);
qx = pmadd(qx, x2, p8d_cephes_exp_q3);
// I don't really get this bit, copied from the SSE2 routines, so...
// TODO(gonnet): Figure out what is going on here, perhaps find a better
// rational interpolant?
x = _mm512_div_pd(px, psub(qx, px));
x = pmadd(p8d_2, x, p8d_1);
// Build e=2^n.
const Packet8d e = _mm512_castsi512_pd(_mm512_slli_epi64(
_mm512_add_epi64(_mm512_cvtpd_epi64(n), _mm512_set1_epi64(1023)), 52));
// Construct the result 2^n * exp(g) = e * x. The max is used to catch
// non-finite values in the input.
return pmax(pmul(x, e), _x);
}*/
// Functions for sqrt.
// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
// exact solution. The main advantage of this approach is not just speed, but
// also the fact that it can be inlined and pipelined with other computations,
// further reducing its effective latency.
#if EIGEN_FAST_MATH
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
psqrt<Packet16f>(const Packet16f& _x) {
_EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
_EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
Packet16f neg_half = pmul(_x, p16f_minus_half);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x),
_mm512_setzero_ps());
// Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
// Multiply the original _x by it's reciprocal square root to extract the
// square root.
return pmul(_x, x);
}
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
psqrt<Packet8d>(const Packet8d& _x) {
_EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
_EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
Packet8d neg_half = pmul(_x, p8d_minus_half);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x),
_mm512_setzero_pd());
// Do a first step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Do a second step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Multiply the original _x by it's reciprocal square root to extract the
// square root.
return pmul(_x, x);
}
#else
template <>
EIGEN_STRONG_INLINE Packet16f psqrt<Packet16f>(const Packet16f& x) {
return _mm512_sqrt_ps(x);
}
template <>
EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(const Packet8d& x) {
return _mm512_sqrt_pd(x);
}
#endif
// Functions for rsqrt.
// Almost identical to the sqrt routine, just leave out the last multiplication
// and fill in NaN/Inf where needed. Note that this function only exists as an
// iterative version for doubles since there is no instruction for diretly
// computing the reciprocal square root in AVX-512.
#ifdef EIGEN_FAST_MATH
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
prsqrt<Packet16f>(const Packet16f& _x) {
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
_EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
_EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
Packet16f neg_half = pmul(_x, p16f_minus_half);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(),
_mm512_rsqrt14_ps(_x));
// Fill in NaNs and Infs for the negative/zero entries.
__mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
Packet16f infs_and_nans = _mm512_mask_blend_ps(
neg_mask, p16f_nan,
_mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps()));
// Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
// Insert NaNs and Infs in all the right places.
return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x);
}
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
prsqrt<Packet8d>(const Packet8d& _x) {
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL);
_EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
_EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
Packet8d neg_half = pmul(_x, p8d_minus_half);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(),
_mm512_rsqrt14_pd(_x));
// Fill in NaNs and Infs for the negative/zero entries.
__mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
Packet8d infs_and_nans = _mm512_mask_blend_pd(
neg_mask, p8d_nan,
_mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd()));
// Do a first step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Do a second step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Insert NaNs and Infs in all the right places.
return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x);
}
#else
template <>
EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
return _mm512_rsqrt28_ps(x);
}
#endif
#endif
} // end namespace internal
} // end namespace Eigen
#endif // THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_