Skip to content
GitLab
Explore
Sign in
Hide whitespace changes
Inline
Side-by-side
Some changes are not shown.
For a faster browsing experience, only
20 of 183+
files are shown. Download one of the files below to see all changes.
external/eigen3/Eigen/src/Core/arch/ZVector/PacketMath.h
0 → 100755
View file @
a394b22a
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_ZVECTOR_H
#define EIGEN_PACKET_MATH_ZVECTOR_H
#include
<stdint.h>
namespace
Eigen
{
namespace
internal
{
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
#endif
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#endif
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
#endif
typedef
__vector
int
Packet4i
;
typedef
__vector
unsigned
int
Packet4ui
;
typedef
__vector
__bool
int
Packet4bi
;
typedef
__vector
short
int
Packet8i
;
typedef
__vector
unsigned
char
Packet16uc
;
typedef
__vector
double
Packet2d
;
typedef
__vector
unsigned
long
long
Packet2ul
;
typedef
__vector
long
long
Packet2l
;
typedef
struct
{
Packet2d
v4f
[
2
];
}
Packet4f
;
typedef
union
{
int32_t
i
[
4
];
uint32_t
ui
[
4
];
int64_t
l
[
2
];
uint64_t
ul
[
2
];
double
d
[
2
];
Packet4i
v4i
;
Packet4ui
v4ui
;
Packet2l
v2l
;
Packet2ul
v2ul
;
Packet2d
v2d
;
}
Packet
;
// We don't want to write the same code all the time, but we need to reuse the constants
// and it doesn't really work to declare them global, so we define macros instead
#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X))
#define _EIGEN_DECLARE_CONST_FAST_Packet2d(NAME,X) \
Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X))
#define _EIGEN_DECLARE_CONST_FAST_Packet2l(NAME,X) \
Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X))
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
Packet4i p4i_##NAME = pset1<Packet4i>(X)
#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
Packet2d p2d_##NAME = pset1<Packet2d>(X)
#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
Packet2l p2l_##NAME = pset1<Packet2l>(X)
// These constants are endian-agnostic
//static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
static
_EIGEN_DECLARE_CONST_FAST_Packet4i
(
ONE
,
1
);
//{ 1, 1, 1, 1}
static
_EIGEN_DECLARE_CONST_FAST_Packet2d
(
ZERO
,
0
);
static
_EIGEN_DECLARE_CONST_FAST_Packet2l
(
ZERO
,
0
);
static
_EIGEN_DECLARE_CONST_FAST_Packet2l
(
ONE
,
1
);
static
Packet2d
p2d_ONE
=
{
1.0
,
1.0
};
static
Packet2d
p2d_ZERO_
=
{
-
0.0
,
-
0.0
};
static
Packet4i
p4i_COUNTDOWN
=
{
0
,
1
,
2
,
3
};
static
Packet4f
p4f_COUNTDOWN
=
{
0.0
,
1.0
,
2.0
,
3.0
};
static
Packet2d
p2d_COUNTDOWN
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet16uc
>
(
p2d_ZERO
),
reinterpret_cast
<
Packet16uc
>
(
p2d_ONE
),
8
));
static
Packet16uc
p16uc_PSET64_HI
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
static
Packet16uc
p16uc_DUPLICATE32_HI
=
{
0
,
1
,
2
,
3
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
4
,
5
,
6
,
7
};
// Mask alignment
#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
// Handle endianness properly while loading constants
// Define global static constants:
static
Packet16uc
p16uc_FORWARD
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
};
static
Packet16uc
p16uc_REVERSE32
=
{
12
,
13
,
14
,
15
,
8
,
9
,
10
,
11
,
4
,
5
,
6
,
7
,
0
,
1
,
2
,
3
};
static
Packet16uc
p16uc_REVERSE64
=
{
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
static
Packet16uc
p16uc_PSET32_WODD
=
vec_sld
((
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
0
),
(
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
2
),
8
);
//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
static
Packet16uc
p16uc_PSET32_WEVEN
=
vec_sld
(
p16uc_DUPLICATE32_HI
,
(
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
3
),
8
);
//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
/*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };*/
static
Packet16uc
p16uc_PSET64_LO
=
(
Packet16uc
)
vec_mergel
((
Packet4ui
)
p16uc_PSET32_WODD
,
(
Packet4ui
)
p16uc_PSET32_WEVEN
);
//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
/*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/
static
Packet16uc
p16uc_TRANSPOSE64_HI
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
};
static
Packet16uc
p16uc_TRANSPOSE64_LO
=
{
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
};
//static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
//static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
#define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR);
#else
#define EIGEN_ZVECTOR_PREFETCH(ADDR) asm( " pfd [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
#endif
template
<
>
struct
packet_traits
<
int
>
:
default_packet_traits
{
typedef
Packet4i
type
;
typedef
Packet4i
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
,
HasHalfPacket
=
0
,
HasAdd
=
1
,
HasSub
=
1
,
HasMul
=
1
,
HasDiv
=
1
,
HasBlend
=
1
};
};
template
<
>
struct
packet_traits
<
float
>
:
default_packet_traits
{
typedef
Packet4f
type
;
typedef
Packet4f
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
,
HasHalfPacket
=
0
,
HasAdd
=
1
,
HasSub
=
1
,
HasMul
=
1
,
HasDiv
=
1
,
HasMin
=
1
,
HasMax
=
1
,
HasAbs
=
1
,
HasSin
=
0
,
HasCos
=
0
,
HasLog
=
0
,
HasExp
=
1
,
HasSqrt
=
1
,
HasRsqrt
=
1
,
HasRound
=
1
,
HasFloor
=
1
,
HasCeil
=
1
,
HasNegate
=
1
,
HasBlend
=
1
};
};
template
<
>
struct
packet_traits
<
double
>
:
default_packet_traits
{
typedef
Packet2d
type
;
typedef
Packet2d
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
2
,
HasHalfPacket
=
1
,
HasAdd
=
1
,
HasSub
=
1
,
HasMul
=
1
,
HasDiv
=
1
,
HasMin
=
1
,
HasMax
=
1
,
HasAbs
=
1
,
HasSin
=
0
,
HasCos
=
0
,
HasLog
=
0
,
HasExp
=
1
,
HasSqrt
=
1
,
HasRsqrt
=
1
,
HasRound
=
1
,
HasFloor
=
1
,
HasCeil
=
1
,
HasNegate
=
1
,
HasBlend
=
1
};
};
template
<
>
struct
unpacket_traits
<
Packet4i
>
{
typedef
int
type
;
enum
{
size
=
4
,
alignment
=
Aligned16
};
typedef
Packet4i
half
;
};
template
<
>
struct
unpacket_traits
<
Packet4f
>
{
typedef
float
type
;
enum
{
size
=
4
,
alignment
=
Aligned16
};
typedef
Packet4f
half
;
};
template
<
>
struct
unpacket_traits
<
Packet2d
>
{
typedef
double
type
;
enum
{
size
=
2
,
alignment
=
Aligned16
};
typedef
Packet2d
half
;
};
/* Forward declaration */
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet4f
,
4
>&
kernel
);
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
s
,
const
Packet4i
&
v
)
{
Packet
vt
;
vt
.
v4i
=
v
;
s
<<
vt
.
i
[
0
]
<<
", "
<<
vt
.
i
[
1
]
<<
", "
<<
vt
.
i
[
2
]
<<
", "
<<
vt
.
i
[
3
];
return
s
;
}
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
s
,
const
Packet4ui
&
v
)
{
Packet
vt
;
vt
.
v4ui
=
v
;
s
<<
vt
.
ui
[
0
]
<<
", "
<<
vt
.
ui
[
1
]
<<
", "
<<
vt
.
ui
[
2
]
<<
", "
<<
vt
.
ui
[
3
];
return
s
;
}
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
s
,
const
Packet2l
&
v
)
{
Packet
vt
;
vt
.
v2l
=
v
;
s
<<
vt
.
l
[
0
]
<<
", "
<<
vt
.
l
[
1
];
return
s
;
}
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
s
,
const
Packet2ul
&
v
)
{
Packet
vt
;
vt
.
v2ul
=
v
;
s
<<
vt
.
ul
[
0
]
<<
", "
<<
vt
.
ul
[
1
]
;
return
s
;
}
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
s
,
const
Packet2d
&
v
)
{
Packet
vt
;
vt
.
v2d
=
v
;
s
<<
vt
.
d
[
0
]
<<
", "
<<
vt
.
d
[
1
];
return
s
;
}
/* Helper function to simulate a vec_splat_packet4f
*/
template
<
int
element
>
EIGEN_STRONG_INLINE
Packet4f
vec_splat_packet4f
(
const
Packet4f
&
from
)
{
Packet4f
splat
;
switch
(
element
)
{
case
0
:
splat
.
v4f
[
0
]
=
vec_splat
(
from
.
v4f
[
0
],
0
);
splat
.
v4f
[
1
]
=
splat
.
v4f
[
0
];
break
;
case
1
:
splat
.
v4f
[
0
]
=
vec_splat
(
from
.
v4f
[
0
],
1
);
splat
.
v4f
[
1
]
=
splat
.
v4f
[
0
];
break
;
case
2
:
splat
.
v4f
[
0
]
=
vec_splat
(
from
.
v4f
[
1
],
0
);
splat
.
v4f
[
1
]
=
splat
.
v4f
[
0
];
break
;
case
3
:
splat
.
v4f
[
0
]
=
vec_splat
(
from
.
v4f
[
1
],
1
);
splat
.
v4f
[
1
]
=
splat
.
v4f
[
0
];
break
;
}
return
splat
;
}
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet4i
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet4i
&
first
,
const
Packet4i
&
second
)
{
switch
(
Offset
%
4
)
{
case
1
:
first
=
vec_sld
(
first
,
second
,
4
);
break
;
case
2
:
first
=
vec_sld
(
first
,
second
,
8
);
break
;
case
3
:
first
=
vec_sld
(
first
,
second
,
12
);
break
;
}
}
};
/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double
*/
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet4f
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet4f
&
first
,
const
Packet4f
&
second
)
{
switch
(
Offset
%
4
)
{
case
1
:
first
.
v4f
[
0
]
=
vec_sld
(
first
.
v4f
[
0
],
first
.
v4f
[
1
],
8
);
first
.
v4f
[
1
]
=
vec_sld
(
first
.
v4f
[
1
],
second
.
v4f
[
0
],
8
);
break
;
case
2
:
first
.
v4f
[
0
]
=
first
.
v4f
[
1
];
first
.
v4f
[
1
]
=
second
.
v4f
[
0
];
break
;
case
3
:
first
.
v4f
[
0
]
=
vec_sld
(
first
.
v4f
[
1
],
second
.
v4f
[
0
],
8
);
first
.
v4f
[
1
]
=
vec_sld
(
second
.
v4f
[
0
],
second
.
v4f
[
1
],
8
);
break
;
}
}
};
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet2d
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet2d
&
first
,
const
Packet2d
&
second
)
{
if
(
Offset
==
1
)
first
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4i
>
(
first
),
reinterpret_cast
<
Packet4i
>
(
second
),
8
));
}
};
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pload
<
Packet4i
>
(
const
int
*
from
)
{
// FIXME: No intrinsic yet
EIGEN_DEBUG_ALIGNED_LOAD
Packet
*
vfrom
;
vfrom
=
(
Packet
*
)
from
;
return
vfrom
->
v4i
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pload
<
Packet4f
>
(
const
float
*
from
)
{
// FIXME: No intrinsic yet
EIGEN_DEBUG_ALIGNED_LOAD
Packet4f
vfrom
;
vfrom
.
v4f
[
0
]
=
vec_ld2f
(
&
from
[
0
]);
vfrom
.
v4f
[
1
]
=
vec_ld2f
(
&
from
[
2
]);
return
vfrom
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pload
<
Packet2d
>
(
const
double
*
from
)
{
// FIXME: No intrinsic yet
EIGEN_DEBUG_ALIGNED_LOAD
Packet
*
vfrom
;
vfrom
=
(
Packet
*
)
from
;
return
vfrom
->
v2d
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
int
>
(
int
*
to
,
const
Packet4i
&
from
)
{
// FIXME: No intrinsic yet
EIGEN_DEBUG_ALIGNED_STORE
Packet
*
vto
;
vto
=
(
Packet
*
)
to
;
vto
->
v4i
=
from
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
// FIXME: No intrinsic yet
EIGEN_DEBUG_ALIGNED_STORE
vec_st2f
(
from
.
v4f
[
0
],
&
to
[
0
]);
vec_st2f
(
from
.
v4f
[
1
],
&
to
[
2
]);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
double
>
(
double
*
to
,
const
Packet2d
&
from
)
{
// FIXME: No intrinsic yet
EIGEN_DEBUG_ALIGNED_STORE
Packet
*
vto
;
vto
=
(
Packet
*
)
to
;
vto
->
v2d
=
from
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pset1
<
Packet4i
>
(
const
int
&
from
)
{
return
vec_splats
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pset1
<
Packet2d
>
(
const
double
&
from
)
{
return
vec_splats
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pset1
<
Packet4f
>
(
const
float
&
from
)
{
Packet4f
to
;
to
.
v4f
[
0
]
=
pset1
<
Packet2d
>
(
static_cast
<
const
double
&>
(
from
));
to
.
v4f
[
1
]
=
to
.
v4f
[
0
];
return
to
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pbroadcast4
<
Packet4i
>
(
const
int
*
a
,
Packet4i
&
a0
,
Packet4i
&
a1
,
Packet4i
&
a2
,
Packet4i
&
a3
)
{
a3
=
pload
<
Packet4i
>
(
a
);
a0
=
vec_splat
(
a3
,
0
);
a1
=
vec_splat
(
a3
,
1
);
a2
=
vec_splat
(
a3
,
2
);
a3
=
vec_splat
(
a3
,
3
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pbroadcast4
<
Packet4f
>
(
const
float
*
a
,
Packet4f
&
a0
,
Packet4f
&
a1
,
Packet4f
&
a2
,
Packet4f
&
a3
)
{
a3
=
pload
<
Packet4f
>
(
a
);
a0
=
vec_splat_packet4f
<
0
>
(
a3
);
a1
=
vec_splat_packet4f
<
1
>
(
a3
);
a2
=
vec_splat_packet4f
<
2
>
(
a3
);
a3
=
vec_splat_packet4f
<
3
>
(
a3
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pbroadcast4
<
Packet2d
>
(
const
double
*
a
,
Packet2d
&
a0
,
Packet2d
&
a1
,
Packet2d
&
a2
,
Packet2d
&
a3
)
{
a1
=
pload
<
Packet2d
>
(
a
);
a0
=
vec_splat
(
a1
,
0
);
a1
=
vec_splat
(
a1
,
1
);
a3
=
pload
<
Packet2d
>
(
a
+
2
);
a2
=
vec_splat
(
a3
,
0
);
a3
=
vec_splat
(
a3
,
1
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet4i
pgather
<
int
,
Packet4i
>
(
const
int
*
from
,
Index
stride
)
{
int
EIGEN_ALIGN16
ai
[
4
];
ai
[
0
]
=
from
[
0
*
stride
];
ai
[
1
]
=
from
[
1
*
stride
];
ai
[
2
]
=
from
[
2
*
stride
];
ai
[
3
]
=
from
[
3
*
stride
];
return
pload
<
Packet4i
>
(
ai
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet4f
pgather
<
float
,
Packet4f
>
(
const
float
*
from
,
Index
stride
)
{
float
EIGEN_ALIGN16
ai
[
4
];
ai
[
0
]
=
from
[
0
*
stride
];
ai
[
1
]
=
from
[
1
*
stride
];
ai
[
2
]
=
from
[
2
*
stride
];
ai
[
3
]
=
from
[
3
*
stride
];
return
pload
<
Packet4f
>
(
ai
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet2d
pgather
<
double
,
Packet2d
>
(
const
double
*
from
,
Index
stride
)
{
double
EIGEN_ALIGN16
af
[
2
];
af
[
0
]
=
from
[
0
*
stride
];
af
[
1
]
=
from
[
1
*
stride
];
return
pload
<
Packet2d
>
(
af
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
int
,
Packet4i
>
(
int
*
to
,
const
Packet4i
&
from
,
Index
stride
)
{
int
EIGEN_ALIGN16
ai
[
4
];
pstore
<
int
>
((
int
*
)
ai
,
from
);
to
[
0
*
stride
]
=
ai
[
0
];
to
[
1
*
stride
]
=
ai
[
1
];
to
[
2
*
stride
]
=
ai
[
2
];
to
[
3
*
stride
]
=
ai
[
3
];
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
float
,
Packet4f
>
(
float
*
to
,
const
Packet4f
&
from
,
Index
stride
)
{
float
EIGEN_ALIGN16
ai
[
4
];
pstore
<
float
>
((
float
*
)
ai
,
from
);
to
[
0
*
stride
]
=
ai
[
0
];
to
[
1
*
stride
]
=
ai
[
1
];
to
[
2
*
stride
]
=
ai
[
2
];
to
[
3
*
stride
]
=
ai
[
3
];
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
double
,
Packet2d
>
(
double
*
to
,
const
Packet2d
&
from
,
Index
stride
)
{
double
EIGEN_ALIGN16
af
[
2
];
pstore
<
double
>
(
af
,
from
);
to
[
0
*
stride
]
=
af
[
0
];
to
[
1
*
stride
]
=
af
[
1
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
padd
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
(
a
+
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
padd
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
Packet4f
c
;
c
.
v4f
[
0
]
=
a
.
v4f
[
0
]
+
b
.
v4f
[
0
];
c
.
v4f
[
1
]
=
a
.
v4f
[
1
]
+
b
.
v4f
[
1
];
return
c
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
padd
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
(
a
+
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
psub
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
(
a
-
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
psub
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
Packet4f
c
;
c
.
v4f
[
0
]
=
a
.
v4f
[
0
]
-
b
.
v4f
[
0
];
c
.
v4f
[
1
]
=
a
.
v4f
[
1
]
-
b
.
v4f
[
1
];
return
c
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
psub
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
(
a
-
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmul
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
(
a
*
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmul
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
Packet4f
c
;
c
.
v4f
[
0
]
=
a
.
v4f
[
0
]
*
b
.
v4f
[
0
];
c
.
v4f
[
1
]
=
a
.
v4f
[
1
]
*
b
.
v4f
[
1
];
return
c
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmul
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
(
a
*
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pdiv
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
(
a
/
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pdiv
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
Packet4f
c
;
c
.
v4f
[
0
]
=
a
.
v4f
[
0
]
/
b
.
v4f
[
0
];
c
.
v4f
[
1
]
=
a
.
v4f
[
1
]
/
b
.
v4f
[
1
];
return
c
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pdiv
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
(
a
/
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pnegate
(
const
Packet4i
&
a
)
{
return
(
-
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pnegate
(
const
Packet4f
&
a
)
{
Packet4f
c
;
c
.
v4f
[
0
]
=
-
a
.
v4f
[
0
];
c
.
v4f
[
1
]
=
-
a
.
v4f
[
1
];
return
c
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pnegate
(
const
Packet2d
&
a
)
{
return
(
-
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pconj
(
const
Packet4i
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pconj
(
const
Packet4f
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pconj
(
const
Packet2d
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmadd
(
const
Packet4i
&
a
,
const
Packet4i
&
b
,
const
Packet4i
&
c
)
{
return
padd
<
Packet4i
>
(
pmul
<
Packet4i
>
(
a
,
b
),
c
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmadd
(
const
Packet4f
&
a
,
const
Packet4f
&
b
,
const
Packet4f
&
c
)
{
Packet4f
res
;
res
.
v4f
[
0
]
=
vec_madd
(
a
.
v4f
[
0
],
b
.
v4f
[
0
],
c
.
v4f
[
0
]);
res
.
v4f
[
1
]
=
vec_madd
(
a
.
v4f
[
1
],
b
.
v4f
[
1
],
c
.
v4f
[
1
]);
return
res
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmadd
(
const
Packet2d
&
a
,
const
Packet2d
&
b
,
const
Packet2d
&
c
)
{
return
vec_madd
(
a
,
b
,
c
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
plset
<
Packet4i
>
(
const
int
&
a
)
{
return
padd
<
Packet4i
>
(
pset1
<
Packet4i
>
(
a
),
p4i_COUNTDOWN
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
plset
<
Packet4f
>
(
const
float
&
a
)
{
return
padd
<
Packet4f
>
(
pset1
<
Packet4f
>
(
a
),
p4f_COUNTDOWN
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
plset
<
Packet2d
>
(
const
double
&
a
)
{
return
padd
<
Packet2d
>
(
pset1
<
Packet2d
>
(
a
),
p2d_COUNTDOWN
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmin
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vec_min
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmin
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_min
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmin
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
Packet4f
res
;
res
.
v4f
[
0
]
=
pmin
(
a
.
v4f
[
0
],
b
.
v4f
[
0
]);
res
.
v4f
[
1
]
=
pmin
(
a
.
v4f
[
1
],
b
.
v4f
[
1
]);
return
res
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmax
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vec_max
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmax
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_max
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmax
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
Packet4f
res
;
res
.
v4f
[
0
]
=
pmax
(
a
.
v4f
[
0
],
b
.
v4f
[
0
]);
res
.
v4f
[
1
]
=
pmax
(
a
.
v4f
[
1
],
b
.
v4f
[
1
]);
return
res
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pand
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vec_and
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pand
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_and
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pand
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
Packet4f
res
;
res
.
v4f
[
0
]
=
pand
(
a
.
v4f
[
0
],
b
.
v4f
[
0
]);
res
.
v4f
[
1
]
=
pand
(
a
.
v4f
[
1
],
b
.
v4f
[
1
]);
return
res
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
por
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vec_or
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
por
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_or
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
por
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
Packet4f
res
;
res
.
v4f
[
0
]
=
pand
(
a
.
v4f
[
0
],
b
.
v4f
[
0
]);
res
.
v4f
[
1
]
=
pand
(
a
.
v4f
[
1
],
b
.
v4f
[
1
]);
return
res
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pxor
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vec_xor
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pxor
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_xor
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pxor
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
Packet4f
res
;
res
.
v4f
[
0
]
=
pand
(
a
.
v4f
[
0
],
b
.
v4f
[
0
]);
res
.
v4f
[
1
]
=
pand
(
a
.
v4f
[
1
],
b
.
v4f
[
1
]);
return
res
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pandnot
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
pand
<
Packet4i
>
(
a
,
vec_nor
(
b
,
b
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pandnot
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_and
(
a
,
vec_nor
(
b
,
b
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pandnot
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
Packet4f
res
;
res
.
v4f
[
0
]
=
pandnot
(
a
.
v4f
[
0
],
b
.
v4f
[
0
]);
res
.
v4f
[
1
]
=
pandnot
(
a
.
v4f
[
1
],
b
.
v4f
[
1
]);
return
res
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pround
<
Packet4f
>
(
const
Packet4f
&
a
)
{
Packet4f
res
;
res
.
v4f
[
0
]
=
vec_round
(
a
.
v4f
[
0
]);
res
.
v4f
[
1
]
=
vec_round
(
a
.
v4f
[
1
]);
return
res
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pround
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vec_round
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pceil
<
Packet4f
>
(
const
Packet4f
&
a
)
{
Packet4f
res
;
res
.
v4f
[
0
]
=
vec_ceil
(
a
.
v4f
[
0
]);
res
.
v4f
[
1
]
=
vec_ceil
(
a
.
v4f
[
1
]);
return
res
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pceil
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vec_ceil
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pfloor
<
Packet4f
>
(
const
Packet4f
&
a
)
{
Packet4f
res
;
res
.
v4f
[
0
]
=
vec_floor
(
a
.
v4f
[
0
]);
res
.
v4f
[
1
]
=
vec_floor
(
a
.
v4f
[
1
]);
return
res
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pfloor
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vec_floor
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploadu
<
Packet4i
>
(
const
int
*
from
)
{
return
pload
<
Packet4i
>
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploadu
<
Packet4f
>
(
const
float
*
from
)
{
return
pload
<
Packet4f
>
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
ploadu
<
Packet2d
>
(
const
double
*
from
)
{
return
pload
<
Packet2d
>
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploaddup
<
Packet4i
>
(
const
int
*
from
)
{
Packet4i
p
=
pload
<
Packet4i
>
(
from
);
return
vec_perm
(
p
,
p
,
p16uc_DUPLICATE32_HI
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploaddup
<
Packet4f
>
(
const
float
*
from
)
{
Packet4f
p
=
pload
<
Packet4f
>
(
from
);
p
.
v4f
[
1
]
=
vec_splat
(
p
.
v4f
[
0
],
1
);
p
.
v4f
[
0
]
=
vec_splat
(
p
.
v4f
[
0
],
0
);
return
p
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
ploaddup
<
Packet2d
>
(
const
double
*
from
)
{
Packet2d
p
=
pload
<
Packet2d
>
(
from
);
return
vec_perm
(
p
,
p
,
p16uc_PSET64_HI
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
int
>
(
int
*
to
,
const
Packet4i
&
from
)
{
pstore
<
int
>
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
pstore
<
float
>
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
double
>
(
double
*
to
,
const
Packet2d
&
from
)
{
pstore
<
double
>
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
int
>
(
const
int
*
addr
)
{
EIGEN_ZVECTOR_PREFETCH
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
float
>
(
const
float
*
addr
)
{
EIGEN_ZVECTOR_PREFETCH
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
double
>
(
const
double
*
addr
)
{
EIGEN_ZVECTOR_PREFETCH
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
int
pfirst
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int
EIGEN_ALIGN16
x
[
4
];
pstore
(
x
,
a
);
return
x
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
float
pfirst
<
Packet4f
>
(
const
Packet4f
&
a
)
{
float
EIGEN_ALIGN16
x
[
2
];
vec_st2f
(
a
.
v4f
[
0
],
&
x
[
0
]);
return
x
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
double
pfirst
<
Packet2d
>
(
const
Packet2d
&
a
)
{
double
EIGEN_ALIGN16
x
[
2
];
pstore
(
x
,
a
);
return
x
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
preverse
(
const
Packet4i
&
a
)
{
return
reinterpret_cast
<
Packet4i
>
(
vec_perm
(
reinterpret_cast
<
Packet16uc
>
(
a
),
reinterpret_cast
<
Packet16uc
>
(
a
),
p16uc_REVERSE32
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
preverse
(
const
Packet2d
&
a
)
{
return
reinterpret_cast
<
Packet2d
>
(
vec_perm
(
reinterpret_cast
<
Packet16uc
>
(
a
),
reinterpret_cast
<
Packet16uc
>
(
a
),
p16uc_REVERSE64
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
preverse
(
const
Packet4f
&
a
)
{
Packet4f
rev
;
rev
.
v4f
[
0
]
=
preverse
<
Packet2d
>
(
a
.
v4f
[
1
]);
rev
.
v4f
[
1
]
=
preverse
<
Packet2d
>
(
a
.
v4f
[
0
]);
return
rev
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pabs
<
Packet4i
>
(
const
Packet4i
&
a
)
{
return
vec_abs
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pabs
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vec_abs
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pabs
<
Packet4f
>
(
const
Packet4f
&
a
)
{
Packet4f
res
;
res
.
v4f
[
0
]
=
pabs
(
a
.
v4f
[
0
]);
res
.
v4f
[
1
]
=
pabs
(
a
.
v4f
[
1
]);
return
res
;
}
template
<
>
EIGEN_STRONG_INLINE
int
predux
<
Packet4i
>
(
const
Packet4i
&
a
)
{
Packet4i
b
,
sum
;
b
=
vec_sld
(
a
,
a
,
8
);
sum
=
padd
<
Packet4i
>
(
a
,
b
);
b
=
vec_sld
(
sum
,
sum
,
4
);
sum
=
padd
<
Packet4i
>
(
sum
,
b
);
return
pfirst
(
sum
);
}
template
<
>
EIGEN_STRONG_INLINE
double
predux
<
Packet2d
>
(
const
Packet2d
&
a
)
{
Packet2d
b
,
sum
;
b
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4i
>
(
a
),
reinterpret_cast
<
Packet4i
>
(
a
),
8
));
sum
=
padd
<
Packet2d
>
(
a
,
b
);
return
pfirst
(
sum
);
}
template
<
>
EIGEN_STRONG_INLINE
float
predux
<
Packet4f
>
(
const
Packet4f
&
a
)
{
Packet2d
sum
;
sum
=
padd
<
Packet2d
>
(
a
.
v4f
[
0
],
a
.
v4f
[
1
]);
double
first
=
predux
<
Packet2d
>
(
sum
);
return
static_cast
<
float
>
(
first
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
preduxp
<
Packet4i
>
(
const
Packet4i
*
vecs
)
{
Packet4i
v
[
4
],
sum
[
4
];
// It's easier and faster to transpose then add as columns
// Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
// Do the transpose, first set of moves
v
[
0
]
=
vec_mergeh
(
vecs
[
0
],
vecs
[
2
]);
v
[
1
]
=
vec_mergel
(
vecs
[
0
],
vecs
[
2
]);
v
[
2
]
=
vec_mergeh
(
vecs
[
1
],
vecs
[
3
]);
v
[
3
]
=
vec_mergel
(
vecs
[
1
],
vecs
[
3
]);
// Get the resulting vectors
sum
[
0
]
=
vec_mergeh
(
v
[
0
],
v
[
2
]);
sum
[
1
]
=
vec_mergel
(
v
[
0
],
v
[
2
]);
sum
[
2
]
=
vec_mergeh
(
v
[
1
],
v
[
3
]);
sum
[
3
]
=
vec_mergel
(
v
[
1
],
v
[
3
]);
// Now do the summation:
// Lines 0+1
sum
[
0
]
=
padd
<
Packet4i
>
(
sum
[
0
],
sum
[
1
]);
// Lines 2+3
sum
[
1
]
=
padd
<
Packet4i
>
(
sum
[
2
],
sum
[
3
]);
// Add the results
sum
[
0
]
=
padd
<
Packet4i
>
(
sum
[
0
],
sum
[
1
]);
return
sum
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
preduxp
<
Packet2d
>
(
const
Packet2d
*
vecs
)
{
Packet2d
v
[
2
],
sum
;
v
[
0
]
=
padd
<
Packet2d
>
(
vecs
[
0
],
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4ui
>
(
vecs
[
0
]),
reinterpret_cast
<
Packet4ui
>
(
vecs
[
0
]),
8
)));
v
[
1
]
=
padd
<
Packet2d
>
(
vecs
[
1
],
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4ui
>
(
vecs
[
1
]),
reinterpret_cast
<
Packet4ui
>
(
vecs
[
1
]),
8
)));
sum
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4ui
>
(
v
[
0
]),
reinterpret_cast
<
Packet4ui
>
(
v
[
1
]),
8
));
return
sum
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
preduxp
<
Packet4f
>
(
const
Packet4f
*
vecs
)
{
PacketBlock
<
Packet4f
,
4
>
transpose
;
transpose
.
packet
[
0
]
=
vecs
[
0
];
transpose
.
packet
[
1
]
=
vecs
[
1
];
transpose
.
packet
[
2
]
=
vecs
[
2
];
transpose
.
packet
[
3
]
=
vecs
[
3
];
ptranspose
(
transpose
);
Packet4f
sum
=
padd
(
transpose
.
packet
[
0
],
transpose
.
packet
[
1
]);
sum
=
padd
(
sum
,
transpose
.
packet
[
2
]);
sum
=
padd
(
sum
,
transpose
.
packet
[
3
]);
return
sum
;
}
// Other reduction functions:
// mul
template
<
>
EIGEN_STRONG_INLINE
int
predux_mul
<
Packet4i
>
(
const
Packet4i
&
a
)
{
EIGEN_ALIGN16
int
aux
[
4
];
pstore
(
aux
,
a
);
return
aux
[
0
]
*
aux
[
1
]
*
aux
[
2
]
*
aux
[
3
];
}
template
<
>
EIGEN_STRONG_INLINE
double
predux_mul
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
pfirst
(
pmul
(
a
,
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4i
>
(
a
),
reinterpret_cast
<
Packet4i
>
(
a
),
8
))));
}
template
<
>
EIGEN_STRONG_INLINE
float
predux_mul
<
Packet4f
>
(
const
Packet4f
&
a
)
{
// Return predux_mul<Packet2d> of the subvectors product
return
static_cast
<
float
>
(
pfirst
(
predux_mul
(
pmul
(
a
.
v4f
[
0
],
a
.
v4f
[
1
]))));
}
// min
template
<
>
EIGEN_STRONG_INLINE
int
predux_min
<
Packet4i
>
(
const
Packet4i
&
a
)
{
Packet4i
b
,
res
;
b
=
pmin
<
Packet4i
>
(
a
,
vec_sld
(
a
,
a
,
8
));
res
=
pmin
<
Packet4i
>
(
b
,
vec_sld
(
b
,
b
,
4
));
return
pfirst
(
res
);
}
template
<
>
EIGEN_STRONG_INLINE
double
predux_min
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
pfirst
(
pmin
<
Packet2d
>
(
a
,
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4i
>
(
a
),
reinterpret_cast
<
Packet4i
>
(
a
),
8
))));
}
template
<
>
EIGEN_STRONG_INLINE
float
predux_min
<
Packet4f
>
(
const
Packet4f
&
a
)
{
Packet2d
b
,
res
;
b
=
pmin
<
Packet2d
>
(
a
.
v4f
[
0
],
a
.
v4f
[
1
]);
res
=
pmin
<
Packet2d
>
(
b
,
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4i
>
(
b
),
reinterpret_cast
<
Packet4i
>
(
b
),
8
)));
return
static_cast
<
float
>
(
pfirst
(
res
));
}
// max
template
<
>
EIGEN_STRONG_INLINE
int
predux_max
<
Packet4i
>
(
const
Packet4i
&
a
)
{
Packet4i
b
,
res
;
b
=
pmax
<
Packet4i
>
(
a
,
vec_sld
(
a
,
a
,
8
));
res
=
pmax
<
Packet4i
>
(
b
,
vec_sld
(
b
,
b
,
4
));
return
pfirst
(
res
);
}
// max
template
<
>
EIGEN_STRONG_INLINE
double
predux_max
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
pfirst
(
pmax
<
Packet2d
>
(
a
,
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4i
>
(
a
),
reinterpret_cast
<
Packet4i
>
(
a
),
8
))));
}
template
<
>
EIGEN_STRONG_INLINE
float
predux_max
<
Packet4f
>
(
const
Packet4f
&
a
)
{
Packet2d
b
,
res
;
b
=
pmax
<
Packet2d
>
(
a
.
v4f
[
0
],
a
.
v4f
[
1
]);
res
=
pmax
<
Packet2d
>
(
b
,
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4i
>
(
b
),
reinterpret_cast
<
Packet4i
>
(
b
),
8
)));
return
static_cast
<
float
>
(
pfirst
(
res
));
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet4i
,
4
>&
kernel
)
{
Packet4i
t0
=
vec_mergeh
(
kernel
.
packet
[
0
],
kernel
.
packet
[
2
]);
Packet4i
t1
=
vec_mergel
(
kernel
.
packet
[
0
],
kernel
.
packet
[
2
]);
Packet4i
t2
=
vec_mergeh
(
kernel
.
packet
[
1
],
kernel
.
packet
[
3
]);
Packet4i
t3
=
vec_mergel
(
kernel
.
packet
[
1
],
kernel
.
packet
[
3
]);
kernel
.
packet
[
0
]
=
vec_mergeh
(
t0
,
t2
);
kernel
.
packet
[
1
]
=
vec_mergel
(
t0
,
t2
);
kernel
.
packet
[
2
]
=
vec_mergeh
(
t1
,
t3
);
kernel
.
packet
[
3
]
=
vec_mergel
(
t1
,
t3
);
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet2d
,
2
>&
kernel
)
{
Packet2d
t0
=
vec_perm
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
],
p16uc_TRANSPOSE64_HI
);
Packet2d
t1
=
vec_perm
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
],
p16uc_TRANSPOSE64_LO
);
kernel
.
packet
[
0
]
=
t0
;
kernel
.
packet
[
1
]
=
t1
;
}
/* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
*/
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet4f
,
4
>&
kernel
)
{
PacketBlock
<
Packet2d
,
2
>
t0
,
t1
,
t2
,
t3
;
// copy top-left 2x2 Packet2d block
t0
.
packet
[
0
]
=
kernel
.
packet
[
0
].
v4f
[
0
];
t0
.
packet
[
1
]
=
kernel
.
packet
[
1
].
v4f
[
0
];
// copy top-right 2x2 Packet2d block
t1
.
packet
[
0
]
=
kernel
.
packet
[
0
].
v4f
[
1
];
t1
.
packet
[
1
]
=
kernel
.
packet
[
1
].
v4f
[
1
];
// copy bottom-left 2x2 Packet2d block
t2
.
packet
[
0
]
=
kernel
.
packet
[
2
].
v4f
[
0
];
t2
.
packet
[
1
]
=
kernel
.
packet
[
3
].
v4f
[
0
];
// copy bottom-right 2x2 Packet2d block
t3
.
packet
[
0
]
=
kernel
.
packet
[
2
].
v4f
[
1
];
t3
.
packet
[
1
]
=
kernel
.
packet
[
3
].
v4f
[
1
];
// Transpose all 2x2 blocks
ptranspose
(
t0
);
ptranspose
(
t1
);
ptranspose
(
t2
);
ptranspose
(
t3
);
// Copy back transposed blocks, but exchange t1 and t2 due to transposition
kernel
.
packet
[
0
].
v4f
[
0
]
=
t0
.
packet
[
0
];
kernel
.
packet
[
0
].
v4f
[
1
]
=
t2
.
packet
[
0
];
kernel
.
packet
[
1
].
v4f
[
0
]
=
t0
.
packet
[
1
];
kernel
.
packet
[
1
].
v4f
[
1
]
=
t2
.
packet
[
1
];
kernel
.
packet
[
2
].
v4f
[
0
]
=
t1
.
packet
[
0
];
kernel
.
packet
[
2
].
v4f
[
1
]
=
t3
.
packet
[
0
];
kernel
.
packet
[
3
].
v4f
[
0
]
=
t1
.
packet
[
1
];
kernel
.
packet
[
3
].
v4f
[
1
]
=
t3
.
packet
[
1
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pblend
(
const
Selector
<
4
>&
ifPacket
,
const
Packet4i
&
thenPacket
,
const
Packet4i
&
elsePacket
)
{
Packet4ui
select
=
{
ifPacket
.
select
[
0
],
ifPacket
.
select
[
1
],
ifPacket
.
select
[
2
],
ifPacket
.
select
[
3
]
};
Packet4ui
mask
=
vec_cmpeq
(
select
,
reinterpret_cast
<
Packet4ui
>
(
p4i_ONE
));
return
vec_sel
(
elsePacket
,
thenPacket
,
mask
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pblend
(
const
Selector
<
4
>&
ifPacket
,
const
Packet4f
&
thenPacket
,
const
Packet4f
&
elsePacket
)
{
Packet2ul
select_hi
=
{
ifPacket
.
select
[
0
],
ifPacket
.
select
[
1
]
};
Packet2ul
select_lo
=
{
ifPacket
.
select
[
2
],
ifPacket
.
select
[
3
]
};
Packet2ul
mask_hi
=
vec_cmpeq
(
select_hi
,
reinterpret_cast
<
Packet2ul
>
(
p2l_ONE
));
Packet2ul
mask_lo
=
vec_cmpeq
(
select_lo
,
reinterpret_cast
<
Packet2ul
>
(
p2l_ONE
));
Packet4f
result
;
result
.
v4f
[
0
]
=
vec_sel
(
elsePacket
.
v4f
[
0
],
thenPacket
.
v4f
[
0
],
mask_hi
);
result
.
v4f
[
1
]
=
vec_sel
(
elsePacket
.
v4f
[
1
],
thenPacket
.
v4f
[
1
],
mask_lo
);
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pblend
(
const
Selector
<
2
>&
ifPacket
,
const
Packet2d
&
thenPacket
,
const
Packet2d
&
elsePacket
)
{
Packet2ul
select
=
{
ifPacket
.
select
[
0
],
ifPacket
.
select
[
1
]
};
Packet2ul
mask
=
vec_cmpeq
(
select
,
reinterpret_cast
<
Packet2ul
>
(
p2l_ONE
));
return
vec_sel
(
elsePacket
,
thenPacket
,
mask
);
}
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_PACKET_MATH_ZVECTOR_H
external/eigen3/Eigen/src/Core/functors/AssignmentFunctors.h
0 → 100644
View file @
a394b22a
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_ASSIGNMENT_FUNCTORS_H
#define EIGEN_ASSIGNMENT_FUNCTORS_H
namespace
Eigen
{
namespace
internal
{
/** \internal
* \brief Template functor for scalar/packet assignment
*
*/
template
<
typename
DstScalar
,
typename
SrcScalar
>
struct
assign_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
assign_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
void
assignCoeff
(
DstScalar
&
a
,
const
SrcScalar
&
b
)
const
{
a
=
b
;
}
template
<
int
Alignment
,
typename
Packet
>
EIGEN_STRONG_INLINE
void
assignPacket
(
DstScalar
*
a
,
const
Packet
&
b
)
const
{
internal
::
pstoret
<
DstScalar
,
Packet
,
Alignment
>
(
a
,
b
);
}
};
// Empty overload for void type (used by PermutationMatrix)
template
<
typename
DstScalar
>
struct
assign_op
<
DstScalar
,
void
>
{};
template
<
typename
DstScalar
,
typename
SrcScalar
>
struct
functor_traits
<
assign_op
<
DstScalar
,
SrcScalar
>
>
{
enum
{
Cost
=
NumTraits
<
DstScalar
>::
ReadCost
,
PacketAccess
=
is_same
<
DstScalar
,
SrcScalar
>::
value
&&
packet_traits
<
DstScalar
>::
Vectorizable
&&
packet_traits
<
SrcScalar
>::
Vectorizable
};
};
/** \internal
* \brief Template functor for scalar/packet assignment with addition
*
*/
template
<
typename
DstScalar
,
typename
SrcScalar
>
struct
add_assign_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
add_assign_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
void
assignCoeff
(
DstScalar
&
a
,
const
SrcScalar
&
b
)
const
{
a
+=
b
;
}
template
<
int
Alignment
,
typename
Packet
>
EIGEN_STRONG_INLINE
void
assignPacket
(
DstScalar
*
a
,
const
Packet
&
b
)
const
{
internal
::
pstoret
<
DstScalar
,
Packet
,
Alignment
>
(
a
,
internal
::
padd
(
internal
::
ploadt
<
Packet
,
Alignment
>
(
a
),
b
));
}
};
template
<
typename
DstScalar
,
typename
SrcScalar
>
struct
functor_traits
<
add_assign_op
<
DstScalar
,
SrcScalar
>
>
{
enum
{
Cost
=
NumTraits
<
DstScalar
>::
ReadCost
+
NumTraits
<
DstScalar
>::
AddCost
,
PacketAccess
=
is_same
<
DstScalar
,
SrcScalar
>::
value
&&
packet_traits
<
DstScalar
>::
HasAdd
};
};
/** \internal
* \brief Template functor for scalar/packet assignment with subtraction
*
*/
template
<
typename
DstScalar
,
typename
SrcScalar
>
struct
sub_assign_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
sub_assign_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
void
assignCoeff
(
DstScalar
&
a
,
const
SrcScalar
&
b
)
const
{
a
-=
b
;
}
template
<
int
Alignment
,
typename
Packet
>
EIGEN_STRONG_INLINE
void
assignPacket
(
DstScalar
*
a
,
const
Packet
&
b
)
const
{
internal
::
pstoret
<
DstScalar
,
Packet
,
Alignment
>
(
a
,
internal
::
psub
(
internal
::
ploadt
<
Packet
,
Alignment
>
(
a
),
b
));
}
};
template
<
typename
DstScalar
,
typename
SrcScalar
>
struct
functor_traits
<
sub_assign_op
<
DstScalar
,
SrcScalar
>
>
{
enum
{
Cost
=
NumTraits
<
DstScalar
>::
ReadCost
+
NumTraits
<
DstScalar
>::
AddCost
,
PacketAccess
=
is_same
<
DstScalar
,
SrcScalar
>::
value
&&
packet_traits
<
DstScalar
>::
HasSub
};
};
/** \internal
* \brief Template functor for scalar/packet assignment with multiplication
*
*/
template
<
typename
DstScalar
,
typename
SrcScalar
=
DstScalar
>
struct
mul_assign_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
mul_assign_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
void
assignCoeff
(
DstScalar
&
a
,
const
SrcScalar
&
b
)
const
{
a
*=
b
;
}
template
<
int
Alignment
,
typename
Packet
>
EIGEN_STRONG_INLINE
void
assignPacket
(
DstScalar
*
a
,
const
Packet
&
b
)
const
{
internal
::
pstoret
<
DstScalar
,
Packet
,
Alignment
>
(
a
,
internal
::
pmul
(
internal
::
ploadt
<
Packet
,
Alignment
>
(
a
),
b
));
}
};
template
<
typename
DstScalar
,
typename
SrcScalar
>
struct
functor_traits
<
mul_assign_op
<
DstScalar
,
SrcScalar
>
>
{
enum
{
Cost
=
NumTraits
<
DstScalar
>::
ReadCost
+
NumTraits
<
DstScalar
>::
MulCost
,
PacketAccess
=
is_same
<
DstScalar
,
SrcScalar
>::
value
&&
packet_traits
<
DstScalar
>::
HasMul
};
};
/** \internal
* \brief Template functor for scalar/packet assignment with diviving
*
*/
template
<
typename
DstScalar
,
typename
SrcScalar
=
DstScalar
>
struct
div_assign_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
div_assign_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
void
assignCoeff
(
DstScalar
&
a
,
const
SrcScalar
&
b
)
const
{
a
/=
b
;
}
template
<
int
Alignment
,
typename
Packet
>
EIGEN_STRONG_INLINE
void
assignPacket
(
DstScalar
*
a
,
const
Packet
&
b
)
const
{
internal
::
pstoret
<
DstScalar
,
Packet
,
Alignment
>
(
a
,
internal
::
pdiv
(
internal
::
ploadt
<
Packet
,
Alignment
>
(
a
),
b
));
}
};
template
<
typename
DstScalar
,
typename
SrcScalar
>
struct
functor_traits
<
div_assign_op
<
DstScalar
,
SrcScalar
>
>
{
enum
{
Cost
=
NumTraits
<
DstScalar
>::
ReadCost
+
NumTraits
<
DstScalar
>::
MulCost
,
PacketAccess
=
is_same
<
DstScalar
,
SrcScalar
>::
value
&&
packet_traits
<
DstScalar
>::
HasDiv
};
};
/** \internal
* \brief Template functor for scalar/packet assignment with swapping
*
* It works as follow. For a non-vectorized evaluation loop, we have:
* for(i) func(A.coeffRef(i), B.coeff(i));
* where B is a SwapWrapper expression. The trick is to make SwapWrapper::coeff behaves like a non-const coeffRef.
* Actually, SwapWrapper might not even be needed since even if B is a plain expression, since it has to be writable
* B.coeff already returns a const reference to the underlying scalar value.
*
* The case of a vectorized loop is more tricky:
* for(i,j) func.assignPacket<A_Align>(&A.coeffRef(i,j), B.packet<B_Align>(i,j));
* Here, B must be a SwapWrapper whose packet function actually returns a proxy object holding a Scalar*,
* the actual alignment and Packet type.
*
*/
template
<
typename
Scalar
>
struct
swap_assign_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
swap_assign_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
void
assignCoeff
(
Scalar
&
a
,
const
Scalar
&
b
)
const
{
#ifdef __CUDACC__
// FIXME is there some kind of cuda::swap?
Scalar
t
=
b
;
const_cast
<
Scalar
&>
(
b
)
=
a
;
a
=
t
;
#else
using
std
::
swap
;
swap
(
a
,
const_cast
<
Scalar
&>
(
b
));
#endif
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
swap_assign_op
<
Scalar
>
>
{
enum
{
Cost
=
3
*
NumTraits
<
Scalar
>::
ReadCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
Vectorizable
};
};
}
// namespace internal
}
// namespace Eigen
#endif // EIGEN_ASSIGNMENT_FUNCTORS_H
external/eigen3/Eigen/src/Core/functors/BinaryFunctors.h
0 → 100644
View file @
a394b22a
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_BINARY_FUNCTORS_H
#define EIGEN_BINARY_FUNCTORS_H
namespace
Eigen
{
namespace
internal
{
//---------- associative binary functors ----------
template
<
typename
Arg1
,
typename
Arg2
>
struct
binary_op_base
{
typedef
Arg1
first_argument_type
;
typedef
Arg2
second_argument_type
;
};
/** \internal
* \brief Template functor to compute the sum of two scalars
*
* \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, DenseBase::sum()
*/
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
scalar_sum_op
:
binary_op_base
<
LhsScalar
,
RhsScalar
>
{
typedef
typename
ScalarBinaryOpTraits
<
LhsScalar
,
RhsScalar
,
scalar_sum_op
>::
ReturnType
result_type
;
#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_sum_op
)
#else
scalar_sum_op
()
{
EIGEN_SCALAR_BINARY_OP_PLUGIN
}
#endif
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
result_type
operator
()
(
const
LhsScalar
&
a
,
const
RhsScalar
&
b
)
const
{
return
a
+
b
;
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Packet
packetOp
(
const
Packet
&
a
,
const
Packet
&
b
)
const
{
return
internal
::
padd
(
a
,
b
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
result_type
predux
(
const
Packet
&
a
)
const
{
return
internal
::
predux
(
a
);
}
};
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
functor_traits
<
scalar_sum_op
<
LhsScalar
,
RhsScalar
>
>
{
enum
{
Cost
=
(
NumTraits
<
LhsScalar
>::
AddCost
+
NumTraits
<
RhsScalar
>::
AddCost
)
/
2
,
// rough estimate!
PacketAccess
=
is_same
<
LhsScalar
,
RhsScalar
>::
value
&&
packet_traits
<
LhsScalar
>::
HasAdd
&&
packet_traits
<
RhsScalar
>::
HasAdd
// TODO vectorize mixed sum
};
};
/** \internal
* \brief Template specialization to deprecate the summation of boolean expressions.
* This is required to solve Bug 426.
* \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast()
*/
template
<
>
struct
scalar_sum_op
<
bool
,
bool
>
:
scalar_sum_op
<
int
,
int
>
{
EIGEN_DEPRECATED
scalar_sum_op
()
{}
};
/** \internal
* \brief Template functor to compute the product of two scalars
*
* \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
*/
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
scalar_product_op
:
binary_op_base
<
LhsScalar
,
RhsScalar
>
{
typedef
typename
ScalarBinaryOpTraits
<
LhsScalar
,
RhsScalar
,
scalar_product_op
>::
ReturnType
result_type
;
#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_product_op
)
#else
scalar_product_op
()
{
EIGEN_SCALAR_BINARY_OP_PLUGIN
}
#endif
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
result_type
operator
()
(
const
LhsScalar
&
a
,
const
RhsScalar
&
b
)
const
{
return
a
*
b
;
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Packet
packetOp
(
const
Packet
&
a
,
const
Packet
&
b
)
const
{
return
internal
::
pmul
(
a
,
b
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
result_type
predux
(
const
Packet
&
a
)
const
{
return
internal
::
predux_mul
(
a
);
}
};
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
functor_traits
<
scalar_product_op
<
LhsScalar
,
RhsScalar
>
>
{
enum
{
Cost
=
(
NumTraits
<
LhsScalar
>::
MulCost
+
NumTraits
<
RhsScalar
>::
MulCost
)
/
2
,
// rough estimate!
PacketAccess
=
is_same
<
LhsScalar
,
RhsScalar
>::
value
&&
packet_traits
<
LhsScalar
>::
HasMul
&&
packet_traits
<
RhsScalar
>::
HasMul
// TODO vectorize mixed product
};
};
/** \internal
* \brief Template functor to compute the conjugate product of two scalars
*
* This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y)
*/
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
scalar_conj_product_op
:
binary_op_base
<
LhsScalar
,
RhsScalar
>
{
enum
{
Conj
=
NumTraits
<
LhsScalar
>::
IsComplex
};
typedef
typename
ScalarBinaryOpTraits
<
LhsScalar
,
RhsScalar
,
scalar_conj_product_op
>::
ReturnType
result_type
;
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_conj_product_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
result_type
operator
()
(
const
LhsScalar
&
a
,
const
RhsScalar
&
b
)
const
{
return
conj_helper
<
LhsScalar
,
RhsScalar
,
Conj
,
false
>
().
pmul
(
a
,
b
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Packet
packetOp
(
const
Packet
&
a
,
const
Packet
&
b
)
const
{
return
conj_helper
<
Packet
,
Packet
,
Conj
,
false
>
().
pmul
(
a
,
b
);
}
};
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
functor_traits
<
scalar_conj_product_op
<
LhsScalar
,
RhsScalar
>
>
{
enum
{
Cost
=
NumTraits
<
LhsScalar
>::
MulCost
,
PacketAccess
=
internal
::
is_same
<
LhsScalar
,
RhsScalar
>::
value
&&
packet_traits
<
LhsScalar
>::
HasMul
};
};
/** \internal
* \brief Template functor to compute the min of two scalars
*
* \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
*/
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
scalar_min_op
:
binary_op_base
<
LhsScalar
,
RhsScalar
>
{
typedef
typename
ScalarBinaryOpTraits
<
LhsScalar
,
RhsScalar
,
scalar_min_op
>::
ReturnType
result_type
;
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_min_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
result_type
operator
()
(
const
LhsScalar
&
a
,
const
RhsScalar
&
b
)
const
{
return
numext
::
mini
(
a
,
b
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Packet
packetOp
(
const
Packet
&
a
,
const
Packet
&
b
)
const
{
return
internal
::
pmin
(
a
,
b
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
result_type
predux
(
const
Packet
&
a
)
const
{
return
internal
::
predux_min
(
a
);
}
};
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
functor_traits
<
scalar_min_op
<
LhsScalar
,
RhsScalar
>
>
{
enum
{
Cost
=
(
NumTraits
<
LhsScalar
>::
AddCost
+
NumTraits
<
RhsScalar
>::
AddCost
)
/
2
,
PacketAccess
=
internal
::
is_same
<
LhsScalar
,
RhsScalar
>::
value
&&
packet_traits
<
LhsScalar
>::
HasMin
};
};
/** \internal
* \brief Template functor to compute the max of two scalars
*
* \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
*/
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
scalar_max_op
:
binary_op_base
<
LhsScalar
,
RhsScalar
>
{
typedef
typename
ScalarBinaryOpTraits
<
LhsScalar
,
RhsScalar
,
scalar_max_op
>::
ReturnType
result_type
;
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_max_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
result_type
operator
()
(
const
LhsScalar
&
a
,
const
RhsScalar
&
b
)
const
{
return
numext
::
maxi
(
a
,
b
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Packet
packetOp
(
const
Packet
&
a
,
const
Packet
&
b
)
const
{
return
internal
::
pmax
(
a
,
b
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
result_type
predux
(
const
Packet
&
a
)
const
{
return
internal
::
predux_max
(
a
);
}
};
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
functor_traits
<
scalar_max_op
<
LhsScalar
,
RhsScalar
>
>
{
enum
{
Cost
=
(
NumTraits
<
LhsScalar
>::
AddCost
+
NumTraits
<
RhsScalar
>::
AddCost
)
/
2
,
PacketAccess
=
internal
::
is_same
<
LhsScalar
,
RhsScalar
>::
value
&&
packet_traits
<
LhsScalar
>::
HasMax
};
};
/** \internal
* \brief Template functors for comparison of two scalars
* \todo Implement packet-comparisons
*/
template
<
typename
LhsScalar
,
typename
RhsScalar
,
ComparisonName
cmp
>
struct
scalar_cmp_op
;
template
<
typename
LhsScalar
,
typename
RhsScalar
,
ComparisonName
cmp
>
struct
functor_traits
<
scalar_cmp_op
<
LhsScalar
,
RhsScalar
,
cmp
>
>
{
enum
{
Cost
=
(
NumTraits
<
LhsScalar
>::
AddCost
+
NumTraits
<
RhsScalar
>::
AddCost
)
/
2
,
PacketAccess
=
false
};
};
template
<
ComparisonName
Cmp
,
typename
LhsScalar
,
typename
RhsScalar
>
struct
result_of
<
scalar_cmp_op
<
LhsScalar
,
RhsScalar
,
Cmp
>
(
LhsScalar
,
RhsScalar
)
>
{
typedef
bool
type
;
};
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
scalar_cmp_op
<
LhsScalar
,
RhsScalar
,
cmp_EQ
>
:
binary_op_base
<
LhsScalar
,
RhsScalar
>
{
typedef
bool
result_type
;
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_cmp_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
bool
operator
()(
const
LhsScalar
&
a
,
const
RhsScalar
&
b
)
const
{
return
a
==
b
;}
};
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
scalar_cmp_op
<
LhsScalar
,
RhsScalar
,
cmp_LT
>
:
binary_op_base
<
LhsScalar
,
RhsScalar
>
{
typedef
bool
result_type
;
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_cmp_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
bool
operator
()(
const
LhsScalar
&
a
,
const
RhsScalar
&
b
)
const
{
return
a
<
b
;}
};
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
scalar_cmp_op
<
LhsScalar
,
RhsScalar
,
cmp_LE
>
:
binary_op_base
<
LhsScalar
,
RhsScalar
>
{
typedef
bool
result_type
;
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_cmp_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
bool
operator
()(
const
LhsScalar
&
a
,
const
RhsScalar
&
b
)
const
{
return
a
<=
b
;}
};
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
scalar_cmp_op
<
LhsScalar
,
RhsScalar
,
cmp_GT
>
:
binary_op_base
<
LhsScalar
,
RhsScalar
>
{
typedef
bool
result_type
;
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_cmp_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
bool
operator
()(
const
LhsScalar
&
a
,
const
RhsScalar
&
b
)
const
{
return
a
>
b
;}
};
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
scalar_cmp_op
<
LhsScalar
,
RhsScalar
,
cmp_GE
>
:
binary_op_base
<
LhsScalar
,
RhsScalar
>
{
typedef
bool
result_type
;
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_cmp_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
bool
operator
()(
const
LhsScalar
&
a
,
const
RhsScalar
&
b
)
const
{
return
a
>=
b
;}
};
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
scalar_cmp_op
<
LhsScalar
,
RhsScalar
,
cmp_UNORD
>
:
binary_op_base
<
LhsScalar
,
RhsScalar
>
{
typedef
bool
result_type
;
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_cmp_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
bool
operator
()(
const
LhsScalar
&
a
,
const
RhsScalar
&
b
)
const
{
return
!
(
a
<=
b
||
b
<=
a
);}
};
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
scalar_cmp_op
<
LhsScalar
,
RhsScalar
,
cmp_NEQ
>
:
binary_op_base
<
LhsScalar
,
RhsScalar
>
{
typedef
bool
result_type
;
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_cmp_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
bool
operator
()(
const
LhsScalar
&
a
,
const
RhsScalar
&
b
)
const
{
return
a
!=
b
;}
};
/** \internal
* \brief Template functor to compute the hypot of two scalars
*
* \sa MatrixBase::stableNorm(), class Redux
*/
template
<
typename
Scalar
>
struct
scalar_hypot_op
<
Scalar
,
Scalar
>
:
binary_op_base
<
Scalar
,
Scalar
>
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_hypot_op
)
// typedef typename NumTraits<Scalar>::Real result_type;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Scalar
operator
()
(
const
Scalar
&
_x
,
const
Scalar
&
_y
)
const
{
EIGEN_USING_STD_MATH
(
sqrt
)
Scalar
p
,
qp
;
if
(
_x
>
_y
)
{
p
=
_x
;
qp
=
_y
/
p
;
}
else
{
p
=
_y
;
qp
=
_x
/
p
;
}
return
p
*
sqrt
(
Scalar
(
1
)
+
qp
*
qp
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_hypot_op
<
Scalar
,
Scalar
>
>
{
enum
{
Cost
=
3
*
NumTraits
<
Scalar
>::
AddCost
+
2
*
NumTraits
<
Scalar
>::
MulCost
+
2
*
scalar_div_cost
<
Scalar
,
false
>::
value
,
PacketAccess
=
false
};
};
/** \internal
* \brief Template functor to compute the pow of two scalars
*/
template
<
typename
Scalar
,
typename
Exponent
>
struct
scalar_pow_op
:
binary_op_base
<
Scalar
,
Exponent
>
{
typedef
typename
ScalarBinaryOpTraits
<
Scalar
,
Exponent
,
scalar_pow_op
>::
ReturnType
result_type
;
#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_pow_op
)
#else
scalar_pow_op
()
{
typedef
Scalar
LhsScalar
;
typedef
Exponent
RhsScalar
;
EIGEN_SCALAR_BINARY_OP_PLUGIN
}
#endif
EIGEN_DEVICE_FUNC
inline
result_type
operator
()
(
const
Scalar
&
a
,
const
Exponent
&
b
)
const
{
return
numext
::
pow
(
a
,
b
);
}
};
template
<
typename
Scalar
,
typename
Exponent
>
struct
functor_traits
<
scalar_pow_op
<
Scalar
,
Exponent
>
>
{
enum
{
Cost
=
5
*
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
false
};
};
//---------- non associative binary functors ----------
/** \internal
* \brief Template functor to compute the difference of two scalars
*
* \sa class CwiseBinaryOp, MatrixBase::operator-
*/
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
scalar_difference_op
:
binary_op_base
<
LhsScalar
,
RhsScalar
>
{
typedef
typename
ScalarBinaryOpTraits
<
LhsScalar
,
RhsScalar
,
scalar_difference_op
>::
ReturnType
result_type
;
#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_difference_op
)
#else
scalar_difference_op
()
{
EIGEN_SCALAR_BINARY_OP_PLUGIN
}
#endif
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
result_type
operator
()
(
const
LhsScalar
&
a
,
const
RhsScalar
&
b
)
const
{
return
a
-
b
;
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Packet
packetOp
(
const
Packet
&
a
,
const
Packet
&
b
)
const
{
return
internal
::
psub
(
a
,
b
);
}
};
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
functor_traits
<
scalar_difference_op
<
LhsScalar
,
RhsScalar
>
>
{
enum
{
Cost
=
(
NumTraits
<
LhsScalar
>::
AddCost
+
NumTraits
<
RhsScalar
>::
AddCost
)
/
2
,
PacketAccess
=
is_same
<
LhsScalar
,
RhsScalar
>::
value
&&
packet_traits
<
LhsScalar
>::
HasSub
&&
packet_traits
<
RhsScalar
>::
HasSub
};
};
/** \internal
* \brief Template functor to compute the quotient of two scalars
*
* \sa class CwiseBinaryOp, Cwise::operator/()
*/
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
scalar_quotient_op
:
binary_op_base
<
LhsScalar
,
RhsScalar
>
{
typedef
typename
ScalarBinaryOpTraits
<
LhsScalar
,
RhsScalar
,
scalar_quotient_op
>::
ReturnType
result_type
;
#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_quotient_op
)
#else
scalar_quotient_op
()
{
EIGEN_SCALAR_BINARY_OP_PLUGIN
}
#endif
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
result_type
operator
()
(
const
LhsScalar
&
a
,
const
RhsScalar
&
b
)
const
{
return
a
/
b
;
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Packet
packetOp
(
const
Packet
&
a
,
const
Packet
&
b
)
const
{
return
internal
::
pdiv
(
a
,
b
);
}
};
template
<
typename
LhsScalar
,
typename
RhsScalar
>
struct
functor_traits
<
scalar_quotient_op
<
LhsScalar
,
RhsScalar
>
>
{
typedef
typename
scalar_quotient_op
<
LhsScalar
,
RhsScalar
>::
result_type
result_type
;
enum
{
PacketAccess
=
is_same
<
LhsScalar
,
RhsScalar
>::
value
&&
packet_traits
<
LhsScalar
>::
HasDiv
&&
packet_traits
<
RhsScalar
>::
HasDiv
,
Cost
=
scalar_div_cost
<
result_type
,
PacketAccess
>::
value
};
};
/** \internal
* \brief Template functor to compute the and of two booleans
*
* \sa class CwiseBinaryOp, ArrayBase::operator&&
*/
struct
scalar_boolean_and_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_boolean_and_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
bool
operator
()
(
const
bool
&
a
,
const
bool
&
b
)
const
{
return
a
&&
b
;
}
};
template
<
>
struct
functor_traits
<
scalar_boolean_and_op
>
{
enum
{
Cost
=
NumTraits
<
bool
>::
AddCost
,
PacketAccess
=
false
};
};
/** \internal
* \brief Template functor to compute the or of two booleans
*
* \sa class CwiseBinaryOp, ArrayBase::operator||
*/
struct
scalar_boolean_or_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_boolean_or_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
bool
operator
()
(
const
bool
&
a
,
const
bool
&
b
)
const
{
return
a
||
b
;
}
};
template
<
>
struct
functor_traits
<
scalar_boolean_or_op
>
{
enum
{
Cost
=
NumTraits
<
bool
>::
AddCost
,
PacketAccess
=
false
};
};
/** \internal
* \brief Template functor to compute the xor of two booleans
*
* \sa class CwiseBinaryOp, ArrayBase::operator^
*/
struct
scalar_boolean_xor_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_boolean_xor_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
bool
operator
()
(
const
bool
&
a
,
const
bool
&
b
)
const
{
return
a
^
b
;
}
};
template
<
>
struct
functor_traits
<
scalar_boolean_xor_op
>
{
enum
{
Cost
=
NumTraits
<
bool
>::
AddCost
,
PacketAccess
=
false
};
};
//---------- binary functors bound to a constant, thus appearing as a unary functor ----------
// The following two classes permits to turn any binary functor into a unary one with one argument bound to a constant value.
// They are analogues to std::binder1st/binder2nd but with the following differences:
// - they are compatible with packetOp
// - they are portable across C++ versions (the std::binder* are deprecated in C++11)
template
<
typename
BinaryOp
>
struct
bind1st_op
:
BinaryOp
{
typedef
typename
BinaryOp
::
first_argument_type
first_argument_type
;
typedef
typename
BinaryOp
::
second_argument_type
second_argument_type
;
typedef
typename
BinaryOp
::
result_type
result_type
;
bind1st_op
(
const
first_argument_type
&
val
)
:
m_value
(
val
)
{}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
result_type
operator
()
(
const
second_argument_type
&
b
)
const
{
return
BinaryOp
::
operator
()(
m_value
,
b
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Packet
packetOp
(
const
Packet
&
b
)
const
{
return
BinaryOp
::
packetOp
(
internal
::
pset1
<
Packet
>
(
m_value
),
b
);
}
first_argument_type
m_value
;
};
template
<
typename
BinaryOp
>
struct
functor_traits
<
bind1st_op
<
BinaryOp
>
>
:
functor_traits
<
BinaryOp
>
{};
template
<
typename
BinaryOp
>
struct
bind2nd_op
:
BinaryOp
{
typedef
typename
BinaryOp
::
first_argument_type
first_argument_type
;
typedef
typename
BinaryOp
::
second_argument_type
second_argument_type
;
typedef
typename
BinaryOp
::
result_type
result_type
;
bind2nd_op
(
const
second_argument_type
&
val
)
:
m_value
(
val
)
{}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
result_type
operator
()
(
const
first_argument_type
&
a
)
const
{
return
BinaryOp
::
operator
()(
a
,
m_value
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
BinaryOp
::
packetOp
(
a
,
internal
::
pset1
<
Packet
>
(
m_value
));
}
second_argument_type
m_value
;
};
template
<
typename
BinaryOp
>
struct
functor_traits
<
bind2nd_op
<
BinaryOp
>
>
:
functor_traits
<
BinaryOp
>
{};
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_BINARY_FUNCTORS_H
external/eigen3/Eigen/src/Core/functors/NullaryFunctors.h
0 → 100644
View file @
a394b22a
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_NULLARY_FUNCTORS_H
#define EIGEN_NULLARY_FUNCTORS_H
namespace
Eigen
{
namespace
internal
{
template
<
typename
Scalar
>
struct
scalar_constant_op
{
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
scalar_constant_op
(
const
scalar_constant_op
&
other
)
:
m_other
(
other
.
m_other
)
{
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
scalar_constant_op
(
const
Scalar
&
other
)
:
m_other
(
other
)
{
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Scalar
operator
()
()
const
{
return
m_other
;
}
template
<
typename
PacketType
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
PacketType
packetOp
()
const
{
return
internal
::
pset1
<
PacketType
>
(
m_other
);
}
const
Scalar
m_other
;
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_constant_op
<
Scalar
>
>
{
enum
{
Cost
=
0
/* as the constant value should be loaded in register only once for the whole expression */
,
PacketAccess
=
packet_traits
<
Scalar
>::
Vectorizable
,
IsRepeatable
=
true
};
};
template
<
typename
Scalar
>
struct
scalar_identity_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_identity_op
)
template
<
typename
IndexType
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Scalar
operator
()
(
IndexType
row
,
IndexType
col
)
const
{
return
row
==
col
?
Scalar
(
1
)
:
Scalar
(
0
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_identity_op
<
Scalar
>
>
{
enum
{
Cost
=
NumTraits
<
Scalar
>::
AddCost
,
PacketAccess
=
false
,
IsRepeatable
=
true
};
};
template
<
typename
Scalar
,
typename
Packet
,
bool
IsInteger
>
struct
linspaced_op_impl
;
template
<
typename
Scalar
,
typename
Packet
>
struct
linspaced_op_impl
<
Scalar
,
Packet
,
/*IsInteger*/
false
>
{
linspaced_op_impl
(
const
Scalar
&
low
,
const
Scalar
&
high
,
Index
num_steps
)
:
m_low
(
low
),
m_high
(
high
),
m_size1
(
num_steps
==
1
?
1
:
num_steps
-
1
),
m_step
(
num_steps
==
1
?
Scalar
()
:
(
high
-
low
)
/
Scalar
(
num_steps
-
1
)),
m_flip
(
numext
::
abs
(
high
)
<
numext
::
abs
(
low
))
{}
template
<
typename
IndexType
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Scalar
operator
()
(
IndexType
i
)
const
{
typedef
typename
NumTraits
<
Scalar
>::
Real
RealScalar
;
if
(
m_flip
)
return
(
i
==
0
)
?
m_low
:
(
m_high
-
RealScalar
(
m_size1
-
i
)
*
m_step
);
else
return
(
i
==
m_size1
)
?
m_high
:
(
m_low
+
RealScalar
(
i
)
*
m_step
);
}
template
<
typename
IndexType
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Packet
packetOp
(
IndexType
i
)
const
{
// Principle:
// [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
if
(
m_flip
)
{
Packet
pi
=
plset
<
Packet
>
(
Scalar
(
i
-
m_size1
));
Packet
res
=
padd
(
pset1
<
Packet
>
(
m_high
),
pmul
(
pset1
<
Packet
>
(
m_step
),
pi
));
if
(
i
==
0
)
res
=
pinsertfirst
(
res
,
m_low
);
return
res
;
}
else
{
Packet
pi
=
plset
<
Packet
>
(
Scalar
(
i
));
Packet
res
=
padd
(
pset1
<
Packet
>
(
m_low
),
pmul
(
pset1
<
Packet
>
(
m_step
),
pi
));
if
(
i
==
m_size1
-
unpacket_traits
<
Packet
>::
size
+
1
)
res
=
pinsertlast
(
res
,
m_high
);
return
res
;
}
}
const
Scalar
m_low
;
const
Scalar
m_high
;
const
Index
m_size1
;
const
Scalar
m_step
;
const
bool
m_flip
;
};
template
<
typename
Scalar
,
typename
Packet
>
struct
linspaced_op_impl
<
Scalar
,
Packet
,
/*IsInteger*/
true
>
{
linspaced_op_impl
(
const
Scalar
&
low
,
const
Scalar
&
high
,
Index
num_steps
)
:
m_low
(
low
),
m_multiplier
((
high
-
low
)
/
convert_index
<
Scalar
>
(
num_steps
<=
1
?
1
:
num_steps
-
1
)),
m_divisor
(
convert_index
<
Scalar
>
((
high
>=
low
?
num_steps
:-
num_steps
)
+
(
high
-
low
))
/
((
numext
::
abs
(
high
-
low
)
+
1
)
==
0
?
1
:
(
numext
::
abs
(
high
-
low
)
+
1
))),
m_use_divisor
(
num_steps
>
1
&&
(
numext
::
abs
(
high
-
low
)
+
1
)
<
num_steps
)
{}
template
<
typename
IndexType
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Scalar
operator
()
(
IndexType
i
)
const
{
if
(
m_use_divisor
)
return
m_low
+
convert_index
<
Scalar
>
(
i
)
/
m_divisor
;
else
return
m_low
+
convert_index
<
Scalar
>
(
i
)
*
m_multiplier
;
}
const
Scalar
m_low
;
const
Scalar
m_multiplier
;
const
Scalar
m_divisor
;
const
bool
m_use_divisor
;
};
// ----- Linspace functor ----------------------------------------------------------------
// Forward declaration (we default to random access which does not really give
// us a speed gain when using packet access but it allows to use the functor in
// nested expressions).
template
<
typename
Scalar
,
typename
PacketType
>
struct
linspaced_op
;
template
<
typename
Scalar
,
typename
PacketType
>
struct
functor_traits
<
linspaced_op
<
Scalar
,
PacketType
>
>
{
enum
{
Cost
=
1
,
PacketAccess
=
(
!
NumTraits
<
Scalar
>::
IsInteger
)
&&
packet_traits
<
Scalar
>::
HasSetLinear
&&
packet_traits
<
Scalar
>::
HasBlend
,
/*&& ((!NumTraits<Scalar>::IsInteger) || packet_traits<Scalar>::HasDiv),*/
// <- vectorization for integer is currently disabled
IsRepeatable
=
true
};
};
template
<
typename
Scalar
,
typename
PacketType
>
struct
linspaced_op
{
linspaced_op
(
const
Scalar
&
low
,
const
Scalar
&
high
,
Index
num_steps
)
:
impl
((
num_steps
==
1
?
high
:
low
),
high
,
num_steps
)
{}
template
<
typename
IndexType
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Scalar
operator
()
(
IndexType
i
)
const
{
return
impl
(
i
);
}
template
<
typename
Packet
,
typename
IndexType
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Packet
packetOp
(
IndexType
i
)
const
{
return
impl
.
packetOp
(
i
);
}
// This proxy object handles the actual required temporaries and the different
// implementations (integer vs. floating point).
const
linspaced_op_impl
<
Scalar
,
PacketType
,
NumTraits
<
Scalar
>::
IsInteger
>
impl
;
};
// Linear access is automatically determined from the operator() prototypes available for the given functor.
// If it exposes an operator()(i,j), then we assume the i and j coefficients are required independently
// and linear access is not possible. In all other cases, linear access is enabled.
// Users should not have to deal with this structure.
template
<
typename
Functor
>
struct
functor_has_linear_access
{
enum
{
ret
=
!
has_binary_operator
<
Functor
>::
value
};
};
// For unreliable compilers, let's specialize the has_*ary_operator
// helpers so that at least built-in nullary functors work fine.
#if !( (EIGEN_COMP_MSVC>1600) || (EIGEN_GNUC_AT_LEAST(4,8)) || (EIGEN_COMP_ICC>=1600))
template
<
typename
Scalar
,
typename
IndexType
>
struct
has_nullary_operator
<
scalar_constant_op
<
Scalar
>
,
IndexType
>
{
enum
{
value
=
1
};
};
template
<
typename
Scalar
,
typename
IndexType
>
struct
has_unary_operator
<
scalar_constant_op
<
Scalar
>
,
IndexType
>
{
enum
{
value
=
0
};
};
template
<
typename
Scalar
,
typename
IndexType
>
struct
has_binary_operator
<
scalar_constant_op
<
Scalar
>
,
IndexType
>
{
enum
{
value
=
0
};
};
template
<
typename
Scalar
,
typename
IndexType
>
struct
has_nullary_operator
<
scalar_identity_op
<
Scalar
>
,
IndexType
>
{
enum
{
value
=
0
};
};
template
<
typename
Scalar
,
typename
IndexType
>
struct
has_unary_operator
<
scalar_identity_op
<
Scalar
>
,
IndexType
>
{
enum
{
value
=
0
};
};
template
<
typename
Scalar
,
typename
IndexType
>
struct
has_binary_operator
<
scalar_identity_op
<
Scalar
>
,
IndexType
>
{
enum
{
value
=
1
};
};
template
<
typename
Scalar
,
typename
PacketType
,
typename
IndexType
>
struct
has_nullary_operator
<
linspaced_op
<
Scalar
,
PacketType
>
,
IndexType
>
{
enum
{
value
=
0
};
};
template
<
typename
Scalar
,
typename
PacketType
,
typename
IndexType
>
struct
has_unary_operator
<
linspaced_op
<
Scalar
,
PacketType
>
,
IndexType
>
{
enum
{
value
=
1
};
};
template
<
typename
Scalar
,
typename
PacketType
,
typename
IndexType
>
struct
has_binary_operator
<
linspaced_op
<
Scalar
,
PacketType
>
,
IndexType
>
{
enum
{
value
=
0
};
};
template
<
typename
Scalar
,
typename
IndexType
>
struct
has_nullary_operator
<
scalar_random_op
<
Scalar
>
,
IndexType
>
{
enum
{
value
=
1
};
};
template
<
typename
Scalar
,
typename
IndexType
>
struct
has_unary_operator
<
scalar_random_op
<
Scalar
>
,
IndexType
>
{
enum
{
value
=
0
};
};
template
<
typename
Scalar
,
typename
IndexType
>
struct
has_binary_operator
<
scalar_random_op
<
Scalar
>
,
IndexType
>
{
enum
{
value
=
0
};
};
#endif
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_NULLARY_FUNCTORS_H
external/eigen3/Eigen/src/Core/functors/StlFunctors.h
0 → 100644
View file @
a394b22a
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_STL_FUNCTORS_H
#define EIGEN_STL_FUNCTORS_H
namespace
Eigen
{
namespace
internal
{
// default functor traits for STL functors:
template
<
typename
T
>
struct
functor_traits
<
std
::
multiplies
<
T
>
>
{
enum
{
Cost
=
NumTraits
<
T
>::
MulCost
,
PacketAccess
=
false
};
};
template
<
typename
T
>
struct
functor_traits
<
std
::
divides
<
T
>
>
{
enum
{
Cost
=
NumTraits
<
T
>::
MulCost
,
PacketAccess
=
false
};
};
template
<
typename
T
>
struct
functor_traits
<
std
::
plus
<
T
>
>
{
enum
{
Cost
=
NumTraits
<
T
>::
AddCost
,
PacketAccess
=
false
};
};
template
<
typename
T
>
struct
functor_traits
<
std
::
minus
<
T
>
>
{
enum
{
Cost
=
NumTraits
<
T
>::
AddCost
,
PacketAccess
=
false
};
};
template
<
typename
T
>
struct
functor_traits
<
std
::
negate
<
T
>
>
{
enum
{
Cost
=
NumTraits
<
T
>::
AddCost
,
PacketAccess
=
false
};
};
template
<
typename
T
>
struct
functor_traits
<
std
::
logical_or
<
T
>
>
{
enum
{
Cost
=
1
,
PacketAccess
=
false
};
};
template
<
typename
T
>
struct
functor_traits
<
std
::
logical_and
<
T
>
>
{
enum
{
Cost
=
1
,
PacketAccess
=
false
};
};
template
<
typename
T
>
struct
functor_traits
<
std
::
logical_not
<
T
>
>
{
enum
{
Cost
=
1
,
PacketAccess
=
false
};
};
template
<
typename
T
>
struct
functor_traits
<
std
::
greater
<
T
>
>
{
enum
{
Cost
=
1
,
PacketAccess
=
false
};
};
template
<
typename
T
>
struct
functor_traits
<
std
::
less
<
T
>
>
{
enum
{
Cost
=
1
,
PacketAccess
=
false
};
};
template
<
typename
T
>
struct
functor_traits
<
std
::
greater_equal
<
T
>
>
{
enum
{
Cost
=
1
,
PacketAccess
=
false
};
};
template
<
typename
T
>
struct
functor_traits
<
std
::
less_equal
<
T
>
>
{
enum
{
Cost
=
1
,
PacketAccess
=
false
};
};
template
<
typename
T
>
struct
functor_traits
<
std
::
equal_to
<
T
>
>
{
enum
{
Cost
=
1
,
PacketAccess
=
false
};
};
template
<
typename
T
>
struct
functor_traits
<
std
::
not_equal_to
<
T
>
>
{
enum
{
Cost
=
1
,
PacketAccess
=
false
};
};
#if (__cplusplus < 201103L) && (EIGEN_COMP_MSVC <= 1900)
// std::binder* are deprecated since c++11 and will be removed in c++17
template
<
typename
T
>
struct
functor_traits
<
std
::
binder2nd
<
T
>
>
{
enum
{
Cost
=
functor_traits
<
T
>::
Cost
,
PacketAccess
=
false
};
};
template
<
typename
T
>
struct
functor_traits
<
std
::
binder1st
<
T
>
>
{
enum
{
Cost
=
functor_traits
<
T
>::
Cost
,
PacketAccess
=
false
};
};
#endif
template
<
typename
T
>
struct
functor_traits
<
std
::
unary_negate
<
T
>
>
{
enum
{
Cost
=
1
+
functor_traits
<
T
>::
Cost
,
PacketAccess
=
false
};
};
template
<
typename
T
>
struct
functor_traits
<
std
::
binary_negate
<
T
>
>
{
enum
{
Cost
=
1
+
functor_traits
<
T
>::
Cost
,
PacketAccess
=
false
};
};
#ifdef EIGEN_STDEXT_SUPPORT
template
<
typename
T0
,
typename
T1
>
struct
functor_traits
<
std
::
project1st
<
T0
,
T1
>
>
{
enum
{
Cost
=
0
,
PacketAccess
=
false
};
};
template
<
typename
T0
,
typename
T1
>
struct
functor_traits
<
std
::
project2nd
<
T0
,
T1
>
>
{
enum
{
Cost
=
0
,
PacketAccess
=
false
};
};
template
<
typename
T0
,
typename
T1
>
struct
functor_traits
<
std
::
select2nd
<
std
::
pair
<
T0
,
T1
>
>
>
{
enum
{
Cost
=
0
,
PacketAccess
=
false
};
};
template
<
typename
T0
,
typename
T1
>
struct
functor_traits
<
std
::
select1st
<
std
::
pair
<
T0
,
T1
>
>
>
{
enum
{
Cost
=
0
,
PacketAccess
=
false
};
};
template
<
typename
T0
,
typename
T1
>
struct
functor_traits
<
std
::
unary_compose
<
T0
,
T1
>
>
{
enum
{
Cost
=
functor_traits
<
T0
>::
Cost
+
functor_traits
<
T1
>::
Cost
,
PacketAccess
=
false
};
};
template
<
typename
T0
,
typename
T1
,
typename
T2
>
struct
functor_traits
<
std
::
binary_compose
<
T0
,
T1
,
T2
>
>
{
enum
{
Cost
=
functor_traits
<
T0
>::
Cost
+
functor_traits
<
T1
>::
Cost
+
functor_traits
<
T2
>::
Cost
,
PacketAccess
=
false
};
};
#endif // EIGEN_STDEXT_SUPPORT
// allow to add new functors and specializations of functor_traits from outside Eigen.
// this macro is really needed because functor_traits must be specialized after it is declared but before it is used...
#ifdef EIGEN_FUNCTORS_PLUGIN
#include EIGEN_FUNCTORS_PLUGIN
#endif
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_STL_FUNCTORS_H
external/eigen3/Eigen/src/
Eigen2Support/Macro
s.h
→
external/eigen3/Eigen/src/
Core/functors/TernaryFunctor
s.h
View file @
a394b22a
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 201
1 Benoit Jacob <jacob.benoit.1
@gmail.com>
// Copyright (C) 201
6 Eugene Brevdo <ebrevdo
@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN
2_MACRO
S_H
#define EIGEN
2_MACRO
S_H
#ifndef EIGEN
_TERNARY_FUNCTOR
S_H
#define EIGEN
_TERNARY_FUNCTOR
S_H
#define ei_assert eigen_assert
#define ei_internal_assert eigen_internal_assert
namespace
Eigen
{
#define EIGEN_ALIGN_128 EIGEN_ALIGN16
namespace
internal
{
#define EIGEN_ARCH_WANTS_ALIGNMENT EIGEN_ALIGN_STATICALLY
//---------- associative ternary functors ----------
#endif // EIGEN2_MACROS_H
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_TERNARY_FUNCTORS_H
external/eigen3/Eigen/src/Core/functors/UnaryFunctors.h
0 → 100644
View file @
a394b22a
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_UNARY_FUNCTORS_H
#define EIGEN_UNARY_FUNCTORS_H
namespace
Eigen
{
namespace
internal
{
/** \internal
* \brief Template functor to compute the opposite of a scalar
*
* \sa class CwiseUnaryOp, MatrixBase::operator-
*/
template
<
typename
Scalar
>
struct
scalar_opposite_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_opposite_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
-
a
;
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
pnegate
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_opposite_op
<
Scalar
>
>
{
enum
{
Cost
=
NumTraits
<
Scalar
>::
AddCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasNegate
};
};
/** \internal
* \brief Template functor to compute the absolute value of a scalar
*
* \sa class CwiseUnaryOp, Cwise::abs
*/
template
<
typename
Scalar
>
struct
scalar_abs_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_abs_op
)
typedef
typename
NumTraits
<
Scalar
>::
Real
result_type
;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
result_type
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
abs
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
pabs
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_abs_op
<
Scalar
>
>
{
enum
{
Cost
=
NumTraits
<
Scalar
>::
AddCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasAbs
};
};
/** \internal
* \brief Template functor to compute the score of a scalar, to chose a pivot
*
* \sa class CwiseUnaryOp
*/
template
<
typename
Scalar
>
struct
scalar_score_coeff_op
:
scalar_abs_op
<
Scalar
>
{
typedef
void
Score_is_abs
;
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_score_coeff_op
<
Scalar
>
>
:
functor_traits
<
scalar_abs_op
<
Scalar
>
>
{};
/* Avoid recomputing abs when we know the score and they are the same. Not a true Eigen functor. */
template
<
typename
Scalar
,
typename
=
void
>
struct
abs_knowing_score
{
EIGEN_EMPTY_STRUCT_CTOR
(
abs_knowing_score
)
typedef
typename
NumTraits
<
Scalar
>::
Real
result_type
;
template
<
typename
Score
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
result_type
operator
()
(
const
Scalar
&
a
,
const
Score
&
)
const
{
return
numext
::
abs
(
a
);
}
};
template
<
typename
Scalar
>
struct
abs_knowing_score
<
Scalar
,
typename
scalar_score_coeff_op
<
Scalar
>::
Score_is_abs
>
{
EIGEN_EMPTY_STRUCT_CTOR
(
abs_knowing_score
)
typedef
typename
NumTraits
<
Scalar
>::
Real
result_type
;
template
<
typename
Scal
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
result_type
operator
()
(
const
Scal
&
,
const
result_type
&
a
)
const
{
return
a
;
}
};
/** \internal
* \brief Template functor to compute the squared absolute value of a scalar
*
* \sa class CwiseUnaryOp, Cwise::abs2
*/
template
<
typename
Scalar
>
struct
scalar_abs2_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_abs2_op
)
typedef
typename
NumTraits
<
Scalar
>::
Real
result_type
;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
result_type
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
abs2
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
pmul
(
a
,
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_abs2_op
<
Scalar
>
>
{
enum
{
Cost
=
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasAbs2
};
};
/** \internal
* \brief Template functor to compute the conjugate of a complex value
*
* \sa class CwiseUnaryOp, MatrixBase::conjugate()
*/
template
<
typename
Scalar
>
struct
scalar_conjugate_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_conjugate_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
using
numext
::
conj
;
return
conj
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
pconj
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_conjugate_op
<
Scalar
>
>
{
enum
{
Cost
=
NumTraits
<
Scalar
>::
IsComplex
?
NumTraits
<
Scalar
>::
AddCost
:
0
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasConj
};
};
/** \internal
* \brief Template functor to compute the phase angle of a complex
*
* \sa class CwiseUnaryOp, Cwise::arg
*/
template
<
typename
Scalar
>
struct
scalar_arg_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_arg_op
)
typedef
typename
NumTraits
<
Scalar
>::
Real
result_type
;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
result_type
operator
()
(
const
Scalar
&
a
)
const
{
using
numext
::
arg
;
return
arg
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
parg
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_arg_op
<
Scalar
>
>
{
enum
{
Cost
=
NumTraits
<
Scalar
>::
IsComplex
?
5
*
NumTraits
<
Scalar
>::
MulCost
:
NumTraits
<
Scalar
>::
AddCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasArg
};
};
/** \internal
* \brief Template functor to cast a scalar to another type
*
* \sa class CwiseUnaryOp, MatrixBase::cast()
*/
template
<
typename
Scalar
,
typename
NewType
>
struct
scalar_cast_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_cast_op
)
typedef
NewType
result_type
;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
NewType
operator
()
(
const
Scalar
&
a
)
const
{
return
cast
<
Scalar
,
NewType
>
(
a
);
}
};
template
<
typename
Scalar
,
typename
NewType
>
struct
functor_traits
<
scalar_cast_op
<
Scalar
,
NewType
>
>
{
enum
{
Cost
=
is_same
<
Scalar
,
NewType
>::
value
?
0
:
NumTraits
<
NewType
>::
AddCost
,
PacketAccess
=
false
};
};
/** \internal
* \brief Template functor to extract the real part of a complex
*
* \sa class CwiseUnaryOp, MatrixBase::real()
*/
template
<
typename
Scalar
>
struct
scalar_real_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_real_op
)
typedef
typename
NumTraits
<
Scalar
>::
Real
result_type
;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
result_type
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
real
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_real_op
<
Scalar
>
>
{
enum
{
Cost
=
0
,
PacketAccess
=
false
};
};
/** \internal
* \brief Template functor to extract the imaginary part of a complex
*
* \sa class CwiseUnaryOp, MatrixBase::imag()
*/
template
<
typename
Scalar
>
struct
scalar_imag_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_imag_op
)
typedef
typename
NumTraits
<
Scalar
>::
Real
result_type
;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
result_type
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
imag
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_imag_op
<
Scalar
>
>
{
enum
{
Cost
=
0
,
PacketAccess
=
false
};
};
/** \internal
* \brief Template functor to extract the real part of a complex as a reference
*
* \sa class CwiseUnaryOp, MatrixBase::real()
*/
template
<
typename
Scalar
>
struct
scalar_real_ref_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_real_ref_op
)
typedef
typename
NumTraits
<
Scalar
>::
Real
result_type
;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
result_type
&
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
real_ref
(
*
const_cast
<
Scalar
*>
(
&
a
));
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_real_ref_op
<
Scalar
>
>
{
enum
{
Cost
=
0
,
PacketAccess
=
false
};
};
/** \internal
* \brief Template functor to extract the imaginary part of a complex as a reference
*
* \sa class CwiseUnaryOp, MatrixBase::imag()
*/
template
<
typename
Scalar
>
struct
scalar_imag_ref_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_imag_ref_op
)
typedef
typename
NumTraits
<
Scalar
>::
Real
result_type
;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
result_type
&
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
imag_ref
(
*
const_cast
<
Scalar
*>
(
&
a
));
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_imag_ref_op
<
Scalar
>
>
{
enum
{
Cost
=
0
,
PacketAccess
=
false
};
};
/** \internal
*
* \brief Template functor to compute the exponential of a scalar
*
* \sa class CwiseUnaryOp, Cwise::exp()
*/
template
<
typename
Scalar
>
struct
scalar_exp_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_exp_op
)
EIGEN_DEVICE_FUNC
inline
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
exp
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
pexp
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_exp_op
<
Scalar
>
>
{
enum
{
PacketAccess
=
packet_traits
<
Scalar
>::
HasExp
,
// The following numbers are based on the AVX implementation.
#ifdef EIGEN_VECTORIZE_FMA
// Haswell can issue 2 add/mul/madd per cycle.
Cost
=
(
sizeof
(
Scalar
)
==
4
// float: 8 pmadd, 4 pmul, 2 padd/psub, 6 other
?
(
8
*
NumTraits
<
Scalar
>::
AddCost
+
6
*
NumTraits
<
Scalar
>::
MulCost
)
// double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div, 13 other
:
(
14
*
NumTraits
<
Scalar
>::
AddCost
+
6
*
NumTraits
<
Scalar
>::
MulCost
+
scalar_div_cost
<
Scalar
,
packet_traits
<
Scalar
>::
HasDiv
>::
value
))
#else
Cost
=
(
sizeof
(
Scalar
)
==
4
// float: 7 pmadd, 6 pmul, 4 padd/psub, 10 other
?
(
21
*
NumTraits
<
Scalar
>::
AddCost
+
13
*
NumTraits
<
Scalar
>::
MulCost
)
// double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div, 13 other
:
(
23
*
NumTraits
<
Scalar
>::
AddCost
+
12
*
NumTraits
<
Scalar
>::
MulCost
+
scalar_div_cost
<
Scalar
,
packet_traits
<
Scalar
>::
HasDiv
>::
value
))
#endif
};
};
/** \internal
*
* \brief Template functor to compute the logarithm of a scalar
*
* \sa class CwiseUnaryOp, ArrayBase::log()
*/
template
<
typename
Scalar
>
struct
scalar_log_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_log_op
)
EIGEN_DEVICE_FUNC
inline
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
log
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
plog
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_log_op
<
Scalar
>
>
{
enum
{
PacketAccess
=
packet_traits
<
Scalar
>::
HasLog
,
Cost
=
(
PacketAccess
// The following numbers are based on the AVX implementation.
#ifdef EIGEN_VECTORIZE_FMA
// 8 pmadd, 6 pmul, 8 padd/psub, 16 other, can issue 2 add/mul/madd per cycle.
?
(
20
*
NumTraits
<
Scalar
>::
AddCost
+
7
*
NumTraits
<
Scalar
>::
MulCost
)
#else
// 8 pmadd, 6 pmul, 8 padd/psub, 20 other
?
(
36
*
NumTraits
<
Scalar
>::
AddCost
+
14
*
NumTraits
<
Scalar
>::
MulCost
)
#endif
// Measured cost of std::log.
:
sizeof
(
Scalar
)
==
4
?
40
:
85
)
};
};
/** \internal
*
* \brief Template functor to compute the logarithm of 1 plus a scalar value
*
* \sa class CwiseUnaryOp, ArrayBase::log1p()
*/
template
<
typename
Scalar
>
struct
scalar_log1p_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_log1p_op
)
EIGEN_DEVICE_FUNC
inline
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
log1p
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
plog1p
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_log1p_op
<
Scalar
>
>
{
enum
{
PacketAccess
=
packet_traits
<
Scalar
>::
HasLog1p
,
Cost
=
functor_traits
<
scalar_log_op
<
Scalar
>
>::
Cost
// TODO measure cost of log1p
};
};
/** \internal
*
* \brief Template functor to compute the base-10 logarithm of a scalar
*
* \sa class CwiseUnaryOp, Cwise::log10()
*/
template
<
typename
Scalar
>
struct
scalar_log10_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_log10_op
)
EIGEN_DEVICE_FUNC
inline
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
EIGEN_USING_STD_MATH
(
log10
)
return
log10
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
plog10
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_log10_op
<
Scalar
>
>
{
enum
{
Cost
=
5
*
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasLog10
};
};
/** \internal
* \brief Template functor to compute the square root of a scalar
* \sa class CwiseUnaryOp, Cwise::sqrt()
*/
template
<
typename
Scalar
>
struct
scalar_sqrt_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_sqrt_op
)
EIGEN_DEVICE_FUNC
inline
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
sqrt
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
psqrt
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_sqrt_op
<
Scalar
>
>
{
enum
{
#if EIGEN_FAST_MATH
// The following numbers are based on the AVX implementation.
Cost
=
(
sizeof
(
Scalar
)
==
8
?
28
// 4 pmul, 1 pmadd, 3 other
:
(
3
*
NumTraits
<
Scalar
>::
AddCost
+
5
*
NumTraits
<
Scalar
>::
MulCost
)),
#else
// The following numbers are based on min VSQRT throughput on Haswell.
Cost
=
(
sizeof
(
Scalar
)
==
8
?
28
:
14
),
#endif
PacketAccess
=
packet_traits
<
Scalar
>::
HasSqrt
};
};
/** \internal
* \brief Template functor to compute the reciprocal square root of a scalar
* \sa class CwiseUnaryOp, Cwise::rsqrt()
*/
template
<
typename
Scalar
>
struct
scalar_rsqrt_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_rsqrt_op
)
EIGEN_DEVICE_FUNC
inline
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
Scalar
(
1
)
/
numext
::
sqrt
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
prsqrt
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_rsqrt_op
<
Scalar
>
>
{
enum
{
Cost
=
5
*
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasRsqrt
};
};
/** \internal
* \brief Template functor to compute the cosine of a scalar
* \sa class CwiseUnaryOp, ArrayBase::cos()
*/
template
<
typename
Scalar
>
struct
scalar_cos_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_cos_op
)
EIGEN_DEVICE_FUNC
inline
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
cos
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
pcos
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_cos_op
<
Scalar
>
>
{
enum
{
Cost
=
5
*
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasCos
};
};
/** \internal
* \brief Template functor to compute the sine of a scalar
* \sa class CwiseUnaryOp, ArrayBase::sin()
*/
template
<
typename
Scalar
>
struct
scalar_sin_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_sin_op
)
EIGEN_DEVICE_FUNC
inline
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
sin
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
psin
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_sin_op
<
Scalar
>
>
{
enum
{
Cost
=
5
*
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasSin
};
};
/** \internal
* \brief Template functor to compute the tan of a scalar
* \sa class CwiseUnaryOp, ArrayBase::tan()
*/
template
<
typename
Scalar
>
struct
scalar_tan_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_tan_op
)
EIGEN_DEVICE_FUNC
inline
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
tan
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
ptan
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_tan_op
<
Scalar
>
>
{
enum
{
Cost
=
5
*
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasTan
};
};
/** \internal
* \brief Template functor to compute the arc cosine of a scalar
* \sa class CwiseUnaryOp, ArrayBase::acos()
*/
template
<
typename
Scalar
>
struct
scalar_acos_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_acos_op
)
EIGEN_DEVICE_FUNC
inline
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
acos
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
pacos
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_acos_op
<
Scalar
>
>
{
enum
{
Cost
=
5
*
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasACos
};
};
/** \internal
* \brief Template functor to compute the arc sine of a scalar
* \sa class CwiseUnaryOp, ArrayBase::asin()
*/
template
<
typename
Scalar
>
struct
scalar_asin_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_asin_op
)
EIGEN_DEVICE_FUNC
inline
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
asin
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
pasin
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_asin_op
<
Scalar
>
>
{
enum
{
Cost
=
5
*
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasASin
};
};
/** \internal
* \brief Template functor to compute the atan of a scalar
* \sa class CwiseUnaryOp, ArrayBase::atan()
*/
template
<
typename
Scalar
>
struct
scalar_atan_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_atan_op
)
EIGEN_DEVICE_FUNC
inline
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
atan
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
patan
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_atan_op
<
Scalar
>
>
{
enum
{
Cost
=
5
*
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasATan
};
};
/** \internal
* \brief Template functor to compute the tanh of a scalar
* \sa class CwiseUnaryOp, ArrayBase::tanh()
*/
template
<
typename
Scalar
>
struct
scalar_tanh_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_tanh_op
)
EIGEN_DEVICE_FUNC
inline
const
Scalar
operator
()(
const
Scalar
&
a
)
const
{
return
numext
::
tanh
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
Packet
packetOp
(
const
Packet
&
x
)
const
{
return
ptanh
(
x
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_tanh_op
<
Scalar
>
>
{
enum
{
PacketAccess
=
packet_traits
<
Scalar
>::
HasTanh
,
Cost
=
(
(
EIGEN_FAST_MATH
&&
is_same
<
Scalar
,
float
>::
value
)
// The following numbers are based on the AVX implementation,
#ifdef EIGEN_VECTORIZE_FMA
// Haswell can issue 2 add/mul/madd per cycle.
// 9 pmadd, 2 pmul, 1 div, 2 other
?
(
2
*
NumTraits
<
Scalar
>::
AddCost
+
6
*
NumTraits
<
Scalar
>::
MulCost
+
scalar_div_cost
<
Scalar
,
packet_traits
<
Scalar
>::
HasDiv
>::
value
)
#else
?
(
11
*
NumTraits
<
Scalar
>::
AddCost
+
11
*
NumTraits
<
Scalar
>::
MulCost
+
scalar_div_cost
<
Scalar
,
packet_traits
<
Scalar
>::
HasDiv
>::
value
)
#endif
// This number assumes a naive implementation of tanh
:
(
6
*
NumTraits
<
Scalar
>::
AddCost
+
3
*
NumTraits
<
Scalar
>::
MulCost
+
2
*
scalar_div_cost
<
Scalar
,
packet_traits
<
Scalar
>::
HasDiv
>::
value
+
functor_traits
<
scalar_exp_op
<
Scalar
>
>::
Cost
))
};
};
/** \internal
* \brief Template functor to compute the sinh of a scalar
* \sa class CwiseUnaryOp, ArrayBase::sinh()
*/
template
<
typename
Scalar
>
struct
scalar_sinh_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_sinh_op
)
EIGEN_DEVICE_FUNC
inline
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
sinh
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
psinh
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_sinh_op
<
Scalar
>
>
{
enum
{
Cost
=
5
*
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasSinh
};
};
/** \internal
* \brief Template functor to compute the cosh of a scalar
* \sa class CwiseUnaryOp, ArrayBase::cosh()
*/
template
<
typename
Scalar
>
struct
scalar_cosh_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_cosh_op
)
EIGEN_DEVICE_FUNC
inline
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
cosh
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
pcosh
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_cosh_op
<
Scalar
>
>
{
enum
{
Cost
=
5
*
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasCosh
};
};
/** \internal
* \brief Template functor to compute the inverse of a scalar
* \sa class CwiseUnaryOp, Cwise::inverse()
*/
template
<
typename
Scalar
>
struct
scalar_inverse_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_inverse_op
)
EIGEN_DEVICE_FUNC
inline
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
Scalar
(
1
)
/
a
;
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
const
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
pdiv
(
pset1
<
Packet
>
(
Scalar
(
1
)),
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_inverse_op
<
Scalar
>
>
{
enum
{
Cost
=
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasDiv
};
};
/** \internal
* \brief Template functor to compute the square of a scalar
* \sa class CwiseUnaryOp, Cwise::square()
*/
template
<
typename
Scalar
>
struct
scalar_square_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_square_op
)
EIGEN_DEVICE_FUNC
inline
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
a
*
a
;
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
const
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
pmul
(
a
,
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_square_op
<
Scalar
>
>
{
enum
{
Cost
=
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasMul
};
};
/** \internal
* \brief Template functor to compute the cube of a scalar
* \sa class CwiseUnaryOp, Cwise::cube()
*/
template
<
typename
Scalar
>
struct
scalar_cube_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_cube_op
)
EIGEN_DEVICE_FUNC
inline
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
a
*
a
*
a
;
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
const
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
pmul
(
a
,
pmul
(
a
,
a
));
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_cube_op
<
Scalar
>
>
{
enum
{
Cost
=
2
*
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasMul
};
};
/** \internal
* \brief Template functor to compute the rounded value of a scalar
* \sa class CwiseUnaryOp, ArrayBase::round()
*/
template
<
typename
Scalar
>
struct
scalar_round_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_round_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
round
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
pround
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_round_op
<
Scalar
>
>
{
enum
{
Cost
=
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasRound
};
};
/** \internal
* \brief Template functor to compute the floor of a scalar
* \sa class CwiseUnaryOp, ArrayBase::floor()
*/
template
<
typename
Scalar
>
struct
scalar_floor_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_floor_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
floor
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
pfloor
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_floor_op
<
Scalar
>
>
{
enum
{
Cost
=
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasFloor
};
};
/** \internal
* \brief Template functor to compute the ceil of a scalar
* \sa class CwiseUnaryOp, ArrayBase::ceil()
*/
template
<
typename
Scalar
>
struct
scalar_ceil_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_ceil_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
numext
::
ceil
(
a
);
}
template
<
typename
Packet
>
EIGEN_DEVICE_FUNC
inline
Packet
packetOp
(
const
Packet
&
a
)
const
{
return
internal
::
pceil
(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_ceil_op
<
Scalar
>
>
{
enum
{
Cost
=
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
packet_traits
<
Scalar
>::
HasCeil
};
};
/** \internal
* \brief Template functor to compute whether a scalar is NaN
* \sa class CwiseUnaryOp, ArrayBase::isnan()
*/
template
<
typename
Scalar
>
struct
scalar_isnan_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_isnan_op
)
typedef
bool
result_type
;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
result_type
operator
()
(
const
Scalar
&
a
)
const
{
return
(
numext
::
isnan
)(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_isnan_op
<
Scalar
>
>
{
enum
{
Cost
=
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
false
};
};
/** \internal
* \brief Template functor to check whether a scalar is +/-inf
* \sa class CwiseUnaryOp, ArrayBase::isinf()
*/
template
<
typename
Scalar
>
struct
scalar_isinf_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_isinf_op
)
typedef
bool
result_type
;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
result_type
operator
()
(
const
Scalar
&
a
)
const
{
return
(
numext
::
isinf
)(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_isinf_op
<
Scalar
>
>
{
enum
{
Cost
=
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
false
};
};
/** \internal
* \brief Template functor to check whether a scalar has a finite value
* \sa class CwiseUnaryOp, ArrayBase::isfinite()
*/
template
<
typename
Scalar
>
struct
scalar_isfinite_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_isfinite_op
)
typedef
bool
result_type
;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
result_type
operator
()
(
const
Scalar
&
a
)
const
{
return
(
numext
::
isfinite
)(
a
);
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_isfinite_op
<
Scalar
>
>
{
enum
{
Cost
=
NumTraits
<
Scalar
>::
MulCost
,
PacketAccess
=
false
};
};
/** \internal
* \brief Template functor to compute the logical not of a boolean
*
* \sa class CwiseUnaryOp, ArrayBase::operator!
*/
template
<
typename
Scalar
>
struct
scalar_boolean_not_op
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_boolean_not_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
bool
operator
()
(
const
bool
&
a
)
const
{
return
!
a
;
}
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_boolean_not_op
<
Scalar
>
>
{
enum
{
Cost
=
NumTraits
<
bool
>::
AddCost
,
PacketAccess
=
false
};
};
/** \internal
* \brief Template functor to compute the signum of a scalar
* \sa class CwiseUnaryOp, Cwise::sign()
*/
template
<
typename
Scalar
,
bool
iscpx
=
(
NumTraits
<
Scalar
>
::
IsComplex
!=
0
)
>
struct
scalar_sign_op
;
template
<
typename
Scalar
>
struct
scalar_sign_op
<
Scalar
,
false
>
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_sign_op
)
EIGEN_DEVICE_FUNC
inline
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
return
Scalar
(
(
a
>
Scalar
(
0
))
-
(
a
<
Scalar
(
0
))
);
}
//TODO
//template <typename Packet>
//EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); }
};
template
<
typename
Scalar
>
struct
scalar_sign_op
<
Scalar
,
true
>
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_sign_op
)
EIGEN_DEVICE_FUNC
inline
const
Scalar
operator
()
(
const
Scalar
&
a
)
const
{
typedef
typename
NumTraits
<
Scalar
>::
Real
real_type
;
real_type
aa
=
numext
::
abs
(
a
);
if
(
aa
==
real_type
(
0
))
return
Scalar
(
0
);
aa
=
real_type
(
1
)
/
aa
;
return
Scalar
(
real
(
a
)
*
aa
,
imag
(
a
)
*
aa
);
}
//TODO
//template <typename Packet>
//EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); }
};
template
<
typename
Scalar
>
struct
functor_traits
<
scalar_sign_op
<
Scalar
>
>
{
enum
{
Cost
=
NumTraits
<
Scalar
>::
IsComplex
?
(
8
*
NumTraits
<
Scalar
>::
MulCost
)
// roughly
:
(
3
*
NumTraits
<
Scalar
>::
AddCost
),
PacketAccess
=
packet_traits
<
Scalar
>::
HasSign
};
};
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_FUNCTORS_H
external/eigen3/Eigen/src/Core/products/CMakeLists.txt
deleted
100644 → 0
View file @
aa523d16
FILE
(
GLOB Eigen_Core_Product_SRCS
"*.h"
)
INSTALL
(
FILES
${
Eigen_Core_Product_SRCS
}
DESTINATION
${
INCLUDE_INSTALL_DIR
}
/Eigen/src/Core/products COMPONENT Devel
)
external/eigen3/Eigen/src/Core/products/CoeffBasedProduct.h
deleted
100644 → 0
View file @
aa523d16
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COEFFBASED_PRODUCT_H
#define EIGEN_COEFFBASED_PRODUCT_H
namespace
Eigen
{
namespace
internal
{
/*********************************************************************************
* Coefficient based product implementation.
* It is designed for the following use cases:
* - small fixed sizes
* - lazy products
*********************************************************************************/
/* Since the all the dimensions of the product are small, here we can rely
* on the generic Assign mechanism to evaluate the product per coeff (or packet).
*
* Note that here the inner-loops should always be unrolled.
*/
template
<
int
Traversal
,
int
UnrollingIndex
,
typename
Lhs
,
typename
Rhs
,
typename
RetScalar
>
struct
product_coeff_impl
;
template
<
int
StorageOrder
,
int
UnrollingIndex
,
typename
Lhs
,
typename
Rhs
,
typename
Packet
,
int
LoadMode
>
struct
product_packet_impl
;
template
<
typename
LhsNested
,
typename
RhsNested
,
int
NestingFlags
>
struct
traits
<
CoeffBasedProduct
<
LhsNested
,
RhsNested
,
NestingFlags
>
>
{
typedef
MatrixXpr
XprKind
;
typedef
typename
remove_all
<
LhsNested
>::
type
_LhsNested
;
typedef
typename
remove_all
<
RhsNested
>::
type
_RhsNested
;
typedef
typename
scalar_product_traits
<
typename
_LhsNested
::
Scalar
,
typename
_RhsNested
::
Scalar
>::
ReturnType
Scalar
;
typedef
typename
promote_storage_type
<
typename
traits
<
_LhsNested
>::
StorageKind
,
typename
traits
<
_RhsNested
>::
StorageKind
>::
ret
StorageKind
;
typedef
typename
promote_index_type
<
typename
traits
<
_LhsNested
>::
Index
,
typename
traits
<
_RhsNested
>::
Index
>::
type
Index
;
enum
{
LhsCoeffReadCost
=
_LhsNested
::
CoeffReadCost
,
RhsCoeffReadCost
=
_RhsNested
::
CoeffReadCost
,
LhsFlags
=
_LhsNested
::
Flags
,
RhsFlags
=
_RhsNested
::
Flags
,
RowsAtCompileTime
=
_LhsNested
::
RowsAtCompileTime
,
ColsAtCompileTime
=
_RhsNested
::
ColsAtCompileTime
,
InnerSize
=
EIGEN_SIZE_MIN_PREFER_FIXED
(
_LhsNested
::
ColsAtCompileTime
,
_RhsNested
::
RowsAtCompileTime
),
MaxRowsAtCompileTime
=
_LhsNested
::
MaxRowsAtCompileTime
,
MaxColsAtCompileTime
=
_RhsNested
::
MaxColsAtCompileTime
,
LhsRowMajor
=
LhsFlags
&
RowMajorBit
,
RhsRowMajor
=
RhsFlags
&
RowMajorBit
,
SameType
=
is_same
<
typename
_LhsNested
::
Scalar
,
typename
_RhsNested
::
Scalar
>::
value
,
CanVectorizeRhs
=
RhsRowMajor
&&
(
RhsFlags
&
PacketAccessBit
)
&&
(
ColsAtCompileTime
==
Dynamic
||
(
(
ColsAtCompileTime
%
packet_traits
<
Scalar
>::
size
)
==
0
&&
(
RhsFlags
&
AlignedBit
)
)
),
CanVectorizeLhs
=
(
!
LhsRowMajor
)
&&
(
LhsFlags
&
PacketAccessBit
)
&&
(
RowsAtCompileTime
==
Dynamic
||
(
(
RowsAtCompileTime
%
packet_traits
<
Scalar
>::
size
)
==
0
&&
(
LhsFlags
&
AlignedBit
)
)
),
EvalToRowMajor
=
(
MaxRowsAtCompileTime
==
1
&&
MaxColsAtCompileTime
!=
1
)
?
1
:
(
MaxColsAtCompileTime
==
1
&&
MaxRowsAtCompileTime
!=
1
)
?
0
:
(
RhsRowMajor
&&
!
CanVectorizeLhs
),
Flags
=
((
unsigned
int
)(
LhsFlags
|
RhsFlags
)
&
HereditaryBits
&
~
RowMajorBit
)
|
(
EvalToRowMajor
?
RowMajorBit
:
0
)
|
NestingFlags
|
(
LhsFlags
&
RhsFlags
&
AlignedBit
)
// TODO enable vectorization for mixed types
|
(
SameType
&&
(
CanVectorizeLhs
||
CanVectorizeRhs
)
?
PacketAccessBit
:
0
),
CoeffReadCost
=
InnerSize
==
Dynamic
?
Dynamic
:
InnerSize
==
0
?
0
:
InnerSize
*
(
NumTraits
<
Scalar
>::
MulCost
+
LhsCoeffReadCost
+
RhsCoeffReadCost
)
+
(
InnerSize
-
1
)
*
NumTraits
<
Scalar
>::
AddCost
,
/* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
* of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
* loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
* the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
*/
CanVectorizeInner
=
SameType
&&
LhsRowMajor
&&
(
!
RhsRowMajor
)
&&
(
LhsFlags
&
RhsFlags
&
ActualPacketAccessBit
)
&&
(
LhsFlags
&
RhsFlags
&
AlignedBit
)
&&
(
InnerSize
%
packet_traits
<
Scalar
>::
size
==
0
)
};
};
}
// end namespace internal
template
<
typename
LhsNested
,
typename
RhsNested
,
int
NestingFlags
>
class
CoeffBasedProduct
:
internal
::
no_assignment_operator
,
public
MatrixBase
<
CoeffBasedProduct
<
LhsNested
,
RhsNested
,
NestingFlags
>
>
{
public:
typedef
MatrixBase
<
CoeffBasedProduct
>
Base
;
EIGEN_DENSE_PUBLIC_INTERFACE
(
CoeffBasedProduct
)
typedef
typename
Base
::
PlainObject
PlainObject
;
private:
typedef
typename
internal
::
traits
<
CoeffBasedProduct
>::
_LhsNested
_LhsNested
;
typedef
typename
internal
::
traits
<
CoeffBasedProduct
>::
_RhsNested
_RhsNested
;
enum
{
PacketSize
=
internal
::
packet_traits
<
Scalar
>::
size
,
InnerSize
=
internal
::
traits
<
CoeffBasedProduct
>::
InnerSize
,
Unroll
=
CoeffReadCost
!=
Dynamic
&&
CoeffReadCost
<=
EIGEN_UNROLLING_LIMIT
,
CanVectorizeInner
=
internal
::
traits
<
CoeffBasedProduct
>::
CanVectorizeInner
};
typedef
internal
::
product_coeff_impl
<
CanVectorizeInner
?
InnerVectorizedTraversal
:
DefaultTraversal
,
Unroll
?
InnerSize
:
Dynamic
,
_LhsNested
,
_RhsNested
,
Scalar
>
ScalarCoeffImpl
;
typedef
CoeffBasedProduct
<
LhsNested
,
RhsNested
,
NestByRefBit
>
LazyCoeffBasedProductType
;
public
:
inline
CoeffBasedProduct
(
const
CoeffBasedProduct
&
other
)
:
Base
(),
m_lhs
(
other
.
m_lhs
),
m_rhs
(
other
.
m_rhs
)
{}
template
<
typename
Lhs
,
typename
Rhs
>
inline
CoeffBasedProduct
(
const
Lhs
&
lhs
,
const
Rhs
&
rhs
)
:
m_lhs
(
lhs
),
m_rhs
(
rhs
)
{
// we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable.
// We still allow to mix T and complex<T>.
EIGEN_STATIC_ASSERT
((
internal
::
scalar_product_traits
<
typename
Lhs
::
RealScalar
,
typename
Rhs
::
RealScalar
>::
Defined
),
YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY
)
eigen_assert
(
lhs
.
cols
()
==
rhs
.
rows
()
&&
"invalid matrix product"
&&
"if you wanted a coeff-wise or a dot product use the respective explicit functions"
);
}
EIGEN_STRONG_INLINE
Index
rows
()
const
{
return
m_lhs
.
rows
();
}
EIGEN_STRONG_INLINE
Index
cols
()
const
{
return
m_rhs
.
cols
();
}
EIGEN_STRONG_INLINE
const
Scalar
coeff
(
Index
row
,
Index
col
)
const
{
Scalar
res
;
ScalarCoeffImpl
::
run
(
row
,
col
,
m_lhs
,
m_rhs
,
res
);
return
res
;
}
/* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
* which is why we don't set the LinearAccessBit.
*/
EIGEN_STRONG_INLINE
const
Scalar
coeff
(
Index
index
)
const
{
Scalar
res
;
const
Index
row
=
RowsAtCompileTime
==
1
?
0
:
index
;
const
Index
col
=
RowsAtCompileTime
==
1
?
index
:
0
;
ScalarCoeffImpl
::
run
(
row
,
col
,
m_lhs
,
m_rhs
,
res
);
return
res
;
}
template
<
int
LoadMode
>
EIGEN_STRONG_INLINE
const
PacketScalar
packet
(
Index
row
,
Index
col
)
const
{
PacketScalar
res
;
internal
::
product_packet_impl
<
Flags
&
RowMajorBit
?
RowMajor
:
ColMajor
,
Unroll
?
InnerSize
:
Dynamic
,
_LhsNested
,
_RhsNested
,
PacketScalar
,
LoadMode
>
::
run
(
row
,
col
,
m_lhs
,
m_rhs
,
res
);
return
res
;
}
// Implicit conversion to the nested type (trigger the evaluation of the product)
EIGEN_STRONG_INLINE
operator
const
PlainObject
&
()
const
{
m_result
.
lazyAssign
(
*
this
);
return
m_result
;
}
const
_LhsNested
&
lhs
()
const
{
return
m_lhs
;
}
const
_RhsNested
&
rhs
()
const
{
return
m_rhs
;
}
const
Diagonal
<
const
LazyCoeffBasedProductType
,
0
>
diagonal
()
const
{
return
reinterpret_cast
<
const
LazyCoeffBasedProductType
&>
(
*
this
);
}
template
<
int
DiagonalIndex
>
const
Diagonal
<
const
LazyCoeffBasedProductType
,
DiagonalIndex
>
diagonal
()
const
{
return
reinterpret_cast
<
const
LazyCoeffBasedProductType
&>
(
*
this
);
}
const
Diagonal
<
const
LazyCoeffBasedProductType
,
Dynamic
>
diagonal
(
Index
index
)
const
{
return
reinterpret_cast
<
const
LazyCoeffBasedProductType
&>
(
*
this
).
diagonal
(
index
);
}
protected
:
typename
internal
::
add_const_on_value_type
<
LhsNested
>::
type
m_lhs
;
typename
internal
::
add_const_on_value_type
<
RhsNested
>::
type
m_rhs
;
mutable
PlainObject
m_result
;
};
namespace
internal
{
// here we need to overload the nested rule for products
// such that the nested type is a const reference to a plain matrix
template
<
typename
Lhs
,
typename
Rhs
,
int
N
,
typename
PlainObject
>
struct
nested
<
CoeffBasedProduct
<
Lhs
,
Rhs
,
EvalBeforeNestingBit
|
EvalBeforeAssigningBit
>
,
N
,
PlainObject
>
{
typedef
PlainObject
const
&
type
;
};
/***************************************************************************
* Normal product .coeff() implementation (with meta-unrolling)
***************************************************************************/
/**************************************
*** Scalar path - no vectorization ***
**************************************/
template
<
int
UnrollingIndex
,
typename
Lhs
,
typename
Rhs
,
typename
RetScalar
>
struct
product_coeff_impl
<
DefaultTraversal
,
UnrollingIndex
,
Lhs
,
Rhs
,
RetScalar
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
RetScalar
&
res
)
{
product_coeff_impl
<
DefaultTraversal
,
UnrollingIndex
-
1
,
Lhs
,
Rhs
,
RetScalar
>::
run
(
row
,
col
,
lhs
,
rhs
,
res
);
res
+=
lhs
.
coeff
(
row
,
UnrollingIndex
-
1
)
*
rhs
.
coeff
(
UnrollingIndex
-
1
,
col
);
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
RetScalar
>
struct
product_coeff_impl
<
DefaultTraversal
,
1
,
Lhs
,
Rhs
,
RetScalar
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
RetScalar
&
res
)
{
res
=
lhs
.
coeff
(
row
,
0
)
*
rhs
.
coeff
(
0
,
col
);
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
RetScalar
>
struct
product_coeff_impl
<
DefaultTraversal
,
0
,
Lhs
,
Rhs
,
RetScalar
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
/*row*/
,
Index
/*col*/
,
const
Lhs
&
/*lhs*/
,
const
Rhs
&
/*rhs*/
,
RetScalar
&
res
)
{
res
=
RetScalar
(
0
);
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
RetScalar
>
struct
product_coeff_impl
<
DefaultTraversal
,
Dynamic
,
Lhs
,
Rhs
,
RetScalar
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
RetScalar
&
res
)
{
res
=
(
lhs
.
row
(
row
).
transpose
().
cwiseProduct
(
rhs
.
col
(
col
)
)).
sum
();
}
};
/*******************************************
*** Scalar path with inner vectorization ***
*******************************************/
template
<
int
UnrollingIndex
,
typename
Lhs
,
typename
Rhs
,
typename
Packet
>
struct
product_coeff_vectorized_unroller
{
typedef
typename
Lhs
::
Index
Index
;
enum
{
PacketSize
=
packet_traits
<
typename
Lhs
::
Scalar
>::
size
};
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
typename
Lhs
::
PacketScalar
&
pres
)
{
product_coeff_vectorized_unroller
<
UnrollingIndex
-
PacketSize
,
Lhs
,
Rhs
,
Packet
>::
run
(
row
,
col
,
lhs
,
rhs
,
pres
);
pres
=
padd
(
pres
,
pmul
(
lhs
.
template
packet
<
Aligned
>(
row
,
UnrollingIndex
)
,
rhs
.
template
packet
<
Aligned
>(
UnrollingIndex
,
col
)
));
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
Packet
>
struct
product_coeff_vectorized_unroller
<
0
,
Lhs
,
Rhs
,
Packet
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
typename
Lhs
::
PacketScalar
&
pres
)
{
pres
=
pmul
(
lhs
.
template
packet
<
Aligned
>(
row
,
0
)
,
rhs
.
template
packet
<
Aligned
>(
0
,
col
));
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
RetScalar
>
struct
product_coeff_impl
<
InnerVectorizedTraversal
,
0
,
Lhs
,
Rhs
,
RetScalar
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
/*row*/
,
Index
/*col*/
,
const
Lhs
&
/*lhs*/
,
const
Rhs
&
/*rhs*/
,
RetScalar
&
res
)
{
res
=
0
;
}
};
template
<
int
UnrollingIndex
,
typename
Lhs
,
typename
Rhs
,
typename
RetScalar
>
struct
product_coeff_impl
<
InnerVectorizedTraversal
,
UnrollingIndex
,
Lhs
,
Rhs
,
RetScalar
>
{
typedef
typename
Lhs
::
PacketScalar
Packet
;
typedef
typename
Lhs
::
Index
Index
;
enum
{
PacketSize
=
packet_traits
<
typename
Lhs
::
Scalar
>::
size
};
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
RetScalar
&
res
)
{
Packet
pres
;
product_coeff_vectorized_unroller
<
UnrollingIndex
-
PacketSize
,
Lhs
,
Rhs
,
Packet
>::
run
(
row
,
col
,
lhs
,
rhs
,
pres
);
res
=
predux
(
pres
);
}
};
template
<
typename
Lhs
,
typename
Rhs
,
int
LhsRows
=
Lhs
::
RowsAtCompileTime
,
int
RhsCols
=
Rhs
::
ColsAtCompileTime
>
struct
product_coeff_vectorized_dyn_selector
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
typename
Lhs
::
Scalar
&
res
)
{
res
=
lhs
.
row
(
row
).
transpose
().
cwiseProduct
(
rhs
.
col
(
col
)).
sum
();
}
};
// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
template
<
typename
Lhs
,
typename
Rhs
,
int
RhsCols
>
struct
product_coeff_vectorized_dyn_selector
<
Lhs
,
Rhs
,
1
,
RhsCols
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
/*row*/
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
typename
Lhs
::
Scalar
&
res
)
{
res
=
lhs
.
transpose
().
cwiseProduct
(
rhs
.
col
(
col
)).
sum
();
}
};
template
<
typename
Lhs
,
typename
Rhs
,
int
LhsRows
>
struct
product_coeff_vectorized_dyn_selector
<
Lhs
,
Rhs
,
LhsRows
,
1
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
/*col*/
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
typename
Lhs
::
Scalar
&
res
)
{
res
=
lhs
.
row
(
row
).
transpose
().
cwiseProduct
(
rhs
).
sum
();
}
};
template
<
typename
Lhs
,
typename
Rhs
>
struct
product_coeff_vectorized_dyn_selector
<
Lhs
,
Rhs
,
1
,
1
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
/*row*/
,
Index
/*col*/
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
typename
Lhs
::
Scalar
&
res
)
{
res
=
lhs
.
transpose
().
cwiseProduct
(
rhs
).
sum
();
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
RetScalar
>
struct
product_coeff_impl
<
InnerVectorizedTraversal
,
Dynamic
,
Lhs
,
Rhs
,
RetScalar
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
typename
Lhs
::
Scalar
&
res
)
{
product_coeff_vectorized_dyn_selector
<
Lhs
,
Rhs
>::
run
(
row
,
col
,
lhs
,
rhs
,
res
);
}
};
/*******************
*** Packet path ***
*******************/
template
<
int
UnrollingIndex
,
typename
Lhs
,
typename
Rhs
,
typename
Packet
,
int
LoadMode
>
struct
product_packet_impl
<
RowMajor
,
UnrollingIndex
,
Lhs
,
Rhs
,
Packet
,
LoadMode
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
Packet
&
res
)
{
product_packet_impl
<
RowMajor
,
UnrollingIndex
-
1
,
Lhs
,
Rhs
,
Packet
,
LoadMode
>::
run
(
row
,
col
,
lhs
,
rhs
,
res
);
res
=
pmadd
(
pset1
<
Packet
>
(
lhs
.
coeff
(
row
,
UnrollingIndex
-
1
)),
rhs
.
template
packet
<
LoadMode
>(
UnrollingIndex
-
1
,
col
),
res
);
}
};
template
<
int
UnrollingIndex
,
typename
Lhs
,
typename
Rhs
,
typename
Packet
,
int
LoadMode
>
struct
product_packet_impl
<
ColMajor
,
UnrollingIndex
,
Lhs
,
Rhs
,
Packet
,
LoadMode
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
Packet
&
res
)
{
product_packet_impl
<
ColMajor
,
UnrollingIndex
-
1
,
Lhs
,
Rhs
,
Packet
,
LoadMode
>::
run
(
row
,
col
,
lhs
,
rhs
,
res
);
res
=
pmadd
(
lhs
.
template
packet
<
LoadMode
>(
row
,
UnrollingIndex
-
1
),
pset1
<
Packet
>
(
rhs
.
coeff
(
UnrollingIndex
-
1
,
col
)),
res
);
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
Packet
,
int
LoadMode
>
struct
product_packet_impl
<
RowMajor
,
1
,
Lhs
,
Rhs
,
Packet
,
LoadMode
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
Packet
&
res
)
{
res
=
pmul
(
pset1
<
Packet
>
(
lhs
.
coeff
(
row
,
0
)),
rhs
.
template
packet
<
LoadMode
>(
0
,
col
));
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
Packet
,
int
LoadMode
>
struct
product_packet_impl
<
ColMajor
,
1
,
Lhs
,
Rhs
,
Packet
,
LoadMode
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
Packet
&
res
)
{
res
=
pmul
(
lhs
.
template
packet
<
LoadMode
>(
row
,
0
),
pset1
<
Packet
>
(
rhs
.
coeff
(
0
,
col
)));
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
Packet
,
int
LoadMode
>
struct
product_packet_impl
<
RowMajor
,
0
,
Lhs
,
Rhs
,
Packet
,
LoadMode
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
/*row*/
,
Index
/*col*/
,
const
Lhs
&
/*lhs*/
,
const
Rhs
&
/*rhs*/
,
Packet
&
res
)
{
res
=
pset1
<
Packet
>
(
0
);
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
Packet
,
int
LoadMode
>
struct
product_packet_impl
<
ColMajor
,
0
,
Lhs
,
Rhs
,
Packet
,
LoadMode
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
/*row*/
,
Index
/*col*/
,
const
Lhs
&
/*lhs*/
,
const
Rhs
&
/*rhs*/
,
Packet
&
res
)
{
res
=
pset1
<
Packet
>
(
0
);
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
Packet
,
int
LoadMode
>
struct
product_packet_impl
<
RowMajor
,
Dynamic
,
Lhs
,
Rhs
,
Packet
,
LoadMode
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
Packet
&
res
)
{
res
=
pset1
<
Packet
>
(
0
);
for
(
Index
i
=
0
;
i
<
lhs
.
cols
();
++
i
)
res
=
pmadd
(
pset1
<
Packet
>
(
lhs
.
coeff
(
row
,
i
)),
rhs
.
template
packet
<
LoadMode
>(
i
,
col
),
res
);
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
Packet
,
int
LoadMode
>
struct
product_packet_impl
<
ColMajor
,
Dynamic
,
Lhs
,
Rhs
,
Packet
,
LoadMode
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
Packet
&
res
)
{
res
=
pset1
<
Packet
>
(
0
);
for
(
Index
i
=
0
;
i
<
lhs
.
cols
();
++
i
)
res
=
pmadd
(
lhs
.
template
packet
<
LoadMode
>(
row
,
i
),
pset1
<
Packet
>
(
rhs
.
coeff
(
i
,
col
)),
res
);
}
};
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_COEFFBASED_PRODUCT_H
external/eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h
View file @
a394b22a
...
...
@@ -10,8 +10,9 @@
#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
#define EIGEN_GENERAL_BLOCK_PANEL_H
namespace
Eigen
{
namespace
Eigen
{
namespace
internal
{
template
<
typename
_LhsScalar
,
typename
_RhsScalar
,
bool
_ConjLhs
=
false
,
bool
_ConjRhs
=
false
>
...
...
@@ -24,29 +25,51 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
return
a
<=
0
?
b
:
a
;
}
#if EIGEN_ARCH_i386_OR_x86_64
const
std
::
ptrdiff_t
defaultL1CacheSize
=
32
*
1024
;
const
std
::
ptrdiff_t
defaultL2CacheSize
=
256
*
1024
;
const
std
::
ptrdiff_t
defaultL3CacheSize
=
2
*
1024
*
1024
;
#else
const
std
::
ptrdiff_t
defaultL1CacheSize
=
16
*
1024
;
const
std
::
ptrdiff_t
defaultL2CacheSize
=
512
*
1024
;
const
std
::
ptrdiff_t
defaultL3CacheSize
=
512
*
1024
;
#endif
/** \internal */
inline
void
manage_caching_sizes
(
Action
action
,
std
::
ptrdiff_t
*
l1
=
0
,
std
::
ptrdiff_t
*
l2
=
0
)
{
static
std
::
ptrdiff_t
m_l1CacheSize
=
0
;
static
std
::
ptrdiff_t
m_l2CacheSize
=
0
;
if
(
m_l2CacheSize
==
0
)
{
m_l1CacheSize
=
manage_caching_sizes_helper
(
queryL1CacheSize
(),
8
*
1024
);
m_l2CacheSize
=
manage_caching_sizes_helper
(
queryTopLevelCacheSize
(),
1
*
1024
*
1024
);
struct
CacheSizes
{
CacheSizes
()
:
m_l1
(
-
1
),
m_l2
(
-
1
),
m_l3
(
-
1
)
{
int
l1CacheSize
,
l2CacheSize
,
l3CacheSize
;
queryCacheSizes
(
l1CacheSize
,
l2CacheSize
,
l3CacheSize
);
m_l1
=
manage_caching_sizes_helper
(
l1CacheSize
,
defaultL1CacheSize
);
m_l2
=
manage_caching_sizes_helper
(
l2CacheSize
,
defaultL2CacheSize
);
m_l3
=
manage_caching_sizes_helper
(
l3CacheSize
,
defaultL3CacheSize
);
}
std
::
ptrdiff_t
m_l1
;
std
::
ptrdiff_t
m_l2
;
std
::
ptrdiff_t
m_l3
;
};
/** \internal */
inline
void
manage_caching_sizes
(
Action
action
,
std
::
ptrdiff_t
*
l1
,
std
::
ptrdiff_t
*
l2
,
std
::
ptrdiff_t
*
l3
)
{
static
CacheSizes
m_cacheSizes
;
if
(
action
==
SetAction
)
{
// set the cpu cache size and cache all block sizes from a global cache size in byte
eigen_internal_assert
(
l1
!=
0
&&
l2
!=
0
);
m_l1CacheSize
=
*
l1
;
m_l2CacheSize
=
*
l2
;
m_cacheSizes
.
m_l1
=
*
l1
;
m_cacheSizes
.
m_l2
=
*
l2
;
m_cacheSizes
.
m_l3
=
*
l3
;
}
else
if
(
action
==
GetAction
)
{
eigen_internal_assert
(
l1
!=
0
&&
l2
!=
0
);
*
l1
=
m_l1CacheSize
;
*
l2
=
m_l2CacheSize
;
*
l1
=
m_cacheSizes
.
m_l1
;
*
l2
=
m_cacheSizes
.
m_l2
;
*
l3
=
m_cacheSizes
.
m_l3
;
}
else
{
...
...
@@ -54,6 +77,206 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi
}
}
/* Helper for computeProductBlockingSizes.
*
* Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
* this function computes the blocking size parameters along the respective dimensions
* for matrix products and related algorithms. The blocking sizes depends on various
* parameters:
* - the L1 and L2 cache sizes,
* - the register level blocking sizes defined by gebp_traits,
* - the number of scalars that fit into a packet (when vectorization is enabled).
*
* \sa setCpuCacheSizes */
template
<
typename
LhsScalar
,
typename
RhsScalar
,
int
KcFactor
,
typename
Index
>
void
evaluateProductBlockingSizesHeuristic
(
Index
&
k
,
Index
&
m
,
Index
&
n
,
Index
num_threads
=
1
)
{
typedef
gebp_traits
<
LhsScalar
,
RhsScalar
>
Traits
;
// Explanations:
// Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
// kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
// per mr x kc horizontal small panels where mr is the blocking size along the m dimension
// at the register level. This small horizontal panel has to stay within L1 cache.
std
::
ptrdiff_t
l1
,
l2
,
l3
;
manage_caching_sizes
(
GetAction
,
&
l1
,
&
l2
,
&
l3
);
if
(
num_threads
>
1
)
{
typedef
typename
Traits
::
ResScalar
ResScalar
;
enum
{
kdiv
=
KcFactor
*
(
Traits
::
mr
*
sizeof
(
LhsScalar
)
+
Traits
::
nr
*
sizeof
(
RhsScalar
)),
ksub
=
Traits
::
mr
*
Traits
::
nr
*
sizeof
(
ResScalar
),
kr
=
8
,
mr
=
Traits
::
mr
,
nr
=
Traits
::
nr
};
// Increasing k gives us more time to prefetch the content of the "C"
// registers. However once the latency is hidden there is no point in
// increasing the value of k, so we'll cap it at 320 (value determined
// experimentally).
const
Index
k_cache
=
(
numext
::
mini
<
Index
>
)((
l1
-
ksub
)
/
kdiv
,
320
);
if
(
k_cache
<
k
)
{
k
=
k_cache
-
(
k_cache
%
kr
);
eigen_internal_assert
(
k
>
0
);
}
const
Index
n_cache
=
(
l2
-
l1
)
/
(
nr
*
sizeof
(
RhsScalar
)
*
k
);
const
Index
n_per_thread
=
numext
::
div_ceil
(
n
,
num_threads
);
if
(
n_cache
<=
n_per_thread
)
{
// Don't exceed the capacity of the l2 cache.
eigen_internal_assert
(
n_cache
>=
static_cast
<
Index
>
(
nr
));
n
=
n_cache
-
(
n_cache
%
nr
);
eigen_internal_assert
(
n
>
0
);
}
else
{
n
=
(
numext
::
mini
<
Index
>
)(
n
,
(
n_per_thread
+
nr
-
1
)
-
((
n_per_thread
+
nr
-
1
)
%
nr
));
}
if
(
l3
>
l2
)
{
// l3 is shared between all cores, so we'll give each thread its own chunk of l3.
const
Index
m_cache
=
(
l3
-
l2
)
/
(
sizeof
(
LhsScalar
)
*
k
*
num_threads
);
const
Index
m_per_thread
=
numext
::
div_ceil
(
m
,
num_threads
);
if
(
m_cache
<
m_per_thread
&&
m_cache
>=
static_cast
<
Index
>
(
mr
))
{
m
=
m_cache
-
(
m_cache
%
mr
);
eigen_internal_assert
(
m
>
0
);
}
else
{
m
=
(
numext
::
mini
<
Index
>
)(
m
,
(
m_per_thread
+
mr
-
1
)
-
((
m_per_thread
+
mr
-
1
)
%
mr
));
}
}
}
else
{
// In unit tests we do not want to use extra large matrices,
// so we reduce the cache size to check the blocking strategy is not flawed
#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
l1
=
9
*
1024
;
l2
=
32
*
1024
;
l3
=
512
*
1024
;
#endif
// Early return for small problems because the computation below are time consuming for small problems.
// Perhaps it would make more sense to consider k*n*m??
// Note that for very tiny problem, this function should be bypassed anyway
// because we use the coefficient-based implementation for them.
if
((
numext
::
maxi
)(
k
,(
numext
::
maxi
)(
m
,
n
))
<
48
)
return
;
typedef
typename
Traits
::
ResScalar
ResScalar
;
enum
{
k_peeling
=
8
,
k_div
=
KcFactor
*
(
Traits
::
mr
*
sizeof
(
LhsScalar
)
+
Traits
::
nr
*
sizeof
(
RhsScalar
)),
k_sub
=
Traits
::
mr
*
Traits
::
nr
*
sizeof
(
ResScalar
)
};
// ---- 1st level of blocking on L1, yields kc ----
// Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
// of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
// We also include a register-level block of the result (mx x nr).
// (In an ideal world only the lhs panel would stay in L1)
// Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
const
Index
max_kc
=
numext
::
maxi
<
Index
>
(((
l1
-
k_sub
)
/
k_div
)
&
(
~
(
k_peeling
-
1
)),
1
);
const
Index
old_k
=
k
;
if
(
k
>
max_kc
)
{
// We are really blocking on the third dimension:
// -> reduce blocking size to make sure the last block is as large as possible
// while keeping the same number of sweeps over the result.
k
=
(
k
%
max_kc
)
==
0
?
max_kc
:
max_kc
-
k_peeling
*
((
max_kc
-
1
-
(
k
%
max_kc
))
/
(
k_peeling
*
(
k
/
max_kc
+
1
)));
eigen_internal_assert
(((
old_k
/
k
)
==
(
old_k
/
max_kc
))
&&
"the number of sweeps has to remain the same"
);
}
// ---- 2nd level of blocking on max(L2,L3), yields nc ----
// TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
// actual_l2 = max(l2, l3/nb_core_sharing_l3)
// The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
// For instance, it corresponds to 6MB of L3 shared among 4 cores.
#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
const
Index
actual_l2
=
l3
;
#else
const
Index
actual_l2
=
1572864
;
// == 1.5 MB
#endif
// Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
// The second half is implicitly reserved to access the result and lhs coefficients.
// When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
// to limit this growth: we bound nc to growth by a factor x1.5.
// However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
// and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
Index
max_nc
;
const
Index
lhs_bytes
=
m
*
k
*
sizeof
(
LhsScalar
);
const
Index
remaining_l1
=
l1
-
k_sub
-
lhs_bytes
;
if
(
remaining_l1
>=
Index
(
Traits
::
nr
*
sizeof
(
RhsScalar
))
*
k
)
{
// L1 blocking
max_nc
=
remaining_l1
/
(
k
*
sizeof
(
RhsScalar
));
}
else
{
// L2 blocking
max_nc
=
(
3
*
actual_l2
)
/
(
2
*
2
*
max_kc
*
sizeof
(
RhsScalar
));
}
// WARNING Below, we assume that Traits::nr is a power of two.
Index
nc
=
numext
::
mini
<
Index
>
(
actual_l2
/
(
2
*
k
*
sizeof
(
RhsScalar
)),
max_nc
)
&
(
~
(
Traits
::
nr
-
1
));
if
(
n
>
nc
)
{
// We are really blocking over the columns:
// -> reduce blocking size to make sure the last block is as large as possible
// while keeping the same number of sweeps over the packed lhs.
// Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
n
=
(
n
%
nc
)
==
0
?
nc
:
(
nc
-
Traits
::
nr
*
((
nc
/*-1*/
-
(
n
%
nc
))
/
(
Traits
::
nr
*
(
n
/
nc
+
1
))));
}
else
if
(
old_k
==
k
)
{
// So far, no blocking at all, i.e., kc==k, and nc==n.
// In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
// TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
Index
problem_size
=
k
*
n
*
sizeof
(
LhsScalar
);
Index
actual_lm
=
actual_l2
;
Index
max_mc
=
m
;
if
(
problem_size
<=
1024
)
{
// problem is small enough to keep in L1
// Let's choose m such that lhs's block fit in 1/3 of L1
actual_lm
=
l1
;
}
else
if
(
l3
!=
0
&&
problem_size
<=
32768
)
{
// we have both L2 and L3, and problem is small enough to be kept in L2
// Let's choose m such that lhs's block fit in 1/3 of L2
actual_lm
=
l2
;
max_mc
=
(
numext
::
mini
<
Index
>
)(
576
,
max_mc
);
}
Index
mc
=
(
numext
::
mini
<
Index
>
)(
actual_lm
/
(
3
*
k
*
sizeof
(
LhsScalar
)),
max_mc
);
if
(
mc
>
Traits
::
mr
)
mc
-=
mc
%
Traits
::
mr
;
else
if
(
mc
==
0
)
return
;
m
=
(
m
%
mc
)
==
0
?
mc
:
(
mc
-
Traits
::
mr
*
((
mc
/*-1*/
-
(
m
%
mc
))
/
(
Traits
::
mr
*
(
m
/
mc
+
1
))));
}
}
}
template
<
typename
Index
>
inline
bool
useSpecificBlockingSizes
(
Index
&
k
,
Index
&
m
,
Index
&
n
)
{
#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
if
(
EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
)
{
k
=
numext
::
mini
<
Index
>
(
k
,
EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K
);
m
=
numext
::
mini
<
Index
>
(
m
,
EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M
);
n
=
numext
::
mini
<
Index
>
(
n
,
EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N
);
return
true
;
}
#else
EIGEN_UNUSED_VARIABLE
(
k
)
EIGEN_UNUSED_VARIABLE
(
m
)
EIGEN_UNUSED_VARIABLE
(
n
)
#endif
return
false
;
}
/** \brief Computes the blocking parameters for a m x k times k x n matrix product
*
* \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
...
...
@@ -62,48 +285,30 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi
*
* Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
* this function computes the blocking size parameters along the respective dimensions
* for matrix products and related algorithms.
The blocking sizes depends on various
*
parameters:
*
- the L1 and L2 cache sizes,
*
- the register level blocking sizes defined by gebp_traits,
*
- the number of scalars that fit into a packet (when vectorization is enabled
).
* for matrix products and related algorithms.
*
*
The blocking size parameters may be evaluated:
*
- either by a heuristic based on cache sizes;
*
- or using fixed prescribed values (for testing purposes
).
*
* \sa setCpuCacheSizes */
template
<
typename
LhsScalar
,
typename
RhsScalar
,
int
KcFactor
,
typename
SizeType
>
void
computeProductBlockingSizes
(
SizeType
&
k
,
SizeType
&
m
,
SizeType
&
n
)
{
EIGEN_UNUSED_VARIABLE
(
n
);
// Explanations:
// Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
// mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
// per kc x nr vertical small panels where nr is the blocking size along the n dimension
// at the register level. For vectorization purpose, these small vertical panels are unpacked,
// e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
// stay in L1 cache.
std
::
ptrdiff_t
l1
,
l2
;
typedef
gebp_traits
<
LhsScalar
,
RhsScalar
>
Traits
;
enum
{
kdiv
=
KcFactor
*
2
*
Traits
::
nr
*
Traits
::
RhsProgress
*
sizeof
(
RhsScalar
),
mr
=
gebp_traits
<
LhsScalar
,
RhsScalar
>::
mr
,
mr_mask
=
(
0xffffffff
/
mr
)
*
mr
};
manage_caching_sizes
(
GetAction
,
&
l1
,
&
l2
);
k
=
std
::
min
<
SizeType
>
(
k
,
l1
/
kdiv
);
SizeType
_m
=
k
>
0
?
l2
/
(
4
*
sizeof
(
LhsScalar
)
*
k
)
:
0
;
if
(
_m
<
m
)
m
=
_m
&
mr_mask
;
template
<
typename
LhsScalar
,
typename
RhsScalar
,
int
KcFactor
,
typename
Index
>
void
computeProductBlockingSizes
(
Index
&
k
,
Index
&
m
,
Index
&
n
,
Index
num_threads
=
1
)
{
if
(
!
useSpecificBlockingSizes
(
k
,
m
,
n
))
{
evaluateProductBlockingSizesHeuristic
<
LhsScalar
,
RhsScalar
,
KcFactor
,
Index
>
(
k
,
m
,
n
,
num_threads
);
}
}
template
<
typename
LhsScalar
,
typename
RhsScalar
,
typename
SizeType
>
inline
void
computeProductBlockingSizes
(
SizeType
&
k
,
SizeType
&
m
,
SizeType
&
n
)
template
<
typename
LhsScalar
,
typename
RhsScalar
,
typename
Index
>
inline
void
computeProductBlockingSizes
(
Index
&
k
,
Index
&
m
,
Index
&
n
,
Index
num_threads
=
1
)
{
computeProductBlockingSizes
<
LhsScalar
,
RhsScalar
,
1
>
(
k
,
m
,
n
);
computeProductBlockingSizes
<
LhsScalar
,
RhsScalar
,
1
,
Index
>
(
k
,
m
,
n
,
num_threads
);
}
#ifdef EIGEN_HAS_
FUSE
_CJMADD
#define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
#ifdef EIGEN_HAS_
SINGLE_INSTRUCTION
_CJMADD
#define
CJ
MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
#else
// FIXME (a bit overkill maybe ?)
...
...
@@ -128,8 +333,8 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
gebp_madd_selector
<
CJ
,
A
,
B
,
C
,
T
>::
run
(
cj
,
a
,
b
,
c
,
t
);
}
#define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
// #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
#define
CJ
MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
// #define
CJ
MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
#endif
/* Vectorization logic
...
...
@@ -148,7 +353,7 @@ class gebp_traits
public:
typedef
_LhsScalar
LhsScalar
;
typedef
_RhsScalar
RhsScalar
;
typedef
typename
s
calar
_product_t
raits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
typedef
typename
S
calar
BinaryOpT
raits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
enum
{
ConjLhs
=
_ConjLhs
,
...
...
@@ -160,16 +365,22 @@ public:
NumberOfRegisters
=
EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
,
// register block size along the N direction
(
must be
either 2
or 4
)
nr
=
NumberOfRegisters
/
4
,
// register block size along the N direction must be
1
or 4
nr
=
4
,
// register block size along the M direction (currently, this one cannot be modified)
mr
=
2
*
LhsPacketSize
,
default_mr
=
(
EIGEN_PLAIN_ENUM_MIN
(
16
,
NumberOfRegisters
)
/
2
/
nr
)
*
LhsPacketSize
,
#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
// we assume 16 registers
// See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
// then using 3*LhsPacketSize triggers non-implemented paths in syrk.
mr
=
Vectorizable
?
3
*
LhsPacketSize
:
default_mr
,
#else
mr
=
default_mr
,
#endif
WorkSpaceFactor
=
nr
*
RhsPacketSize
,
LhsProgress
=
LhsPacketSize
,
RhsProgress
=
RhsPacketSize
RhsProgress
=
1
};
typedef
typename
packet_traits
<
LhsScalar
>::
type
_LhsPacket
;
...
...
@@ -186,36 +397,67 @@ public:
{
p
=
pset1
<
ResPacket
>
(
ResScalar
(
0
));
}
EIGEN_STRONG_INLINE
void
unpackRhs
(
DenseIndex
n
,
const
RhsScalar
*
rhs
,
RhsScalar
*
b
)
EIGEN_STRONG_INLINE
void
broadcastRhs
(
const
RhsScalar
*
b
,
RhsPacket
&
b0
,
RhsPacket
&
b1
,
RhsPacket
&
b2
,
RhsPacket
&
b3
)
{
pbroadcast4
(
b
,
b0
,
b1
,
b2
,
b3
);
}
// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
// {
// pbroadcast2(b, b0, b1);
// }
template
<
typename
RhsPacketType
>
EIGEN_STRONG_INLINE
void
loadRhs
(
const
RhsScalar
*
b
,
RhsPacketType
&
dest
)
const
{
dest
=
pset1
<
RhsPacketType
>
(
*
b
);
}
EIGEN_STRONG_INLINE
void
loadRhsQuad
(
const
RhsScalar
*
b
,
RhsPacket
&
dest
)
const
{
for
(
DenseIndex
k
=
0
;
k
<
n
;
k
++
)
pstore1
<
RhsPacket
>
(
&
b
[
k
*
RhsPacketSize
],
rhs
[
k
]);
dest
=
ploadquad
<
RhsPacket
>
(
b
);
}
EIGEN_STRONG_INLINE
void
loadRhs
(
const
RhsScalar
*
b
,
RhsPacket
&
dest
)
const
template
<
typename
LhsPacketType
>
EIGEN_STRONG_INLINE
void
loadLhs
(
const
LhsScalar
*
a
,
LhsPacketType
&
dest
)
const
{
dest
=
pload
<
R
hsPacket
>
(
b
);
dest
=
pload
<
L
hsPacket
Type
>
(
a
);
}
EIGEN_STRONG_INLINE
void
loadLhs
(
const
LhsScalar
*
a
,
LhsPacket
&
dest
)
const
template
<
typename
LhsPacketType
>
EIGEN_STRONG_INLINE
void
loadLhsUnaligned
(
const
LhsScalar
*
a
,
LhsPacketType
&
dest
)
const
{
dest
=
pload
<
LhsPacket
>
(
a
);
dest
=
pload
u
<
LhsPacket
Type
>
(
a
);
}
EIGEN_STRONG_INLINE
void
madd
(
const
LhsPacket
&
a
,
const
RhsPacket
&
b
,
AccPacket
&
c
,
AccPacket
&
tmp
)
const
template
<
typename
LhsPacketType
,
typename
RhsPacketType
,
typename
AccPacketType
>
EIGEN_STRONG_INLINE
void
madd
(
const
LhsPacketType
&
a
,
const
RhsPacketType
&
b
,
AccPacketType
&
c
,
AccPacketType
&
tmp
)
const
{
tmp
=
b
;
tmp
=
pmul
(
a
,
tmp
);
c
=
padd
(
c
,
tmp
);
conj_helper
<
LhsPacketType
,
RhsPacketType
,
ConjLhs
,
ConjRhs
>
cj
;
// It would be a lot cleaner to call pmadd all the time. Unfortunately if we
// let gcc allocate the register in which to store the result of the pmul
// (in the case where there is no FMA) gcc fails to figure out how to avoid
// spilling register.
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
EIGEN_UNUSED_VARIABLE
(
tmp
);
c
=
cj
.
pmadd
(
a
,
b
,
c
);
#else
tmp
=
b
;
tmp
=
cj
.
pmul
(
a
,
tmp
);
c
=
padd
(
c
,
tmp
);
#endif
}
EIGEN_STRONG_INLINE
void
acc
(
const
AccPacket
&
c
,
const
ResPacket
&
alpha
,
ResPacket
&
r
)
const
{
r
=
pmadd
(
c
,
alpha
,
r
);
}
template
<
typename
ResPacketHalf
>
EIGEN_STRONG_INLINE
void
acc
(
const
ResPacketHalf
&
c
,
const
ResPacketHalf
&
alpha
,
ResPacketHalf
&
r
)
const
{
r
=
pmadd
(
c
,
alpha
,
r
);
}
protected
:
// conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
// conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
};
template
<
typename
RealScalar
,
bool
_ConjLhs
>
...
...
@@ -224,7 +466,7 @@ class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
public:
typedef
std
::
complex
<
RealScalar
>
LhsScalar
;
typedef
RealScalar
RhsScalar
;
typedef
typename
s
calar
_product_t
raits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
typedef
typename
S
calar
BinaryOpT
raits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
enum
{
ConjLhs
=
_ConjLhs
,
...
...
@@ -235,12 +477,16 @@ public:
ResPacketSize
=
Vectorizable
?
packet_traits
<
ResScalar
>::
size
:
1
,
NumberOfRegisters
=
EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
,
nr
=
NumberOfRegisters
/
4
,
mr
=
2
*
LhsPacketSize
,
WorkSpaceFactor
=
nr
*
RhsPacketSize
,
nr
=
4
,
#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
// we assume 16 registers
mr
=
3
*
LhsPacketSize
,
#else
mr
=
(
EIGEN_PLAIN_ENUM_MIN
(
16
,
NumberOfRegisters
)
/
2
/
nr
)
*
LhsPacketSize
,
#endif
LhsProgress
=
LhsPacketSize
,
RhsProgress
=
RhsPacketSize
RhsProgress
=
1
};
typedef
typename
packet_traits
<
LhsScalar
>::
type
_LhsPacket
;
...
...
@@ -258,15 +504,14 @@ public:
p
=
pset1
<
ResPacket
>
(
ResScalar
(
0
));
}
EIGEN_STRONG_INLINE
void
unpackRhs
(
DenseIndex
n
,
const
RhsScalar
*
rhs
,
Rhs
Scalar
*
b
)
EIGEN_STRONG_INLINE
void
loadRhs
(
const
RhsScalar
*
b
,
Rhs
Packet
&
dest
)
const
{
for
(
DenseIndex
k
=
0
;
k
<
n
;
k
++
)
pstore1
<
RhsPacket
>
(
&
b
[
k
*
RhsPacketSize
],
rhs
[
k
]);
dest
=
pset1
<
RhsPacket
>
(
*
b
);
}
EIGEN_STRONG_INLINE
void
loadRhs
(
const
RhsScalar
*
b
,
RhsPacket
&
dest
)
const
EIGEN_STRONG_INLINE
void
loadRhs
Quad
(
const
RhsScalar
*
b
,
RhsPacket
&
dest
)
const
{
dest
=
p
load
<
RhsPacket
>
(
b
);
dest
=
p
set1
<
RhsPacket
>
(
*
b
);
}
EIGEN_STRONG_INLINE
void
loadLhs
(
const
LhsScalar
*
a
,
LhsPacket
&
dest
)
const
...
...
@@ -274,6 +519,21 @@ public:
dest
=
pload
<
LhsPacket
>
(
a
);
}
EIGEN_STRONG_INLINE
void
loadLhsUnaligned
(
const
LhsScalar
*
a
,
LhsPacket
&
dest
)
const
{
dest
=
ploadu
<
LhsPacket
>
(
a
);
}
EIGEN_STRONG_INLINE
void
broadcastRhs
(
const
RhsScalar
*
b
,
RhsPacket
&
b0
,
RhsPacket
&
b1
,
RhsPacket
&
b2
,
RhsPacket
&
b3
)
{
pbroadcast4
(
b
,
b0
,
b1
,
b2
,
b3
);
}
// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
// {
// pbroadcast2(b, b0, b1);
// }
EIGEN_STRONG_INLINE
void
madd
(
const
LhsPacket
&
a
,
const
RhsPacket
&
b
,
AccPacket
&
c
,
RhsPacket
&
tmp
)
const
{
madd_impl
(
a
,
b
,
c
,
tmp
,
typename
conditional
<
Vectorizable
,
true_type
,
false_type
>::
type
());
...
...
@@ -281,7 +541,12 @@ public:
EIGEN_STRONG_INLINE
void
madd_impl
(
const
LhsPacket
&
a
,
const
RhsPacket
&
b
,
AccPacket
&
c
,
RhsPacket
&
tmp
,
const
true_type
&
)
const
{
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
EIGEN_UNUSED_VARIABLE
(
tmp
);
c
.
v
=
pmadd
(
a
.
v
,
b
,
c
.
v
);
#else
tmp
=
b
;
tmp
=
pmul
(
a
.
v
,
tmp
);
c
.
v
=
padd
(
c
.
v
,
tmp
);
#endif
}
EIGEN_STRONG_INLINE
void
madd_impl
(
const
LhsScalar
&
a
,
const
RhsScalar
&
b
,
ResScalar
&
c
,
RhsScalar
&
/*tmp*/
,
const
false_type
&
)
const
...
...
@@ -298,6 +563,38 @@ protected:
conj_helper
<
ResPacket
,
ResPacket
,
ConjLhs
,
false
>
cj
;
};
template
<
typename
Packet
>
struct
DoublePacket
{
Packet
first
;
Packet
second
;
};
template
<
typename
Packet
>
DoublePacket
<
Packet
>
padd
(
const
DoublePacket
<
Packet
>
&
a
,
const
DoublePacket
<
Packet
>
&
b
)
{
DoublePacket
<
Packet
>
res
;
res
.
first
=
padd
(
a
.
first
,
b
.
first
);
res
.
second
=
padd
(
a
.
second
,
b
.
second
);
return
res
;
}
template
<
typename
Packet
>
const
DoublePacket
<
Packet
>&
predux_downto4
(
const
DoublePacket
<
Packet
>
&
a
)
{
return
a
;
}
template
<
typename
Packet
>
struct
unpacket_traits
<
DoublePacket
<
Packet
>
>
{
typedef
DoublePacket
<
Packet
>
half
;
};
// template<typename Packet>
// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
// {
// DoublePacket<Packet> res;
// res.first = padd(a.first, b.first);
// res.second = padd(a.second,b.second);
// return res;
// }
template
<
typename
RealScalar
,
bool
_ConjLhs
,
bool
_ConjRhs
>
class
gebp_traits
<
std
::
complex
<
RealScalar
>
,
std
::
complex
<
RealScalar
>
,
_ConjLhs
,
_ConjRhs
>
{
...
...
@@ -314,60 +611,80 @@ public:
&&
packet_traits
<
Scalar
>::
Vectorizable
,
RealPacketSize
=
Vectorizable
?
packet_traits
<
RealScalar
>::
size
:
1
,
ResPacketSize
=
Vectorizable
?
packet_traits
<
ResScalar
>::
size
:
1
,
nr
=
2
,
mr
=
2
*
ResPacketSize
,
WorkSpaceFactor
=
Vectorizable
?
2
*
nr
*
RealPacketSize
:
nr
,
LhsPacketSize
=
Vectorizable
?
packet_traits
<
LhsScalar
>::
size
:
1
,
RhsPacketSize
=
Vectorizable
?
packet_traits
<
RhsScalar
>::
size
:
1
,
// FIXME: should depend on NumberOfRegisters
nr
=
4
,
mr
=
ResPacketSize
,
LhsProgress
=
ResPacketSize
,
RhsProgress
=
Vectorizable
?
2
*
ResPacketSize
:
1
RhsProgress
=
1
};
typedef
typename
packet_traits
<
RealScalar
>::
type
RealPacket
;
typedef
typename
packet_traits
<
Scalar
>::
type
ScalarPacket
;
struct
DoublePacket
{
RealPacket
first
;
RealPacket
second
;
};
typedef
DoublePacket
<
RealPacket
>
DoublePacketType
;
typedef
typename
conditional
<
Vectorizable
,
RealPacket
,
Scalar
>::
type
LhsPacket
;
typedef
typename
conditional
<
Vectorizable
,
DoublePacket
,
Scalar
>::
type
RhsPacket
;
typedef
typename
conditional
<
Vectorizable
,
DoublePacket
Type
,
Scalar
>::
type
RhsPacket
;
typedef
typename
conditional
<
Vectorizable
,
ScalarPacket
,
Scalar
>::
type
ResPacket
;
typedef
typename
conditional
<
Vectorizable
,
DoublePacket
,
Scalar
>::
type
AccPacket
;
typedef
typename
conditional
<
Vectorizable
,
DoublePacket
Type
,
Scalar
>::
type
AccPacket
;
EIGEN_STRONG_INLINE
void
initAcc
(
Scalar
&
p
)
{
p
=
Scalar
(
0
);
}
EIGEN_STRONG_INLINE
void
initAcc
(
DoublePacket
&
p
)
EIGEN_STRONG_INLINE
void
initAcc
(
DoublePacket
Type
&
p
)
{
p
.
first
=
pset1
<
RealPacket
>
(
RealScalar
(
0
));
p
.
second
=
pset1
<
RealPacket
>
(
RealScalar
(
0
));
}
/* Unpack the rhs coeff such that each complex coefficient is spread into
* two packects containing respectively the real and imaginary coefficient
* duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y]
*/
EIGEN_STRONG_INLINE
void
unpackRhs
(
DenseIndex
n
,
const
Scalar
*
rhs
,
Scalar
*
b
)
// Scalar path
EIGEN_STRONG_INLINE
void
loadRhs
(
const
RhsScalar
*
b
,
ResPacket
&
dest
)
const
{
for
(
DenseIndex
k
=
0
;
k
<
n
;
k
++
)
{
if
(
Vectorizable
)
{
pstore1
<
RealPacket
>
((
RealScalar
*
)
&
b
[
k
*
ResPacketSize
*
2
+
0
],
real
(
rhs
[
k
]));
pstore1
<
RealPacket
>
((
RealScalar
*
)
&
b
[
k
*
ResPacketSize
*
2
+
ResPacketSize
],
imag
(
rhs
[
k
]));
}
else
b
[
k
]
=
rhs
[
k
];
}
dest
=
pset1
<
ResPacket
>
(
*
b
);
}
EIGEN_STRONG_INLINE
void
loadRhs
(
const
RhsScalar
*
b
,
ResPacket
&
dest
)
const
{
dest
=
*
b
;
}
EIGEN_STRONG_INLINE
void
loadRhs
(
const
RhsScalar
*
b
,
DoublePacket
&
dest
)
const
// Vectorized path
EIGEN_STRONG_INLINE
void
loadRhs
(
const
RhsScalar
*
b
,
DoublePacketType
&
dest
)
const
{
dest
.
first
=
pset1
<
RealPacket
>
(
real
(
*
b
));
dest
.
second
=
pset1
<
RealPacket
>
(
imag
(
*
b
));
}
EIGEN_STRONG_INLINE
void
loadRhsQuad
(
const
RhsScalar
*
b
,
ResPacket
&
dest
)
const
{
loadRhs
(
b
,
dest
);
}
EIGEN_STRONG_INLINE
void
loadRhsQuad
(
const
RhsScalar
*
b
,
DoublePacketType
&
dest
)
const
{
eigen_internal_assert
(
unpacket_traits
<
ScalarPacket
>::
size
<=
4
);
loadRhs
(
b
,
dest
);
}
EIGEN_STRONG_INLINE
void
broadcastRhs
(
const
RhsScalar
*
b
,
RhsPacket
&
b0
,
RhsPacket
&
b1
,
RhsPacket
&
b2
,
RhsPacket
&
b3
)
{
dest
.
first
=
pload
<
RealPacket
>
((
const
RealScalar
*
)
b
);
dest
.
second
=
pload
<
RealPacket
>
((
const
RealScalar
*
)(
b
+
ResPacketSize
));
// FIXME not sure that's the best way to implement it!
loadRhs
(
b
+
0
,
b0
);
loadRhs
(
b
+
1
,
b1
);
loadRhs
(
b
+
2
,
b2
);
loadRhs
(
b
+
3
,
b3
);
}
// Vectorized path
EIGEN_STRONG_INLINE
void
broadcastRhs
(
const
RhsScalar
*
b
,
DoublePacketType
&
b0
,
DoublePacketType
&
b1
)
{
// FIXME not sure that's the best way to implement it!
loadRhs
(
b
+
0
,
b0
);
loadRhs
(
b
+
1
,
b1
);
}
// Scalar path
EIGEN_STRONG_INLINE
void
broadcastRhs
(
const
RhsScalar
*
b
,
RhsScalar
&
b0
,
RhsScalar
&
b1
)
{
// FIXME not sure that's the best way to implement it!
loadRhs
(
b
+
0
,
b0
);
loadRhs
(
b
+
1
,
b1
);
}
// nothing special here
...
...
@@ -376,7 +693,12 @@ public:
dest
=
pload
<
LhsPacket
>
((
const
typename
unpacket_traits
<
LhsPacket
>::
type
*
)(
a
));
}
EIGEN_STRONG_INLINE
void
madd
(
const
LhsPacket
&
a
,
const
RhsPacket
&
b
,
DoublePacket
&
c
,
RhsPacket
&
/*tmp*/
)
const
EIGEN_STRONG_INLINE
void
loadLhsUnaligned
(
const
LhsScalar
*
a
,
LhsPacket
&
dest
)
const
{
dest
=
ploadu
<
LhsPacket
>
((
const
typename
unpacket_traits
<
LhsPacket
>::
type
*
)(
a
));
}
EIGEN_STRONG_INLINE
void
madd
(
const
LhsPacket
&
a
,
const
RhsPacket
&
b
,
DoublePacketType
&
c
,
RhsPacket
&
/*tmp*/
)
const
{
c
.
first
=
padd
(
pmul
(
a
,
b
.
first
),
c
.
first
);
c
.
second
=
padd
(
pmul
(
a
,
b
.
second
),
c
.
second
);
...
...
@@ -389,7 +711,7 @@ public:
EIGEN_STRONG_INLINE
void
acc
(
const
Scalar
&
c
,
const
Scalar
&
alpha
,
Scalar
&
r
)
const
{
r
+=
alpha
*
c
;
}
EIGEN_STRONG_INLINE
void
acc
(
const
DoublePacket
&
c
,
const
ResPacket
&
alpha
,
ResPacket
&
r
)
const
EIGEN_STRONG_INLINE
void
acc
(
const
DoublePacket
Type
&
c
,
const
ResPacket
&
alpha
,
ResPacket
&
r
)
const
{
// assemble c
ResPacket
tmp
;
...
...
@@ -440,12 +762,12 @@ public:
ResPacketSize
=
Vectorizable
?
packet_traits
<
ResScalar
>::
size
:
1
,
NumberOfRegisters
=
EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
,
// FIXME: should depend on NumberOfRegisters
nr
=
4
,
mr
=
2
*
ResPacketSize
,
WorkSpaceFactor
=
nr
*
RhsPacketSize
,
mr
=
(
EIGEN_PLAIN_ENUM_MIN
(
16
,
NumberOfRegisters
)
/
2
/
nr
)
*
ResPacketSize
,
LhsProgress
=
ResPacketSize
,
RhsProgress
=
ResPacketSize
RhsProgress
=
1
};
typedef
typename
packet_traits
<
LhsScalar
>::
type
_LhsPacket
;
...
...
@@ -463,21 +785,38 @@ public:
p
=
pset1
<
ResPacket
>
(
ResScalar
(
0
));
}
EIGEN_STRONG_INLINE
void
unpackRhs
(
DenseIndex
n
,
const
RhsScalar
*
rhs
,
Rhs
Scalar
*
b
)
EIGEN_STRONG_INLINE
void
loadRhs
(
const
RhsScalar
*
b
,
Rhs
Packet
&
dest
)
const
{
for
(
DenseIndex
k
=
0
;
k
<
n
;
k
++
)
pstore1
<
RhsPacket
>
(
&
b
[
k
*
RhsPacketSize
],
rhs
[
k
]);
dest
=
pset1
<
RhsPacket
>
(
*
b
);
}
EIGEN_STRONG_INLINE
void
l
oadRhs
(
const
RhsScalar
*
b
,
RhsPacket
&
dest
)
const
void
br
oad
cast
Rhs
(
const
RhsScalar
*
b
,
RhsPacket
&
b0
,
RhsPacket
&
b1
,
RhsPacket
&
b2
,
RhsPacket
&
b3
)
{
dest
=
pload
<
RhsPacket
>
(
b
);
pbroadcast4
(
b
,
b0
,
b1
,
b2
,
b3
);
}
// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
// {
// // FIXME not sure that's the best way to implement it!
// b0 = pload1<RhsPacket>(b+0);
// b1 = pload1<RhsPacket>(b+1);
// }
EIGEN_STRONG_INLINE
void
loadLhs
(
const
LhsScalar
*
a
,
LhsPacket
&
dest
)
const
{
dest
=
ploaddup
<
LhsPacket
>
(
a
);
}
EIGEN_STRONG_INLINE
void
loadRhsQuad
(
const
RhsScalar
*
b
,
RhsPacket
&
dest
)
const
{
eigen_internal_assert
(
unpacket_traits
<
RhsPacket
>::
size
<=
4
);
loadRhs
(
b
,
dest
);
}
EIGEN_STRONG_INLINE
void
loadLhsUnaligned
(
const
LhsScalar
*
a
,
LhsPacket
&
dest
)
const
{
dest
=
ploaddup
<
LhsPacket
>
(
a
);
}
EIGEN_STRONG_INLINE
void
madd
(
const
LhsPacket
&
a
,
const
RhsPacket
&
b
,
AccPacket
&
c
,
RhsPacket
&
tmp
)
const
{
...
...
@@ -486,7 +825,13 @@ public:
EIGEN_STRONG_INLINE
void
madd_impl
(
const
LhsPacket
&
a
,
const
RhsPacket
&
b
,
AccPacket
&
c
,
RhsPacket
&
tmp
,
const
true_type
&
)
const
{
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
EIGEN_UNUSED_VARIABLE
(
tmp
);
c
.
v
=
pmadd
(
a
,
b
.
v
,
c
.
v
);
#else
tmp
=
b
;
tmp
.
v
=
pmul
(
a
,
tmp
.
v
);
c
=
padd
(
c
,
tmp
);
#endif
}
EIGEN_STRONG_INLINE
void
madd_impl
(
const
LhsScalar
&
a
,
const
RhsScalar
&
b
,
ResScalar
&
c
,
RhsScalar
&
/*tmp*/
,
const
false_type
&
)
const
...
...
@@ -510,7 +855,7 @@ protected:
* |real |cplx | no vectorization yet, would require to pack A with duplication
* |cplx |real | easy vectorization
*/
template
<
typename
LhsScalar
,
typename
RhsScalar
,
typename
Index
,
int
mr
,
int
nr
,
bool
ConjugateLhs
,
bool
ConjugateRhs
>
template
<
typename
LhsScalar
,
typename
RhsScalar
,
typename
Index
,
typename
DataMapper
,
int
mr
,
int
nr
,
bool
ConjugateLhs
,
bool
ConjugateRhs
>
struct
gebp_kernel
{
typedef
gebp_traits
<
LhsScalar
,
RhsScalar
,
ConjugateLhs
,
ConjugateRhs
>
Traits
;
...
...
@@ -520,6 +865,15 @@ struct gebp_kernel
typedef
typename
Traits
::
ResPacket
ResPacket
;
typedef
typename
Traits
::
AccPacket
AccPacket
;
typedef
gebp_traits
<
RhsScalar
,
LhsScalar
,
ConjugateRhs
,
ConjugateLhs
>
SwappedTraits
;
typedef
typename
SwappedTraits
::
ResScalar
SResScalar
;
typedef
typename
SwappedTraits
::
LhsPacket
SLhsPacket
;
typedef
typename
SwappedTraits
::
RhsPacket
SRhsPacket
;
typedef
typename
SwappedTraits
::
ResPacket
SResPacket
;
typedef
typename
SwappedTraits
::
AccPacket
SAccPacket
;
typedef
typename
DataMapper
::
LinearMapper
LinearMapper
;
enum
{
Vectorizable
=
Traits
::
Vectorizable
,
LhsProgress
=
Traits
::
LhsProgress
,
...
...
@@ -528,571 +882,788 @@ struct gebp_kernel
};
EIGEN_DONT_INLINE
void
operator
()(
ResScalar
*
res
,
Index
resStride
,
const
LhsScalar
*
blockA
,
const
RhsScalar
*
blockB
,
Index
rows
,
Index
depth
,
Index
cols
,
ResScalar
alpha
,
Index
strideA
=-
1
,
Index
strideB
=-
1
,
Index
offsetA
=
0
,
Index
offsetB
=
0
,
RhsScalar
*
unpackedB
=
0
);
void
operator
()(
const
DataMapper
&
res
,
const
LhsScalar
*
blockA
,
const
RhsScalar
*
blockB
,
Index
rows
,
Index
depth
,
Index
cols
,
ResScalar
alpha
,
Index
strideA
=-
1
,
Index
strideB
=-
1
,
Index
offsetA
=
0
,
Index
offsetB
=
0
);
};
template
<
typename
LhsScalar
,
typename
RhsScalar
,
typename
Index
,
int
mr
,
int
nr
,
bool
ConjugateLhs
,
bool
ConjugateRhs
>
template
<
typename
LhsScalar
,
typename
RhsScalar
,
typename
Index
,
typename
DataMapper
,
int
mr
,
int
nr
,
bool
ConjugateLhs
,
bool
ConjugateRhs
>
EIGEN_DONT_INLINE
void
gebp_kernel
<
LhsScalar
,
RhsScalar
,
Index
,
mr
,
nr
,
ConjugateLhs
,
ConjugateRhs
>
::
operator
()(
ResScalar
*
res
,
Index
resStride
,
const
LhsScalar
*
blockA
,
const
RhsScalar
*
blockB
,
Index
rows
,
Index
depth
,
Index
cols
,
ResScalar
alpha
,
Index
strideA
,
Index
strideB
,
Index
offsetA
,
Index
offsetB
,
RhsScalar
*
unpackedB
)
void
gebp_kernel
<
LhsScalar
,
RhsScalar
,
Index
,
DataMapper
,
mr
,
nr
,
ConjugateLhs
,
ConjugateRhs
>
::
operator
()(
const
DataMapper
&
res
,
const
LhsScalar
*
blockA
,
const
RhsScalar
*
blockB
,
Index
rows
,
Index
depth
,
Index
cols
,
ResScalar
alpha
,
Index
strideA
,
Index
strideB
,
Index
offsetA
,
Index
offsetB
)
{
Traits
traits
;
SwappedTraits
straits
;
if
(
strideA
==-
1
)
strideA
=
depth
;
if
(
strideB
==-
1
)
strideB
=
depth
;
conj_helper
<
LhsScalar
,
RhsScalar
,
ConjugateLhs
,
ConjugateRhs
>
cj
;
// conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
Index
packet_cols
=
(
cols
/
nr
)
*
nr
;
const
Index
peeled_mc
=
(
rows
/
mr
)
*
mr
;
// FIXME:
const
Index
peeled_mc2
=
peeled_mc
+
(
rows
-
peeled_mc
>=
LhsProgress
?
LhsProgress
:
0
);
const
Index
peeled_kc
=
(
depth
/
4
)
*
4
;
if
(
unpackedB
==
0
)
unpackedB
=
const_cast
<
RhsScalar
*>
(
blockB
-
strideB
*
nr
*
RhsProgress
);
// loops on each micro vertical panel of rhs (depth x nr)
for
(
Index
j2
=
0
;
j2
<
packet_cols
;
j2
+=
nr
)
Index
packet_cols4
=
nr
>=
4
?
(
cols
/
4
)
*
4
:
0
;
const
Index
peeled_mc3
=
mr
>=
3
*
Traits
::
LhsProgress
?
(
rows
/
(
3
*
LhsProgress
))
*
(
3
*
LhsProgress
)
:
0
;
const
Index
peeled_mc2
=
mr
>=
2
*
Traits
::
LhsProgress
?
peeled_mc3
+
((
rows
-
peeled_mc3
)
/
(
2
*
LhsProgress
))
*
(
2
*
LhsProgress
)
:
0
;
const
Index
peeled_mc1
=
mr
>=
1
*
Traits
::
LhsProgress
?
(
rows
/
(
1
*
LhsProgress
))
*
(
1
*
LhsProgress
)
:
0
;
enum
{
pk
=
8
};
// NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
const
Index
peeled_kc
=
depth
&
~
(
pk
-
1
);
const
Index
prefetch_res_offset
=
32
/
sizeof
(
ResScalar
);
// const Index depth2 = depth & ~1;
//---------- Process 3 * LhsProgress rows at once ----------
// This corresponds to 3*LhsProgress x nr register blocks.
// Usually, make sense only with FMA
if
(
mr
>=
3
*
Traits
::
LhsProgress
)
{
traits
.
unpackRhs
(
depth
*
nr
,
&
blockB
[
j2
*
strideB
+
offsetB
*
nr
],
unpackedB
);
// loops on each largest micro horizontal panel of lhs (mr x depth)
// => we select a mr x nr micro block of res which is entirely
// stored into mr/packet_size x nr registers.
for
(
Index
i
=
0
;
i
<
peeled_mc
;
i
+=
mr
)
// Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
// and on each largest micro vertical panel of the rhs (depth * nr).
// Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
// However, if depth is too small, we can extend the number of rows of these horizontal panels.
// This actual number of rows is computed as follow:
const
Index
l1
=
defaultL1CacheSize
;
// in Bytes, TODO, l1 should be passed to this function.
// The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
// suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
// or because we are testing specific blocking sizes.
const
Index
actual_panel_rows
=
(
3
*
LhsProgress
)
*
std
::
max
<
Index
>
(
1
,(
(
l1
-
sizeof
(
ResScalar
)
*
mr
*
nr
-
depth
*
nr
*
sizeof
(
RhsScalar
))
/
(
depth
*
sizeof
(
LhsScalar
)
*
3
*
LhsProgress
)
));
for
(
Index
i1
=
0
;
i1
<
peeled_mc3
;
i1
+=
actual_panel_rows
)
{
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
*
mr
];
prefetch
(
&
blA
[
0
]);
// gets res block as register
AccPacket
C0
,
C1
,
C2
,
C3
,
C4
,
C5
,
C6
,
C7
;
traits
.
initAcc
(
C0
);
traits
.
initAcc
(
C1
);
if
(
nr
==
4
)
traits
.
initAcc
(
C2
);
if
(
nr
==
4
)
traits
.
initAcc
(
C3
);
traits
.
initAcc
(
C4
);
traits
.
initAcc
(
C5
);
if
(
nr
==
4
)
traits
.
initAcc
(
C6
);
if
(
nr
==
4
)
traits
.
initAcc
(
C7
);
ResScalar
*
r0
=
&
res
[(
j2
+
0
)
*
resStride
+
i
];
ResScalar
*
r1
=
r0
+
resStride
;
ResScalar
*
r2
=
r1
+
resStride
;
ResScalar
*
r3
=
r2
+
resStride
;
prefetch
(
r0
+
16
);
prefetch
(
r1
+
16
);
prefetch
(
r2
+
16
);
prefetch
(
r3
+
16
);
// performs "inner" product
// TODO let's check wether the folowing peeled loop could not be
// optimized via optimal prefetching from one loop to the other
const
RhsScalar
*
blB
=
unpackedB
;
for
(
Index
k
=
0
;
k
<
peeled_kc
;
k
+=
4
)
const
Index
actual_panel_end
=
(
std
::
min
)(
i1
+
actual_panel_rows
,
peeled_mc3
);
for
(
Index
j2
=
0
;
j2
<
packet_cols4
;
j2
+=
nr
)
{
if
(
nr
==
2
)
{
LhsPacket
A0
,
A1
;
RhsPacket
B_0
;
RhsPacket
T0
;
EIGEN_ASM_COMMENT
(
"mybegin2"
);
traits
.
loadLhs
(
&
blA
[
0
*
LhsProgress
],
A0
);
traits
.
loadLhs
(
&
blA
[
1
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
0
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
loadRhs
(
&
blB
[
1
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C1
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C5
,
B_0
);
traits
.
loadLhs
(
&
blA
[
2
*
LhsProgress
],
A0
);
traits
.
loadLhs
(
&
blA
[
3
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
2
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
loadRhs
(
&
blB
[
3
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C1
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C5
,
B_0
);
traits
.
loadLhs
(
&
blA
[
4
*
LhsProgress
],
A0
);
traits
.
loadLhs
(
&
blA
[
5
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
4
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
loadRhs
(
&
blB
[
5
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C1
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C5
,
B_0
);
traits
.
loadLhs
(
&
blA
[
6
*
LhsProgress
],
A0
);
traits
.
loadLhs
(
&
blA
[
7
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
6
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
loadRhs
(
&
blB
[
7
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C1
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C5
,
B_0
);
EIGEN_ASM_COMMENT
(
"myend"
);
}
else
for
(
Index
i
=
i1
;
i
<
actual_panel_end
;
i
+=
3
*
LhsProgress
)
{
EIGEN_ASM_COMMENT
(
"mybegin4"
);
LhsPacket
A0
,
A1
;
RhsPacket
B_0
,
B1
,
B2
,
B3
;
RhsPacket
T0
;
traits
.
loadLhs
(
&
blA
[
0
*
LhsProgress
],
A0
);
traits
.
loadLhs
(
&
blA
[
1
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
0
*
RhsProgress
],
B_0
);
traits
.
loadRhs
(
&
blB
[
1
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
loadRhs
(
&
blB
[
2
*
RhsProgress
],
B2
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
loadRhs
(
&
blB
[
3
*
RhsProgress
],
B3
);
traits
.
loadRhs
(
&
blB
[
4
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
T0
);
traits
.
madd
(
A1
,
B1
,
C5
,
B1
);
traits
.
loadRhs
(
&
blB
[
5
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
T0
);
traits
.
madd
(
A1
,
B2
,
C6
,
B2
);
traits
.
loadRhs
(
&
blB
[
6
*
RhsProgress
],
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
T0
);
traits
.
loadLhs
(
&
blA
[
2
*
LhsProgress
],
A0
);
traits
.
madd
(
A1
,
B3
,
C7
,
B3
);
traits
.
loadLhs
(
&
blA
[
3
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
7
*
RhsProgress
],
B3
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
loadRhs
(
&
blB
[
8
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
T0
);
traits
.
madd
(
A1
,
B1
,
C5
,
B1
);
traits
.
loadRhs
(
&
blB
[
9
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
T0
);
traits
.
madd
(
A1
,
B2
,
C6
,
B2
);
traits
.
loadRhs
(
&
blB
[
10
*
RhsProgress
],
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
T0
);
traits
.
loadLhs
(
&
blA
[
4
*
LhsProgress
],
A0
);
traits
.
madd
(
A1
,
B3
,
C7
,
B3
);
traits
.
loadLhs
(
&
blA
[
5
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
11
*
RhsProgress
],
B3
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
loadRhs
(
&
blB
[
12
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
T0
);
traits
.
madd
(
A1
,
B1
,
C5
,
B1
);
traits
.
loadRhs
(
&
blB
[
13
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
T0
);
traits
.
madd
(
A1
,
B2
,
C6
,
B2
);
traits
.
loadRhs
(
&
blB
[
14
*
RhsProgress
],
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
T0
);
traits
.
loadLhs
(
&
blA
[
6
*
LhsProgress
],
A0
);
traits
.
madd
(
A1
,
B3
,
C7
,
B3
);
traits
.
loadLhs
(
&
blA
[
7
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
15
*
RhsProgress
],
B3
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
T0
);
traits
.
madd
(
A1
,
B1
,
C5
,
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
T0
);
traits
.
madd
(
A1
,
B2
,
C6
,
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
T0
);
traits
.
madd
(
A1
,
B3
,
C7
,
B3
);
}
// We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
// stored into 3 x nr registers.
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
*
(
3
*
LhsProgress
)];
prefetch
(
&
blA
[
0
]);
// gets res block as register
AccPacket
C0
,
C1
,
C2
,
C3
,
C4
,
C5
,
C6
,
C7
,
C8
,
C9
,
C10
,
C11
;
traits
.
initAcc
(
C0
);
traits
.
initAcc
(
C1
);
traits
.
initAcc
(
C2
);
traits
.
initAcc
(
C3
);
traits
.
initAcc
(
C4
);
traits
.
initAcc
(
C5
);
traits
.
initAcc
(
C6
);
traits
.
initAcc
(
C7
);
traits
.
initAcc
(
C8
);
traits
.
initAcc
(
C9
);
traits
.
initAcc
(
C10
);
traits
.
initAcc
(
C11
);
LinearMapper
r0
=
res
.
getLinearMapper
(
i
,
j2
+
0
);
LinearMapper
r1
=
res
.
getLinearMapper
(
i
,
j2
+
1
);
LinearMapper
r2
=
res
.
getLinearMapper
(
i
,
j2
+
2
);
LinearMapper
r3
=
res
.
getLinearMapper
(
i
,
j2
+
3
);
r0
.
prefetch
(
0
);
r1
.
prefetch
(
0
);
r2
.
prefetch
(
0
);
r3
.
prefetch
(
0
);
// performs "inner" products
const
RhsScalar
*
blB
=
&
blockB
[
j2
*
strideB
+
offsetB
*
nr
];
prefetch
(
&
blB
[
0
]);
LhsPacket
A0
,
A1
;
blB
+=
4
*
nr
*
RhsProgress
;
blA
+=
4
*
mr
;
}
// process remaining peeled loop
for
(
Index
k
=
peeled_kc
;
k
<
depth
;
k
++
)
{
if
(
nr
==
2
)
for
(
Index
k
=
0
;
k
<
peeled_kc
;
k
+=
pk
)
{
LhsPacket
A0
,
A1
;
RhsPacket
B_0
;
RhsPacket
T0
;
traits
.
loadLhs
(
&
blA
[
0
*
LhsProgress
],
A0
);
traits
.
loadLhs
(
&
blA
[
1
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
0
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
loadRhs
(
&
blB
[
1
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C1
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C5
,
B_0
);
EIGEN_ASM_COMMENT
(
"begin gebp micro kernel 3pX4"
);
RhsPacket
B_0
,
T0
;
LhsPacket
A2
;
#define EIGEN_GEBP_ONESTEP(K) \
do { \
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
internal::prefetch(blA+(3*K+16)*LhsProgress); \
if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); }
/* Bug 953 */
\
traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
traits.madd(A0, B_0, C0, T0); \
traits.madd(A1, B_0, C4, T0); \
traits.madd(A2, B_0, C8, B_0); \
traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
traits.madd(A0, B_0, C1, T0); \
traits.madd(A1, B_0, C5, T0); \
traits.madd(A2, B_0, C9, B_0); \
traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
traits.madd(A0, B_0, C2, T0); \
traits.madd(A1, B_0, C6, T0); \
traits.madd(A2, B_0, C10, B_0); \
traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
traits.madd(A0, B_0, C3 , T0); \
traits.madd(A1, B_0, C7, T0); \
traits.madd(A2, B_0, C11, B_0); \
EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
} while(false)
internal
::
prefetch
(
blB
);
EIGEN_GEBP_ONESTEP
(
0
);
EIGEN_GEBP_ONESTEP
(
1
);
EIGEN_GEBP_ONESTEP
(
2
);
EIGEN_GEBP_ONESTEP
(
3
);
EIGEN_GEBP_ONESTEP
(
4
);
EIGEN_GEBP_ONESTEP
(
5
);
EIGEN_GEBP_ONESTEP
(
6
);
EIGEN_GEBP_ONESTEP
(
7
);
blB
+=
pk
*
4
*
RhsProgress
;
blA
+=
pk
*
3
*
Traits
::
LhsProgress
;
EIGEN_ASM_COMMENT
(
"end gebp micro kernel 3pX4"
);
}
else
// process remaining peeled loop
for
(
Index
k
=
peeled_kc
;
k
<
depth
;
k
++
)
{
LhsPacket
A0
,
A1
;
RhsPacket
B_0
,
B1
,
B2
,
B3
;
RhsPacket
T0
;
traits
.
loadLhs
(
&
blA
[
0
*
LhsProgress
],
A0
);
traits
.
loadLhs
(
&
blA
[
1
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
0
*
RhsProgress
],
B_0
);
traits
.
loadRhs
(
&
blB
[
1
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
loadRhs
(
&
blB
[
2
*
RhsProgress
],
B2
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
loadRhs
(
&
blB
[
3
*
RhsProgress
],
B3
);
traits
.
madd
(
A0
,
B1
,
C1
,
T0
);
traits
.
madd
(
A1
,
B1
,
C5
,
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
T0
);
traits
.
madd
(
A1
,
B2
,
C6
,
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
T0
);
traits
.
madd
(
A1
,
B3
,
C7
,
B3
);
RhsPacket
B_0
,
T0
;
LhsPacket
A2
;
EIGEN_GEBP_ONESTEP
(
0
);
blB
+=
4
*
RhsProgress
;
blA
+=
3
*
Traits
::
LhsProgress
;
}
blB
+=
nr
*
RhsProgress
;
blA
+=
mr
;
}
#undef EIGEN_GEBP_ONESTEP
if
(
nr
==
4
)
{
ResPacket
R0
,
R1
,
R2
,
R3
,
R4
,
R5
,
R6
;
ResPacket
R0
,
R1
,
R2
;
ResPacket
alphav
=
pset1
<
ResPacket
>
(
alpha
);
R0
=
ploadu
<
ResPacket
>
(
r0
);
R1
=
ploadu
<
ResPacket
>
(
r1
);
R2
=
ploadu
<
ResPacket
>
(
r2
);
R3
=
ploadu
<
ResPacket
>
(
r3
);
R4
=
ploadu
<
ResPacket
>
(
r0
+
ResPacketSize
);
R5
=
ploadu
<
ResPacket
>
(
r1
+
ResPacketSize
);
R6
=
ploadu
<
ResPacket
>
(
r2
+
ResPacketSize
);
R0
=
r0
.
loadPacket
(
0
*
Traits
::
ResPacketSize
);
R1
=
r0
.
loadPacket
(
1
*
Traits
::
ResPacketSize
);
R2
=
r0
.
loadPacket
(
2
*
Traits
::
ResPacketSize
);
traits
.
acc
(
C0
,
alphav
,
R0
);
pstoreu
(
r0
,
R0
);
R0
=
ploadu
<
ResPacket
>
(
r3
+
ResPacketSize
);
traits
.
acc
(
C1
,
alphav
,
R1
);
traits
.
acc
(
C2
,
alphav
,
R2
);
traits
.
acc
(
C3
,
alphav
,
R3
);
traits
.
acc
(
C4
,
alphav
,
R4
);
traits
.
acc
(
C5
,
alphav
,
R5
);
traits
.
acc
(
C6
,
alphav
,
R6
);
traits
.
acc
(
C7
,
alphav
,
R0
);
pstoreu
(
r1
,
R1
);
pstoreu
(
r2
,
R2
);
pstoreu
(
r3
,
R3
);
pstoreu
(
r0
+
ResPacketSize
,
R4
);
pstoreu
(
r1
+
ResPacketSize
,
R5
);
pstoreu
(
r2
+
ResPacketSize
,
R6
);
pstoreu
(
r3
+
ResPacketSize
,
R0
);
traits
.
acc
(
C4
,
alphav
,
R1
);
traits
.
acc
(
C8
,
alphav
,
R2
);
r0
.
storePacket
(
0
*
Traits
::
ResPacketSize
,
R0
);
r0
.
storePacket
(
1
*
Traits
::
ResPacketSize
,
R1
);
r0
.
storePacket
(
2
*
Traits
::
ResPacketSize
,
R2
);
R0
=
r1
.
loadPacket
(
0
*
Traits
::
ResPacketSize
);
R1
=
r1
.
loadPacket
(
1
*
Traits
::
ResPacketSize
);
R2
=
r1
.
loadPacket
(
2
*
Traits
::
ResPacketSize
);
traits
.
acc
(
C1
,
alphav
,
R0
);
traits
.
acc
(
C5
,
alphav
,
R1
);
traits
.
acc
(
C9
,
alphav
,
R2
);
r1
.
storePacket
(
0
*
Traits
::
ResPacketSize
,
R0
);
r1
.
storePacket
(
1
*
Traits
::
ResPacketSize
,
R1
);
r1
.
storePacket
(
2
*
Traits
::
ResPacketSize
,
R2
);
R0
=
r2
.
loadPacket
(
0
*
Traits
::
ResPacketSize
);
R1
=
r2
.
loadPacket
(
1
*
Traits
::
ResPacketSize
);
R2
=
r2
.
loadPacket
(
2
*
Traits
::
ResPacketSize
);
traits
.
acc
(
C2
,
alphav
,
R0
);
traits
.
acc
(
C6
,
alphav
,
R1
);
traits
.
acc
(
C10
,
alphav
,
R2
);
r2
.
storePacket
(
0
*
Traits
::
ResPacketSize
,
R0
);
r2
.
storePacket
(
1
*
Traits
::
ResPacketSize
,
R1
);
r2
.
storePacket
(
2
*
Traits
::
ResPacketSize
,
R2
);
R0
=
r3
.
loadPacket
(
0
*
Traits
::
ResPacketSize
);
R1
=
r3
.
loadPacket
(
1
*
Traits
::
ResPacketSize
);
R2
=
r3
.
loadPacket
(
2
*
Traits
::
ResPacketSize
);
traits
.
acc
(
C3
,
alphav
,
R0
);
traits
.
acc
(
C7
,
alphav
,
R1
);
traits
.
acc
(
C11
,
alphav
,
R2
);
r3
.
storePacket
(
0
*
Traits
::
ResPacketSize
,
R0
);
r3
.
storePacket
(
1
*
Traits
::
ResPacketSize
,
R1
);
r3
.
storePacket
(
2
*
Traits
::
ResPacketSize
,
R2
);
}
}
else
// Deal with remaining columns of the rhs
for
(
Index
j2
=
packet_cols4
;
j2
<
cols
;
j2
++
)
{
ResPacket
R0
,
R1
,
R4
;
for
(
Index
i
=
i1
;
i
<
actual_panel_end
;
i
+=
3
*
LhsProgress
)
{
// One column at a time
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
*
(
3
*
Traits
::
LhsProgress
)];
prefetch
(
&
blA
[
0
]);
// gets res block as register
AccPacket
C0
,
C4
,
C8
;
traits
.
initAcc
(
C0
);
traits
.
initAcc
(
C4
);
traits
.
initAcc
(
C8
);
LinearMapper
r0
=
res
.
getLinearMapper
(
i
,
j2
);
r0
.
prefetch
(
0
);
// performs "inner" products
const
RhsScalar
*
blB
=
&
blockB
[
j2
*
strideB
+
offsetB
];
LhsPacket
A0
,
A1
,
A2
;
for
(
Index
k
=
0
;
k
<
peeled_kc
;
k
+=
pk
)
{
EIGEN_ASM_COMMENT
(
"begin gebp micro kernel 3pX1"
);
RhsPacket
B_0
;
#define EIGEN_GEBGP_ONESTEP(K) \
do { \
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
traits.madd(A0, B_0, C0, B_0); \
traits.madd(A1, B_0, C4, B_0); \
traits.madd(A2, B_0, C8, B_0); \
EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
} while(false)
EIGEN_GEBGP_ONESTEP
(
0
);
EIGEN_GEBGP_ONESTEP
(
1
);
EIGEN_GEBGP_ONESTEP
(
2
);
EIGEN_GEBGP_ONESTEP
(
3
);
EIGEN_GEBGP_ONESTEP
(
4
);
EIGEN_GEBGP_ONESTEP
(
5
);
EIGEN_GEBGP_ONESTEP
(
6
);
EIGEN_GEBGP_ONESTEP
(
7
);
blB
+=
pk
*
RhsProgress
;
blA
+=
pk
*
3
*
Traits
::
LhsProgress
;
EIGEN_ASM_COMMENT
(
"end gebp micro kernel 3pX1"
);
}
// process remaining peeled loop
for
(
Index
k
=
peeled_kc
;
k
<
depth
;
k
++
)
{
RhsPacket
B_0
;
EIGEN_GEBGP_ONESTEP
(
0
);
blB
+=
RhsProgress
;
blA
+=
3
*
Traits
::
LhsProgress
;
}
#undef EIGEN_GEBGP_ONESTEP
ResPacket
R0
,
R1
,
R2
;
ResPacket
alphav
=
pset1
<
ResPacket
>
(
alpha
);
R0
=
p
load
u
<
ResPacket
>
(
r0
);
R1
=
p
load
u
<
ResPacket
>
(
r1
);
R
4
=
p
load
u
<
Res
Packet
>
(
r0
+
ResPacketSize
);
R0
=
r0
.
load
Packet
(
0
*
Traits
::
ResPacket
Size
);
R1
=
r0
.
load
Packet
(
1
*
Traits
::
ResPacket
Size
);
R
2
=
r0
.
loadPacket
(
2
*
Traits
::
ResPacketSize
);
traits
.
acc
(
C0
,
alphav
,
R0
);
pstoreu
(
r0
,
R0
);
R0
=
ploadu
<
ResPacket
>
(
r1
+
ResPacketSize
);
traits
.
acc
(
C1
,
alphav
,
R1
);
traits
.
acc
(
C4
,
alphav
,
R4
);
traits
.
acc
(
C5
,
alphav
,
R0
);
pstoreu
(
r1
,
R1
);
pstoreu
(
r0
+
ResPacketSize
,
R4
);
pstoreu
(
r1
+
ResPacketSize
,
R0
);
traits
.
acc
(
C4
,
alphav
,
R1
);
traits
.
acc
(
C8
,
alphav
,
R2
);
r0
.
storePacket
(
0
*
Traits
::
ResPacketSize
,
R0
);
r0
.
storePacket
(
1
*
Traits
::
ResPacketSize
,
R1
);
r0
.
storePacket
(
2
*
Traits
::
ResPacketSize
,
R2
);
}
}
}
if
(
rows
-
peeled_mc
>=
LhsProgress
)
}
//---------- Process 2 * LhsProgress rows at once ----------
if
(
mr
>=
2
*
Traits
::
LhsProgress
)
{
const
Index
l1
=
defaultL1CacheSize
;
// in Bytes, TODO, l1 should be passed to this function.
// The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
// suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
// or because we are testing specific blocking sizes.
Index
actual_panel_rows
=
(
2
*
LhsProgress
)
*
std
::
max
<
Index
>
(
1
,(
(
l1
-
sizeof
(
ResScalar
)
*
mr
*
nr
-
depth
*
nr
*
sizeof
(
RhsScalar
))
/
(
depth
*
sizeof
(
LhsScalar
)
*
2
*
LhsProgress
)
));
for
(
Index
i1
=
peeled_mc3
;
i1
<
peeled_mc2
;
i1
+=
actual_panel_rows
)
{
Index
i
=
peeled_mc
;
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
*
LhsProgress
];
prefetch
(
&
blA
[
0
]);
// gets res block as register
AccPacket
C0
,
C1
,
C2
,
C3
;
traits
.
initAcc
(
C0
);
traits
.
initAcc
(
C1
);
if
(
nr
==
4
)
traits
.
initAcc
(
C2
);
if
(
nr
==
4
)
traits
.
initAcc
(
C3
);
// performs "inner" product
const
RhsScalar
*
blB
=
unpackedB
;
for
(
Index
k
=
0
;
k
<
peeled_kc
;
k
+=
4
)
Index
actual_panel_end
=
(
std
::
min
)(
i1
+
actual_panel_rows
,
peeled_mc2
);
for
(
Index
j2
=
0
;
j2
<
packet_cols4
;
j2
+=
nr
)
{
if
(
nr
==
2
)
for
(
Index
i
=
i1
;
i
<
actual_panel_end
;
i
+=
2
*
LhsProgress
)
{
LhsPacket
A0
;
RhsPacket
B_0
,
B1
;
// We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
// stored into 2 x nr registers.
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
*
(
2
*
Traits
::
LhsProgress
)];
prefetch
(
&
blA
[
0
]);
// gets res block as register
AccPacket
C0
,
C1
,
C2
,
C3
,
C4
,
C5
,
C6
,
C7
;
traits
.
initAcc
(
C0
);
traits
.
initAcc
(
C1
);
traits
.
initAcc
(
C2
);
traits
.
initAcc
(
C3
);
traits
.
initAcc
(
C4
);
traits
.
initAcc
(
C5
);
traits
.
initAcc
(
C6
);
traits
.
initAcc
(
C7
);
LinearMapper
r0
=
res
.
getLinearMapper
(
i
,
j2
+
0
);
LinearMapper
r1
=
res
.
getLinearMapper
(
i
,
j2
+
1
);
LinearMapper
r2
=
res
.
getLinearMapper
(
i
,
j2
+
2
);
LinearMapper
r3
=
res
.
getLinearMapper
(
i
,
j2
+
3
);
r0
.
prefetch
(
prefetch_res_offset
);
r1
.
prefetch
(
prefetch_res_offset
);
r2
.
prefetch
(
prefetch_res_offset
);
r3
.
prefetch
(
prefetch_res_offset
);
// performs "inner" products
const
RhsScalar
*
blB
=
&
blockB
[
j2
*
strideB
+
offsetB
*
nr
];
prefetch
(
&
blB
[
0
]);
LhsPacket
A0
,
A1
;
traits
.
loadLhs
(
&
blA
[
0
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
0
*
RhsProgress
],
B_0
);
traits
.
loadRhs
(
&
blB
[
1
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
loadRhs
(
&
blB
[
2
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
traits
.
loadLhs
(
&
blA
[
1
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
3
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
loadRhs
(
&
blB
[
4
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
traits
.
loadLhs
(
&
blA
[
2
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
5
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
loadRhs
(
&
blB
[
6
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
traits
.
loadLhs
(
&
blA
[
3
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
7
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
}
else
for
(
Index
k
=
0
;
k
<
peeled_kc
;
k
+=
pk
)
{
LhsPacket
A0
;
RhsPacket
B_0
,
B1
,
B2
,
B3
;
traits
.
loadLhs
(
&
blA
[
0
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
0
*
RhsProgress
],
B_0
);
traits
.
loadRhs
(
&
blB
[
1
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
loadRhs
(
&
blB
[
2
*
RhsProgress
],
B2
);
traits
.
loadRhs
(
&
blB
[
3
*
RhsProgress
],
B3
);
traits
.
loadRhs
(
&
blB
[
4
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
traits
.
loadRhs
(
&
blB
[
5
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
B2
);
traits
.
loadRhs
(
&
blB
[
6
*
RhsProgress
],
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
B3
);
traits
.
loadLhs
(
&
blA
[
1
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
7
*
RhsProgress
],
B3
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
loadRhs
(
&
blB
[
8
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
traits
.
loadRhs
(
&
blB
[
9
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
B2
);
traits
.
loadRhs
(
&
blB
[
10
*
RhsProgress
],
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
B3
);
traits
.
loadLhs
(
&
blA
[
2
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
11
*
RhsProgress
],
B3
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
loadRhs
(
&
blB
[
12
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
traits
.
loadRhs
(
&
blB
[
13
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
B2
);
traits
.
loadRhs
(
&
blB
[
14
*
RhsProgress
],
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
B3
);
traits
.
loadLhs
(
&
blA
[
3
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
15
*
RhsProgress
],
B3
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
B3
);
EIGEN_ASM_COMMENT
(
"begin gebp micro kernel 2pX4"
);
RhsPacket
B_0
,
B1
,
B2
,
B3
,
T0
;
#define EIGEN_GEBGP_ONESTEP(K) \
do { \
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
traits.madd(A0, B_0, C0, T0); \
traits.madd(A1, B_0, C4, B_0); \
traits.madd(A0, B1, C1, T0); \
traits.madd(A1, B1, C5, B1); \
traits.madd(A0, B2, C2, T0); \
traits.madd(A1, B2, C6, B2); \
traits.madd(A0, B3, C3, T0); \
traits.madd(A1, B3, C7, B3); \
EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
} while(false)
internal
::
prefetch
(
blB
+
(
48
+
0
));
EIGEN_GEBGP_ONESTEP
(
0
);
EIGEN_GEBGP_ONESTEP
(
1
);
EIGEN_GEBGP_ONESTEP
(
2
);
EIGEN_GEBGP_ONESTEP
(
3
);
internal
::
prefetch
(
blB
+
(
48
+
16
));
EIGEN_GEBGP_ONESTEP
(
4
);
EIGEN_GEBGP_ONESTEP
(
5
);
EIGEN_GEBGP_ONESTEP
(
6
);
EIGEN_GEBGP_ONESTEP
(
7
);
blB
+=
pk
*
4
*
RhsProgress
;
blA
+=
pk
*
(
2
*
Traits
::
LhsProgress
);
EIGEN_ASM_COMMENT
(
"end gebp micro kernel 2pX4"
);
}
blB
+=
nr
*
4
*
RhsProgress
;
blA
+=
4
*
LhsProgress
;
}
// process remaining peeled loop
for
(
Index
k
=
peeled_kc
;
k
<
depth
;
k
++
)
{
if
(
nr
==
2
)
// process remaining peeled loop
for
(
Index
k
=
peeled_kc
;
k
<
depth
;
k
++
)
{
LhsPacket
A0
;
RhsPacket
B_0
,
B1
;
traits
.
loadLhs
(
&
blA
[
0
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
0
*
RhsProgress
],
B_0
);
traits
.
loadRhs
(
&
blB
[
1
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
RhsPacket
B_0
,
B1
,
B2
,
B3
,
T0
;
EIGEN_GEBGP_ONESTEP
(
0
);
blB
+=
4
*
RhsProgress
;
blA
+=
2
*
Traits
::
LhsProgress
;
}
else
{
LhsPacket
A0
;
RhsPacket
B_0
,
B1
,
B2
,
B3
;
#undef EIGEN_GEBGP_ONESTEP
traits
.
loadLhs
(
&
blA
[
0
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
0
*
RhsProgress
],
B_0
);
traits
.
loadRhs
(
&
blB
[
1
*
RhsProgress
],
B1
);
traits
.
loadRhs
(
&
blB
[
2
*
RhsProgress
],
B2
);
traits
.
loadRhs
(
&
blB
[
3
*
RhsProgress
],
B3
);
ResPacket
R0
,
R1
,
R2
,
R3
;
ResPacket
alphav
=
pset1
<
ResPacket
>
(
alpha
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
B3
);
R0
=
r0
.
loadPacket
(
0
*
Traits
::
ResPacketSize
);
R1
=
r0
.
loadPacket
(
1
*
Traits
::
ResPacketSize
);
R2
=
r1
.
loadPacket
(
0
*
Traits
::
ResPacketSize
);
R3
=
r1
.
loadPacket
(
1
*
Traits
::
ResPacketSize
);
traits
.
acc
(
C0
,
alphav
,
R0
);
traits
.
acc
(
C4
,
alphav
,
R1
);
traits
.
acc
(
C1
,
alphav
,
R2
);
traits
.
acc
(
C5
,
alphav
,
R3
);
r0
.
storePacket
(
0
*
Traits
::
ResPacketSize
,
R0
);
r0
.
storePacket
(
1
*
Traits
::
ResPacketSize
,
R1
);
r1
.
storePacket
(
0
*
Traits
::
ResPacketSize
,
R2
);
r1
.
storePacket
(
1
*
Traits
::
ResPacketSize
,
R3
);
R0
=
r2
.
loadPacket
(
0
*
Traits
::
ResPacketSize
);
R1
=
r2
.
loadPacket
(
1
*
Traits
::
ResPacketSize
);
R2
=
r3
.
loadPacket
(
0
*
Traits
::
ResPacketSize
);
R3
=
r3
.
loadPacket
(
1
*
Traits
::
ResPacketSize
);
traits
.
acc
(
C2
,
alphav
,
R0
);
traits
.
acc
(
C6
,
alphav
,
R1
);
traits
.
acc
(
C3
,
alphav
,
R2
);
traits
.
acc
(
C7
,
alphav
,
R3
);
r2
.
storePacket
(
0
*
Traits
::
ResPacketSize
,
R0
);
r2
.
storePacket
(
1
*
Traits
::
ResPacketSize
,
R1
);
r3
.
storePacket
(
0
*
Traits
::
ResPacketSize
,
R2
);
r3
.
storePacket
(
1
*
Traits
::
ResPacketSize
,
R3
);
}
blB
+=
nr
*
RhsProgress
;
blA
+=
LhsProgress
;
}
// Deal with remaining columns of the rhs
for
(
Index
j2
=
packet_cols4
;
j2
<
cols
;
j2
++
)
{
for
(
Index
i
=
i1
;
i
<
actual_panel_end
;
i
+=
2
*
LhsProgress
)
{
// One column at a time
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
*
(
2
*
Traits
::
LhsProgress
)];
prefetch
(
&
blA
[
0
]);
ResPacket
R0
,
R1
,
R2
,
R3
;
ResPacket
alphav
=
pset1
<
ResPacket
>
(
alpha
);
ResScalar
*
r0
=
&
res
[(
j2
+
0
)
*
resStride
+
i
];
ResScalar
*
r1
=
r0
+
resStride
;
ResScalar
*
r2
=
r1
+
resStride
;
ResScalar
*
r3
=
r2
+
resStride
;
// gets res block as register
AccPacket
C0
,
C4
;
traits
.
initAcc
(
C0
);
traits
.
initAcc
(
C4
);
R0
=
ploadu
<
ResPacket
>
(
r0
);
R1
=
ploadu
<
ResPacket
>
(
r1
);
if
(
nr
==
4
)
R2
=
ploadu
<
ResPacket
>
(
r2
);
if
(
nr
==
4
)
R3
=
ploadu
<
ResPacket
>
(
r3
);
LinearMapper
r0
=
res
.
getLinearMapper
(
i
,
j2
);
r0
.
prefetch
(
prefetch_res_offset
);
traits
.
acc
(
C0
,
alphav
,
R0
);
traits
.
acc
(
C1
,
alphav
,
R1
);
if
(
nr
==
4
)
traits
.
acc
(
C2
,
alphav
,
R2
);
if
(
nr
==
4
)
traits
.
acc
(
C3
,
alphav
,
R3
);
// performs "inner" products
const
RhsScalar
*
blB
=
&
blockB
[
j2
*
strideB
+
offsetB
];
LhsPacket
A0
,
A1
;
pstoreu
(
r0
,
R0
);
pstoreu
(
r1
,
R1
);
if
(
nr
==
4
)
pstoreu
(
r2
,
R2
);
if
(
nr
==
4
)
pstoreu
(
r3
,
R3
);
}
for
(
Index
i
=
peeled_mc2
;
i
<
rows
;
i
++
)
{
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
];
prefetch
(
&
blA
[
0
]);
// gets a 1 x nr res block as registers
ResScalar
C0
(
0
),
C1
(
0
),
C2
(
0
),
C3
(
0
);
// TODO directly use blockB ???
const
RhsScalar
*
blB
=
&
blockB
[
j2
*
strideB
+
offsetB
*
nr
];
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
if
(
nr
==
2
)
for
(
Index
k
=
0
;
k
<
peeled_kc
;
k
+=
pk
)
{
LhsScalar
A0
;
RhsScalar
B_0
,
B1
;
A0
=
blA
[
k
];
B_0
=
blB
[
0
];
B1
=
blB
[
1
];
MADD
(
cj
,
A0
,
B_0
,
C0
,
B_0
);
MADD
(
cj
,
A0
,
B1
,
C1
,
B1
);
EIGEN_ASM_COMMENT
(
"begin gebp micro kernel 2pX1"
);
RhsPacket
B_0
,
B1
;
#define EIGEN_GEBGP_ONESTEP(K) \
do { \
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
traits.madd(A0, B_0, C0, B1); \
traits.madd(A1, B_0, C4, B_0); \
EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
} while(false)
EIGEN_GEBGP_ONESTEP
(
0
);
EIGEN_GEBGP_ONESTEP
(
1
);
EIGEN_GEBGP_ONESTEP
(
2
);
EIGEN_GEBGP_ONESTEP
(
3
);
EIGEN_GEBGP_ONESTEP
(
4
);
EIGEN_GEBGP_ONESTEP
(
5
);
EIGEN_GEBGP_ONESTEP
(
6
);
EIGEN_GEBGP_ONESTEP
(
7
);
blB
+=
pk
*
RhsProgress
;
blA
+=
pk
*
2
*
Traits
::
LhsProgress
;
EIGEN_ASM_COMMENT
(
"end gebp micro kernel 2pX1"
);
}
else
// process remaining peeled loop
for
(
Index
k
=
peeled_kc
;
k
<
depth
;
k
++
)
{
LhsScalar
A0
;
RhsScalar
B_0
,
B1
,
B2
,
B3
;
A0
=
blA
[
k
];
B_0
=
blB
[
0
];
B1
=
blB
[
1
];
B2
=
blB
[
2
];
B3
=
blB
[
3
];
MADD
(
cj
,
A0
,
B_0
,
C0
,
B_0
);
MADD
(
cj
,
A0
,
B1
,
C1
,
B1
);
MADD
(
cj
,
A0
,
B2
,
C2
,
B2
);
MADD
(
cj
,
A0
,
B3
,
C3
,
B3
);
RhsPacket
B_0
,
B1
;
EIGEN_GEBGP_ONESTEP
(
0
);
blB
+=
RhsProgress
;
blA
+=
2
*
Traits
::
LhsProgress
;
}
#undef EIGEN_GEBGP_ONESTEP
ResPacket
R0
,
R1
;
ResPacket
alphav
=
pset1
<
ResPacket
>
(
alpha
);
blB
+=
nr
;
R0
=
r0
.
loadPacket
(
0
*
Traits
::
ResPacketSize
);
R1
=
r0
.
loadPacket
(
1
*
Traits
::
ResPacketSize
);
traits
.
acc
(
C0
,
alphav
,
R0
);
traits
.
acc
(
C4
,
alphav
,
R1
);
r0
.
storePacket
(
0
*
Traits
::
ResPacketSize
,
R0
);
r0
.
storePacket
(
1
*
Traits
::
ResPacketSize
,
R1
);
}
}
res
[(
j2
+
0
)
*
resStride
+
i
]
+=
alpha
*
C0
;
res
[(
j2
+
1
)
*
resStride
+
i
]
+=
alpha
*
C1
;
if
(
nr
==
4
)
res
[(
j2
+
2
)
*
resStride
+
i
]
+=
alpha
*
C2
;
if
(
nr
==
4
)
res
[(
j2
+
3
)
*
resStride
+
i
]
+=
alpha
*
C3
;
}
}
// process remaining rhs/res columns one at a time
// => do the same but with nr==1
for
(
Index
j2
=
packet_cols
;
j2
<
cols
;
j2
++
)
//---------- Process 1 * LhsProgress rows at once ----------
if
(
mr
>=
1
*
Traits
::
LhsProgress
)
{
// unpack B
traits
.
unpackRhs
(
depth
,
&
blockB
[
j2
*
strideB
+
offsetB
],
unpackedB
);
for
(
Index
i
=
0
;
i
<
peeled_mc
;
i
+=
mr
)
// loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
for
(
Index
i
=
peeled_mc2
;
i
<
peeled_mc1
;
i
+=
1
*
LhsProgress
)
{
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
*
mr
];
prefetch
(
&
blA
[
0
]);
// loops on each largest micro vertical panel of rhs (depth * nr)
for
(
Index
j2
=
0
;
j2
<
packet_cols4
;
j2
+=
nr
)
{
// We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
// stored into 1 x nr registers.
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
*
(
1
*
Traits
::
LhsProgress
)];
prefetch
(
&
blA
[
0
]);
// gets res block as register
AccPacket
C0
,
C1
,
C2
,
C3
;
traits
.
initAcc
(
C0
);
traits
.
initAcc
(
C1
);
traits
.
initAcc
(
C2
);
traits
.
initAcc
(
C3
);
LinearMapper
r0
=
res
.
getLinearMapper
(
i
,
j2
+
0
);
LinearMapper
r1
=
res
.
getLinearMapper
(
i
,
j2
+
1
);
LinearMapper
r2
=
res
.
getLinearMapper
(
i
,
j2
+
2
);
LinearMapper
r3
=
res
.
getLinearMapper
(
i
,
j2
+
3
);
r0
.
prefetch
(
prefetch_res_offset
);
r1
.
prefetch
(
prefetch_res_offset
);
r2
.
prefetch
(
prefetch_res_offset
);
r3
.
prefetch
(
prefetch_res_offset
);
// performs "inner" products
const
RhsScalar
*
blB
=
&
blockB
[
j2
*
strideB
+
offsetB
*
nr
];
prefetch
(
&
blB
[
0
]);
LhsPacket
A0
;
for
(
Index
k
=
0
;
k
<
peeled_kc
;
k
+=
pk
)
{
EIGEN_ASM_COMMENT
(
"begin gebp micro kernel 1pX4"
);
RhsPacket
B_0
,
B1
,
B2
,
B3
;
#define EIGEN_GEBGP_ONESTEP(K) \
do { \
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
traits.madd(A0, B_0, C0, B_0); \
traits.madd(A0, B1, C1, B1); \
traits.madd(A0, B2, C2, B2); \
traits.madd(A0, B3, C3, B3); \
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
} while(false)
internal
::
prefetch
(
blB
+
(
48
+
0
));
EIGEN_GEBGP_ONESTEP
(
0
);
EIGEN_GEBGP_ONESTEP
(
1
);
EIGEN_GEBGP_ONESTEP
(
2
);
EIGEN_GEBGP_ONESTEP
(
3
);
internal
::
prefetch
(
blB
+
(
48
+
16
));
EIGEN_GEBGP_ONESTEP
(
4
);
EIGEN_GEBGP_ONESTEP
(
5
);
EIGEN_GEBGP_ONESTEP
(
6
);
EIGEN_GEBGP_ONESTEP
(
7
);
blB
+=
pk
*
4
*
RhsProgress
;
blA
+=
pk
*
1
*
LhsProgress
;
EIGEN_ASM_COMMENT
(
"end gebp micro kernel 1pX4"
);
}
// process remaining peeled loop
for
(
Index
k
=
peeled_kc
;
k
<
depth
;
k
++
)
{
RhsPacket
B_0
,
B1
,
B2
,
B3
;
EIGEN_GEBGP_ONESTEP
(
0
);
blB
+=
4
*
RhsProgress
;
blA
+=
1
*
LhsProgress
;
}
#undef EIGEN_GEBGP_ONESTEP
// TODO move the res loads to the stores
ResPacket
R0
,
R1
;
ResPacket
alphav
=
pset1
<
ResPacket
>
(
alpha
);
// get res block as registers
AccPacket
C0
,
C4
;
traits
.
initAcc
(
C0
);
traits
.
initAcc
(
C4
);
R0
=
r0
.
loadPacket
(
0
*
Traits
::
ResPacketSize
);
R1
=
r1
.
loadPacket
(
0
*
Traits
::
ResPacketSize
);
traits
.
acc
(
C0
,
alphav
,
R0
);
traits
.
acc
(
C1
,
alphav
,
R1
);
r0
.
storePacket
(
0
*
Traits
::
ResPacketSize
,
R0
);
r1
.
storePacket
(
0
*
Traits
::
ResPacketSize
,
R1
);
R0
=
r2
.
loadPacket
(
0
*
Traits
::
ResPacketSize
);
R1
=
r3
.
loadPacket
(
0
*
Traits
::
ResPacketSize
);
traits
.
acc
(
C2
,
alphav
,
R0
);
traits
.
acc
(
C3
,
alphav
,
R1
);
r2
.
storePacket
(
0
*
Traits
::
ResPacketSize
,
R0
);
r3
.
storePacket
(
0
*
Traits
::
ResPacketSize
,
R1
);
}
const
RhsScalar
*
blB
=
unpackedB
;
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
// Deal with remaining columns of the rhs
for
(
Index
j2
=
packet_cols4
;
j2
<
cols
;
j2
++
)
{
LhsPacket
A0
,
A1
;
RhsPacket
B_0
;
RhsPacket
T0
;
traits
.
loadLhs
(
&
blA
[
0
*
LhsProgress
],
A0
);
traits
.
loadLhs
(
&
blA
[
1
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
0
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
// One column at a time
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
*
(
1
*
Traits
::
LhsProgress
)];
prefetch
(
&
blA
[
0
]);
blB
+=
RhsProgress
;
blA
+=
2
*
LhsProgress
;
}
ResPacket
R0
,
R4
;
ResPacket
alphav
=
pset1
<
ResPacket
>
(
alpha
);
// gets res block as register
AccPacket
C0
;
traits
.
initAcc
(
C0
);
ResScalar
*
r0
=
&
res
[(
j2
+
0
)
*
resStride
+
i
]
;
LinearMapper
r0
=
res
.
getLinearMapper
(
i
,
j2
)
;
R0
=
ploadu
<
ResPacket
>
(
r0
);
R4
=
ploadu
<
ResPacket
>
(
r0
+
ResPacketSize
);
// performs "inner" products
const
RhsScalar
*
blB
=
&
blockB
[
j2
*
strideB
+
offsetB
];
LhsPacket
A0
;
traits
.
acc
(
C0
,
alphav
,
R0
);
traits
.
acc
(
C4
,
alphav
,
R4
);
for
(
Index
k
=
0
;
k
<
peeled_kc
;
k
+=
pk
)
{
EIGEN_ASM_COMMENT
(
"begin gebp micro kernel 1pX1"
);
RhsPacket
B_0
;
#define EIGEN_GEBGP_ONESTEP(K) \
do { \
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
traits.madd(A0, B_0, C0, B_0); \
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
} while(false);
EIGEN_GEBGP_ONESTEP
(
0
);
EIGEN_GEBGP_ONESTEP
(
1
);
EIGEN_GEBGP_ONESTEP
(
2
);
EIGEN_GEBGP_ONESTEP
(
3
);
EIGEN_GEBGP_ONESTEP
(
4
);
EIGEN_GEBGP_ONESTEP
(
5
);
EIGEN_GEBGP_ONESTEP
(
6
);
EIGEN_GEBGP_ONESTEP
(
7
);
blB
+=
pk
*
RhsProgress
;
blA
+=
pk
*
1
*
Traits
::
LhsProgress
;
EIGEN_ASM_COMMENT
(
"end gebp micro kernel 1pX1"
);
}
pstoreu
(
r0
,
R0
);
pstoreu
(
r0
+
ResPacketSize
,
R4
);
// process remaining peeled loop
for
(
Index
k
=
peeled_kc
;
k
<
depth
;
k
++
)
{
RhsPacket
B_0
;
EIGEN_GEBGP_ONESTEP
(
0
);
blB
+=
RhsProgress
;
blA
+=
1
*
Traits
::
LhsProgress
;
}
#undef EIGEN_GEBGP_ONESTEP
ResPacket
R0
;
ResPacket
alphav
=
pset1
<
ResPacket
>
(
alpha
);
R0
=
r0
.
loadPacket
(
0
*
Traits
::
ResPacketSize
);
traits
.
acc
(
C0
,
alphav
,
R0
);
r0
.
storePacket
(
0
*
Traits
::
ResPacketSize
,
R0
);
}
}
if
(
rows
-
peeled_mc
>=
LhsProgress
)
}
//---------- Process remaining rows, 1 at once ----------
if
(
peeled_mc1
<
rows
)
{
// loop on each panel of the rhs
for
(
Index
j2
=
0
;
j2
<
packet_cols4
;
j2
+=
nr
)
{
Index
i
=
peeled_mc
;
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
*
LhsProgress
];
prefetch
(
&
blA
[
0
]);
AccPacket
C0
;
traits
.
initAcc
(
C0
);
const
RhsScalar
*
blB
=
unpackedB
;
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
// loop on each row of the lhs (1*LhsProgress x depth)
for
(
Index
i
=
peeled_mc1
;
i
<
rows
;
i
+=
1
)
{
LhsPacket
A0
;
RhsPacket
B_0
;
traits
.
loadLhs
(
blA
,
A0
);
traits
.
loadRhs
(
blB
,
B_0
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
blB
+=
RhsProgress
;
blA
+=
LhsProgress
;
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
];
prefetch
(
&
blA
[
0
]);
const
RhsScalar
*
blB
=
&
blockB
[
j2
*
strideB
+
offsetB
*
nr
];
// The following piece of code wont work for 512 bit registers
// Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
// as nr (which is currently 4) for the return type.
typedef
typename
unpacket_traits
<
SResPacket
>::
half
SResPacketHalf
;
if
((
SwappedTraits
::
LhsProgress
%
4
)
==
0
&&
(
SwappedTraits
::
LhsProgress
<=
8
)
&&
(
SwappedTraits
::
LhsProgress
!=
8
||
unpacket_traits
<
SResPacketHalf
>::
size
==
nr
))
{
SAccPacket
C0
,
C1
,
C2
,
C3
;
straits
.
initAcc
(
C0
);
straits
.
initAcc
(
C1
);
straits
.
initAcc
(
C2
);
straits
.
initAcc
(
C3
);
const
Index
spk
=
(
std
::
max
)(
1
,
SwappedTraits
::
LhsProgress
/
4
);
const
Index
endk
=
(
depth
/
spk
)
*
spk
;
const
Index
endk4
=
(
depth
/
(
spk
*
4
))
*
(
spk
*
4
);
Index
k
=
0
;
for
(;
k
<
endk4
;
k
+=
4
*
spk
)
{
SLhsPacket
A0
,
A1
;
SRhsPacket
B_0
,
B_1
;
straits
.
loadLhsUnaligned
(
blB
+
0
*
SwappedTraits
::
LhsProgress
,
A0
);
straits
.
loadLhsUnaligned
(
blB
+
1
*
SwappedTraits
::
LhsProgress
,
A1
);
straits
.
loadRhsQuad
(
blA
+
0
*
spk
,
B_0
);
straits
.
loadRhsQuad
(
blA
+
1
*
spk
,
B_1
);
straits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
straits
.
madd
(
A1
,
B_1
,
C1
,
B_1
);
straits
.
loadLhsUnaligned
(
blB
+
2
*
SwappedTraits
::
LhsProgress
,
A0
);
straits
.
loadLhsUnaligned
(
blB
+
3
*
SwappedTraits
::
LhsProgress
,
A1
);
straits
.
loadRhsQuad
(
blA
+
2
*
spk
,
B_0
);
straits
.
loadRhsQuad
(
blA
+
3
*
spk
,
B_1
);
straits
.
madd
(
A0
,
B_0
,
C2
,
B_0
);
straits
.
madd
(
A1
,
B_1
,
C3
,
B_1
);
blB
+=
4
*
SwappedTraits
::
LhsProgress
;
blA
+=
4
*
spk
;
}
C0
=
padd
(
padd
(
C0
,
C1
),
padd
(
C2
,
C3
));
for
(;
k
<
endk
;
k
+=
spk
)
{
SLhsPacket
A0
;
SRhsPacket
B_0
;
straits
.
loadLhsUnaligned
(
blB
,
A0
);
straits
.
loadRhsQuad
(
blA
,
B_0
);
straits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
blB
+=
SwappedTraits
::
LhsProgress
;
blA
+=
spk
;
}
if
(
SwappedTraits
::
LhsProgress
==
8
)
{
// Special case where we have to first reduce the accumulation register C0
typedef
typename
conditional
<
SwappedTraits
::
LhsProgress
>=
8
,
typename
unpacket_traits
<
SResPacket
>::
half
,
SResPacket
>::
type
SResPacketHalf
;
typedef
typename
conditional
<
SwappedTraits
::
LhsProgress
>=
8
,
typename
unpacket_traits
<
SLhsPacket
>::
half
,
SLhsPacket
>::
type
SLhsPacketHalf
;
typedef
typename
conditional
<
SwappedTraits
::
LhsProgress
>=
8
,
typename
unpacket_traits
<
SLhsPacket
>::
half
,
SRhsPacket
>::
type
SRhsPacketHalf
;
typedef
typename
conditional
<
SwappedTraits
::
LhsProgress
>=
8
,
typename
unpacket_traits
<
SAccPacket
>::
half
,
SAccPacket
>::
type
SAccPacketHalf
;
SResPacketHalf
R
=
res
.
template
gatherPacket
<
SResPacketHalf
>(
i
,
j2
);
SResPacketHalf
alphav
=
pset1
<
SResPacketHalf
>
(
alpha
);
if
(
depth
-
endk
>
0
)
{
// We have to handle the last row of the rhs which corresponds to a half-packet
SLhsPacketHalf
a0
;
SRhsPacketHalf
b0
;
straits
.
loadLhsUnaligned
(
blB
,
a0
);
straits
.
loadRhs
(
blA
,
b0
);
SAccPacketHalf
c0
=
predux_downto4
(
C0
);
straits
.
madd
(
a0
,
b0
,
c0
,
b0
);
straits
.
acc
(
c0
,
alphav
,
R
);
}
else
{
straits
.
acc
(
predux_downto4
(
C0
),
alphav
,
R
);
}
res
.
scatterPacket
(
i
,
j2
,
R
);
}
else
{
SResPacket
R
=
res
.
template
gatherPacket
<
SResPacket
>(
i
,
j2
);
SResPacket
alphav
=
pset1
<
SResPacket
>
(
alpha
);
straits
.
acc
(
C0
,
alphav
,
R
);
res
.
scatterPacket
(
i
,
j2
,
R
);
}
}
else
// scalar path
{
// get a 1 x 4 res block as registers
ResScalar
C0
(
0
),
C1
(
0
),
C2
(
0
),
C3
(
0
);
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
LhsScalar
A0
;
RhsScalar
B_0
,
B_1
;
A0
=
blA
[
k
];
B_0
=
blB
[
0
];
B_1
=
blB
[
1
];
CJMADD
(
cj
,
A0
,
B_0
,
C0
,
B_0
);
CJMADD
(
cj
,
A0
,
B_1
,
C1
,
B_1
);
B_0
=
blB
[
2
];
B_1
=
blB
[
3
];
CJMADD
(
cj
,
A0
,
B_0
,
C2
,
B_0
);
CJMADD
(
cj
,
A0
,
B_1
,
C3
,
B_1
);
blB
+=
4
;
}
res
(
i
,
j2
+
0
)
+=
alpha
*
C0
;
res
(
i
,
j2
+
1
)
+=
alpha
*
C1
;
res
(
i
,
j2
+
2
)
+=
alpha
*
C2
;
res
(
i
,
j2
+
3
)
+=
alpha
*
C3
;
}
}
ResPacket
alphav
=
pset1
<
ResPacket
>
(
alpha
);
ResPacket
R0
=
ploadu
<
ResPacket
>
(
&
res
[(
j2
+
0
)
*
resStride
+
i
]);
traits
.
acc
(
C0
,
alphav
,
R0
);
pstoreu
(
&
res
[(
j2
+
0
)
*
resStride
+
i
],
R0
);
}
for
(
Index
i
=
peeled_mc2
;
i
<
rows
;
i
++
)
// remaining columns
for
(
Index
j2
=
packet_cols4
;
j2
<
cols
;
j2
++
)
{
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
];
prefetch
(
&
blA
[
0
]);
// gets a 1 x 1 res block as registers
ResScalar
C0
(
0
);
// FIXME directly use blockB ??
const
RhsScalar
*
blB
=
&
blockB
[
j2
*
strideB
+
offsetB
];
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
// loop on each row of the lhs (1*LhsProgress x depth)
for
(
Index
i
=
peeled_mc1
;
i
<
rows
;
i
+=
1
)
{
LhsScalar
A0
=
blA
[
k
];
RhsScalar
B_0
=
blB
[
k
];
MADD
(
cj
,
A0
,
B_0
,
C0
,
B_0
);
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
];
prefetch
(
&
blA
[
0
]);
// gets a 1 x 1 res block as registers
ResScalar
C0
(
0
);
const
RhsScalar
*
blB
=
&
blockB
[
j2
*
strideB
+
offsetB
];
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
LhsScalar
A0
=
blA
[
k
];
RhsScalar
B_0
=
blB
[
k
];
CJMADD
(
cj
,
A0
,
B_0
,
C0
,
B_0
);
}
res
(
i
,
j2
)
+=
alpha
*
C0
;
}
res
[(
j2
+
0
)
*
resStride
+
i
]
+=
alpha
*
C0
;
}
}
}
...
...
@@ -1114,81 +1685,193 @@ EIGEN_ASM_COMMENT("mybegin4");
//
// 32 33 34 35 ...
// 36 36 38 39 ...
template
<
typename
Scalar
,
typename
Index
,
int
Pack1
,
int
Pack2
,
int
StorageOrder
,
bool
Conjugate
,
bool
PanelMode
>
struct
gemm_pack_lhs
template
<
typename
Scalar
,
typename
Index
,
typename
DataMapper
,
int
Pack1
,
int
Pack2
,
bool
Conjugate
,
bool
PanelMode
>
struct
gemm_pack_lhs
<
Scalar
,
Index
,
DataMapper
,
Pack1
,
Pack2
,
ColMajor
,
Conjugate
,
PanelMode
>
{
EIGEN_DONT_INLINE
void
operator
()(
Scalar
*
blockA
,
const
Scalar
*
EIGEN_RESTRICT
_lhs
,
Index
lhsStride
,
Index
depth
,
Index
rows
,
Index
stride
=
0
,
Index
offset
=
0
);
typedef
typename
DataMapper
::
LinearMapper
LinearMapper
;
EIGEN_DONT_INLINE
void
operator
()(
Scalar
*
blockA
,
const
DataMapper
&
lhs
,
Index
depth
,
Index
rows
,
Index
stride
=
0
,
Index
offset
=
0
);
};
template
<
typename
Scalar
,
typename
Index
,
int
Pack1
,
int
Pack2
,
int
StorageOrder
,
bool
Conjugate
,
bool
PanelMode
>
EIGEN_DONT_INLINE
void
gemm_pack_lhs
<
Scalar
,
Index
,
Pack1
,
Pack2
,
StorageOrde
r
,
Conjugate
,
PanelMode
>
::
operator
()(
Scalar
*
blockA
,
const
Scalar
*
EIGEN_RESTRICT
_lhs
,
Index
lhsStride
,
Index
depth
,
Index
rows
,
Index
stride
,
Index
offset
)
template
<
typename
Scalar
,
typename
Index
,
typename
DataMapper
,
int
Pack1
,
int
Pack2
,
bool
Conjugate
,
bool
PanelMode
>
EIGEN_DONT_INLINE
void
gemm_pack_lhs
<
Scalar
,
Index
,
DataMapper
,
Pack1
,
Pack2
,
ColMajo
r
,
Conjugate
,
PanelMode
>
::
operator
()(
Scalar
*
blockA
,
const
DataMapper
&
lhs
,
Index
depth
,
Index
rows
,
Index
stride
,
Index
offset
)
{
typedef
typename
packet_traits
<
Scalar
>::
type
Packet
;
enum
{
PacketSize
=
packet_traits
<
Scalar
>::
size
};
EIGEN_ASM_COMMENT
(
"EIGEN PRODUCT PACK LHS"
);
EIGEN_UNUSED_VARIABLE
(
stride
)
EIGEN_UNUSED_VARIABLE
(
offset
)
EIGEN_UNUSED_VARIABLE
(
stride
)
;
EIGEN_UNUSED_VARIABLE
(
offset
)
;
eigen_assert
(((
!
PanelMode
)
&&
stride
==
0
&&
offset
==
0
)
||
(
PanelMode
&&
stride
>=
depth
&&
offset
<=
stride
));
eigen_assert
(
(
StorageOrder
==
RowMajor
)
||
((
Pack1
%
PacketSize
)
==
0
&&
Pack1
<=
4
*
PacketSize
)
);
eigen_assert
(
((
Pack1
%
PacketSize
)
==
0
&&
Pack1
<=
4
*
PacketSize
)
||
(
Pack1
<=
4
)
);
conj_if
<
NumTraits
<
Scalar
>::
IsComplex
&&
Conjugate
>
cj
;
const_blas_data_mapper
<
Scalar
,
Index
,
StorageOrder
>
lhs
(
_lhs
,
lhsStride
);
Index
count
=
0
;
Index
peeled_mc
=
(
rows
/
Pack1
)
*
Pack1
;
for
(
Index
i
=
0
;
i
<
peeled_mc
;
i
+=
Pack1
)
const
Index
peeled_mc3
=
Pack1
>=
3
*
PacketSize
?
(
rows
/
(
3
*
PacketSize
))
*
(
3
*
PacketSize
)
:
0
;
const
Index
peeled_mc2
=
Pack1
>=
2
*
PacketSize
?
peeled_mc3
+
((
rows
-
peeled_mc3
)
/
(
2
*
PacketSize
))
*
(
2
*
PacketSize
)
:
0
;
const
Index
peeled_mc1
=
Pack1
>=
1
*
PacketSize
?
(
rows
/
(
1
*
PacketSize
))
*
(
1
*
PacketSize
)
:
0
;
const
Index
peeled_mc0
=
Pack2
>=
1
*
PacketSize
?
peeled_mc1
:
Pack2
>
1
?
(
rows
/
Pack2
)
*
Pack2
:
0
;
Index
i
=
0
;
// Pack 3 packets
if
(
Pack1
>=
3
*
PacketSize
)
{
if
(
PanelMode
)
count
+=
Pack1
*
offset
;
for
(;
i
<
peeled_mc3
;
i
+=
3
*
PacketSize
)
{
if
(
PanelMode
)
count
+=
(
3
*
PacketSize
)
*
offset
;
if
(
StorageOrder
==
ColMajor
)
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
Packet
A
,
B
,
C
;
A
=
lhs
.
loadPacket
(
i
+
0
*
PacketSize
,
k
);
B
=
lhs
.
loadPacket
(
i
+
1
*
PacketSize
,
k
);
C
=
lhs
.
loadPacket
(
i
+
2
*
PacketSize
,
k
);
pstore
(
blockA
+
count
,
cj
.
pconj
(
A
));
count
+=
PacketSize
;
pstore
(
blockA
+
count
,
cj
.
pconj
(
B
));
count
+=
PacketSize
;
pstore
(
blockA
+
count
,
cj
.
pconj
(
C
));
count
+=
PacketSize
;
}
if
(
PanelMode
)
count
+=
(
3
*
PacketSize
)
*
(
stride
-
offset
-
depth
);
}
}
// Pack 2 packets
if
(
Pack1
>=
2
*
PacketSize
)
{
for
(;
i
<
peeled_mc2
;
i
+=
2
*
PacketSize
)
{
if
(
PanelMode
)
count
+=
(
2
*
PacketSize
)
*
offset
;
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
Packet
A
,
B
,
C
,
D
;
if
(
Pack1
>=
1
*
PacketSize
)
A
=
ploadu
<
Packet
>
(
&
lhs
(
i
+
0
*
PacketSize
,
k
));
if
(
Pack1
>=
2
*
PacketSize
)
B
=
ploadu
<
Packet
>
(
&
lhs
(
i
+
1
*
PacketSize
,
k
));
if
(
Pack1
>=
3
*
PacketSize
)
C
=
ploadu
<
Packet
>
(
&
lhs
(
i
+
2
*
PacketSize
,
k
));
if
(
Pack1
>=
4
*
PacketSize
)
D
=
ploadu
<
Packet
>
(
&
lhs
(
i
+
3
*
PacketSize
,
k
));
if
(
Pack1
>=
1
*
PacketSize
)
{
pstore
(
blockA
+
count
,
cj
.
pconj
(
A
));
count
+=
PacketSize
;
}
if
(
Pack1
>=
2
*
PacketSize
)
{
pstore
(
blockA
+
count
,
cj
.
pconj
(
B
));
count
+=
PacketSize
;
}
if
(
Pack1
>=
3
*
PacketSize
)
{
pstore
(
blockA
+
count
,
cj
.
pconj
(
C
));
count
+=
PacketSize
;
}
if
(
Pack1
>=
4
*
PacketSize
)
{
pstore
(
blockA
+
count
,
cj
.
pconj
(
D
));
count
+=
PacketSize
;
}
Packet
A
,
B
;
A
=
lhs
.
loadPacket
(
i
+
0
*
PacketSize
,
k
);
B
=
lhs
.
loadPacket
(
i
+
1
*
PacketSize
,
k
);
pstore
(
blockA
+
count
,
cj
.
pconj
(
A
));
count
+=
PacketSize
;
pstore
(
blockA
+
count
,
cj
.
pconj
(
B
));
count
+=
PacketSize
;
}
if
(
PanelMode
)
count
+=
(
2
*
PacketSize
)
*
(
stride
-
offset
-
depth
);
}
else
}
// Pack 1 packets
if
(
Pack1
>=
1
*
PacketSize
)
{
for
(;
i
<
peeled_mc1
;
i
+=
1
*
PacketSize
)
{
if
(
PanelMode
)
count
+=
(
1
*
PacketSize
)
*
offset
;
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
Packet
A
;
A
=
lhs
.
loadPacket
(
i
+
0
*
PacketSize
,
k
);
pstore
(
blockA
+
count
,
cj
.
pconj
(
A
));
count
+=
PacketSize
;
}
if
(
PanelMode
)
count
+=
(
1
*
PacketSize
)
*
(
stride
-
offset
-
depth
);
}
}
// Pack scalars
if
(
Pack2
<
PacketSize
&&
Pack2
>
1
)
{
for
(;
i
<
peeled_mc0
;
i
+=
Pack2
)
{
if
(
PanelMode
)
count
+=
Pack2
*
offset
;
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
for
(
Index
w
=
0
;
w
<
Pack2
;
w
++
)
blockA
[
count
++
]
=
cj
(
lhs
(
i
+
w
,
k
));
if
(
PanelMode
)
count
+=
Pack2
*
(
stride
-
offset
-
depth
);
}
}
for
(;
i
<
rows
;
i
++
)
{
if
(
PanelMode
)
count
+=
offset
;
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
blockA
[
count
++
]
=
cj
(
lhs
(
i
,
k
));
if
(
PanelMode
)
count
+=
(
stride
-
offset
-
depth
);
}
}
template
<
typename
Scalar
,
typename
Index
,
typename
DataMapper
,
int
Pack1
,
int
Pack2
,
bool
Conjugate
,
bool
PanelMode
>
struct
gemm_pack_lhs
<
Scalar
,
Index
,
DataMapper
,
Pack1
,
Pack2
,
RowMajor
,
Conjugate
,
PanelMode
>
{
typedef
typename
DataMapper
::
LinearMapper
LinearMapper
;
EIGEN_DONT_INLINE
void
operator
()(
Scalar
*
blockA
,
const
DataMapper
&
lhs
,
Index
depth
,
Index
rows
,
Index
stride
=
0
,
Index
offset
=
0
);
};
template
<
typename
Scalar
,
typename
Index
,
typename
DataMapper
,
int
Pack1
,
int
Pack2
,
bool
Conjugate
,
bool
PanelMode
>
EIGEN_DONT_INLINE
void
gemm_pack_lhs
<
Scalar
,
Index
,
DataMapper
,
Pack1
,
Pack2
,
RowMajor
,
Conjugate
,
PanelMode
>
::
operator
()(
Scalar
*
blockA
,
const
DataMapper
&
lhs
,
Index
depth
,
Index
rows
,
Index
stride
,
Index
offset
)
{
typedef
typename
packet_traits
<
Scalar
>::
type
Packet
;
enum
{
PacketSize
=
packet_traits
<
Scalar
>::
size
};
EIGEN_ASM_COMMENT
(
"EIGEN PRODUCT PACK LHS"
);
EIGEN_UNUSED_VARIABLE
(
stride
);
EIGEN_UNUSED_VARIABLE
(
offset
);
eigen_assert
(((
!
PanelMode
)
&&
stride
==
0
&&
offset
==
0
)
||
(
PanelMode
&&
stride
>=
depth
&&
offset
<=
stride
));
conj_if
<
NumTraits
<
Scalar
>::
IsComplex
&&
Conjugate
>
cj
;
Index
count
=
0
;
// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
int
pack
=
Pack1
;
Index
i
=
0
;
while
(
pack
>
0
)
{
Index
remaining_rows
=
rows
-
i
;
Index
peeled_mc
=
i
+
(
remaining_rows
/
pack
)
*
pack
;
for
(;
i
<
peeled_mc
;
i
+=
pack
)
{
if
(
PanelMode
)
count
+=
pack
*
offset
;
const
Index
peeled_k
=
(
depth
/
PacketSize
)
*
PacketSize
;
Index
k
=
0
;
if
(
pack
>=
PacketSize
)
{
for
(;
k
<
peeled_k
;
k
+=
PacketSize
)
{
for
(
Index
m
=
0
;
m
<
pack
;
m
+=
PacketSize
)
{
PacketBlock
<
Packet
>
kernel
;
for
(
int
p
=
0
;
p
<
PacketSize
;
++
p
)
kernel
.
packet
[
p
]
=
lhs
.
loadPacket
(
i
+
p
+
m
,
k
);
ptranspose
(
kernel
);
for
(
int
p
=
0
;
p
<
PacketSize
;
++
p
)
pstore
(
blockA
+
count
+
m
+
(
pack
)
*
p
,
cj
.
pconj
(
kernel
.
packet
[
p
]));
}
count
+=
PacketSize
*
pack
;
}
}
for
(;
k
<
depth
;
k
++
)
{
// TODO add a vectorized transpose here
Index
w
=
0
;
for
(;
w
<
P
ack
1
-
3
;
w
+=
4
)
for
(;
w
<
p
ack
-
3
;
w
+=
4
)
{
Scalar
a
(
cj
(
lhs
(
i
+
w
+
0
,
k
))),
b
(
cj
(
lhs
(
i
+
w
+
1
,
k
))),
c
(
cj
(
lhs
(
i
+
w
+
2
,
k
))),
d
(
cj
(
lhs
(
i
+
w
+
3
,
k
)));
b
(
cj
(
lhs
(
i
+
w
+
1
,
k
))),
c
(
cj
(
lhs
(
i
+
w
+
2
,
k
))),
d
(
cj
(
lhs
(
i
+
w
+
3
,
k
)));
blockA
[
count
++
]
=
a
;
blockA
[
count
++
]
=
b
;
blockA
[
count
++
]
=
c
;
blockA
[
count
++
]
=
d
;
}
if
(
P
ack
1
%
4
)
for
(;
w
<
P
ack
1
;
++
w
)
if
(
p
ack
%
4
)
for
(;
w
<
p
ack
;
++
w
)
blockA
[
count
++
]
=
cj
(
lhs
(
i
+
w
,
k
));
}
if
(
PanelMode
)
count
+=
pack
*
(
stride
-
offset
-
depth
);
}
if
(
PanelMode
)
count
+=
Pack1
*
(
stride
-
offset
-
depth
);
}
if
(
rows
-
peeled_mc
>=
Pack2
)
{
if
(
PanelMode
)
count
+=
Pack2
*
offset
;
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
for
(
Index
w
=
0
;
w
<
Pack2
;
w
++
)
blockA
[
count
++
]
=
cj
(
lhs
(
peeled_mc
+
w
,
k
));
if
(
PanelMode
)
count
+=
Pack2
*
(
stride
-
offset
-
depth
);
peeled_mc
+=
Pack2
;
pack
-=
PacketSize
;
if
(
pack
<
Pack2
&&
(
pack
+
PacketSize
)
!=
Pack2
)
pack
=
Pack2
;
}
for
(
Index
i
=
peeled_mc
;
i
<
rows
;
i
++
)
for
(;
i
<
rows
;
i
++
)
{
if
(
PanelMode
)
count
+=
offset
;
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
...
...
@@ -1204,53 +1887,123 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder,
// 4 5 6 7 16 17 18 19 25 28
// 8 9 10 11 20 21 22 23 26 29
// . . . . . . . . . .
template
<
typename
Scalar
,
typename
Index
,
int
nr
,
bool
Conjugate
,
bool
PanelMode
>
struct
gemm_pack_rhs
<
Scalar
,
Index
,
nr
,
ColMajor
,
Conjugate
,
PanelMode
>
template
<
typename
Scalar
,
typename
Index
,
typename
DataMapper
,
int
nr
,
bool
Conjugate
,
bool
PanelMode
>
struct
gemm_pack_rhs
<
Scalar
,
Index
,
DataMapper
,
nr
,
ColMajor
,
Conjugate
,
PanelMode
>
{
typedef
typename
packet_traits
<
Scalar
>::
type
Packet
;
typedef
typename
DataMapper
::
LinearMapper
LinearMapper
;
enum
{
PacketSize
=
packet_traits
<
Scalar
>::
size
};
EIGEN_DONT_INLINE
void
operator
()(
Scalar
*
blockB
,
const
Scalar
*
rhs
,
Index
rhsStride
,
Index
depth
,
Index
cols
,
Index
stride
=
0
,
Index
offset
=
0
);
EIGEN_DONT_INLINE
void
operator
()(
Scalar
*
blockB
,
const
DataMapper
&
rhs
,
Index
depth
,
Index
cols
,
Index
stride
=
0
,
Index
offset
=
0
);
};
template
<
typename
Scalar
,
typename
Index
,
int
nr
,
bool
Conjugate
,
bool
PanelMode
>
EIGEN_DONT_INLINE
void
gemm_pack_rhs
<
Scalar
,
Index
,
nr
,
ColMajor
,
Conjugate
,
PanelMode
>
::
operator
()(
Scalar
*
blockB
,
const
Scalar
*
rhs
,
Index
rhsStride
,
Index
depth
,
Index
cols
,
Index
stride
,
Index
offset
)
template
<
typename
Scalar
,
typename
Index
,
typename
DataMapper
,
int
nr
,
bool
Conjugate
,
bool
PanelMode
>
EIGEN_DONT_INLINE
void
gemm_pack_rhs
<
Scalar
,
Index
,
DataMapper
,
nr
,
ColMajor
,
Conjugate
,
PanelMode
>
::
operator
()(
Scalar
*
blockB
,
const
DataMapper
&
rhs
,
Index
depth
,
Index
cols
,
Index
stride
,
Index
offset
)
{
EIGEN_ASM_COMMENT
(
"EIGEN PRODUCT PACK RHS COLMAJOR"
);
EIGEN_UNUSED_VARIABLE
(
stride
)
EIGEN_UNUSED_VARIABLE
(
offset
)
EIGEN_UNUSED_VARIABLE
(
stride
)
;
EIGEN_UNUSED_VARIABLE
(
offset
)
;
eigen_assert
(((
!
PanelMode
)
&&
stride
==
0
&&
offset
==
0
)
||
(
PanelMode
&&
stride
>=
depth
&&
offset
<=
stride
));
conj_if
<
NumTraits
<
Scalar
>::
IsComplex
&&
Conjugate
>
cj
;
Index
packet_cols
=
(
cols
/
nr
)
*
nr
;
Index
packet_cols8
=
nr
>=
8
?
(
cols
/
8
)
*
8
:
0
;
Index
packet_cols4
=
nr
>=
4
?
(
cols
/
4
)
*
4
:
0
;
Index
count
=
0
;
for
(
Index
j2
=
0
;
j2
<
packet_cols
;
j2
+=
nr
)
const
Index
peeled_k
=
(
depth
/
PacketSize
)
*
PacketSize
;
// if(nr>=8)
// {
// for(Index j2=0; j2<packet_cols8; j2+=8)
// {
// // skip what we have before
// if(PanelMode) count += 8 * offset;
// const Scalar* b0 = &rhs[(j2+0)*rhsStride];
// const Scalar* b1 = &rhs[(j2+1)*rhsStride];
// const Scalar* b2 = &rhs[(j2+2)*rhsStride];
// const Scalar* b3 = &rhs[(j2+3)*rhsStride];
// const Scalar* b4 = &rhs[(j2+4)*rhsStride];
// const Scalar* b5 = &rhs[(j2+5)*rhsStride];
// const Scalar* b6 = &rhs[(j2+6)*rhsStride];
// const Scalar* b7 = &rhs[(j2+7)*rhsStride];
// Index k=0;
// if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
// {
// for(; k<peeled_k; k+=PacketSize) {
// PacketBlock<Packet> kernel;
// for (int p = 0; p < PacketSize; ++p) {
// kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
// }
// ptranspose(kernel);
// for (int p = 0; p < PacketSize; ++p) {
// pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
// count+=PacketSize;
// }
// }
// }
// for(; k<depth; k++)
// {
// blockB[count+0] = cj(b0[k]);
// blockB[count+1] = cj(b1[k]);
// blockB[count+2] = cj(b2[k]);
// blockB[count+3] = cj(b3[k]);
// blockB[count+4] = cj(b4[k]);
// blockB[count+5] = cj(b5[k]);
// blockB[count+6] = cj(b6[k]);
// blockB[count+7] = cj(b7[k]);
// count += 8;
// }
// // skip what we have after
// if(PanelMode) count += 8 * (stride-offset-depth);
// }
// }
if
(
nr
>=
4
)
{
// skip what we have before
if
(
PanelMode
)
count
+=
nr
*
offset
;
const
Scalar
*
b0
=
&
rhs
[(
j2
+
0
)
*
rhsStride
];
const
Scalar
*
b1
=
&
rhs
[(
j2
+
1
)
*
rhsStride
];
const
Scalar
*
b2
=
&
rhs
[(
j2
+
2
)
*
rhsStride
];
const
Scalar
*
b3
=
&
rhs
[(
j2
+
3
)
*
rhsStride
];
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
for
(
Index
j2
=
packet_cols8
;
j2
<
packet_cols4
;
j2
+=
4
)
{
blockB
[
count
+
0
]
=
cj
(
b0
[
k
]);
blockB
[
count
+
1
]
=
cj
(
b1
[
k
]);
if
(
nr
==
4
)
blockB
[
count
+
2
]
=
cj
(
b2
[
k
]);
if
(
nr
==
4
)
blockB
[
count
+
3
]
=
cj
(
b3
[
k
]);
count
+=
nr
;
// skip what we have before
if
(
PanelMode
)
count
+=
4
*
offset
;
const
LinearMapper
dm0
=
rhs
.
getLinearMapper
(
0
,
j2
+
0
);
const
LinearMapper
dm1
=
rhs
.
getLinearMapper
(
0
,
j2
+
1
);
const
LinearMapper
dm2
=
rhs
.
getLinearMapper
(
0
,
j2
+
2
);
const
LinearMapper
dm3
=
rhs
.
getLinearMapper
(
0
,
j2
+
3
);
Index
k
=
0
;
if
((
PacketSize
%
4
)
==
0
)
// TODO enable vectorized transposition for PacketSize==2 ??
{
for
(;
k
<
peeled_k
;
k
+=
PacketSize
)
{
PacketBlock
<
Packet
,(
PacketSize
%
4
)
==
0
?
4
:
PacketSize
>
kernel
;
kernel
.
packet
[
0
]
=
dm0
.
loadPacket
(
k
);
kernel
.
packet
[
1
%
PacketSize
]
=
dm1
.
loadPacket
(
k
);
kernel
.
packet
[
2
%
PacketSize
]
=
dm2
.
loadPacket
(
k
);
kernel
.
packet
[
3
%
PacketSize
]
=
dm3
.
loadPacket
(
k
);
ptranspose
(
kernel
);
pstoreu
(
blockB
+
count
+
0
*
PacketSize
,
cj
.
pconj
(
kernel
.
packet
[
0
]));
pstoreu
(
blockB
+
count
+
1
*
PacketSize
,
cj
.
pconj
(
kernel
.
packet
[
1
%
PacketSize
]));
pstoreu
(
blockB
+
count
+
2
*
PacketSize
,
cj
.
pconj
(
kernel
.
packet
[
2
%
PacketSize
]));
pstoreu
(
blockB
+
count
+
3
*
PacketSize
,
cj
.
pconj
(
kernel
.
packet
[
3
%
PacketSize
]));
count
+=
4
*
PacketSize
;
}
}
for
(;
k
<
depth
;
k
++
)
{
blockB
[
count
+
0
]
=
cj
(
dm0
(
k
));
blockB
[
count
+
1
]
=
cj
(
dm1
(
k
));
blockB
[
count
+
2
]
=
cj
(
dm2
(
k
));
blockB
[
count
+
3
]
=
cj
(
dm3
(
k
));
count
+=
4
;
}
// skip what we have after
if
(
PanelMode
)
count
+=
4
*
(
stride
-
offset
-
depth
);
}
// skip what we have after
if
(
PanelMode
)
count
+=
nr
*
(
stride
-
offset
-
depth
);
}
// copy the remaining columns one at a time (nr==1)
for
(
Index
j2
=
packet_cols
;
j2
<
cols
;
++
j2
)
for
(
Index
j2
=
packet_cols
4
;
j2
<
cols
;
++
j2
)
{
if
(
PanelMode
)
count
+=
offset
;
const
Scalar
*
b
0
=
&
rhs
[(
j2
+
0
)
*
rhsStride
]
;
const
LinearMapper
dm
0
=
rhs
.
getLinearMapper
(
0
,
j2
)
;
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
blockB
[
count
]
=
cj
(
b0
[
k
]
);
blockB
[
count
]
=
cj
(
dm0
(
k
)
);
count
+=
1
;
}
if
(
PanelMode
)
count
+=
(
stride
-
offset
-
depth
);
...
...
@@ -1258,48 +2011,93 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
}
// this version is optimized for row major matrices
template
<
typename
Scalar
,
typename
Index
,
int
nr
,
bool
Conjugate
,
bool
PanelMode
>
struct
gemm_pack_rhs
<
Scalar
,
Index
,
nr
,
RowMajor
,
Conjugate
,
PanelMode
>
template
<
typename
Scalar
,
typename
Index
,
typename
DataMapper
,
int
nr
,
bool
Conjugate
,
bool
PanelMode
>
struct
gemm_pack_rhs
<
Scalar
,
Index
,
DataMapper
,
nr
,
RowMajor
,
Conjugate
,
PanelMode
>
{
typedef
typename
packet_traits
<
Scalar
>::
type
Packet
;
typedef
typename
DataMapper
::
LinearMapper
LinearMapper
;
enum
{
PacketSize
=
packet_traits
<
Scalar
>::
size
};
EIGEN_DONT_INLINE
void
operator
()(
Scalar
*
blockB
,
const
Scalar
*
rhs
,
Index
rhsStride
,
Index
depth
,
Index
cols
,
Index
stride
=
0
,
Index
offset
=
0
);
EIGEN_DONT_INLINE
void
operator
()(
Scalar
*
blockB
,
const
DataMapper
&
rhs
,
Index
depth
,
Index
cols
,
Index
stride
=
0
,
Index
offset
=
0
);
};
template
<
typename
Scalar
,
typename
Index
,
int
nr
,
bool
Conjugate
,
bool
PanelMode
>
EIGEN_DONT_INLINE
void
gemm_pack_rhs
<
Scalar
,
Index
,
nr
,
RowMajor
,
Conjugate
,
PanelMode
>
::
operator
()(
Scalar
*
blockB
,
const
Scalar
*
rhs
,
Index
rhsStride
,
Index
depth
,
Index
cols
,
Index
stride
,
Index
offset
)
template
<
typename
Scalar
,
typename
Index
,
typename
DataMapper
,
int
nr
,
bool
Conjugate
,
bool
PanelMode
>
EIGEN_DONT_INLINE
void
gemm_pack_rhs
<
Scalar
,
Index
,
DataMapper
,
nr
,
RowMajor
,
Conjugate
,
PanelMode
>
::
operator
()(
Scalar
*
blockB
,
const
DataMapper
&
rhs
,
Index
depth
,
Index
cols
,
Index
stride
,
Index
offset
)
{
EIGEN_ASM_COMMENT
(
"EIGEN PRODUCT PACK RHS ROWMAJOR"
);
EIGEN_UNUSED_VARIABLE
(
stride
)
EIGEN_UNUSED_VARIABLE
(
offset
)
EIGEN_UNUSED_VARIABLE
(
stride
)
;
EIGEN_UNUSED_VARIABLE
(
offset
)
;
eigen_assert
(((
!
PanelMode
)
&&
stride
==
0
&&
offset
==
0
)
||
(
PanelMode
&&
stride
>=
depth
&&
offset
<=
stride
));
conj_if
<
NumTraits
<
Scalar
>::
IsComplex
&&
Conjugate
>
cj
;
Index
packet_cols
=
(
cols
/
nr
)
*
nr
;
Index
packet_cols8
=
nr
>=
8
?
(
cols
/
8
)
*
8
:
0
;
Index
packet_cols4
=
nr
>=
4
?
(
cols
/
4
)
*
4
:
0
;
Index
count
=
0
;
for
(
Index
j2
=
0
;
j2
<
packet_cols
;
j2
+=
nr
)
// if(nr>=8)
// {
// for(Index j2=0; j2<packet_cols8; j2+=8)
// {
// // skip what we have before
// if(PanelMode) count += 8 * offset;
// for(Index k=0; k<depth; k++)
// {
// if (PacketSize==8) {
// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
// pstoreu(blockB+count, cj.pconj(A));
// } else if (PacketSize==4) {
// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
// Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
// pstoreu(blockB+count, cj.pconj(A));
// pstoreu(blockB+count+PacketSize, cj.pconj(B));
// } else {
// const Scalar* b0 = &rhs[k*rhsStride + j2];
// blockB[count+0] = cj(b0[0]);
// blockB[count+1] = cj(b0[1]);
// blockB[count+2] = cj(b0[2]);
// blockB[count+3] = cj(b0[3]);
// blockB[count+4] = cj(b0[4]);
// blockB[count+5] = cj(b0[5]);
// blockB[count+6] = cj(b0[6]);
// blockB[count+7] = cj(b0[7]);
// }
// count += 8;
// }
// // skip what we have after
// if(PanelMode) count += 8 * (stride-offset-depth);
// }
// }
if
(
nr
>=
4
)
{
// skip what we have before
if
(
PanelMode
)
count
+=
nr
*
offset
;
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
for
(
Index
j2
=
packet_cols8
;
j2
<
packet_cols4
;
j2
+=
4
)
{
const
Scalar
*
b0
=
&
rhs
[
k
*
rhsStride
+
j2
];
blockB
[
count
+
0
]
=
cj
(
b0
[
0
]);
blockB
[
count
+
1
]
=
cj
(
b0
[
1
]);
if
(
nr
==
4
)
blockB
[
count
+
2
]
=
cj
(
b0
[
2
]);
if
(
nr
==
4
)
blockB
[
count
+
3
]
=
cj
(
b0
[
3
]);
count
+=
nr
;
// skip what we have before
if
(
PanelMode
)
count
+=
4
*
offset
;
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
if
(
PacketSize
==
4
)
{
Packet
A
=
rhs
.
loadPacket
(
k
,
j2
);
pstoreu
(
blockB
+
count
,
cj
.
pconj
(
A
));
count
+=
PacketSize
;
}
else
{
const
LinearMapper
dm0
=
rhs
.
getLinearMapper
(
k
,
j2
);
blockB
[
count
+
0
]
=
cj
(
dm0
(
0
));
blockB
[
count
+
1
]
=
cj
(
dm0
(
1
));
blockB
[
count
+
2
]
=
cj
(
dm0
(
2
));
blockB
[
count
+
3
]
=
cj
(
dm0
(
3
));
count
+=
4
;
}
}
// skip what we have after
if
(
PanelMode
)
count
+=
4
*
(
stride
-
offset
-
depth
);
}
// skip what we have after
if
(
PanelMode
)
count
+=
nr
*
(
stride
-
offset
-
depth
);
}
// copy the remaining columns one at a time (nr==1)
for
(
Index
j2
=
packet_cols
;
j2
<
cols
;
++
j2
)
for
(
Index
j2
=
packet_cols
4
;
j2
<
cols
;
++
j2
)
{
if
(
PanelMode
)
count
+=
offset
;
const
Scalar
*
b0
=
&
rhs
[
j2
];
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
blockB
[
count
]
=
cj
(
b0
[
k
*
rhsStride
]
);
blockB
[
count
]
=
cj
(
rhs
(
k
,
j2
)
);
count
+=
1
;
}
if
(
PanelMode
)
count
+=
stride
-
offset
-
depth
;
...
...
@@ -1312,8 +2110,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
* \sa setCpuCacheSize */
inline
std
::
ptrdiff_t
l1CacheSize
()
{
std
::
ptrdiff_t
l1
,
l2
;
internal
::
manage_caching_sizes
(
GetAction
,
&
l1
,
&
l2
);
std
::
ptrdiff_t
l1
,
l2
,
l3
;
internal
::
manage_caching_sizes
(
GetAction
,
&
l1
,
&
l2
,
&
l3
);
return
l1
;
}
...
...
@@ -1321,19 +2119,29 @@ inline std::ptrdiff_t l1CacheSize()
* \sa setCpuCacheSize */
inline
std
::
ptrdiff_t
l2CacheSize
()
{
std
::
ptrdiff_t
l1
,
l2
;
internal
::
manage_caching_sizes
(
GetAction
,
&
l1
,
&
l2
);
std
::
ptrdiff_t
l1
,
l2
,
l3
;
internal
::
manage_caching_sizes
(
GetAction
,
&
l1
,
&
l2
,
&
l3
);
return
l2
;
}
/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\
rs.
* \sa setCpuCacheSize */
inline
std
::
ptrdiff_t
l3CacheSize
()
{
std
::
ptrdiff_t
l1
,
l2
,
l3
;
internal
::
manage_caching_sizes
(
GetAction
,
&
l1
,
&
l2
,
&
l3
);
return
l3
;
}
/** Set the cpu L1 and L2 cache sizes (in bytes).
* These values are use to adjust the size of the blocks
* for the algorithms working per blocks.
*
* \sa computeProductBlockingSizes */
inline
void
setCpuCacheSizes
(
std
::
ptrdiff_t
l1
,
std
::
ptrdiff_t
l2
)
inline
void
setCpuCacheSizes
(
std
::
ptrdiff_t
l1
,
std
::
ptrdiff_t
l2
,
std
::
ptrdiff_t
l3
)
{
internal
::
manage_caching_sizes
(
SetAction
,
&
l1
,
&
l2
);
internal
::
manage_caching_sizes
(
SetAction
,
&
l1
,
&
l2
,
&
l3
);
}
}
// end namespace Eigen
...
...
external/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h
View file @
a394b22a
...
...
@@ -10,7 +10,7 @@
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
#define EIGEN_GENERAL_MATRIX_MATRIX_H
namespace
Eigen
{
namespace
Eigen
{
namespace
internal
{
...
...
@@ -23,7 +23,9 @@ template<
typename
RhsScalar
,
int
RhsStorageOrder
,
bool
ConjugateRhs
>
struct
general_matrix_matrix_product
<
Index
,
LhsScalar
,
LhsStorageOrder
,
ConjugateLhs
,
RhsScalar
,
RhsStorageOrder
,
ConjugateRhs
,
RowMajor
>
{
typedef
typename
scalar_product_traits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
typedef
gebp_traits
<
RhsScalar
,
LhsScalar
>
Traits
;
typedef
typename
ScalarBinaryOpTraits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
rows
,
Index
cols
,
Index
depth
,
const
LhsScalar
*
lhs
,
Index
lhsStride
,
...
...
@@ -51,42 +53,44 @@ template<
struct
general_matrix_matrix_product
<
Index
,
LhsScalar
,
LhsStorageOrder
,
ConjugateLhs
,
RhsScalar
,
RhsStorageOrder
,
ConjugateRhs
,
ColMajor
>
{
typedef
typename
scalar_product_traits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
typedef
gebp_traits
<
LhsScalar
,
RhsScalar
>
Traits
;
typedef
typename
ScalarBinaryOpTraits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
static
void
run
(
Index
rows
,
Index
cols
,
Index
depth
,
const
LhsScalar
*
_lhs
,
Index
lhsStride
,
const
RhsScalar
*
_rhs
,
Index
rhsStride
,
ResScalar
*
res
,
Index
resStride
,
ResScalar
*
_
res
,
Index
resStride
,
ResScalar
alpha
,
level3_blocking
<
LhsScalar
,
RhsScalar
>&
blocking
,
GemmParallelInfo
<
Index
>*
info
=
0
)
{
const_blas_data_mapper
<
LhsScalar
,
Index
,
LhsStorageOrder
>
lhs
(
_lhs
,
lhsStride
);
const_blas_data_mapper
<
RhsScalar
,
Index
,
RhsStorageOrder
>
rhs
(
_rhs
,
rhsStride
);
typedef
gebp_traits
<
LhsScalar
,
RhsScalar
>
Traits
;
typedef
const_blas_data_mapper
<
LhsScalar
,
Index
,
LhsStorageOrder
>
LhsMapper
;
typedef
const_blas_data_mapper
<
RhsScalar
,
Index
,
RhsStorageOrder
>
RhsMapper
;
typedef
blas_data_mapper
<
typename
Traits
::
ResScalar
,
Index
,
ColMajor
>
ResMapper
;
LhsMapper
lhs
(
_lhs
,
lhsStride
);
RhsMapper
rhs
(
_rhs
,
rhsStride
);
ResMapper
res
(
_res
,
resStride
);
Index
kc
=
blocking
.
kc
();
// cache block size along the K direction
Index
mc
=
(
std
::
min
)(
rows
,
blocking
.
mc
());
// cache block size along the M direction
//
Index nc = blocking.nc()
;
// cache block size along the N direction
Index
nc
=
(
std
::
min
)(
cols
,
blocking
.
nc
()
);
// cache block size along the N direction
gemm_pack_lhs
<
LhsScalar
,
Index
,
Traits
::
mr
,
Traits
::
LhsProgress
,
LhsStorageOrder
>
pack_lhs
;
gemm_pack_rhs
<
RhsScalar
,
Index
,
Traits
::
nr
,
RhsStorageOrder
>
pack_rhs
;
gebp_kernel
<
LhsScalar
,
RhsScalar
,
Index
,
Traits
::
mr
,
Traits
::
nr
,
ConjugateLhs
,
ConjugateRhs
>
gebp
;
gemm_pack_lhs
<
LhsScalar
,
Index
,
LhsMapper
,
Traits
::
mr
,
Traits
::
LhsProgress
,
LhsStorageOrder
>
pack_lhs
;
gemm_pack_rhs
<
RhsScalar
,
Index
,
RhsMapper
,
Traits
::
nr
,
RhsStorageOrder
>
pack_rhs
;
gebp_kernel
<
LhsScalar
,
RhsScalar
,
Index
,
ResMapper
,
Traits
::
mr
,
Traits
::
nr
,
ConjugateLhs
,
ConjugateRhs
>
gebp
;
#ifdef EIGEN_HAS_OPENMP
if
(
info
)
{
// this is the parallel version!
Index
tid
=
omp_get_thread_num
();
Index
threads
=
omp_get_num_threads
();
std
::
size_t
sizeA
=
kc
*
mc
;
std
::
size_t
sizeW
=
kc
*
Traits
::
WorkSpaceFactor
;
ei_declare_aligned_stack_constructed_variable
(
LhsScalar
,
blockA
,
sizeA
,
0
);
ei_declare_aligned_stack_constructed_variable
(
RhsScalar
,
w
,
sizeW
,
0
);
RhsScalar
*
blockB
=
blocking
.
blockB
();
eigen_internal_assert
(
blockB
!=
0
);
int
tid
=
omp_get_thread_num
();
int
threads
=
omp_get_num_threads
();
LhsScalar
*
blockA
=
blocking
.
blockA
();
eigen_internal_assert
(
blockA
!=
0
);
std
::
size_t
sizeB
=
kc
*
nc
;
ei_declare_aligned_stack_constructed_variable
(
RhsScalar
,
blockB
,
sizeB
,
0
);
// For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
for
(
Index
k
=
0
;
k
<
depth
;
k
+=
kc
)
...
...
@@ -94,56 +98,56 @@ static void run(Index rows, Index cols, Index depth,
const
Index
actual_kc
=
(
std
::
min
)(
k
+
kc
,
depth
)
-
k
;
// => rows of B', and cols of the A'
// In order to reduce the chance that a thread has to wait for the other,
// let's start by packing
A
'.
pack_
l
hs
(
block
A
,
&
lhs
(
0
,
k
),
lhsStride
,
actual_kc
,
m
c
);
// let's start by packing
B
'.
pack_
r
hs
(
block
B
,
rhs
.
getSubMapper
(
k
,
0
)
,
actual_kc
,
n
c
);
// Pack
B
_k to
B
' in a parallel fashion:
// each thread packs the sub block
B
_k,
j
to
B
'_
j
where
j
is the thread id.
// Pack
A
_k to
A
' in a parallel fashion:
// each thread packs the sub block
A
_k,
i
to
A
'_
i
where
i
is the thread id.
// However, before copying to
B
'_
j
, we have to make sure that no other thread is still using it,
// However, before copying to
A
'_
i
, we have to make sure that no other thread is still using it,
// i.e., we test that info[tid].users equals 0.
// Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
while
(
info
[
tid
].
users
!=
0
)
{}
info
[
tid
].
users
+=
threads
;
pack_
r
hs
(
block
B
+
info
[
tid
].
r
hs_start
*
actual_kc
,
&
rhs
(
k
,
info
[
tid
].
r
hs_start
),
rhsStride
,
actual_kc
,
info
[
tid
].
r
hs_length
);
pack_
l
hs
(
block
A
+
info
[
tid
].
l
hs_start
*
actual_kc
,
lhs
.
getSubMapper
(
info
[
tid
].
l
hs_start
,
k
)
,
actual_kc
,
info
[
tid
].
l
hs_length
);
// Notify the other threads that the part
B
'_
j
is ready to go.
// Notify the other threads that the part
A
'_
i
is ready to go.
info
[
tid
].
sync
=
k
;
// Computes C_i += A' * B' per
B
'_
j
for
(
Index
shift
=
0
;
shift
<
threads
;
++
shift
)
// Computes C_i += A' * B' per
A
'_
i
for
(
int
shift
=
0
;
shift
<
threads
;
++
shift
)
{
Index
j
=
(
tid
+
shift
)
%
threads
;
int
i
=
(
tid
+
shift
)
%
threads
;
// At this point we have to make sure that
B
'_
j
has been updated by the thread
j
,
// At this point we have to make sure that
A
'_
i
has been updated by the thread
i
,
// we use testAndSetOrdered to mimic a volatile access.
// However, no need to wait for the B' part which has been updated by the current thread!
if
(
shift
>
0
)
while
(
info
[
j
].
sync
!=
k
)
{}
if
(
shift
>
0
)
{
while
(
info
[
i
].
sync
!=
k
)
{
}
}
gebp
(
res
+
info
[
j
].
r
hs_start
*
resStride
,
resStride
,
blockA
,
blockB
+
info
[
j
].
r
hs_start
*
actual_kc
,
mc
,
actual_kc
,
info
[
j
].
r
hs_length
,
a
lpha
,
-
1
,
-
1
,
0
,
0
,
w
);
gebp
(
res
.
getSubMapper
(
info
[
i
].
l
hs_start
,
0
),
blockA
+
info
[
i
].
l
hs_start
*
actual_kc
,
blockB
,
info
[
i
].
l
hs_length
,
a
ctual_kc
,
nc
,
alpha
);
}
// Then keep going as usual with the remaining
A
'
for
(
Index
i
=
mc
;
i
<
row
s
;
i
+=
m
c
)
// Then keep going as usual with the remaining
B
'
for
(
Index
j
=
nc
;
j
<
col
s
;
j
+=
n
c
)
{
const
Index
actual_
m
c
=
(
std
::
min
)(
i
+
mc
,
row
s
)
-
i
;
const
Index
actual_
n
c
=
(
std
::
min
)(
j
+
nc
,
col
s
)
-
j
;
// pack
A_i,k
to
A
'
pack_
l
hs
(
block
A
,
&
lhs
(
i
,
k
),
lhsStride
,
actual_kc
,
actual_
m
c
);
// pack
B_k,j
to
B
'
pack_
r
hs
(
block
B
,
rhs
.
getSubMapper
(
k
,
j
)
,
actual_kc
,
actual_
n
c
);
// C_
i
+= A' * B'
gebp
(
res
+
i
,
resStride
,
blockA
,
blockB
,
actual_mc
,
actual_kc
,
cols
,
alpha
,
-
1
,
-
1
,
0
,
0
,
w
);
// C_
j
+= A' * B'
gebp
(
res
.
getSubMapper
(
0
,
j
)
,
blockA
,
blockB
,
rows
,
actual_kc
,
actual_nc
,
alpha
);
}
// Release all the sub blocks
B
'_
j
of
B
' for the current thread,
// Release all the sub blocks
A
'_
i
of
A
' for the current thread,
// i.e., we simply decrement the number of users by 1
for
(
Index
j
=
0
;
j
<
threads
;
++
j
)
{
for
(
Index
i
=
0
;
i
<
threads
;
++
i
)
#pragma omp atomic
info
[
j
].
users
-=
1
;
}
info
[
i
].
users
-=
1
;
}
}
else
...
...
@@ -153,38 +157,42 @@ static void run(Index rows, Index cols, Index depth,
// this is the sequential version!
std
::
size_t
sizeA
=
kc
*
mc
;
std
::
size_t
sizeB
=
kc
*
cols
;
std
::
size_t
sizeW
=
kc
*
Traits
::
WorkSpaceFactor
;
std
::
size_t
sizeB
=
kc
*
nc
;
ei_declare_aligned_stack_constructed_variable
(
LhsScalar
,
blockA
,
sizeA
,
blocking
.
blockA
());
ei_declare_aligned_stack_constructed_variable
(
RhsScalar
,
blockB
,
sizeB
,
blocking
.
blockB
());
ei_declare_aligned_stack_constructed_variable
(
RhsScalar
,
blockW
,
sizeW
,
blocking
.
blockW
());
const
bool
pack_rhs_once
=
mc
!=
rows
&&
kc
==
depth
&&
nc
==
cols
;
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
// (==GEMM_VAR1)
for
(
Index
k2
=
0
;
k2
<
depth
;
k2
+=
kc
)
for
(
Index
i2
=
0
;
i2
<
rows
;
i2
+=
mc
)
{
const
Index
actual_kc
=
(
std
::
min
)(
k2
+
kc
,
depth
)
-
k2
;
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
// => Pack rhs's panel into a sequential chunk of memory (L2 caching)
// Note that this panel will be read as many times as the number of blocks in the lhs's
// vertical panel which is, in practice, a very low number.
pack_rhs
(
blockB
,
&
rhs
(
k2
,
0
),
rhsStride
,
actual_kc
,
cols
);
const
Index
actual_mc
=
(
std
::
min
)(
i2
+
mc
,
rows
)
-
i2
;
// For each mc x kc block of the lhs's vertical panel...
// (==GEPP_VAR1)
for
(
Index
i2
=
0
;
i2
<
rows
;
i2
+=
mc
)
for
(
Index
k2
=
0
;
k2
<
depth
;
k2
+=
kc
)
{
const
Index
actual_mc
=
(
std
::
min
)(
i2
+
mc
,
rows
)
-
i2
;
// We pack the lhs's block into a sequential chunk of memory (L1 caching)
// Note that this block will be read a very high number of times, which is equal to the number of
// micro vertical panel of the large rhs's panel (e.g., cols/4 times).
pack_lhs
(
blockA
,
&
lhs
(
i2
,
k2
),
lhsStride
,
actual_kc
,
actual_mc
);
// Everything is packed, we can now call the block * panel kernel:
gebp
(
res
+
i2
,
resStride
,
blockA
,
blockB
,
actual_mc
,
actual_kc
,
cols
,
alpha
,
-
1
,
-
1
,
0
,
0
,
blockW
);
const
Index
actual_kc
=
(
std
::
min
)(
k2
+
kc
,
depth
)
-
k2
;
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
// => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
// Note that this panel will be read as many times as the number of blocks in the rhs's
// horizontal panel which is, in practice, a very low number.
pack_lhs
(
blockA
,
lhs
.
getSubMapper
(
i2
,
k2
),
actual_kc
,
actual_mc
);
// For each kc x nc block of the rhs's horizontal panel...
for
(
Index
j2
=
0
;
j2
<
cols
;
j2
+=
nc
)
{
const
Index
actual_nc
=
(
std
::
min
)(
j2
+
nc
,
cols
)
-
j2
;
// We pack the rhs's block into a sequential chunk of memory (L2 caching)
// Note that this block will be read a very high number of times, which is equal to the number of
// micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
if
((
!
pack_rhs_once
)
||
i2
==
0
)
pack_rhs
(
blockB
,
rhs
.
getSubMapper
(
k2
,
j2
),
actual_kc
,
actual_nc
);
// Everything is packed, we can now call the panel * block kernel:
gebp
(
res
.
getSubMapper
(
i2
,
j2
),
blockA
,
blockB
,
actual_mc
,
actual_kc
,
actual_nc
,
alpha
);
}
}
}
}
...
...
@@ -193,26 +201,21 @@ static void run(Index rows, Index cols, Index depth,
};
/*********************************************************************************
* Specialization of
G
ener
alP
roduct
<>
for "large" GEMM, i.e.,
* Specialization of
g
ener
ic_p
roduct
_impl
for "large" GEMM, i.e.,
* implementation of the high level wrapper to general_matrix_matrix_product
**********************************************************************************/
template
<
typename
Lhs
,
typename
Rhs
>
struct
traits
<
GeneralProduct
<
Lhs
,
Rhs
,
GemmProduct
>
>
:
traits
<
ProductBase
<
GeneralProduct
<
Lhs
,
Rhs
,
GemmProduct
>
,
Lhs
,
Rhs
>
>
{};
template
<
typename
Scalar
,
typename
Index
,
typename
Gemm
,
typename
Lhs
,
typename
Rhs
,
typename
Dest
,
typename
BlockingType
>
struct
gemm_functor
{
gemm_functor
(
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
Dest
&
dest
,
const
Scalar
&
actualAlpha
,
BlockingType
&
blocking
)
gemm_functor
(
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
Dest
&
dest
,
const
Scalar
&
actualAlpha
,
BlockingType
&
blocking
)
:
m_lhs
(
lhs
),
m_rhs
(
rhs
),
m_dest
(
dest
),
m_actualAlpha
(
actualAlpha
),
m_blocking
(
blocking
)
{}
void
initParallelSession
()
const
void
initParallelSession
(
Index
num_threads
)
const
{
m_blocking
.
allocateB
();
m_blocking
.
initParallel
(
m_lhs
.
rows
(),
m_rhs
.
cols
(),
m_lhs
.
cols
(),
num_threads
);
m_blocking
.
allocateA
();
}
void
operator
()
(
Index
row
,
Index
rows
,
Index
col
=
0
,
Index
cols
=-
1
,
GemmParallelInfo
<
Index
>*
info
=
0
)
const
...
...
@@ -221,12 +224,14 @@ struct gemm_functor
cols
=
m_rhs
.
cols
();
Gemm
::
run
(
rows
,
cols
,
m_lhs
.
cols
(),
/*(const Scalar*)*/
&
m_lhs
.
coeffRef
(
row
,
0
),
m_lhs
.
outerStride
(),
/*(const Scalar*)*/
&
m_rhs
.
coeffRef
(
0
,
col
),
m_rhs
.
outerStride
(),
&
m_lhs
.
coeffRef
(
row
,
0
),
m_lhs
.
outerStride
(),
&
m_rhs
.
coeffRef
(
0
,
col
),
m_rhs
.
outerStride
(),
(
Scalar
*
)
&
(
m_dest
.
coeffRef
(
row
,
col
)),
m_dest
.
outerStride
(),
m_actualAlpha
,
m_blocking
,
info
);
}
typedef
typename
Gemm
::
Traits
Traits
;
protected
:
const
Lhs
&
m_lhs
;
const
Rhs
&
m_rhs
;
...
...
@@ -247,29 +252,27 @@ class level3_blocking
protected:
LhsScalar
*
m_blockA
;
RhsScalar
*
m_blockB
;
RhsScalar
*
m_blockW
;
Dense
Index
m_mc
;
Dense
Index
m_nc
;
Dense
Index
m_kc
;
Index
m_mc
;
Index
m_nc
;
Index
m_kc
;
public:
level3_blocking
()
:
m_blockA
(
0
),
m_blockB
(
0
),
m_blockW
(
0
),
m_mc
(
0
),
m_nc
(
0
),
m_kc
(
0
)
:
m_blockA
(
0
),
m_blockB
(
0
),
m_mc
(
0
),
m_nc
(
0
),
m_kc
(
0
)
{}
inline
Dense
Index
mc
()
const
{
return
m_mc
;
}
inline
Dense
Index
nc
()
const
{
return
m_nc
;
}
inline
Dense
Index
kc
()
const
{
return
m_kc
;
}
inline
Index
mc
()
const
{
return
m_mc
;
}
inline
Index
nc
()
const
{
return
m_nc
;
}
inline
Index
kc
()
const
{
return
m_kc
;
}
inline
LhsScalar
*
blockA
()
{
return
m_blockA
;
}
inline
RhsScalar
*
blockB
()
{
return
m_blockB
;
}
inline
RhsScalar
*
blockW
()
{
return
m_blockW
;
}
};
template
<
int
StorageOrder
,
typename
_LhsScalar
,
typename
_RhsScalar
,
int
MaxRows
,
int
MaxCols
,
int
MaxDepth
,
int
KcFactor
>
class
gemm_blocking_space
<
StorageOrder
,
_LhsScalar
,
_RhsScalar
,
MaxRows
,
MaxCols
,
MaxDepth
,
KcFactor
,
true
>
class
gemm_blocking_space
<
StorageOrder
,
_LhsScalar
,
_RhsScalar
,
MaxRows
,
MaxCols
,
MaxDepth
,
KcFactor
,
true
/* == FiniteAtCompileTime */
>
:
public
level3_blocking
<
typename
conditional
<
StorageOrder
==
RowMajor
,
_RhsScalar
,
_LhsScalar
>::
type
,
typename
conditional
<
StorageOrder
==
RowMajor
,
_LhsScalar
,
_RhsScalar
>::
type
>
...
...
@@ -284,29 +287,38 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
typedef
gebp_traits
<
LhsScalar
,
RhsScalar
>
Traits
;
enum
{
SizeA
=
ActualRows
*
MaxDepth
,
SizeB
=
ActualCols
*
MaxDepth
,
SizeW
=
MaxDepth
*
Traits
::
WorkSpaceFactor
SizeB
=
ActualCols
*
MaxDepth
};
EIGEN_ALIGN16
LhsScalar
m_staticA
[
SizeA
];
EIGEN_ALIGN16
RhsScalar
m_staticB
[
SizeB
];
EIGEN_ALIGN16
RhsScalar
m_staticW
[
SizeW
];
#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
EIGEN_ALIGN_MAX
LhsScalar
m_staticA
[
SizeA
];
EIGEN_ALIGN_MAX
RhsScalar
m_staticB
[
SizeB
];
#else
EIGEN_ALIGN_MAX
char
m_staticA
[
SizeA
*
sizeof
(
LhsScalar
)
+
EIGEN_DEFAULT_ALIGN_BYTES
-
1
];
EIGEN_ALIGN_MAX
char
m_staticB
[
SizeB
*
sizeof
(
RhsScalar
)
+
EIGEN_DEFAULT_ALIGN_BYTES
-
1
];
#endif
public
:
gemm_blocking_space
(
Dense
Index
/*rows*/
,
Dense
Index
/*cols*/
,
Dense
Index
/*depth*/
)
gemm_blocking_space
(
Index
/*rows*/
,
Index
/*cols*/
,
Index
/*depth*/
,
Index
/*num_threads*/
,
bool
/*full_rows = false*/
)
{
this
->
m_mc
=
ActualRows
;
this
->
m_nc
=
ActualCols
;
this
->
m_kc
=
MaxDepth
;
#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
this
->
m_blockA
=
m_staticA
;
this
->
m_blockB
=
m_staticB
;
this
->
m_blockW
=
m_staticW
;
#else
this
->
m_blockA
=
reinterpret_cast
<
LhsScalar
*>
((
internal
::
UIntPtr
(
m_staticA
)
+
(
EIGEN_DEFAULT_ALIGN_BYTES
-
1
))
&
~
std
::
size_t
(
EIGEN_DEFAULT_ALIGN_BYTES
-
1
));
this
->
m_blockB
=
reinterpret_cast
<
RhsScalar
*>
((
internal
::
UIntPtr
(
m_staticB
)
+
(
EIGEN_DEFAULT_ALIGN_BYTES
-
1
))
&
~
std
::
size_t
(
EIGEN_DEFAULT_ALIGN_BYTES
-
1
));
#endif
}
void
initParallel
(
Index
,
Index
,
Index
,
Index
)
{}
inline
void
allocateA
()
{}
inline
void
allocateB
()
{}
inline
void
allocateW
()
{}
inline
void
allocateAll
()
{}
};
...
...
@@ -323,22 +335,42 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
typedef
typename
conditional
<
Transpose
,
_LhsScalar
,
_RhsScalar
>::
type
RhsScalar
;
typedef
gebp_traits
<
LhsScalar
,
RhsScalar
>
Traits
;
DenseIndex
m_sizeA
;
DenseIndex
m_sizeB
;
DenseIndex
m_sizeW
;
Index
m_sizeA
;
Index
m_sizeB
;
public
:
gemm_blocking_space
(
DenseIndex
rows
,
DenseIndex
cols
,
DenseIndex
depth
)
gemm_blocking_space
(
Index
rows
,
Index
cols
,
Index
depth
,
Index
num_threads
,
bool
l3_blocking
)
{
this
->
m_mc
=
Transpose
?
cols
:
rows
;
this
->
m_nc
=
Transpose
?
rows
:
cols
;
this
->
m_kc
=
depth
;
if
(
l3_blocking
)
{
computeProductBlockingSizes
<
LhsScalar
,
RhsScalar
,
KcFactor
>
(
this
->
m_kc
,
this
->
m_mc
,
this
->
m_nc
,
num_threads
);
}
else
// no l3 blocking
{
Index
n
=
this
->
m_nc
;
computeProductBlockingSizes
<
LhsScalar
,
RhsScalar
,
KcFactor
>
(
this
->
m_kc
,
this
->
m_mc
,
n
,
num_threads
);
}
m_sizeA
=
this
->
m_mc
*
this
->
m_kc
;
m_sizeB
=
this
->
m_kc
*
this
->
m_nc
;
}
void
initParallel
(
Index
rows
,
Index
cols
,
Index
depth
,
Index
num_threads
)
{
this
->
m_mc
=
Transpose
?
cols
:
rows
;
this
->
m_nc
=
Transpose
?
rows
:
cols
;
this
->
m_kc
=
depth
;
computeProductBlockingSizes
<
LhsScalar
,
RhsScalar
,
KcFactor
>
(
this
->
m_kc
,
this
->
m_mc
,
this
->
m_nc
);
eigen_internal_assert
(
this
->
m_blockA
==
0
&&
this
->
m_blockB
==
0
);
Index
m
=
this
->
m_mc
;
computeProductBlockingSizes
<
LhsScalar
,
RhsScalar
,
KcFactor
>
(
this
->
m_kc
,
m
,
this
->
m_nc
,
num_threads
);
m_sizeA
=
this
->
m_mc
*
this
->
m_kc
;
m_sizeB
=
this
->
m_kc
*
this
->
m_nc
;
m_sizeW
=
this
->
m_kc
*
Traits
::
WorkSpaceFactor
;
}
void
allocateA
()
...
...
@@ -353,81 +385,108 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
this
->
m_blockB
=
aligned_new
<
RhsScalar
>
(
m_sizeB
);
}
void
allocateW
()
{
if
(
this
->
m_blockW
==
0
)
this
->
m_blockW
=
aligned_new
<
RhsScalar
>
(
m_sizeW
);
}
void
allocateAll
()
{
allocateA
();
allocateB
();
allocateW
();
}
~
gemm_blocking_space
()
{
aligned_delete
(
this
->
m_blockA
,
m_sizeA
);
aligned_delete
(
this
->
m_blockB
,
m_sizeB
);
aligned_delete
(
this
->
m_blockW
,
m_sizeW
);
}
};
}
// end namespace internal
namespace
internal
{
template
<
typename
Lhs
,
typename
Rhs
>
class
GeneralP
roduct
<
Lhs
,
Rhs
,
GemmProduct
>
:
public
ProductBase
<
GeneralP
roduct
<
Lhs
,
Rhs
,
GemmProduct
>
,
Lhs
,
Rhs
>
struct
generic_p
roduct
_impl
<
Lhs
,
Rhs
,
DenseShape
,
DenseShape
,
GemmProduct
>
:
generic_product_impl_base
<
Lhs
,
Rhs
,
generic_p
roduct
_impl
<
Lhs
,
Rhs
,
DenseShape
,
DenseShape
,
GemmProduct
>
>
{
enum
{
MaxDepthAtCompileTime
=
EIGEN_SIZE_MIN_PREFER_FIXED
(
Lhs
::
MaxColsAtCompileTime
,
Rhs
::
MaxRowsAtCompileTime
)
};
public
:
EIGEN_PRODUCT_PUBLIC_INTERFACE
(
GeneralProduct
)
typedef
typename
Lhs
::
Scalar
LhsScalar
;
typedef
typename
Rhs
::
Scalar
RhsScalar
;
typedef
Scalar
ResScalar
;
typedef
typename
Product
<
Lhs
,
Rhs
>::
Scalar
Scalar
;
typedef
typename
Lhs
::
Scalar
LhsScalar
;
typedef
typename
Rhs
::
Scalar
RhsScalar
;
GeneralProduct
(
const
Lhs
&
lhs
,
const
Rhs
&
rhs
)
:
Base
(
lhs
,
rhs
)
{
#if !(defined(EIGEN_NO_STATIC_ASSERT) && defined(EIGEN_NO_DEBUG))
typedef
internal
::
scalar_product_op
<
LhsScalar
,
RhsScalar
>
BinOp
;
EIGEN_CHECK_BINARY_COMPATIBILIY
(
BinOp
,
LhsScalar
,
RhsScalar
);
#endif
}
typedef
internal
::
blas_traits
<
Lhs
>
LhsBlasTraits
;
typedef
typename
LhsBlasTraits
::
DirectLinearAccessType
ActualLhsType
;
typedef
typename
internal
::
remove_all
<
ActualLhsType
>::
type
ActualLhsTypeCleaned
;
template
<
typename
Dest
>
void
scaleAndAddTo
(
Dest
&
dst
,
const
Scalar
&
alpha
)
const
{
eigen_assert
(
dst
.
rows
()
==
m_lhs
.
rows
()
&&
dst
.
cols
()
==
m_rhs
.
cols
());
if
(
m_lhs
.
cols
()
==
0
||
m_lhs
.
rows
()
==
0
||
m_rhs
.
cols
()
==
0
)
return
;
typedef
internal
::
blas_traits
<
Rhs
>
RhsBlasTraits
;
typedef
typename
RhsBlasTraits
::
DirectLinearAccessType
ActualRhsType
;
typedef
typename
internal
::
remove_all
<
ActualRhsType
>::
type
ActualRhsTypeCleaned
;
typename
internal
::
add_const_on_value_type
<
ActualLhsType
>::
type
lhs
=
LhsBlasTraits
::
extract
(
m_lhs
);
typename
internal
::
add_const_on_value_type
<
ActualRhsType
>::
type
rhs
=
RhsBlasTraits
::
extract
(
m_rhs
);
enum
{
MaxDepthAtCompileTime
=
EIGEN_SIZE_MIN_PREFER_FIXED
(
Lhs
::
MaxColsAtCompileTime
,
Rhs
::
MaxRowsAtCompileTime
)
};
Scalar
actualAlpha
=
alpha
*
LhsBlasTraits
::
extractScalarFactor
(
m_lhs
)
*
RhsBlasTraits
::
extractScalarFactor
(
m_rhs
);
typedef
generic_product_impl
<
Lhs
,
Rhs
,
DenseShape
,
DenseShape
,
CoeffBasedProductMode
>
lazyproduct
;
typedef
internal
::
gemm_blocking_space
<
(
Dest
::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
LhsScalar
,
RhsScalar
,
Dest
::
MaxRowsAtCompileTime
,
Dest
::
MaxColsAtCompileTime
,
MaxDepthAtCompileTime
>
BlockingType
;
template
<
typename
Dst
>
static
void
evalTo
(
Dst
&
dst
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
)
{
if
((
rhs
.
rows
()
+
dst
.
rows
()
+
dst
.
cols
())
<
20
&&
rhs
.
rows
()
>
0
)
lazyproduct
::
evalTo
(
dst
,
lhs
,
rhs
);
else
{
dst
.
setZero
();
scaleAndAddTo
(
dst
,
lhs
,
rhs
,
Scalar
(
1
));
}
}
typedef
internal
::
gemm_functor
<
Scalar
,
Index
,
internal
::
general_matrix_matrix_product
<
Index
,
LhsScalar
,
(
_ActualLhsType
::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
bool
(
LhsBlasTraits
::
NeedToConjugate
),
RhsScalar
,
(
_ActualRhsType
::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
bool
(
RhsBlasTraits
::
NeedToConjugate
),
(
Dest
::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
>
,
_ActualLhsType
,
_ActualRhsType
,
Dest
,
BlockingType
>
GemmFunctor
;
template
<
typename
Dst
>
static
void
addTo
(
Dst
&
dst
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
)
{
if
((
rhs
.
rows
()
+
dst
.
rows
()
+
dst
.
cols
())
<
20
&&
rhs
.
rows
()
>
0
)
lazyproduct
::
addTo
(
dst
,
lhs
,
rhs
);
else
scaleAndAddTo
(
dst
,
lhs
,
rhs
,
Scalar
(
1
));
}
BlockingType
blocking
(
dst
.
rows
(),
dst
.
cols
(),
lhs
.
cols
());
template
<
typename
Dst
>
static
void
subTo
(
Dst
&
dst
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
)
{
if
((
rhs
.
rows
()
+
dst
.
rows
()
+
dst
.
cols
())
<
20
&&
rhs
.
rows
()
>
0
)
lazyproduct
::
subTo
(
dst
,
lhs
,
rhs
);
else
scaleAndAddTo
(
dst
,
lhs
,
rhs
,
Scalar
(
-
1
));
}
internal
::
parallelize_gemm
<
(
Dest
::
MaxRowsAtCompileTime
>
32
||
Dest
::
MaxRowsAtCompileTime
==
Dynamic
)
>
(
GemmFunctor
(
lhs
,
rhs
,
dst
,
actualAlpha
,
blocking
),
this
->
rows
(),
this
->
cols
(),
Dest
::
Flags
&
RowMajorBit
);
}
template
<
typename
Dest
>
static
void
scaleAndAddTo
(
Dest
&
dst
,
const
Lhs
&
a_lhs
,
const
Rhs
&
a_rhs
,
const
Scalar
&
alpha
)
{
eigen_assert
(
dst
.
rows
()
==
a_lhs
.
rows
()
&&
dst
.
cols
()
==
a_rhs
.
cols
());
if
(
a_lhs
.
cols
()
==
0
||
a_lhs
.
rows
()
==
0
||
a_rhs
.
cols
()
==
0
)
return
;
typename
internal
::
add_const_on_value_type
<
ActualLhsType
>::
type
lhs
=
LhsBlasTraits
::
extract
(
a_lhs
);
typename
internal
::
add_const_on_value_type
<
ActualRhsType
>::
type
rhs
=
RhsBlasTraits
::
extract
(
a_rhs
);
Scalar
actualAlpha
=
alpha
*
LhsBlasTraits
::
extractScalarFactor
(
a_lhs
)
*
RhsBlasTraits
::
extractScalarFactor
(
a_rhs
);
typedef
internal
::
gemm_blocking_space
<
(
Dest
::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
LhsScalar
,
RhsScalar
,
Dest
::
MaxRowsAtCompileTime
,
Dest
::
MaxColsAtCompileTime
,
MaxDepthAtCompileTime
>
BlockingType
;
typedef
internal
::
gemm_functor
<
Scalar
,
Index
,
internal
::
general_matrix_matrix_product
<
Index
,
LhsScalar
,
(
ActualLhsTypeCleaned
::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
bool
(
LhsBlasTraits
::
NeedToConjugate
),
RhsScalar
,
(
ActualRhsTypeCleaned
::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
bool
(
RhsBlasTraits
::
NeedToConjugate
),
(
Dest
::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
>
,
ActualLhsTypeCleaned
,
ActualRhsTypeCleaned
,
Dest
,
BlockingType
>
GemmFunctor
;
BlockingType
blocking
(
dst
.
rows
(),
dst
.
cols
(),
lhs
.
cols
(),
1
,
true
);
internal
::
parallelize_gemm
<
(
Dest
::
MaxRowsAtCompileTime
>
32
||
Dest
::
MaxRowsAtCompileTime
==
Dynamic
)
>
(
GemmFunctor
(
lhs
,
rhs
,
dst
,
actualAlpha
,
blocking
),
a_lhs
.
rows
(),
a_rhs
.
cols
(),
a_lhs
.
cols
(),
Dest
::
Flags
&
RowMajorBit
);
}
};
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_GENERAL_MATRIX_MATRIX_H
external/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
View file @
a394b22a
...
...
@@ -20,7 +20,7 @@ namespace internal {
/**********************************************************************
* This file implements a general A * B product while
* evaluating only one triangular part of the product.
* This is more general version of self adjoint product (C += A A^T)
* This is
a
more general version of self adjoint product (C += A A^T)
* as the level 3 SYRK Blas routine.
**********************************************************************/
...
...
@@ -40,15 +40,16 @@ template <typename Index, typename LhsScalar, int LhsStorageOrder, bool Conjugat
typename
RhsScalar
,
int
RhsStorageOrder
,
bool
ConjugateRhs
,
int
UpLo
,
int
Version
>
struct
general_matrix_matrix_triangular_product
<
Index
,
LhsScalar
,
LhsStorageOrder
,
ConjugateLhs
,
RhsScalar
,
RhsStorageOrder
,
ConjugateRhs
,
RowMajor
,
UpLo
,
Version
>
{
typedef
typename
s
calar
_product_t
raits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
typedef
typename
S
calar
BinaryOpT
raits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
size
,
Index
depth
,
const
LhsScalar
*
lhs
,
Index
lhsStride
,
const
RhsScalar
*
rhs
,
Index
rhsStride
,
ResScalar
*
res
,
Index
resStride
,
const
ResScalar
&
alpha
)
const
RhsScalar
*
rhs
,
Index
rhsStride
,
ResScalar
*
res
,
Index
resStride
,
const
ResScalar
&
alpha
,
level3_blocking
<
RhsScalar
,
LhsScalar
>&
blocking
)
{
general_matrix_matrix_triangular_product
<
Index
,
RhsScalar
,
RhsStorageOrder
==
RowMajor
?
ColMajor
:
RowMajor
,
ConjugateRhs
,
LhsScalar
,
LhsStorageOrder
==
RowMajor
?
ColMajor
:
RowMajor
,
ConjugateLhs
,
ColMajor
,
UpLo
==
Lower
?
Upper
:
Lower
>
::
run
(
size
,
depth
,
rhs
,
rhsStride
,
lhs
,
lhsStride
,
res
,
resStride
,
alpha
);
::
run
(
size
,
depth
,
rhs
,
rhsStride
,
lhs
,
lhsStride
,
res
,
resStride
,
alpha
,
blocking
);
}
};
...
...
@@ -56,32 +57,36 @@ template <typename Index, typename LhsScalar, int LhsStorageOrder, bool Conjugat
typename
RhsScalar
,
int
RhsStorageOrder
,
bool
ConjugateRhs
,
int
UpLo
,
int
Version
>
struct
general_matrix_matrix_triangular_product
<
Index
,
LhsScalar
,
LhsStorageOrder
,
ConjugateLhs
,
RhsScalar
,
RhsStorageOrder
,
ConjugateRhs
,
ColMajor
,
UpLo
,
Version
>
{
typedef
typename
s
calar
_product_t
raits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
typedef
typename
S
calar
BinaryOpT
raits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
size
,
Index
depth
,
const
LhsScalar
*
_lhs
,
Index
lhsStride
,
const
RhsScalar
*
_rhs
,
Index
rhsStride
,
ResScalar
*
res
,
Index
resStride
,
const
ResScalar
&
alpha
)
const
RhsScalar
*
_rhs
,
Index
rhsStride
,
ResScalar
*
_res
,
Index
resStride
,
const
ResScalar
&
alpha
,
level3_blocking
<
LhsScalar
,
RhsScalar
>&
blocking
)
{
const_blas_data_mapper
<
LhsScalar
,
Index
,
LhsStorageOrder
>
lhs
(
_lhs
,
lhsStride
);
const_blas_data_mapper
<
RhsScalar
,
Index
,
RhsStorageOrder
>
rhs
(
_rhs
,
rhsStride
);
typedef
gebp_traits
<
LhsScalar
,
RhsScalar
>
Traits
;
Index
kc
=
depth
;
// cache block size along the K direction
Index
mc
=
size
;
// cache block size along the M direction
Index
nc
=
size
;
// cache block size along the N direction
computeProductBlockingSizes
<
LhsScalar
,
RhsScalar
>
(
kc
,
mc
,
nc
);
typedef
const_blas_data_mapper
<
LhsScalar
,
Index
,
LhsStorageOrder
>
LhsMapper
;
typedef
const_blas_data_mapper
<
RhsScalar
,
Index
,
RhsStorageOrder
>
RhsMapper
;
typedef
blas_data_mapper
<
typename
Traits
::
ResScalar
,
Index
,
ColMajor
>
ResMapper
;
LhsMapper
lhs
(
_lhs
,
lhsStride
);
RhsMapper
rhs
(
_rhs
,
rhsStride
);
ResMapper
res
(
_res
,
resStride
);
Index
kc
=
blocking
.
kc
();
Index
mc
=
(
std
::
min
)(
size
,
blocking
.
mc
());
// !!! mc must be a multiple of nr:
if
(
mc
>
Traits
::
nr
)
mc
=
(
mc
/
Traits
::
nr
)
*
Traits
::
nr
;
std
::
size_t
size
W
=
kc
*
Traits
::
WorkSpaceFactor
;
std
::
size_t
sizeB
=
sizeW
+
kc
*
size
;
ei_declare_aligned_stack_constructed_variable
(
LhsScalar
,
blockA
,
kc
*
mc
,
0
);
ei_declare_aligned_stack_constructed_variable
(
R
hsScalar
,
allocatedB
lock
B
,
size
B
,
0
);
RhsScalar
*
blockB
=
allocatedBlockB
+
sizeW
;
gemm_pack_lhs
<
LhsScalar
,
Index
,
Traits
::
mr
,
Traits
::
LhsProgress
,
LhsStorageOrder
>
pack_lhs
;
gemm_pack_rhs
<
RhsScalar
,
Index
,
Traits
::
nr
,
RhsStorageOrder
>
pack_rhs
;
gebp_kernel
<
LhsScalar
,
RhsScalar
,
Index
,
Traits
::
mr
,
Traits
::
nr
,
ConjugateLhs
,
ConjugateRhs
>
gebp
;
std
::
size_t
size
A
=
kc
*
mc
;
std
::
size_t
sizeB
=
kc
*
size
;
ei_declare_aligned_stack_constructed_variable
(
L
hsScalar
,
b
lock
A
,
size
A
,
blocking
.
blockA
()
);
ei_declare_aligned_stack_constructed_variable
(
RhsScalar
,
blockB
,
sizeB
,
blocking
.
blockB
())
;
gemm_pack_lhs
<
LhsScalar
,
Index
,
LhsMapper
,
Traits
::
mr
,
Traits
::
LhsProgress
,
LhsStorageOrder
>
pack_lhs
;
gemm_pack_rhs
<
RhsScalar
,
Index
,
RhsMapper
,
Traits
::
nr
,
RhsStorageOrder
>
pack_rhs
;
gebp_kernel
<
LhsScalar
,
RhsScalar
,
Index
,
ResMapper
,
Traits
::
mr
,
Traits
::
nr
,
ConjugateLhs
,
ConjugateRhs
>
gebp
;
tribb_kernel
<
LhsScalar
,
RhsScalar
,
Index
,
Traits
::
mr
,
Traits
::
nr
,
ConjugateLhs
,
ConjugateRhs
,
UpLo
>
sybb
;
for
(
Index
k2
=
0
;
k2
<
depth
;
k2
+=
kc
)
...
...
@@ -89,29 +94,30 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
const
Index
actual_kc
=
(
std
::
min
)(
k2
+
kc
,
depth
)
-
k2
;
// note that the actual rhs is the transpose/adjoint of mat
pack_rhs
(
blockB
,
&
rhs
(
k2
,
0
),
rhsStride
,
actual_kc
,
size
);
pack_rhs
(
blockB
,
rhs
.
getSubMapper
(
k2
,
0
)
,
actual_kc
,
size
);
for
(
Index
i2
=
0
;
i2
<
size
;
i2
+=
mc
)
{
const
Index
actual_mc
=
(
std
::
min
)(
i2
+
mc
,
size
)
-
i2
;
pack_lhs
(
blockA
,
&
lhs
(
i2
,
k2
),
lhsStride
,
actual_kc
,
actual_mc
);
pack_lhs
(
blockA
,
lhs
.
getSubMapper
(
i2
,
k2
)
,
actual_kc
,
actual_mc
);
// the selected actual_mc * size panel of res is split into three different part:
// 1 - before the diagonal => processed with gebp or skipped
// 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
// 3 - after the diagonal => processed with gebp or skipped
if
(
UpLo
==
Lower
)
gebp
(
res
+
i2
,
resStride
,
blockA
,
blockB
,
actual_mc
,
actual_kc
,
(
std
::
min
)(
size
,
i2
),
alpha
,
-
1
,
-
1
,
0
,
0
,
allocatedBlockB
);
gebp
(
res
.
getSubMapper
(
i2
,
0
),
blockA
,
blockB
,
actual_mc
,
actual_kc
,
(
std
::
min
)(
size
,
i2
),
alpha
,
-
1
,
-
1
,
0
,
0
);
sybb
(
res
+
resStride
*
i2
+
i2
,
resStride
,
blockA
,
blockB
+
actual_kc
*
i2
,
actual_mc
,
actual_kc
,
alpha
,
allocatedBlockB
);
sybb
(
_
res
+
resStride
*
i2
+
i2
,
resStride
,
blockA
,
blockB
+
actual_kc
*
i2
,
actual_mc
,
actual_kc
,
alpha
);
if
(
UpLo
==
Upper
)
{
Index
j2
=
i2
+
actual_mc
;
gebp
(
res
+
resStride
*
j2
+
i2
,
resStride
,
blockA
,
blockB
+
actual_kc
*
j2
,
actual_mc
,
actual_kc
,
(
std
::
max
)(
Index
(
0
),
size
-
j2
),
alpha
,
-
1
,
-
1
,
0
,
0
,
allocatedBlockB
);
gebp
(
res
.
getSubMapper
(
i2
,
j2
)
,
blockA
,
blockB
+
actual_kc
*
j2
,
actual_mc
,
actual_kc
,
(
std
::
max
)(
Index
(
0
),
size
-
j2
),
alpha
,
-
1
,
-
1
,
0
,
0
);
}
}
}
...
...
@@ -132,14 +138,17 @@ struct tribb_kernel
{
typedef
gebp_traits
<
LhsScalar
,
RhsScalar
,
ConjLhs
,
ConjRhs
>
Traits
;
typedef
typename
Traits
::
ResScalar
ResScalar
;
enum
{
BlockSize
=
EIGEN_PLAIN_ENUM_MAX
(
mr
,
nr
)
BlockSize
=
meta_least_common_multiple
<
EIGEN_PLAIN_ENUM_MAX
(
mr
,
nr
),
EIGEN_PLAIN_ENUM_MIN
(
mr
,
nr
)
>::
ret
};
void
operator
()(
ResScalar
*
res
,
Index
resStride
,
const
LhsScalar
*
blockA
,
const
RhsScalar
*
blockB
,
Index
size
,
Index
depth
,
const
ResScalar
&
alpha
,
RhsScalar
*
workspace
)
void
operator
()(
ResScalar
*
_
res
,
Index
resStride
,
const
LhsScalar
*
blockA
,
const
RhsScalar
*
blockB
,
Index
size
,
Index
depth
,
const
ResScalar
&
alpha
)
{
gebp_kernel
<
LhsScalar
,
RhsScalar
,
Index
,
mr
,
nr
,
ConjLhs
,
ConjRhs
>
gebp_kernel
;
Matrix
<
ResScalar
,
BlockSize
,
BlockSize
,
ColMajor
>
buffer
;
typedef
blas_data_mapper
<
ResScalar
,
Index
,
ColMajor
>
ResMapper
;
ResMapper
res
(
_res
,
resStride
);
gebp_kernel
<
LhsScalar
,
RhsScalar
,
Index
,
ResMapper
,
mr
,
nr
,
ConjLhs
,
ConjRhs
>
gebp_kernel
;
Matrix
<
ResScalar
,
BlockSize
,
BlockSize
,
ColMajor
>
buffer
((
internal
::
constructor_without_unaligned_array_assert
()));
// let's process the block per panel of actual_mc x BlockSize,
// again, each is split into three parts, etc.
...
...
@@ -149,20 +158,20 @@ struct tribb_kernel
const
RhsScalar
*
actual_b
=
blockB
+
j
*
depth
;
if
(
UpLo
==
Upper
)
gebp_kernel
(
res
+
j
*
resStride
,
resStride
,
blockA
,
actual_b
,
j
,
depth
,
actualBlockSize
,
alpha
,
-
1
,
-
1
,
0
,
0
,
workspace
);
gebp_kernel
(
res
.
getSubMapper
(
0
,
j
)
,
blockA
,
actual_b
,
j
,
depth
,
actualBlockSize
,
alpha
,
-
1
,
-
1
,
0
,
0
);
// selfadjoint micro block
{
Index
i
=
j
;
buffer
.
setZero
();
// 1 - apply the kernel on the temporary buffer
gebp_kernel
(
buffer
.
data
(),
BlockSize
,
blockA
+
depth
*
i
,
actual_b
,
actualBlockSize
,
depth
,
actualBlockSize
,
alpha
,
-
1
,
-
1
,
0
,
0
,
workspace
);
gebp_kernel
(
ResMapper
(
buffer
.
data
(),
BlockSize
)
,
blockA
+
depth
*
i
,
actual_b
,
actualBlockSize
,
depth
,
actualBlockSize
,
alpha
,
-
1
,
-
1
,
0
,
0
);
// 2 - triangular accumulation
for
(
Index
j1
=
0
;
j1
<
actualBlockSize
;
++
j1
)
{
ResScalar
*
r
=
res
+
(
j
+
j1
)
*
resStride
+
i
;
ResScalar
*
r
=
&
res
(
i
,
j
+
j1
)
;
for
(
Index
i1
=
UpLo
==
Lower
?
j1
:
0
;
UpLo
==
Lower
?
i1
<
actualBlockSize
:
i1
<=
j1
;
++
i1
)
r
[
i1
]
+=
buffer
(
i1
,
j1
);
...
...
@@ -172,8 +181,8 @@ struct tribb_kernel
if
(
UpLo
==
Lower
)
{
Index
i
=
j
+
actualBlockSize
;
gebp_kernel
(
res
+
j
*
resStride
+
i
,
resStride
,
blockA
+
depth
*
i
,
actual_b
,
size
-
i
,
depth
,
actualBlockSize
,
alpha
,
-
1
,
-
1
,
0
,
0
,
workspace
);
gebp_kernel
(
res
.
getSubMapper
(
i
,
j
)
,
blockA
+
depth
*
i
,
actual_b
,
size
-
i
,
depth
,
actualBlockSize
,
alpha
,
-
1
,
-
1
,
0
,
0
);
}
}
}
...
...
@@ -190,10 +199,9 @@ struct general_product_to_triangular_selector;
template
<
typename
MatrixType
,
typename
ProductType
,
int
UpLo
>
struct
general_product_to_triangular_selector
<
MatrixType
,
ProductType
,
UpLo
,
true
>
{
static
void
run
(
MatrixType
&
mat
,
const
ProductType
&
prod
,
const
typename
MatrixType
::
Scalar
&
alpha
)
static
void
run
(
MatrixType
&
mat
,
const
ProductType
&
prod
,
const
typename
MatrixType
::
Scalar
&
alpha
,
bool
beta
)
{
typedef
typename
MatrixType
::
Scalar
Scalar
;
typedef
typename
MatrixType
::
Index
Index
;
typedef
typename
internal
::
remove_all
<
typename
ProductType
::
LhsNested
>::
type
Lhs
;
typedef
internal
::
blas_traits
<
Lhs
>
LhsBlasTraits
;
...
...
@@ -209,6 +217,9 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
Scalar
actualAlpha
=
alpha
*
LhsBlasTraits
::
extractScalarFactor
(
prod
.
lhs
().
derived
())
*
RhsBlasTraits
::
extractScalarFactor
(
prod
.
rhs
().
derived
());
if
(
!
beta
)
mat
.
template
triangularView
<
UpLo
>().
setZero
();
enum
{
StorageOrder
=
(
internal
::
traits
<
MatrixType
>::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
UseLhsDirectly
=
_ActualLhs
::
InnerStrideAtCompileTime
==
1
,
...
...
@@ -236,10 +247,8 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
template
<
typename
MatrixType
,
typename
ProductType
,
int
UpLo
>
struct
general_product_to_triangular_selector
<
MatrixType
,
ProductType
,
UpLo
,
false
>
{
static
void
run
(
MatrixType
&
mat
,
const
ProductType
&
prod
,
const
typename
MatrixType
::
Scalar
&
alpha
)
static
void
run
(
MatrixType
&
mat
,
const
ProductType
&
prod
,
const
typename
MatrixType
::
Scalar
&
alpha
,
bool
beta
)
{
typedef
typename
MatrixType
::
Index
Index
;
typedef
typename
internal
::
remove_all
<
typename
ProductType
::
LhsNested
>::
type
Lhs
;
typedef
internal
::
blas_traits
<
Lhs
>
LhsBlasTraits
;
typedef
typename
LhsBlasTraits
::
DirectLinearAccessType
ActualLhs
;
...
...
@@ -254,23 +263,47 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
typename
ProductType
::
Scalar
actualAlpha
=
alpha
*
LhsBlasTraits
::
extractScalarFactor
(
prod
.
lhs
().
derived
())
*
RhsBlasTraits
::
extractScalarFactor
(
prod
.
rhs
().
derived
());
if
(
!
beta
)
mat
.
template
triangularView
<
UpLo
>().
setZero
();
enum
{
IsRowMajor
=
(
internal
::
traits
<
MatrixType
>::
Flags
&
RowMajorBit
)
?
1
:
0
,
LhsIsRowMajor
=
_ActualLhs
::
Flags
&
RowMajorBit
?
1
:
0
,
RhsIsRowMajor
=
_ActualRhs
::
Flags
&
RowMajorBit
?
1
:
0
,
SkipDiag
=
(
UpLo
&
(
UnitDiag
|
ZeroDiag
))
!=
0
};
Index
size
=
mat
.
cols
();
if
(
SkipDiag
)
size
--
;
Index
depth
=
actualLhs
.
cols
();
typedef
internal
::
gemm_blocking_space
<
IsRowMajor
?
RowMajor
:
ColMajor
,
typename
Lhs
::
Scalar
,
typename
Rhs
::
Scalar
,
MatrixType
::
MaxColsAtCompileTime
,
MatrixType
::
MaxColsAtCompileTime
,
_ActualRhs
::
MaxColsAtCompileTime
>
BlockingType
;
BlockingType
blocking
(
size
,
size
,
depth
,
1
,
false
);
internal
::
general_matrix_matrix_triangular_product
<
Index
,
typename
Lhs
::
Scalar
,
_ActualLhs
::
Flags
&
RowMajorBit
?
RowMajor
:
ColMajor
,
LhsBlasTraits
::
NeedToConjugate
,
typename
Rhs
::
Scalar
,
_ActualRhs
::
Flags
&
RowMajorBit
?
RowMajor
:
ColMajor
,
RhsBlasTraits
::
NeedToConjugate
,
MatrixType
::
Flags
&
RowMajorBit
?
RowMajor
:
ColMajor
,
UpLo
>
::
run
(
mat
.
cols
(),
actualLhs
.
cols
(),
&
actualLhs
.
coeffRef
(
0
,
0
),
actualLhs
.
outerStride
(),
&
actualRhs
.
coeffRef
(
0
,
0
),
actualRhs
.
outerStride
(),
mat
.
data
(),
mat
.
outerStride
(),
actualAlpha
);
typename
Lhs
::
Scalar
,
LhsIsRowMajor
?
RowMajor
:
ColMajor
,
LhsBlasTraits
::
NeedToConjugate
,
typename
Rhs
::
Scalar
,
RhsIsRowMajor
?
RowMajor
:
ColMajor
,
RhsBlasTraits
::
NeedToConjugate
,
IsRowMajor
?
RowMajor
:
ColMajor
,
UpLo
&
(
Lower
|
Upper
)
>
::
run
(
size
,
depth
,
&
actualLhs
.
coeffRef
(
SkipDiag
&&
(
UpLo
&
Lower
)
==
Lower
?
1
:
0
,
0
),
actualLhs
.
outerStride
(),
&
actualRhs
.
coeffRef
(
0
,
SkipDiag
&&
(
UpLo
&
Upper
)
==
Upper
?
1
:
0
),
actualRhs
.
outerStride
(),
mat
.
data
()
+
(
SkipDiag
?
(
bool
(
IsRowMajor
)
!=
((
UpLo
&
Lower
)
==
Lower
)
?
1
:
mat
.
outerStride
()
)
:
0
),
mat
.
outerStride
(),
actualAlpha
,
blocking
);
}
};
template
<
typename
MatrixType
,
unsigned
int
UpLo
>
template
<
typename
Product
Derived
,
typename
_Lhs
,
typename
_Rhs
>
TriangularView
<
MatrixType
,
UpLo
>&
TriangularView
<
MatrixType
,
UpLo
>::
assignProduct
(
const
Product
Base
<
ProductDerived
,
_Lhs
,
_Rhs
>
&
prod
,
const
Scalar
&
alpha
)
template
<
typename
Product
Type
>
TriangularView
<
MatrixType
,
UpLo
>&
TriangularView
Impl
<
MatrixType
,
UpLo
,
Dense
>::
_
assignProduct
(
const
Product
Type
&
prod
,
const
Scalar
&
alpha
,
bool
beta
)
{
general_product_to_triangular_selector
<
MatrixType
,
ProductDerived
,
UpLo
,
(
_Lhs
::
ColsAtCompileTime
==
1
)
||
(
_Rhs
::
RowsAtCompileTime
==
1
)
>::
run
(
m_matrix
.
const_cast_derived
(),
prod
.
derived
(),
alpha
);
EIGEN_STATIC_ASSERT
((
UpLo
&
UnitDiag
)
==
0
,
WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED
);
eigen_assert
(
derived
().
nestedExpression
().
rows
()
==
prod
.
rows
()
&&
derived
().
cols
()
==
prod
.
cols
());
general_product_to_triangular_selector
<
MatrixType
,
ProductType
,
UpLo
,
internal
::
traits
<
ProductType
>::
InnerSize
==
1
>::
run
(
derived
().
nestedExpression
().
const_cast_derived
(),
prod
,
alpha
,
beta
);
return
*
this
;
return
derived
()
;
}
}
// end namespace Eigen
...
...
external/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_
MKL
.h
→
external/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_
BLAS
.h
View file @
a394b22a
...
...
@@ -25,15 +25,15 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************************
* Content : Eigen bindings to
Intel(R) MKL
* Content : Eigen bindings to
BLAS F77
* Level 3 BLAS SYRK/HERK implementation.
********************************************************************************
*/
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_
MKL
_H
#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_
MKL
_H
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_
BLAS
_H
#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_
BLAS
_H
namespace
Eigen
{
namespace
Eigen
{
namespace
internal
{
...
...
@@ -44,34 +44,35 @@ struct general_matrix_matrix_rankupdate :
// try to go to BLAS specialization
#define EIGEN_
MKL
_RANKUPDATE_SPECIALIZE(Scalar) \
#define EIGEN_
BLAS
_RANKUPDATE_SPECIALIZE(Scalar) \
template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs, int UpLo> \
struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \
Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha) \
const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha
, level3_blocking<Scalar, Scalar>& blocking
) \
{ \
if (lhs==rhs) { \
if (
lhs==rhs
&& ((UpLo&(Lower|Upper)==UpLo))
) { \
general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha
,blocking
); \
} else { \
general_matrix_matrix_triangular_product<Index, \
Scalar, LhsStorageOrder, ConjugateLhs, \
Scalar, RhsStorageOrder, ConjugateRhs, \
ColMajor, UpLo, BuiltIn> \
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha
,blocking
); \
} \
} \
};
EIGEN_MKL_RANKUPDATE_SPECIALIZE
(
double
)
//EIGEN_MKL_RANKUPDATE_SPECIALIZE(dcomplex)
EIGEN_MKL_RANKUPDATE_SPECIALIZE
(
float
)
//EIGEN_MKL_RANKUPDATE_SPECIALIZE(scomplex)
EIGEN_BLAS_RANKUPDATE_SPECIALIZE
(
double
)
EIGEN_BLAS_RANKUPDATE_SPECIALIZE
(
float
)
// TODO handle complex cases
// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(dcomplex)
// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(scomplex)
// SYRK for float/double
#define EIGEN_
MKL
_RANKUPDATE_R(EIGTYPE,
MKL
TYPE,
MKL
FUNC) \
#define EIGEN_
BLAS
_RANKUPDATE_R(EIGTYPE,
BLAS
TYPE,
BLAS
FUNC) \
template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \
struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
enum { \
...
...
@@ -80,23 +81,19 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \
}; \
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
const EIGTYPE*
/*
rhs
*/
, Index
/*
rhsStride
*/
, EIGTYPE* res, Index resStride, EIGTYPE alpha
, level3_blocking<EIGTYPE, EIGTYPE>&
/*blocking*/
) \
{ \
/* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/
\
\
MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \
MKLTYPE alpha_, beta_; \
\
/* Set alpha_ & beta_ */
\
assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
MKLFUNC(&uplo, &trans, &n, &k, &alpha_, lhs, &lda, &beta_, res, &ldc); \
BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \
EIGTYPE beta(1); \
BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \
} \
};
// HERK for complex data
#define EIGEN_
MKL
_RANKUPDATE_C(EIGTYPE,
MKL
TYPE, RTYPE,
MKL
FUNC) \
#define EIGEN_
BLAS
_RANKUPDATE_C(EIGTYPE,
BLAS
TYPE, RTYPE,
BLAS
FUNC) \
template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \
struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
enum { \
...
...
@@ -105,18 +102,15 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \
}; \
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
const EIGTYPE*
/*
rhs
*/
, Index
/*
rhsStride
*/
, EIGTYPE* res, Index resStride, EIGTYPE alpha
, level3_blocking<EIGTYPE, EIGTYPE>&
/*blocking*/
) \
{ \
typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \
\
MKL_INT lda=
lhsStride, ldc=resStride, n=size, k=depth; \
char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \
BlasIndex lda=convert_index<BlasIndex>(
lhsStride
)
, ldc=
convert_index<BlasIndex>(
resStride
)
, n=
convert_index<BlasIndex>(
size
)
, k=
convert_index<BlasIndex>(
depth
)
; \
char uplo=(
(
IsLower) ? 'L' : 'U'
)
, trans=(
(
AStorageOrder==RowMajor) ? 'C':'N'
)
; \
RTYPE alpha_, beta_; \
const EIGTYPE* a_ptr; \
\
/* Set alpha_ & beta_ */
\
/* assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); */
\
/* assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1));*/
\
alpha_ = alpha.real(); \
beta_ = 1.0; \
/* Copy with conjugation in some cases*/
\
...
...
@@ -127,20 +121,21 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
lda = a.outerStride(); \
a_ptr = a.data(); \
} else a_ptr=lhs; \
MKL
FUNC(&uplo, &trans, &n, &k, &alpha_, (
MKL
TYPE*)a_ptr, &lda, &beta_, (
MKL
TYPE*)res, &ldc); \
BLAS
FUNC(&uplo, &trans, &n, &k, &alpha_, (
BLAS
TYPE*)a_ptr, &lda, &beta_, (
BLAS
TYPE*)res, &ldc); \
} \
};
EIGEN_
MKL
_RANKUPDATE_R
(
double
,
double
,
dsyrk
)
EIGEN_
MKL
_RANKUPDATE_R
(
float
,
float
,
ssyrk
)
EIGEN_
BLAS
_RANKUPDATE_R
(
double
,
double
,
dsyrk
_
)
EIGEN_
BLAS
_RANKUPDATE_R
(
float
,
float
,
ssyrk
_
)
//EIGEN_MKL_RANKUPDATE_C(dcomplex, MKL_Complex16, double, zherk)
//EIGEN_MKL_RANKUPDATE_C(scomplex, MKL_Complex8, double, cherk)
// TODO hanlde complex cases
// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)
// EIGEN_BLAS_RANKUPDATE_C(scomplex, float, float, cherk_)
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_
MKL
_H
#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_
BLAS
_H
external/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix_
MKL
.h
→
external/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix_
BLAS
.h
View file @
a394b22a
...
...
@@ -25,13 +25,13 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************************
* Content : Eigen bindings to
Intel(R) MKL
* Content : Eigen bindings to
BLAS F77
* General matrix-matrix product functionality based on ?GEMM.
********************************************************************************
*/
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_
MKL
_H
#define EIGEN_GENERAL_MATRIX_MATRIX_
MKL
_H
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_
BLAS
_H
#define EIGEN_GENERAL_MATRIX_MATRIX_
BLAS
_H
namespace
Eigen
{
...
...
@@ -46,13 +46,15 @@ namespace internal {
// gemm specialization
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX,
MKL
TYPE,
MKL
PREFIX) \
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX,
BLAS
TYPE,
BLAS
PREFIX) \
template< \
typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
{ \
typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \
\
static void run(Index rows, Index cols, Index depth, \
const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \
...
...
@@ -64,55 +66,50 @@ static void run(Index rows, Index cols, Index depth, \
using std::conj; \
\
char transa, transb; \
MKL_INT
m, n, k, lda, ldb, ldc; \
BlasIndex
m, n, k, lda, ldb, ldc; \
const EIGTYPE *a, *b; \
MKL
TYPE
alpha_,
beta
_
; \
EIG
TYPE beta
(1)
; \
MatrixX##EIGPREFIX a_tmp, b_tmp; \
EIGTYPE myone(1);\
\
/* Set transpose options */
\
transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
\
/* Set m, n, k */
\
m = (MKL_INT)rows; \
n = (MKL_INT)cols; \
k = (MKL_INT)depth; \
\
/* Set alpha_ & beta_ */
\
assign_scalar_eig2mkl(alpha_, alpha); \
assign_scalar_eig2mkl(beta_, myone); \
m = convert_index<BlasIndex>(rows); \
n = convert_index<BlasIndex>(cols); \
k = convert_index<BlasIndex>(depth); \
\
/* Set lda, ldb, ldc */
\
lda =
(MKL_INT)
lhsStride; \
ldb =
(MKL_INT)
rhsStride; \
ldc =
(MKL_INT)
resStride; \
lda =
convert_index<BlasIndex>(
lhsStride
)
; \
ldb =
convert_index<BlasIndex>(
rhsStride
)
; \
ldc =
convert_index<BlasIndex>(
resStride
)
; \
\
/* Set a, b, c */
\
if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \
a_tmp = lhs.conjugate(); \
a = a_tmp.data(); \
lda = a_tmp.outerStride(); \
lda =
convert_index<BlasIndex>(
a_tmp.outerStride()
)
; \
} else a = _lhs; \
\
if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \
b_tmp = rhs.conjugate(); \
b = b_tmp.data(); \
ldb = b_tmp.outerStride(); \
ldb =
convert_index<BlasIndex>(
b_tmp.outerStride()
)
; \
} else b = _rhs; \
\
MKL
PREFIX##gemm(&transa, &transb, &m, &n, &k, &alpha
_
, (const
MKL
TYPE*)a, &lda, (const
MKL
TYPE*)b, &ldb, &beta
_
, (
MKL
TYPE*)res, &ldc); \
BLAS
PREFIX##gemm
_
(&transa, &transb, &m, &n, &k, &
numext::real_ref(
alpha
)
, (const
BLAS
TYPE*)a, &lda, (const
BLAS
TYPE*)b, &ldb, &
numext::real_ref(
beta
)
, (
BLAS
TYPE*)res, &ldc); \
}};
GEMM_SPECIALIZATION
(
double
,
d
,
double
,
d
)
GEMM_SPECIALIZATION
(
float
,
f
,
float
,
s
)
GEMM_SPECIALIZATION
(
dcomplex
,
cd
,
MKL_Complex16
,
z
)
GEMM_SPECIALIZATION
(
scomplex
,
cf
,
MKL_Complex8
,
c
)
GEMM_SPECIALIZATION
(
double
,
d
,
double
,
d
)
GEMM_SPECIALIZATION
(
float
,
f
,
float
,
s
)
GEMM_SPECIALIZATION
(
dcomplex
,
cd
,
double
,
z
)
GEMM_SPECIALIZATION
(
scomplex
,
cf
,
float
,
c
)
}
// end namespase internal
}
// end namespace Eigen
#endif // EIGEN_GENERAL_MATRIX_MATRIX_
MKL
_H
#endif // EIGEN_GENERAL_MATRIX_MATRIX_
BLAS
_H
external/eigen3/Eigen/src/Core/products/GeneralMatrixVector.h
View file @
a394b22a
...
...
@@ -10,7 +10,7 @@
#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
#define EIGEN_GENERAL_MATRIX_VECTOR_H
namespace
Eigen
{
namespace
Eigen
{
namespace
internal
{
...
...
@@ -26,11 +26,39 @@ namespace internal {
* |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
* |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
* |cplx |real |real | optimal case, vectorization possible via real-cplx mul
*
* Accesses to the matrix coefficients follow the following logic:
*
* - if all columns have the same alignment then
* - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
* - otherwise perform unaligned loads only (-> NoneAligned case)
* - otherwise
* - if even columns have the same alignment then
* // odd columns are guaranteed to have the same alignment too
* - if even or odd columns have the same alignment as the result, then
* // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
* - perform half aligned and half unaligned loads (-> EvenAligned case)
* - otherwise perform unaligned loads only (-> NoneAligned case)
* - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
* - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
* perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
* // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
* - otherwise,
* // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
* // we currently fall back to the NoneAligned case
*
* The same reasoning apply for the transposed case.
*
* The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
* One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
* strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
* compared to unaligned loads on a 4 byte boundary.
*
*/
template
<
typename
Index
,
typename
LhsScalar
,
bool
ConjugateLhs
,
typename
RhsScalar
,
bool
ConjugateRhs
,
int
Version
>
struct
general_matrix_vector_product
<
Index
,
LhsScalar
,
ColMajor
,
ConjugateLhs
,
RhsScalar
,
ConjugateRhs
,
Version
>
template
<
typename
Index
,
typename
LhsScalar
,
typename
LhsMapper
,
bool
ConjugateLhs
,
typename
RhsScalar
,
typename
RhsMapper
,
bool
ConjugateRhs
,
int
Version
>
struct
general_matrix_vector_product
<
Index
,
LhsScalar
,
LhsMapper
,
ColMajor
,
ConjugateLhs
,
RhsScalar
,
RhsMapper
,
ConjugateRhs
,
Version
>
{
typedef
typename
s
calar
_product_t
raits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
typedef
typename
S
calar
BinaryOpT
raits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
enum
{
Vectorizable
=
packet_traits
<
LhsScalar
>::
Vectorizable
&&
packet_traits
<
RhsScalar
>::
Vectorizable
...
...
@@ -50,31 +78,35 @@ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
EIGEN_DONT_INLINE
static
void
run
(
Index
rows
,
Index
cols
,
const
LhsScalar
*
lhs
,
Index
lhsStride
,
const
RhsScalar
*
rhs
,
Index
rhsIncr
,
ResScalar
*
res
,
Index
resIncr
,
RhsScalar
alpha
);
const
LhsMapper
&
lhs
,
const
RhsMapper
&
rhs
,
ResScalar
*
res
,
Index
resIncr
,
RhsScalar
alpha
);
};
template
<
typename
Index
,
typename
LhsScalar
,
bool
ConjugateLhs
,
typename
RhsScalar
,
bool
ConjugateRhs
,
int
Version
>
EIGEN_DONT_INLINE
void
general_matrix_vector_product
<
Index
,
LhsScalar
,
ColMajor
,
ConjugateLhs
,
RhsScalar
,
ConjugateRhs
,
Version
>::
run
(
template
<
typename
Index
,
typename
LhsScalar
,
typename
LhsMapper
,
bool
ConjugateLhs
,
typename
RhsScalar
,
typename
RhsMapper
,
bool
ConjugateRhs
,
int
Version
>
EIGEN_DONT_INLINE
void
general_matrix_vector_product
<
Index
,
LhsScalar
,
LhsMapper
,
ColMajor
,
ConjugateLhs
,
RhsScalar
,
RhsMapper
,
ConjugateRhs
,
Version
>::
run
(
Index
rows
,
Index
cols
,
const
LhsScalar
*
lhs
,
Index
lhsStride
,
const
RhsScalar
*
rhs
,
Index
rhsIncr
,
ResScalar
*
res
,
Index
resIncr
,
RhsScalar
alpha
)
const
LhsMapper
&
lhs
,
const
RhsMapper
&
rhs
,
ResScalar
*
res
,
Index
resIncr
,
RhsScalar
alpha
)
{
EIGEN_UNUSED_VARIABLE
(
resIncr
)
EIGEN_UNUSED_VARIABLE
(
resIncr
)
;
eigen_internal_assert
(
resIncr
==
1
);
#ifdef _EIGEN_ACCUMULATE_PACKETS
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
#endif
#define _EIGEN_ACCUMULATE_PACKETS(A
0,A13,A
2) \
#define _EIGEN_ACCUMULATE_PACKETS(A
lignment0,Alignment13,Alignment
2) \
pstore(&res[j], \
padd(pload<ResPacket>(&res[j]), \
padd( \
padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j), ptmp0), \
pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j), ptmp1)), \
padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j), ptmp2), \
pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j), ptmp3)) )))
typedef
typename
LhsMapper
::
VectorMapper
LhsScalars
;
conj_helper
<
LhsScalar
,
RhsScalar
,
ConjugateLhs
,
ConjugateRhs
>
cj
;
conj_helper
<
LhsPacket
,
RhsPacket
,
ConjugateLhs
,
ConjugateRhs
>
pcj
;
...
...
@@ -88,10 +120,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
const
Index
ResPacketAlignedMask
=
ResPacketSize
-
1
;
// const Index PeelAlignedMask = ResPacketSize*peels-1;
const
Index
size
=
rows
;
const
Index
lhsStride
=
lhs
.
stride
();
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type.
Index
alignedStart
=
internal
::
first_aligned
(
res
,
size
);
Index
alignedStart
=
internal
::
first_
default_
aligned
(
res
,
size
);
Index
alignedSize
=
ResPacketSize
>
1
?
alignedStart
+
((
size
-
alignedStart
)
&
~
ResPacketAlignedMask
)
:
0
;
const
Index
peeledSize
=
alignedSize
-
RhsPacketSize
*
peels
-
RhsPacketSize
+
1
;
...
...
@@ -101,19 +135,26 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
:
FirstAligned
;
// we cannot assume the first element is aligned because of sub-matrices
const
Index
lhsAlignmentOffset
=
internal
::
first
_a
ligned
(
lhs
,
size
);
const
Index
lhsAlignmentOffset
=
lhs
.
first
A
ligned
(
size
);
// find how many columns do we have to skip to be aligned with the result (if possible)
Index
skipColumns
=
0
;
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
if
(
(
size_t
(
lhs
)
%
sizeof
(
LhsScalar
))
||
(
size_t
(
res
)
%
sizeof
(
ResScalar
))
)
if
(
(
lhsAlignmentOffset
<
0
)
||
(
lhsAlignmentOffset
==
size
)
||
(
UIntPtr
(
res
)
%
sizeof
(
ResScalar
))
)
{
alignedSize
=
0
;
alignedStart
=
0
;
alignmentPattern
=
NoneAligned
;
}
else
if
(
LhsPacketSize
>
4
)
{
// TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
// Currently, it seems to be better to perform unaligned loads anyway
alignmentPattern
=
NoneAligned
;
}
else
if
(
LhsPacketSize
>
1
)
{
eigen_internal_assert
(
size_t
(
l
hs
+
lhsAlignmentOffset
)
%
sizeof
(
LhsPacket
)
==
0
||
size
<
LhsPacketSize
);
//
eigen_internal_assert(size_t(
firstL
hs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
while
(
skipColumns
<
LhsPacketSize
&&
alignedStart
!=
((
lhsAlignmentOffset
+
alignmentStep
*
skipColumns
)
%
LhsPacketSize
))
...
...
@@ -130,10 +171,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
// note that the skiped columns are processed later.
}
eigen_internal_assert
(
(
alignmentPattern
==
NoneAligned
)
/*
eigen_internal_assert( (alignmentPattern==NoneAligned)
|| (skipColumns + columnsAtOnce >= cols)
|| LhsPacketSize > size
||
(
size_t
(
l
hs
+
alignedStart
+
lhsStride
*
skipColumns
)
%
sizeof
(
LhsPacket
))
==
0
);
|| (size_t(
firstL
hs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
*/
}
else
if
(
Vectorizable
)
{
...
...
@@ -142,20 +183,20 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
alignmentPattern
=
AllAligned
;
}
Index
offset1
=
(
FirstAligned
&&
alignmentStep
==
1
?
3
:
1
)
;
Index
offset3
=
(
FirstAligned
&&
alignmentStep
==
1
?
1
:
3
)
;
const
Index
offset1
=
(
FirstAligned
&&
alignmentStep
==
1
)
?
3
:
1
;
const
Index
offset3
=
(
FirstAligned
&&
alignmentStep
==
1
)
?
1
:
3
;
Index
columnBound
=
((
cols
-
skipColumns
)
/
columnsAtOnce
)
*
columnsAtOnce
+
skipColumns
;
for
(
Index
i
=
skipColumns
;
i
<
columnBound
;
i
+=
columnsAtOnce
)
{
RhsPacket
ptmp0
=
pset1
<
RhsPacket
>
(
alpha
*
rhs
[
i
*
rhsIncr
]
),
ptmp1
=
pset1
<
RhsPacket
>
(
alpha
*
rhs
[
(
i
+
offset1
)
*
rhsIncr
]
),
ptmp2
=
pset1
<
RhsPacket
>
(
alpha
*
rhs
[
(
i
+
2
)
*
rhsIncr
]
),
ptmp3
=
pset1
<
RhsPacket
>
(
alpha
*
rhs
[
(
i
+
offset3
)
*
rhsIncr
]
);
RhsPacket
ptmp0
=
pset1
<
RhsPacket
>
(
alpha
*
rhs
(
i
,
0
)
),
ptmp1
=
pset1
<
RhsPacket
>
(
alpha
*
rhs
(
i
+
offset1
,
0
)
),
ptmp2
=
pset1
<
RhsPacket
>
(
alpha
*
rhs
(
i
+
2
,
0
)
),
ptmp3
=
pset1
<
RhsPacket
>
(
alpha
*
rhs
(
i
+
offset3
,
0
)
);
// this helps a lot generating better binary code
const
LhsScalar
*
lhs0
=
lhs
+
i
*
lhsStride
,
*
lhs1
=
lhs
+
(
i
+
offset1
)
*
lhsStride
,
*
lhs2
=
lhs
+
(
i
+
2
)
*
lhsStride
,
*
lhs3
=
lhs
+
(
i
+
offset3
)
*
lhsStride
;
const
LhsScalar
s
lhs0
=
lhs
.
getVectorMapper
(
0
,
i
+
0
),
lhs1
=
lhs
.
getVectorMapper
(
0
,
i
+
offset1
)
,
lhs2
=
lhs
.
getVectorMapper
(
0
,
i
+
2
),
lhs3
=
lhs
.
getVectorMapper
(
0
,
i
+
offset3
)
;
if
(
Vectorizable
)
{
...
...
@@ -163,10 +204,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
// process initial unaligned coeffs
for
(
Index
j
=
0
;
j
<
alignedStart
;
++
j
)
{
res
[
j
]
=
cj
.
pmadd
(
lhs0
[
j
]
,
pfirst
(
ptmp0
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs1
[
j
]
,
pfirst
(
ptmp1
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs2
[
j
]
,
pfirst
(
ptmp2
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs3
[
j
]
,
pfirst
(
ptmp3
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs0
(
j
)
,
pfirst
(
ptmp0
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs1
(
j
)
,
pfirst
(
ptmp1
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs2
(
j
)
,
pfirst
(
ptmp2
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs3
(
j
)
,
pfirst
(
ptmp3
),
res
[
j
]);
}
if
(
alignedSize
>
alignedStart
)
...
...
@@ -175,11 +216,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
{
case
AllAligned
:
for
(
Index
j
=
alignedStart
;
j
<
alignedSize
;
j
+=
ResPacketSize
)
_EIGEN_ACCUMULATE_PACKETS
(
d
,
d
,
d
);
_EIGEN_ACCUMULATE_PACKETS
(
Aligned
,
Aligned
,
Aligne
d
);
break
;
case
EvenAligned
:
for
(
Index
j
=
alignedStart
;
j
<
alignedSize
;
j
+=
ResPacketSize
)
_EIGEN_ACCUMULATE_PACKETS
(
d
,
du
,
d
);
_EIGEN_ACCUMULATE_PACKETS
(
Aligned
,
Unaligned
,
Aligne
d
);
break
;
case
FirstAligned
:
{
...
...
@@ -189,28 +230,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
LhsPacket
A00
,
A01
,
A02
,
A03
,
A10
,
A11
,
A12
,
A13
;
ResPacket
T0
,
T1
;
A01
=
p
load
<
LhsPacket
>
(
&
lhs1
[
alignedStart
-
1
]
);
A02
=
p
load
<
LhsPacket
>
(
&
lhs2
[
alignedStart
-
2
]
);
A03
=
p
load
<
LhsPacket
>
(
&
lhs3
[
alignedStart
-
3
]
);
A01
=
lhs1
.
template
load
<
LhsPacket
,
Aligned
>(
alignedStart
-
1
);
A02
=
lhs2
.
template
load
<
LhsPacket
,
Aligned
>(
alignedStart
-
2
);
A03
=
lhs3
.
template
load
<
LhsPacket
,
Aligned
>(
alignedStart
-
3
);
for
(;
j
<
peeledSize
;
j
+=
peels
*
ResPacketSize
)
{
A11
=
p
load
<
LhsPacket
>
(
&
lhs1
[
j
-
1
+
LhsPacketSize
]
);
palign
<
1
>
(
A01
,
A11
);
A12
=
p
load
<
LhsPacket
>
(
&
lhs2
[
j
-
2
+
LhsPacketSize
]
);
palign
<
2
>
(
A02
,
A12
);
A13
=
p
load
<
LhsPacket
>
(
&
lhs3
[
j
-
3
+
LhsPacketSize
]
);
palign
<
3
>
(
A03
,
A13
);
A11
=
lhs1
.
template
load
<
LhsPacket
,
Aligned
>(
j
-
1
+
LhsPacketSize
);
palign
<
1
>
(
A01
,
A11
);
A12
=
lhs2
.
template
load
<
LhsPacket
,
Aligned
>(
j
-
2
+
LhsPacketSize
);
palign
<
2
>
(
A02
,
A12
);
A13
=
lhs3
.
template
load
<
LhsPacket
,
Aligned
>(
j
-
3
+
LhsPacketSize
);
palign
<
3
>
(
A03
,
A13
);
A00
=
p
load
<
LhsPacket
>
(
&
lhs0
[
j
]
);
A10
=
p
load
<
LhsPacket
>
(
&
lhs0
[
j
+
LhsPacketSize
]
);
A00
=
lhs0
.
template
load
<
LhsPacket
,
Aligned
>(
j
);
A10
=
lhs0
.
template
load
<
LhsPacket
,
Aligned
>(
j
+
LhsPacketSize
);
T0
=
pcj
.
pmadd
(
A00
,
ptmp0
,
pload
<
ResPacket
>
(
&
res
[
j
]));
T1
=
pcj
.
pmadd
(
A10
,
ptmp0
,
pload
<
ResPacket
>
(
&
res
[
j
+
ResPacketSize
]));
T0
=
pcj
.
pmadd
(
A01
,
ptmp1
,
T0
);
A01
=
p
load
<
LhsPacket
>
(
&
lhs1
[
j
-
1
+
2
*
LhsPacketSize
]
);
palign
<
1
>
(
A11
,
A01
);
A01
=
lhs1
.
template
load
<
LhsPacket
,
Aligned
>(
j
-
1
+
2
*
LhsPacketSize
);
palign
<
1
>
(
A11
,
A01
);
T0
=
pcj
.
pmadd
(
A02
,
ptmp2
,
T0
);
A02
=
p
load
<
LhsPacket
>
(
&
lhs2
[
j
-
2
+
2
*
LhsPacketSize
]
);
palign
<
2
>
(
A12
,
A02
);
A02
=
lhs2
.
template
load
<
LhsPacket
,
Aligned
>(
j
-
2
+
2
*
LhsPacketSize
);
palign
<
2
>
(
A12
,
A02
);
T0
=
pcj
.
pmadd
(
A03
,
ptmp3
,
T0
);
pstore
(
&
res
[
j
],
T0
);
A03
=
p
load
<
LhsPacket
>
(
&
lhs3
[
j
-
3
+
2
*
LhsPacketSize
]
);
palign
<
3
>
(
A13
,
A03
);
A03
=
lhs3
.
template
load
<
LhsPacket
,
Aligned
>(
j
-
3
+
2
*
LhsPacketSize
);
palign
<
3
>
(
A13
,
A03
);
T1
=
pcj
.
pmadd
(
A11
,
ptmp1
,
T1
);
T1
=
pcj
.
pmadd
(
A12
,
ptmp2
,
T1
);
T1
=
pcj
.
pmadd
(
A13
,
ptmp3
,
T1
);
...
...
@@ -218,12 +259,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
}
}
for
(;
j
<
alignedSize
;
j
+=
ResPacketSize
)
_EIGEN_ACCUMULATE_PACKETS
(
d
,
du
,
du
);
_EIGEN_ACCUMULATE_PACKETS
(
Aligned
,
Unaligned
,
Unaligned
);
break
;
}
default
:
for
(
Index
j
=
alignedStart
;
j
<
alignedSize
;
j
+=
ResPacketSize
)
_EIGEN_ACCUMULATE_PACKETS
(
du
,
du
,
du
);
_EIGEN_ACCUMULATE_PACKETS
(
Unaligned
,
Unaligned
,
Unaligned
);
break
;
}
}
...
...
@@ -232,10 +273,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
/* process remaining coeffs (or all if there is no explicit vectorization) */
for
(
Index
j
=
alignedSize
;
j
<
size
;
++
j
)
{
res
[
j
]
=
cj
.
pmadd
(
lhs0
[
j
]
,
pfirst
(
ptmp0
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs1
[
j
]
,
pfirst
(
ptmp1
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs2
[
j
]
,
pfirst
(
ptmp2
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs3
[
j
]
,
pfirst
(
ptmp3
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs0
(
j
)
,
pfirst
(
ptmp0
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs1
(
j
)
,
pfirst
(
ptmp1
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs2
(
j
)
,
pfirst
(
ptmp2
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs3
(
j
)
,
pfirst
(
ptmp3
),
res
[
j
]);
}
}
...
...
@@ -246,27 +287,27 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
{
for
(
Index
k
=
start
;
k
<
end
;
++
k
)
{
RhsPacket
ptmp0
=
pset1
<
RhsPacket
>
(
alpha
*
rhs
[
k
*
rhsIncr
]
);
const
LhsScalar
*
lhs0
=
lhs
+
k
*
lhsStride
;
RhsPacket
ptmp0
=
pset1
<
RhsPacket
>
(
alpha
*
rhs
(
k
,
0
)
);
const
LhsScalar
s
lhs0
=
lhs
.
getVectorMapper
(
0
,
k
)
;
if
(
Vectorizable
)
{
/* explicit vectorization */
// process first unaligned result's coeffs
for
(
Index
j
=
0
;
j
<
alignedStart
;
++
j
)
res
[
j
]
+=
cj
.
pmul
(
lhs0
[
j
]
,
pfirst
(
ptmp0
));
res
[
j
]
+=
cj
.
pmul
(
lhs0
(
j
)
,
pfirst
(
ptmp0
));
// process aligned result's coeffs
if
(
(
size_t
(
lhs0
+
alignedStart
)
%
sizeof
(
LhsPacket
))
==
0
)
if
(
lhs0
.
template
aligned
<
LhsPacket
>(
alignedStart
)
)
for
(
Index
i
=
alignedStart
;
i
<
alignedSize
;
i
+=
ResPacketSize
)
pstore
(
&
res
[
i
],
pcj
.
pmadd
(
p
load
<
LhsPacket
>
(
&
lhs0
[
i
]
),
ptmp0
,
pload
<
ResPacket
>
(
&
res
[
i
])));
pstore
(
&
res
[
i
],
pcj
.
pmadd
(
lhs0
.
template
load
<
LhsPacket
,
Aligned
>(
i
),
ptmp0
,
pload
<
ResPacket
>
(
&
res
[
i
])));
else
for
(
Index
i
=
alignedStart
;
i
<
alignedSize
;
i
+=
ResPacketSize
)
pstore
(
&
res
[
i
],
pcj
.
pmadd
(
p
load
u
<
LhsPacket
>
(
&
lhs0
[
i
]
),
ptmp0
,
pload
<
ResPacket
>
(
&
res
[
i
])));
pstore
(
&
res
[
i
],
pcj
.
pmadd
(
lhs0
.
template
load
<
LhsPacket
,
Unaligned
>(
i
),
ptmp0
,
pload
<
ResPacket
>
(
&
res
[
i
])));
}
// process remaining scalars (or all if no explicit vectorization)
for
(
Index
i
=
alignedSize
;
i
<
size
;
++
i
)
res
[
i
]
+=
cj
.
pmul
(
lhs0
[
i
]
,
pfirst
(
ptmp0
));
res
[
i
]
+=
cj
.
pmul
(
lhs0
(
i
)
,
pfirst
(
ptmp0
));
}
if
(
skipColumns
)
{
...
...
@@ -290,10 +331,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
* - alpha is always a complex (or converted to a complex)
* - no vectorization
*/
template
<
typename
Index
,
typename
LhsScalar
,
bool
ConjugateLhs
,
typename
RhsScalar
,
bool
ConjugateRhs
,
int
Version
>
struct
general_matrix_vector_product
<
Index
,
LhsScalar
,
RowMajor
,
ConjugateLhs
,
RhsScalar
,
ConjugateRhs
,
Version
>
template
<
typename
Index
,
typename
LhsScalar
,
typename
LhsMapper
,
bool
ConjugateLhs
,
typename
RhsScalar
,
typename
RhsMapper
,
bool
ConjugateRhs
,
int
Version
>
struct
general_matrix_vector_product
<
Index
,
LhsScalar
,
LhsMapper
,
RowMajor
,
ConjugateLhs
,
RhsScalar
,
RhsMapper
,
ConjugateRhs
,
Version
>
{
typedef
typename
s
calar
_product_t
raits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
typedef
typename
S
calar
BinaryOpT
raits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
enum
{
Vectorizable
=
packet_traits
<
LhsScalar
>::
Vectorizable
&&
packet_traits
<
RhsScalar
>::
Vectorizable
...
...
@@ -310,73 +351,84 @@ typedef typename packet_traits<ResScalar>::type _ResPacket;
typedef
typename
conditional
<
Vectorizable
,
_LhsPacket
,
LhsScalar
>::
type
LhsPacket
;
typedef
typename
conditional
<
Vectorizable
,
_RhsPacket
,
RhsScalar
>::
type
RhsPacket
;
typedef
typename
conditional
<
Vectorizable
,
_ResPacket
,
ResScalar
>::
type
ResPacket
;
EIGEN_DONT_INLINE
static
void
run
(
Index
rows
,
Index
cols
,
const
Lhs
Scalar
*
lhs
,
Index
lhsStride
,
const
Rhs
Scalar
*
rhs
,
Index
rhsIncr
,
ResScalar
*
res
,
Index
resIncr
,
const
Lhs
Mapper
&
lhs
,
const
Rhs
Mapper
&
rhs
,
ResScalar
*
res
,
Index
resIncr
,
ResScalar
alpha
);
};
template
<
typename
Index
,
typename
LhsScalar
,
bool
ConjugateLhs
,
typename
RhsScalar
,
bool
ConjugateRhs
,
int
Version
>
EIGEN_DONT_INLINE
void
general_matrix_vector_product
<
Index
,
LhsScalar
,
RowMajor
,
ConjugateLhs
,
RhsScalar
,
ConjugateRhs
,
Version
>::
run
(
template
<
typename
Index
,
typename
LhsScalar
,
typename
LhsMapper
,
bool
ConjugateLhs
,
typename
RhsScalar
,
typename
RhsMapper
,
bool
ConjugateRhs
,
int
Version
>
EIGEN_DONT_INLINE
void
general_matrix_vector_product
<
Index
,
LhsScalar
,
LhsMapper
,
RowMajor
,
ConjugateLhs
,
RhsScalar
,
RhsMapper
,
ConjugateRhs
,
Version
>::
run
(
Index
rows
,
Index
cols
,
const
Lhs
Scalar
*
lhs
,
Index
lhsStride
,
const
Rhs
Scalar
*
rhs
,
Index
rhsIncr
,
const
Lhs
Mapper
&
lhs
,
const
Rhs
Mapper
&
rhs
,
ResScalar
*
res
,
Index
resIncr
,
ResScalar
alpha
)
{
EIGEN_UNUSED_VARIABLE
(
rhsIncr
);
eigen_internal_assert
(
rhsIncr
==
1
);
eigen_internal_assert
(
rhs
.
stride
()
==
1
);
#ifdef _EIGEN_ACCUMULATE_PACKETS
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
#endif
#define _EIGEN_ACCUMULATE_PACKETS(A
0,A13,A
2) {\
RhsPacket b =
p
load<RhsPacket
>(&rhs[j]
); \
ptmp0 = pcj.pmadd(
EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]
), b, ptmp0); \
ptmp1 = pcj.pmadd(
EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]
), b, ptmp1); \
ptmp2 = pcj.pmadd(
EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]
), b, ptmp2); \
ptmp3 = pcj.pmadd(
EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]
), b, ptmp3); }
#define _EIGEN_ACCUMULATE_PACKETS(A
lignment0,Alignment13,Alignment
2) {\
RhsPacket b =
rhs.getVectorMapper(j, 0).template
load<RhsPacket
, Aligned>(0
);
\
ptmp0 = pcj.pmadd(
lhs0.template load<LhsPacket, Alignment0>(j
), b, ptmp0); \
ptmp1 = pcj.pmadd(
lhs1.template load<LhsPacket, Alignment13>(j
), b, ptmp1); \
ptmp2 = pcj.pmadd(
lhs2.template load<LhsPacket, Alignment2>(j
), b, ptmp2); \
ptmp3 = pcj.pmadd(
lhs3.template load<LhsPacket, Alignment13>(j
), b, ptmp3); }
conj_helper
<
LhsScalar
,
RhsScalar
,
ConjugateLhs
,
ConjugateRhs
>
cj
;
conj_helper
<
LhsPacket
,
RhsPacket
,
ConjugateLhs
,
ConjugateRhs
>
pcj
;
typedef
typename
LhsMapper
::
VectorMapper
LhsScalars
;
enum
{
AllAligned
=
0
,
EvenAligned
=
1
,
FirstAligned
=
2
,
NoneAligned
=
3
};
const
Index
rowsAtOnce
=
4
;
const
Index
peels
=
2
;
const
Index
RhsPacketAlignedMask
=
RhsPacketSize
-
1
;
const
Index
LhsPacketAlignedMask
=
LhsPacketSize
-
1
;
// const Index PeelAlignedMask = RhsPacketSize*peels-1;
const
Index
depth
=
cols
;
const
Index
lhsStride
=
lhs
.
stride
();
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type
// if that's not the case then vectorization is discarded, see below.
Index
alignedStart
=
internal
::
first
_a
ligned
(
rhs
,
depth
);
Index
alignedStart
=
rhs
.
first
A
ligned
(
depth
);
Index
alignedSize
=
RhsPacketSize
>
1
?
alignedStart
+
((
depth
-
alignedStart
)
&
~
RhsPacketAlignedMask
)
:
0
;
const
Index
peeledSize
=
alignedSize
-
RhsPacketSize
*
peels
-
RhsPacketSize
+
1
;
const
Index
alignmentStep
=
LhsPacketSize
>
1
?
(
LhsPacketSize
-
lhsStride
%
LhsPacketSize
)
&
LhsPacketAlignedMask
:
0
;
Index
alignmentPattern
=
alignmentStep
==
0
?
AllAligned
:
alignmentStep
==
(
LhsPacketSize
/
2
)
?
EvenAligned
:
FirstAligned
;
:
alignmentStep
==
(
LhsPacketSize
/
2
)
?
EvenAligned
:
FirstAligned
;
// we cannot assume the first element is aligned because of sub-matrices
const
Index
lhsAlignmentOffset
=
internal
::
first_aligned
(
lhs
,
depth
);
const
Index
lhsAlignmentOffset
=
lhs
.
firstAligned
(
depth
);
const
Index
rhsAlignmentOffset
=
rhs
.
firstAligned
(
rows
);
// find how many rows do we have to skip to be aligned with rhs (if possible)
Index
skipRows
=
0
;
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
if
(
(
sizeof
(
LhsScalar
)
!=
sizeof
(
RhsScalar
))
||
(
size_t
(
lhs
)
%
sizeof
(
LhsScalar
))
||
(
size_t
(
rhs
)
%
sizeof
(
RhsScalar
))
)
if
(
(
sizeof
(
LhsScalar
)
!=
sizeof
(
RhsScalar
))
||
(
lhsAlignmentOffset
<
0
)
||
(
lhsAlignmentOffset
==
depth
)
||
(
rhsAlignmentOffset
<
0
)
||
(
rhsAlignmentOffset
==
rows
)
)
{
alignedSize
=
0
;
alignedStart
=
0
;
alignmentPattern
=
NoneAligned
;
}
else
if
(
LhsPacketSize
>
4
)
{
// TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
alignmentPattern
=
NoneAligned
;
}
else
if
(
LhsPacketSize
>
1
)
{
eigen_internal_assert
(
size_t
(
l
hs
+
lhsAlignmentOffset
)
%
sizeof
(
LhsPacket
)
==
0
||
depth
<
LhsPacketSize
);
//
eigen_internal_assert(size_t(
firstL
hs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
while
(
skipRows
<
LhsPacketSize
&&
alignedStart
!=
((
lhsAlignmentOffset
+
alignmentStep
*
skipRows
)
%
LhsPacketSize
))
...
...
@@ -392,11 +444,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
skipRows
=
(
std
::
min
)(
skipRows
,
Index
(
rows
));
// note that the skiped columns are processed later.
}
eigen_internal_assert
(
alignmentPattern
==
NoneAligned
/*
eigen_internal_assert( alignmentPattern==NoneAligned
|| LhsPacketSize==1
|| (skipRows + rowsAtOnce >= rows)
|| LhsPacketSize > depth
||
(
size_t
(
l
hs
+
alignedStart
+
lhsStride
*
skipRows
)
%
sizeof
(
LhsPacket
))
==
0
);
|| (size_t(
firstL
hs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
*/
}
else
if
(
Vectorizable
)
{
...
...
@@ -405,18 +457,19 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
alignmentPattern
=
AllAligned
;
}
Index
offset1
=
(
FirstAligned
&&
alignmentStep
==
1
?
3
:
1
)
;
Index
offset3
=
(
FirstAligned
&&
alignmentStep
==
1
?
1
:
3
)
;
const
Index
offset1
=
(
FirstAligned
&&
alignmentStep
==
1
)
?
3
:
1
;
const
Index
offset3
=
(
FirstAligned
&&
alignmentStep
==
1
)
?
1
:
3
;
Index
rowBound
=
((
rows
-
skipRows
)
/
rowsAtOnce
)
*
rowsAtOnce
+
skipRows
;
for
(
Index
i
=
skipRows
;
i
<
rowBound
;
i
+=
rowsAtOnce
)
{
EIGEN_ALIGN16
ResScalar
tmp0
=
ResScalar
(
0
);
// FIXME: what is the purpose of this EIGEN_ALIGN_DEFAULT ??
EIGEN_ALIGN_MAX
ResScalar
tmp0
=
ResScalar
(
0
);
ResScalar
tmp1
=
ResScalar
(
0
),
tmp2
=
ResScalar
(
0
),
tmp3
=
ResScalar
(
0
);
// this helps the compiler generating good binary code
const
LhsScalar
*
lhs0
=
lhs
+
i
*
lhsStride
,
*
lhs1
=
lhs
+
(
i
+
offset1
)
*
lhsStride
,
*
lhs2
=
lhs
+
(
i
+
2
)
*
lhsStride
,
*
lhs3
=
lhs
+
(
i
+
offset3
)
*
lhsStride
;
const
LhsScalar
s
lhs0
=
lhs
.
getVectorMapper
(
i
+
0
,
0
),
lhs1
=
lhs
.
getVectorMapper
(
i
+
offset1
,
0
)
,
lhs2
=
lhs
.
getVectorMapper
(
i
+
2
,
0
),
lhs3
=
lhs
.
getVectorMapper
(
i
+
offset3
,
0
)
;
if
(
Vectorizable
)
{
...
...
@@ -428,9 +481,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
// FIXME this loop get vectorized by the compiler !
for
(
Index
j
=
0
;
j
<
alignedStart
;
++
j
)
{
RhsScalar
b
=
rhs
[
j
]
;
tmp0
+=
cj
.
pmul
(
lhs0
[
j
]
,
b
);
tmp1
+=
cj
.
pmul
(
lhs1
[
j
]
,
b
);
tmp2
+=
cj
.
pmul
(
lhs2
[
j
]
,
b
);
tmp3
+=
cj
.
pmul
(
lhs3
[
j
]
,
b
);
RhsScalar
b
=
rhs
(
j
,
0
)
;
tmp0
+=
cj
.
pmul
(
lhs0
(
j
)
,
b
);
tmp1
+=
cj
.
pmul
(
lhs1
(
j
)
,
b
);
tmp2
+=
cj
.
pmul
(
lhs2
(
j
)
,
b
);
tmp3
+=
cj
.
pmul
(
lhs3
(
j
)
,
b
);
}
if
(
alignedSize
>
alignedStart
)
...
...
@@ -439,11 +492,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
{
case
AllAligned
:
for
(
Index
j
=
alignedStart
;
j
<
alignedSize
;
j
+=
RhsPacketSize
)
_EIGEN_ACCUMULATE_PACKETS
(
d
,
d
,
d
);
_EIGEN_ACCUMULATE_PACKETS
(
Aligned
,
Aligned
,
Aligne
d
);
break
;
case
EvenAligned
:
for
(
Index
j
=
alignedStart
;
j
<
alignedSize
;
j
+=
RhsPacketSize
)
_EIGEN_ACCUMULATE_PACKETS
(
d
,
du
,
d
);
_EIGEN_ACCUMULATE_PACKETS
(
Aligned
,
Unaligned
,
Aligne
d
);
break
;
case
FirstAligned
:
{
...
...
@@ -457,39 +510,39 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
* than basic unaligned loads.
*/
LhsPacket
A01
,
A02
,
A03
,
A11
,
A12
,
A13
;
A01
=
p
load
<
LhsPacket
>
(
&
lhs1
[
alignedStart
-
1
]
);
A02
=
p
load
<
LhsPacket
>
(
&
lhs2
[
alignedStart
-
2
]
);
A03
=
p
load
<
LhsPacket
>
(
&
lhs3
[
alignedStart
-
3
]
);
A01
=
lhs1
.
template
load
<
LhsPacket
,
Aligned
>(
alignedStart
-
1
);
A02
=
lhs2
.
template
load
<
LhsPacket
,
Aligned
>(
alignedStart
-
2
);
A03
=
lhs3
.
template
load
<
LhsPacket
,
Aligned
>(
alignedStart
-
3
);
for
(;
j
<
peeledSize
;
j
+=
peels
*
RhsPacketSize
)
{
RhsPacket
b
=
p
load
<
RhsPacket
>
(
&
rhs
[
j
]
);
A11
=
p
load
<
LhsPacket
>
(
&
lhs1
[
j
-
1
+
LhsPacketSize
]
);
palign
<
1
>
(
A01
,
A11
);
A12
=
p
load
<
LhsPacket
>
(
&
lhs2
[
j
-
2
+
LhsPacketSize
]
);
palign
<
2
>
(
A02
,
A12
);
A13
=
p
load
<
LhsPacket
>
(
&
lhs3
[
j
-
3
+
LhsPacketSize
]
);
palign
<
3
>
(
A03
,
A13
);
RhsPacket
b
=
rhs
.
getVectorMapper
(
j
,
0
).
template
load
<
RhsPacket
,
Aligned
>(
0
);
A11
=
lhs1
.
template
load
<
LhsPacket
,
Aligned
>(
j
-
1
+
LhsPacketSize
);
palign
<
1
>
(
A01
,
A11
);
A12
=
lhs2
.
template
load
<
LhsPacket
,
Aligned
>(
j
-
2
+
LhsPacketSize
);
palign
<
2
>
(
A02
,
A12
);
A13
=
lhs3
.
template
load
<
LhsPacket
,
Aligned
>(
j
-
3
+
LhsPacketSize
);
palign
<
3
>
(
A03
,
A13
);
ptmp0
=
pcj
.
pmadd
(
p
load
<
LhsPacket
>
(
&
lhs0
[
j
]
),
b
,
ptmp0
);
ptmp0
=
pcj
.
pmadd
(
lhs0
.
template
load
<
LhsPacket
,
Aligned
>(
j
),
b
,
ptmp0
);
ptmp1
=
pcj
.
pmadd
(
A01
,
b
,
ptmp1
);
A01
=
p
load
<
LhsPacket
>
(
&
lhs1
[
j
-
1
+
2
*
LhsPacketSize
]
);
palign
<
1
>
(
A11
,
A01
);
A01
=
lhs1
.
template
load
<
LhsPacket
,
Aligned
>(
j
-
1
+
2
*
LhsPacketSize
);
palign
<
1
>
(
A11
,
A01
);
ptmp2
=
pcj
.
pmadd
(
A02
,
b
,
ptmp2
);
A02
=
p
load
<
LhsPacket
>
(
&
lhs2
[
j
-
2
+
2
*
LhsPacketSize
]
);
palign
<
2
>
(
A12
,
A02
);
A02
=
lhs2
.
template
load
<
LhsPacket
,
Aligned
>(
j
-
2
+
2
*
LhsPacketSize
);
palign
<
2
>
(
A12
,
A02
);
ptmp3
=
pcj
.
pmadd
(
A03
,
b
,
ptmp3
);
A03
=
p
load
<
LhsPacket
>
(
&
lhs3
[
j
-
3
+
2
*
LhsPacketSize
]
);
palign
<
3
>
(
A13
,
A03
);
A03
=
lhs3
.
template
load
<
LhsPacket
,
Aligned
>(
j
-
3
+
2
*
LhsPacketSize
);
palign
<
3
>
(
A13
,
A03
);
b
=
pload
<
RhsPacket
>
(
&
rhs
[
j
+
RhsPacketSize
]
);
ptmp0
=
pcj
.
pmadd
(
p
load
<
LhsPacket
>
(
&
lhs0
[
j
+
LhsPacketSize
]
),
b
,
ptmp0
);
b
=
rhs
.
getVectorMapper
(
j
+
RhsPacketSize
,
0
).
template
load
<
RhsPacket
,
Aligned
>(
0
);
ptmp0
=
pcj
.
pmadd
(
lhs0
.
template
load
<
LhsPacket
,
Aligned
>(
j
+
LhsPacketSize
),
b
,
ptmp0
);
ptmp1
=
pcj
.
pmadd
(
A11
,
b
,
ptmp1
);
ptmp2
=
pcj
.
pmadd
(
A12
,
b
,
ptmp2
);
ptmp3
=
pcj
.
pmadd
(
A13
,
b
,
ptmp3
);
}
}
for
(;
j
<
alignedSize
;
j
+=
RhsPacketSize
)
_EIGEN_ACCUMULATE_PACKETS
(
d
,
du
,
du
);
_EIGEN_ACCUMULATE_PACKETS
(
Aligned
,
Unaligned
,
Unaligned
);
break
;
}
default
:
for
(
Index
j
=
alignedStart
;
j
<
alignedSize
;
j
+=
RhsPacketSize
)
_EIGEN_ACCUMULATE_PACKETS
(
du
,
du
,
du
);
_EIGEN_ACCUMULATE_PACKETS
(
Unaligned
,
Unaligned
,
Unaligned
);
break
;
}
tmp0
+=
predux
(
ptmp0
);
...
...
@@ -503,9 +556,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
// FIXME this loop get vectorized by the compiler !
for
(
Index
j
=
alignedSize
;
j
<
depth
;
++
j
)
{
RhsScalar
b
=
rhs
[
j
]
;
tmp0
+=
cj
.
pmul
(
lhs0
[
j
]
,
b
);
tmp1
+=
cj
.
pmul
(
lhs1
[
j
]
,
b
);
tmp2
+=
cj
.
pmul
(
lhs2
[
j
]
,
b
);
tmp3
+=
cj
.
pmul
(
lhs3
[
j
]
,
b
);
RhsScalar
b
=
rhs
(
j
,
0
)
;
tmp0
+=
cj
.
pmul
(
lhs0
(
j
)
,
b
);
tmp1
+=
cj
.
pmul
(
lhs1
(
j
)
,
b
);
tmp2
+=
cj
.
pmul
(
lhs2
(
j
)
,
b
);
tmp3
+=
cj
.
pmul
(
lhs3
(
j
)
,
b
);
}
res
[
i
*
resIncr
]
+=
alpha
*
tmp0
;
res
[(
i
+
offset1
)
*
resIncr
]
+=
alpha
*
tmp1
;
...
...
@@ -520,30 +573,30 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
{
for
(
Index
i
=
start
;
i
<
end
;
++
i
)
{
EIGEN_ALIGN
16
ResScalar
tmp0
=
ResScalar
(
0
);
EIGEN_ALIGN
_MAX
ResScalar
tmp0
=
ResScalar
(
0
);
ResPacket
ptmp0
=
pset1
<
ResPacket
>
(
tmp0
);
const
LhsScalar
*
lhs0
=
lhs
+
i
*
lhsStride
;
const
LhsScalar
s
lhs0
=
lhs
.
getVectorMapper
(
i
,
0
)
;
// process first unaligned result's coeffs
// FIXME this loop get vectorized by the compiler !
for
(
Index
j
=
0
;
j
<
alignedStart
;
++
j
)
tmp0
+=
cj
.
pmul
(
lhs0
[
j
]
,
rhs
[
j
]
);
tmp0
+=
cj
.
pmul
(
lhs0
(
j
)
,
rhs
(
j
,
0
)
);
if
(
alignedSize
>
alignedStart
)
{
// process aligned rhs coeffs
if
(
(
size_t
(
lhs0
+
alignedStart
)
%
sizeof
(
LhsPacket
))
==
0
)
if
(
lhs0
.
template
aligned
<
LhsPacket
>(
alignedStart
)
)
for
(
Index
j
=
alignedStart
;
j
<
alignedSize
;
j
+=
RhsPacketSize
)
ptmp0
=
pcj
.
pmadd
(
pload
<
LhsPacket
>
(
&
lhs0
[
j
]),
p
load
<
RhsPacket
>
(
&
rhs
[
j
]
),
ptmp0
);
ptmp0
=
pcj
.
pmadd
(
lhs0
.
template
load
<
LhsPacket
,
Aligned
>(
j
),
rhs
.
getVectorMapper
(
j
,
0
).
template
load
<
RhsPacket
,
Aligned
>(
0
),
ptmp0
);
else
for
(
Index
j
=
alignedStart
;
j
<
alignedSize
;
j
+=
RhsPacketSize
)
ptmp0
=
pcj
.
pmadd
(
p
load
u
<
LhsPacket
>
(
&
lhs0
[
j
]),
p
load
<
RhsPacket
>
(
&
rhs
[
j
]
),
ptmp0
);
ptmp0
=
pcj
.
pmadd
(
lhs0
.
template
load
<
LhsPacket
,
Unaligned
>(
j
),
rhs
.
getVectorMapper
(
j
,
0
).
template
load
<
RhsPacket
,
Aligned
>(
0
),
ptmp0
);
tmp0
+=
predux
(
ptmp0
);
}
// process remaining scalars
// FIXME this loop get vectorized by the compiler !
for
(
Index
j
=
alignedSize
;
j
<
depth
;
++
j
)
tmp0
+=
cj
.
pmul
(
lhs0
[
j
]
,
rhs
[
j
]
);
tmp0
+=
cj
.
pmul
(
lhs0
(
j
)
,
rhs
(
j
,
0
)
);
res
[
i
*
resIncr
]
+=
alpha
*
tmp0
;
}
if
(
skipRows
)
...
...
external/eigen3/Eigen/src/Core/products/GeneralMatrixVector_
MKL
.h
→
external/eigen3/Eigen/src/Core/products/GeneralMatrixVector_
BLAS
.h
View file @
a394b22a
...
...
@@ -25,13 +25,13 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************************
* Content : Eigen bindings to
Intel(R) MKL
* Content : Eigen bindings to
BLAS F77
* General matrix-vector product functionality based on ?GEMV.
********************************************************************************
*/
#ifndef EIGEN_GENERAL_MATRIX_VECTOR_
MKL
_H
#define EIGEN_GENERAL_MATRIX_VECTOR_
MKL
_H
#ifndef EIGEN_GENERAL_MATRIX_VECTOR_
BLAS
_H
#define EIGEN_GENERAL_MATRIX_VECTOR_
BLAS
_H
namespace
Eigen
{
...
...
@@ -46,47 +46,46 @@ namespace internal {
// gemv specialization
template
<
typename
Index
,
typename
LhsScalar
,
int
LhsStorageOrder
,
bool
ConjugateLhs
,
typename
RhsScalar
,
bool
ConjugateRhs
>
struct
general_matrix_vector_product_gemv
:
general_matrix_vector_product
<
Index
,
LhsScalar
,
LhsStorageOrder
,
ConjugateLhs
,
RhsScalar
,
ConjugateRhs
,
BuiltIn
>
{};
template
<
typename
Index
,
typename
LhsScalar
,
int
StorageOrder
,
bool
ConjugateLhs
,
typename
RhsScalar
,
bool
ConjugateRhs
>
struct
general_matrix_vector_product_gemv
;
#define EIGEN_
MKL
_GEMV_SPECIALIZE(Scalar) \
#define EIGEN_
BLAS
_GEMV_SPECIALIZE(Scalar) \
template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
struct general_matrix_vector_product<Index,Scalar,
ColMajor,ConjugateLhs,Scalar
,ConjugateRhs,Specialized> { \
struct general_matrix_vector_product<Index,Scalar,
const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>
,ConjugateRhs,Specialized> { \
static void run( \
Index rows, Index cols, \
const
Scalar* lhs, Index lhsStride
, \
const
Scalar* rhs, Index rhsIncr
, \
const
const_blas_data_mapper<Scalar,Index,ColMajor> &lhs
, \
const
const_blas_data_mapper<Scalar,Index,RowMajor> &rhs
, \
Scalar* res, Index resIncr, Scalar alpha) \
{ \
if (ConjugateLhs) { \
general_matrix_vector_product<Index,Scalar,
ColMajor,ConjugateLhs,Scalar
,ConjugateRhs,BuiltIn>::run( \
rows, cols, lhs,
lhsStride, rhs, rhsIncr
, res, resIncr, alpha); \
general_matrix_vector_product<Index,Scalar,
const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>
,ConjugateRhs,BuiltIn>::run( \
rows, cols, lhs,
rhs
, res, resIncr, alpha); \
} else { \
general_matrix_vector_product_gemv<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
rows, cols, lhs, lhs
S
tride, rhs
, rhsIncr
, res, resIncr, alpha); \
rows, cols, lhs
.data()
, lhs
.s
tride
()
, rhs
.data(), rhs.stride()
, res, resIncr, alpha); \
} \
} \
}; \
template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
struct general_matrix_vector_product<Index,Scalar,
RowMajor,ConjugateLhs,Scalar
,ConjugateRhs,Specialized> { \
struct general_matrix_vector_product<Index,Scalar,
const_blas_data_mapper<Scalar,Index,RowMajor>,RowMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>
,ConjugateRhs,Specialized> { \
static void run( \
Index rows, Index cols, \
const
Scalar* lhs, Index lhsStride
, \
const
Scalar* rhs, Index rhsIncr
, \
const
const_blas_data_mapper<Scalar,Index,RowMajor> &lhs
, \
const
const_blas_data_mapper<Scalar,Index,ColMajor> &rhs
, \
Scalar* res, Index resIncr, Scalar alpha) \
{ \
general_matrix_vector_product_gemv<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
rows, cols, lhs, lhs
S
tride, rhs
, rhsIncr
, res, resIncr, alpha); \
rows, cols, lhs
.data()
, lhs
.s
tride
()
, rhs
.data(), rhs.stride()
, res, resIncr, alpha); \
} \
}; \
EIGEN_
MKL
_GEMV_SPECIALIZE
(
double
)
EIGEN_
MKL
_GEMV_SPECIALIZE
(
float
)
EIGEN_
MKL
_GEMV_SPECIALIZE
(
dcomplex
)
EIGEN_
MKL
_GEMV_SPECIALIZE
(
scomplex
)
EIGEN_
BLAS
_GEMV_SPECIALIZE
(
double
)
EIGEN_
BLAS
_GEMV_SPECIALIZE
(
float
)
EIGEN_
BLAS
_GEMV_SPECIALIZE
(
dcomplex
)
EIGEN_
BLAS
_GEMV_SPECIALIZE
(
scomplex
)
#define EIGEN_
MKL
_GEMV_SPECIALIZATION(EIGTYPE,
MKL
TYPE,
MKL
PREFIX) \
#define EIGEN_
BLAS
_GEMV_SPECIALIZATION(EIGTYPE,
BLAS
TYPE,
BLAS
PREFIX) \
template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
{ \
...
...
@@ -98,16 +97,15 @@ static void run( \
const EIGTYPE* rhs, Index rhsIncr, \
EIGTYPE* res, Index resIncr, EIGTYPE alpha) \
{ \
MKL_INT m=rows, n=cols, lda=lhsStride, incx=rhsIncr, incy=resIncr; \
MKLTYPE alpha_, beta_; \
const EIGTYPE *x_ptr, myone(1); \
BlasIndex m=convert_index<BlasIndex>(rows), n=convert_index<BlasIndex>(cols), \
lda=convert_index<BlasIndex>(lhsStride), incx=convert_index<BlasIndex>(rhsIncr), incy=convert_index<BlasIndex>(resIncr); \
const EIGTYPE beta(1); \
const EIGTYPE *x_ptr; \
char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \
if (LhsStorageOrder==RowMajor) { \
m
=
cols; \
n
=
rows; \
m
= convert_index<BlasIndex>(
cols
)
; \
n
= convert_index<BlasIndex>(
rows
)
; \
}\
assign_scalar_eig2mkl(alpha_, alpha); \
assign_scalar_eig2mkl(beta_, myone); \
GEMVVector x_tmp; \
if (ConjugateRhs) { \
Map<const GEMVVector, 0, InnerStride<> > map_x(rhs,cols,1,InnerStride<>(incx)); \
...
...
@@ -115,17 +113,17 @@ static void run( \
x_ptr=x_tmp.data(); \
incx=1; \
} else x_ptr=rhs; \
MKL
PREFIX##gemv(&trans, &m, &n, &alpha
_
, (const
MKL
TYPE*)lhs, &lda, (const
MKL
TYPE*)x_ptr, &incx, &beta
_
, (
MKL
TYPE*)res, &incy); \
BLAS
PREFIX##gemv
_
(&trans, &m, &n, &
numext::real_ref(
alpha
)
, (const
BLAS
TYPE*)lhs, &lda, (const
BLAS
TYPE*)x_ptr, &incx, &
numext::real_ref(
beta
)
, (
BLAS
TYPE*)res, &incy); \
}\
};
EIGEN_
MKL
_GEMV_SPECIALIZATION
(
double
,
double
,
d
)
EIGEN_
MKL
_GEMV_SPECIALIZATION
(
float
,
float
,
s
)
EIGEN_
MKL
_GEMV_SPECIALIZATION
(
dcomplex
,
MKL_Complex16
,
z
)
EIGEN_
MKL
_GEMV_SPECIALIZATION
(
scomplex
,
MKL_Complex8
,
c
)
EIGEN_
BLAS
_GEMV_SPECIALIZATION
(
double
,
double
,
d
)
EIGEN_
BLAS
_GEMV_SPECIALIZATION
(
float
,
float
,
s
)
EIGEN_
BLAS
_GEMV_SPECIALIZATION
(
dcomplex
,
double
,
z
)
EIGEN_
BLAS
_GEMV_SPECIALIZATION
(
scomplex
,
float
,
c
)
}
// end namespase internal
}
// end namespace Eigen
#endif // EIGEN_GENERAL_MATRIX_VECTOR_
MKL
_H
#endif // EIGEN_GENERAL_MATRIX_VECTOR_
BLAS
_H
external/eigen3/Eigen/src/Core/products/Parallelizer.h
View file @
a394b22a
...
...
@@ -10,7 +10,7 @@
#ifndef EIGEN_PARALLELIZER_H
#define EIGEN_PARALLELIZER_H
namespace
Eigen
{
namespace
Eigen
{
namespace
internal
{
...
...
@@ -49,8 +49,8 @@ inline void initParallel()
{
int
nbt
;
internal
::
manage_multi_threading
(
GetAction
,
&
nbt
);
std
::
ptrdiff_t
l1
,
l2
;
internal
::
manage_caching_sizes
(
GetAction
,
&
l1
,
&
l2
);
std
::
ptrdiff_t
l1
,
l2
,
l3
;
internal
::
manage_caching_sizes
(
GetAction
,
&
l1
,
&
l2
,
&
l3
);
}
/** \returns the max number of threads reserved for Eigen
...
...
@@ -73,17 +73,17 @@ namespace internal {
template
<
typename
Index
>
struct
GemmParallelInfo
{
GemmParallelInfo
()
:
sync
(
-
1
),
users
(
0
),
r
hs_start
(
0
),
r
hs_length
(
0
)
{}
GemmParallelInfo
()
:
sync
(
-
1
),
users
(
0
),
l
hs_start
(
0
),
l
hs_length
(
0
)
{}
int
volatile
sync
;
Index
volatile
sync
;
int
volatile
users
;
Index
r
hs_start
;
Index
r
hs_length
;
Index
l
hs_start
;
Index
l
hs_length
;
};
template
<
bool
Condition
,
typename
Functor
,
typename
Index
>
void
parallelize_gemm
(
const
Functor
&
func
,
Index
rows
,
Index
cols
,
bool
transpose
)
void
parallelize_gemm
(
const
Functor
&
func
,
Index
rows
,
Index
cols
,
Index
depth
,
bool
transpose
)
{
// TODO when EIGEN_USE_BLAS is defined,
// we should still enable OMP for other scalar types
...
...
@@ -92,6 +92,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
// the matrix product when multithreading is enabled. This is a temporary
// fix to support row-major destination matrices. This whole
// parallelizer mechanism has to be redisigned anyway.
EIGEN_UNUSED_VARIABLE
(
depth
);
EIGEN_UNUSED_VARIABLE
(
transpose
);
func
(
0
,
rows
,
0
,
cols
);
#else
...
...
@@ -102,56 +103,56 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
// - we are not already in a parallel code
// - the sizes are large enough
//
1- are we already in a parallel session?
//
FIXME omp_get_num_threads()>1 only works for openmp, w
hat
if
the
user does not use openmp?
if
((
!
Condition
)
||
(
omp_get_num_threads
()
>
1
))
return
func
(
0
,
rows
,
0
,
cols
);
//
compute the maximal number of threads from the size of the product:
//
This first heuristic takes into account t
hat the
product kernel is fully optimized when working with nr columns at once.
Index
size
=
transpose
?
rows
:
cols
;
Index
pb_max_threads
=
std
::
max
<
Index
>
(
1
,
size
/
Functor
::
Traits
::
nr
);
Index
size
=
transpose
?
cols
:
rows
;
// compute the maximal number of threads from the total amount of work:
double
work
=
static_cast
<
double
>
(
rows
)
*
static_cast
<
double
>
(
cols
)
*
static_cast
<
double
>
(
depth
);
double
kMinTaskSize
=
50000
;
// FIXME improve this heuristic.
pb_max_threads
=
std
::
max
<
Index
>
(
1
,
std
::
min
<
Index
>
(
pb_max_threads
,
work
/
kMinTaskSize
));
// 2- compute the maximal number of threads from the size of the product:
// FIXME this has to be fine tuned
Index
max_threads
=
std
::
max
<
Index
>
(
1
,
size
/
32
);
// compute the number of threads we are going to use
Index
threads
=
std
::
min
<
Index
>
(
nbThreads
(),
pb_max_threads
);
//
3 - compute the number of threads we are going to use
Index
threads
=
std
::
min
<
Index
>
(
nbThreads
(),
max_
thread
s
);
if
(
threads
==
1
)
//
if multi-threading is explicitely disabled, not useful, or if we already are in a parallel session,
// then abort multi-
thread
ing
// FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
if
(
(
!
Condition
)
||
(
threads
==
1
)
||
(
omp_get_num_threads
()
>
1
)
)
return
func
(
0
,
rows
,
0
,
cols
);
Eigen
::
initParallel
();
func
.
initParallelSession
();
func
.
initParallelSession
(
threads
);
if
(
transpose
)
std
::
swap
(
rows
,
cols
);
GemmParallelInfo
<
Index
>*
info
=
new
GemmParallelInfo
<
Index
>
[
threads
]
;
ei_declare_aligned_stack_constructed_variable
(
GemmParallelInfo
<
Index
>
,
info
,
threads
,
0
)
;
#pragma omp parallel num_threads(threads)
{
Index
i
=
omp_get_thread_num
();
// Note that the actual number of threads might be lower than the number of request ones.
Index
actual_threads
=
omp_get_num_threads
();
Index
blockCols
=
(
cols
/
actual_threads
)
&
~
Index
(
0x3
);
Index
blockRows
=
(
rows
/
actual_threads
)
&
~
Index
(
0x7
);
Index
blockRows
=
(
rows
/
actual_threads
);
blockRows
=
(
blockRows
/
Functor
::
Traits
::
mr
)
*
Functor
::
Traits
::
mr
;
Index
r0
=
i
*
blockRows
;
Index
actualBlockRows
=
(
i
+
1
==
actual_threads
)
?
rows
-
r0
:
blockRows
;
Index
c0
=
i
*
blockCols
;
Index
actualBlockCols
=
(
i
+
1
==
actual_threads
)
?
cols
-
c0
:
blockCols
;
info
[
i
].
r
hs_start
=
c
0
;
info
[
i
].
r
hs_length
=
actualBlock
Col
s
;
info
[
i
].
l
hs_start
=
r
0
;
info
[
i
].
l
hs_length
=
actualBlock
Row
s
;
if
(
transpose
)
func
(
0
,
cols
,
r0
,
actualBlockRows
,
info
);
else
func
(
r0
,
actualBlockRows
,
0
,
cols
,
info
);
if
(
transpose
)
func
(
c0
,
actualBlockCols
,
0
,
rows
,
info
);
else
func
(
0
,
rows
,
c0
,
actualBlockCols
,
info
);
}
delete
[]
info
;
#endif
}
...
...
external/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
View file @
a394b22a
...
...
@@ -15,7 +15,7 @@ namespace Eigen {
namespace
internal
{
// pack a selfadjoint block diagonal for use with the gebp_kernel
template
<
typename
Scalar
,
typename
Index
,
int
Pack1
,
int
Pack2
,
int
StorageOrder
>
template
<
typename
Scalar
,
typename
Index
,
int
Pack1
,
int
Pack2
_dummy
,
int
StorageOrder
>
struct
symm_pack_lhs
{
template
<
int
BlockRows
>
inline
...
...
@@ -45,25 +45,32 @@ struct symm_pack_lhs
}
void
operator
()(
Scalar
*
blockA
,
const
Scalar
*
_lhs
,
Index
lhsStride
,
Index
cols
,
Index
rows
)
{
enum
{
PacketSize
=
packet_traits
<
Scalar
>::
size
};
const_blas_data_mapper
<
Scalar
,
Index
,
StorageOrder
>
lhs
(
_lhs
,
lhsStride
);
Index
count
=
0
;
Index
peeled_mc
=
(
rows
/
Pack1
)
*
Pack1
;
for
(
Index
i
=
0
;
i
<
peeled_mc
;
i
+=
Pack1
)
{
pack
<
Pack1
>
(
blockA
,
lhs
,
cols
,
i
,
count
);
}
if
(
rows
-
peeled_mc
>=
Pack2
)
{
pack
<
Pack2
>
(
blockA
,
lhs
,
cols
,
peeled_mc
,
count
);
peeled_mc
+=
Pack2
;
}
//Index peeled_mc3 = (rows/Pack1)*Pack1;
const
Index
peeled_mc3
=
Pack1
>=
3
*
PacketSize
?
(
rows
/
(
3
*
PacketSize
))
*
(
3
*
PacketSize
)
:
0
;
const
Index
peeled_mc2
=
Pack1
>=
2
*
PacketSize
?
peeled_mc3
+
((
rows
-
peeled_mc3
)
/
(
2
*
PacketSize
))
*
(
2
*
PacketSize
)
:
0
;
const
Index
peeled_mc1
=
Pack1
>=
1
*
PacketSize
?
(
rows
/
(
1
*
PacketSize
))
*
(
1
*
PacketSize
)
:
0
;
if
(
Pack1
>=
3
*
PacketSize
)
for
(
Index
i
=
0
;
i
<
peeled_mc3
;
i
+=
3
*
PacketSize
)
pack
<
3
*
PacketSize
>
(
blockA
,
lhs
,
cols
,
i
,
count
);
if
(
Pack1
>=
2
*
PacketSize
)
for
(
Index
i
=
peeled_mc3
;
i
<
peeled_mc2
;
i
+=
2
*
PacketSize
)
pack
<
2
*
PacketSize
>
(
blockA
,
lhs
,
cols
,
i
,
count
);
if
(
Pack1
>=
1
*
PacketSize
)
for
(
Index
i
=
peeled_mc2
;
i
<
peeled_mc1
;
i
+=
1
*
PacketSize
)
pack
<
1
*
PacketSize
>
(
blockA
,
lhs
,
cols
,
i
,
count
);
// do the same with mr==1
for
(
Index
i
=
peeled_mc
;
i
<
rows
;
i
++
)
for
(
Index
i
=
peeled_mc
1
;
i
<
rows
;
i
++
)
{
for
(
Index
k
=
0
;
k
<
i
;
k
++
)
blockA
[
count
++
]
=
lhs
(
i
,
k
);
// normal
blockA
[
count
++
]
=
lhs
(
i
,
k
);
// normal
blockA
[
count
++
]
=
numext
::
real
(
lhs
(
i
,
i
));
// real (diagonal)
...
...
@@ -82,7 +89,8 @@ struct symm_pack_rhs
Index
end_k
=
k2
+
rows
;
Index
count
=
0
;
const_blas_data_mapper
<
Scalar
,
Index
,
StorageOrder
>
rhs
(
_rhs
,
rhsStride
);
Index
packet_cols
=
(
cols
/
nr
)
*
nr
;
Index
packet_cols8
=
nr
>=
8
?
(
cols
/
8
)
*
8
:
0
;
Index
packet_cols4
=
nr
>=
4
?
(
cols
/
4
)
*
4
:
0
;
// first part: normal case
for
(
Index
j2
=
0
;
j2
<
k2
;
j2
+=
nr
)
...
...
@@ -91,79 +99,151 @@ struct symm_pack_rhs
{
blockB
[
count
+
0
]
=
rhs
(
k
,
j2
+
0
);
blockB
[
count
+
1
]
=
rhs
(
k
,
j2
+
1
);
if
(
nr
=
=
4
)
if
(
nr
>
=
4
)
{
blockB
[
count
+
2
]
=
rhs
(
k
,
j2
+
2
);
blockB
[
count
+
3
]
=
rhs
(
k
,
j2
+
3
);
}
if
(
nr
>=
8
)
{
blockB
[
count
+
4
]
=
rhs
(
k
,
j2
+
4
);
blockB
[
count
+
5
]
=
rhs
(
k
,
j2
+
5
);
blockB
[
count
+
6
]
=
rhs
(
k
,
j2
+
6
);
blockB
[
count
+
7
]
=
rhs
(
k
,
j2
+
7
);
}
count
+=
nr
;
}
}
// second part: diagonal block
for
(
Index
j2
=
k2
;
j2
<
(
std
::
min
)(
k2
+
rows
,
packet_cols
);
j2
+=
nr
)
Index
end8
=
nr
>=
8
?
(
std
::
min
)(
k2
+
rows
,
packet_cols8
)
:
k2
;
if
(
nr
>=
8
)
{
// again we can split vertically in three different parts (transpose, symmetric, normal)
// transpose
for
(
Index
k
=
k2
;
k
<
j2
;
k
++
)
for
(
Index
j2
=
k2
;
j2
<
end8
;
j2
+=
8
)
{
blockB
[
count
+
0
]
=
numext
::
conj
(
rhs
(
j2
+
0
,
k
));
blockB
[
count
+
1
]
=
numext
::
conj
(
rhs
(
j2
+
1
,
k
));
if
(
nr
==
4
)
// again we can split vertically in three different parts (transpose, symmetric, normal)
// transpose
for
(
Index
k
=
k2
;
k
<
j2
;
k
++
)
{
blockB
[
count
+
0
]
=
numext
::
conj
(
rhs
(
j2
+
0
,
k
));
blockB
[
count
+
1
]
=
numext
::
conj
(
rhs
(
j2
+
1
,
k
));
blockB
[
count
+
2
]
=
numext
::
conj
(
rhs
(
j2
+
2
,
k
));
blockB
[
count
+
3
]
=
numext
::
conj
(
rhs
(
j2
+
3
,
k
));
blockB
[
count
+
4
]
=
numext
::
conj
(
rhs
(
j2
+
4
,
k
));
blockB
[
count
+
5
]
=
numext
::
conj
(
rhs
(
j2
+
5
,
k
));
blockB
[
count
+
6
]
=
numext
::
conj
(
rhs
(
j2
+
6
,
k
));
blockB
[
count
+
7
]
=
numext
::
conj
(
rhs
(
j2
+
7
,
k
));
count
+=
8
;
}
count
+=
nr
;
}
// symmetric
Index
h
=
0
;
for
(
Index
k
=
j2
;
k
<
j2
+
nr
;
k
++
)
{
// normal
for
(
Index
w
=
0
;
w
<
h
;
++
w
)
blockB
[
count
+
w
]
=
rhs
(
k
,
j2
+
w
);
// symmetric
Index
h
=
0
;
for
(
Index
k
=
j2
;
k
<
j2
+
8
;
k
++
)
{
// normal
for
(
Index
w
=
0
;
w
<
h
;
++
w
)
blockB
[
count
+
w
]
=
rhs
(
k
,
j2
+
w
);
blockB
[
count
+
h
]
=
numext
::
real
(
rhs
(
k
,
k
));
blockB
[
count
+
h
]
=
numext
::
real
(
rhs
(
k
,
k
));
// transpose
for
(
Index
w
=
h
+
1
;
w
<
nr
;
++
w
)
blockB
[
count
+
w
]
=
numext
::
conj
(
rhs
(
j2
+
w
,
k
));
count
+=
nr
;
++
h
;
// transpose
for
(
Index
w
=
h
+
1
;
w
<
8
;
++
w
)
blockB
[
count
+
w
]
=
numext
::
conj
(
rhs
(
j2
+
w
,
k
));
count
+=
8
;
++
h
;
}
// normal
for
(
Index
k
=
j2
+
8
;
k
<
end_k
;
k
++
)
{
blockB
[
count
+
0
]
=
rhs
(
k
,
j2
+
0
);
blockB
[
count
+
1
]
=
rhs
(
k
,
j2
+
1
);
blockB
[
count
+
2
]
=
rhs
(
k
,
j2
+
2
);
blockB
[
count
+
3
]
=
rhs
(
k
,
j2
+
3
);
blockB
[
count
+
4
]
=
rhs
(
k
,
j2
+
4
);
blockB
[
count
+
5
]
=
rhs
(
k
,
j2
+
5
);
blockB
[
count
+
6
]
=
rhs
(
k
,
j2
+
6
);
blockB
[
count
+
7
]
=
rhs
(
k
,
j2
+
7
);
count
+=
8
;
}
}
// normal
for
(
Index
k
=
j2
+
nr
;
k
<
end_k
;
k
++
)
}
if
(
nr
>=
4
)
{
for
(
Index
j2
=
end8
;
j2
<
(
std
::
min
)(
k2
+
rows
,
packet_cols4
);
j2
+=
4
)
{
blockB
[
count
+
0
]
=
rhs
(
k
,
j2
+
0
);
blockB
[
count
+
1
]
=
rhs
(
k
,
j2
+
1
);
if
(
nr
==
4
)
// again we can split vertically in three different parts (transpose, symmetric, normal)
// transpose
for
(
Index
k
=
k2
;
k
<
j2
;
k
++
)
{
blockB
[
count
+
0
]
=
numext
::
conj
(
rhs
(
j2
+
0
,
k
));
blockB
[
count
+
1
]
=
numext
::
conj
(
rhs
(
j2
+
1
,
k
));
blockB
[
count
+
2
]
=
numext
::
conj
(
rhs
(
j2
+
2
,
k
));
blockB
[
count
+
3
]
=
numext
::
conj
(
rhs
(
j2
+
3
,
k
));
count
+=
4
;
}
// symmetric
Index
h
=
0
;
for
(
Index
k
=
j2
;
k
<
j2
+
4
;
k
++
)
{
// normal
for
(
Index
w
=
0
;
w
<
h
;
++
w
)
blockB
[
count
+
w
]
=
rhs
(
k
,
j2
+
w
);
blockB
[
count
+
h
]
=
numext
::
real
(
rhs
(
k
,
k
));
// transpose
for
(
Index
w
=
h
+
1
;
w
<
4
;
++
w
)
blockB
[
count
+
w
]
=
numext
::
conj
(
rhs
(
j2
+
w
,
k
));
count
+=
4
;
++
h
;
}
// normal
for
(
Index
k
=
j2
+
4
;
k
<
end_k
;
k
++
)
{
blockB
[
count
+
0
]
=
rhs
(
k
,
j2
+
0
);
blockB
[
count
+
1
]
=
rhs
(
k
,
j2
+
1
);
blockB
[
count
+
2
]
=
rhs
(
k
,
j2
+
2
);
blockB
[
count
+
3
]
=
rhs
(
k
,
j2
+
3
);
count
+=
4
;
}
count
+=
nr
;
}
}
// third part: transposed
for
(
Index
j2
=
k2
+
rows
;
j2
<
packet_cols
;
j2
+=
nr
)
if
(
nr
>=
8
)
{
for
(
Index
k
=
k2
;
k
<
end_k
;
k
++
)
for
(
Index
j2
=
k2
+
rows
;
j2
<
packet_cols8
;
j2
+=
8
)
{
blockB
[
count
+
0
]
=
numext
::
conj
(
rhs
(
j2
+
0
,
k
));
blockB
[
count
+
1
]
=
numext
::
conj
(
rhs
(
j2
+
1
,
k
));
if
(
nr
==
4
)
for
(
Index
k
=
k2
;
k
<
end_k
;
k
++
)
{
blockB
[
count
+
0
]
=
numext
::
conj
(
rhs
(
j2
+
0
,
k
));
blockB
[
count
+
1
]
=
numext
::
conj
(
rhs
(
j2
+
1
,
k
));
blockB
[
count
+
2
]
=
numext
::
conj
(
rhs
(
j2
+
2
,
k
));
blockB
[
count
+
3
]
=
numext
::
conj
(
rhs
(
j2
+
3
,
k
));
blockB
[
count
+
4
]
=
numext
::
conj
(
rhs
(
j2
+
4
,
k
));
blockB
[
count
+
5
]
=
numext
::
conj
(
rhs
(
j2
+
5
,
k
));
blockB
[
count
+
6
]
=
numext
::
conj
(
rhs
(
j2
+
6
,
k
));
blockB
[
count
+
7
]
=
numext
::
conj
(
rhs
(
j2
+
7
,
k
));
count
+=
8
;
}
}
}
if
(
nr
>=
4
)
{
for
(
Index
j2
=
(
std
::
max
)(
packet_cols8
,
k2
+
rows
);
j2
<
packet_cols4
;
j2
+=
4
)
{
for
(
Index
k
=
k2
;
k
<
end_k
;
k
++
)
{
blockB
[
count
+
0
]
=
numext
::
conj
(
rhs
(
j2
+
0
,
k
));
blockB
[
count
+
1
]
=
numext
::
conj
(
rhs
(
j2
+
1
,
k
));
blockB
[
count
+
2
]
=
numext
::
conj
(
rhs
(
j2
+
2
,
k
));
blockB
[
count
+
3
]
=
numext
::
conj
(
rhs
(
j2
+
3
,
k
));
count
+=
4
;
}
count
+=
nr
;
}
}
// copy the remaining columns one at a time (=> the same with nr==1)
for
(
Index
j2
=
packet_cols
;
j2
<
cols
;
++
j2
)
for
(
Index
j2
=
packet_cols
4
;
j2
<
cols
;
++
j2
)
{
// transpose
Index
half
=
(
std
::
min
)(
end_k
,
j2
);
...
...
@@ -211,7 +291,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co
const
Scalar
*
lhs
,
Index
lhsStride
,
const
Scalar
*
rhs
,
Index
rhsStride
,
Scalar
*
res
,
Index
resStride
,
const
Scalar
&
alpha
)
const
Scalar
&
alpha
,
level3_blocking
<
Scalar
,
Scalar
>&
blocking
)
{
product_selfadjoint_matrix
<
Scalar
,
Index
,
EIGEN_LOGICAL_XOR
(
RhsSelfAdjoint
,
RhsStorageOrder
==
RowMajor
)
?
ColMajor
:
RowMajor
,
...
...
@@ -219,7 +299,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co
EIGEN_LOGICAL_XOR
(
LhsSelfAdjoint
,
LhsStorageOrder
==
RowMajor
)
?
ColMajor
:
RowMajor
,
LhsSelfAdjoint
,
NumTraits
<
Scalar
>::
IsComplex
&&
EIGEN_LOGICAL_XOR
(
LhsSelfAdjoint
,
ConjugateLhs
),
ColMajor
>
::
run
(
cols
,
rows
,
rhs
,
rhsStride
,
lhs
,
lhsStride
,
res
,
resStride
,
alpha
);
::
run
(
cols
,
rows
,
rhs
,
rhsStride
,
lhs
,
lhsStride
,
res
,
resStride
,
alpha
,
blocking
);
}
};
...
...
@@ -234,7 +314,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs
const
Scalar
*
_lhs
,
Index
lhsStride
,
const
Scalar
*
_rhs
,
Index
rhsStride
,
Scalar
*
res
,
Index
resStride
,
const
Scalar
&
alpha
);
const
Scalar
&
alpha
,
level3_blocking
<
Scalar
,
Scalar
>&
blocking
);
};
template
<
typename
Scalar
,
typename
Index
,
...
...
@@ -244,33 +324,35 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
Index
rows
,
Index
cols
,
const
Scalar
*
_lhs
,
Index
lhsStride
,
const
Scalar
*
_rhs
,
Index
rhsStride
,
Scalar
*
res
,
Index
resStride
,
const
Scalar
&
alpha
)
Scalar
*
_
res
,
Index
resStride
,
const
Scalar
&
alpha
,
level3_blocking
<
Scalar
,
Scalar
>&
blocking
)
{
Index
size
=
rows
;
const_blas_data_mapper
<
Scalar
,
Index
,
LhsStorageOrder
>
lhs
(
_lhs
,
lhsStride
);
const_blas_data_mapper
<
Scalar
,
Index
,
RhsStorageOrder
>
rhs
(
_rhs
,
rhsStride
);
typedef
gebp_traits
<
Scalar
,
Scalar
>
Traits
;
Index
kc
=
size
;
// cache block size along the K direction
Index
mc
=
rows
;
// cache block size along the M direction
Index
nc
=
cols
;
// cache block size along the N direction
computeProductBlockingSizes
<
Scalar
,
Scalar
>
(
kc
,
mc
,
nc
);
// kc must smaller than mc
typedef
const_blas_data_mapper
<
Scalar
,
Index
,
LhsStorageOrder
>
LhsMapper
;
typedef
const_blas_data_mapper
<
Scalar
,
Index
,
(
LhsStorageOrder
==
RowMajor
)
?
ColMajor
:
RowMajor
>
LhsTransposeMapper
;
typedef
const_blas_data_mapper
<
Scalar
,
Index
,
RhsStorageOrder
>
RhsMapper
;
typedef
blas_data_mapper
<
typename
Traits
::
ResScalar
,
Index
,
ColMajor
>
ResMapper
;
LhsMapper
lhs
(
_lhs
,
lhsStride
);
LhsTransposeMapper
lhs_transpose
(
_lhs
,
lhsStride
);
RhsMapper
rhs
(
_rhs
,
rhsStride
);
ResMapper
res
(
_res
,
resStride
);
Index
kc
=
blocking
.
kc
();
// cache block size along the K direction
Index
mc
=
(
std
::
min
)(
rows
,
blocking
.
mc
());
// cache block size along the M direction
// kc must be smaller than mc
kc
=
(
std
::
min
)(
kc
,
mc
);
std
::
size_t
sizeA
=
kc
*
mc
;
std
::
size_t
sizeB
=
kc
*
cols
;
ei_declare_aligned_stack_constructed_variable
(
Scalar
,
blockA
,
sizeA
,
blocking
.
blockA
());
ei_declare_aligned_stack_constructed_variable
(
Scalar
,
blockB
,
sizeB
,
blocking
.
blockB
());
std
::
size_t
sizeW
=
kc
*
Traits
::
WorkSpaceFactor
;
std
::
size_t
sizeB
=
sizeW
+
kc
*
cols
;
ei_declare_aligned_stack_constructed_variable
(
Scalar
,
blockA
,
kc
*
mc
,
0
);
ei_declare_aligned_stack_constructed_variable
(
Scalar
,
allocatedBlockB
,
sizeB
,
0
);
Scalar
*
blockB
=
allocatedBlockB
+
sizeW
;
gebp_kernel
<
Scalar
,
Scalar
,
Index
,
Traits
::
mr
,
Traits
::
nr
,
ConjugateLhs
,
ConjugateRhs
>
gebp_kernel
;
gebp_kernel
<
Scalar
,
Scalar
,
Index
,
ResMapper
,
Traits
::
mr
,
Traits
::
nr
,
ConjugateLhs
,
ConjugateRhs
>
gebp_kernel
;
symm_pack_lhs
<
Scalar
,
Index
,
Traits
::
mr
,
Traits
::
LhsProgress
,
LhsStorageOrder
>
pack_lhs
;
gemm_pack_rhs
<
Scalar
,
Index
,
Traits
::
nr
,
RhsStorageOrder
>
pack_rhs
;
gemm_pack_lhs
<
Scalar
,
Index
,
Traits
::
mr
,
Traits
::
LhsProgress
,
LhsStorageOrder
==
RowMajor
?
ColMajor
:
RowMajor
,
true
>
pack_lhs_transposed
;
gemm_pack_rhs
<
Scalar
,
Index
,
RhsMapper
,
Traits
::
nr
,
RhsStorageOrder
>
pack_rhs
;
gemm_pack_lhs
<
Scalar
,
Index
,
LhsTransposeMapper
,
Traits
::
mr
,
Traits
::
LhsProgress
,
LhsStorageOrder
==
RowMajor
?
ColMajor
:
RowMajor
,
true
>
pack_lhs_transposed
;
for
(
Index
k2
=
0
;
k2
<
size
;
k2
+=
kc
)
{
...
...
@@ -279,7 +361,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
// we have selected one row panel of rhs and one column panel of lhs
// pack rhs's panel into a sequential chunk of memory
// and expand each coeff to a constant packet for further reuse
pack_rhs
(
blockB
,
&
rhs
(
k2
,
0
),
rhsStride
,
actual_kc
,
cols
);
pack_rhs
(
blockB
,
rhs
.
getSubMapper
(
k2
,
0
)
,
actual_kc
,
cols
);
// the select lhs's panel has to be split in three different parts:
// 1 - the transposed panel above the diagonal block => transposed packed copy
...
...
@@ -289,9 +371,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
{
const
Index
actual_mc
=
(
std
::
min
)(
i2
+
mc
,
k2
)
-
i2
;
// transposed packed copy
pack_lhs_transposed
(
blockA
,
&
lhs
(
k
2
,
i
2
),
lhsStride
,
actual_kc
,
actual_mc
);
pack_lhs_transposed
(
blockA
,
lhs
_transpose
.
getSubMapper
(
i
2
,
k
2
),
actual_kc
,
actual_mc
);
gebp_kernel
(
res
+
i2
,
resStride
,
blockA
,
blockB
,
actual_mc
,
actual_kc
,
cols
,
alpha
);
gebp_kernel
(
res
.
getSubMapper
(
i2
,
0
)
,
blockA
,
blockB
,
actual_mc
,
actual_kc
,
cols
,
alpha
);
}
// the block diagonal
{
...
...
@@ -299,16 +381,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
// symmetric packed copy
pack_lhs
(
blockA
,
&
lhs
(
k2
,
k2
),
lhsStride
,
actual_kc
,
actual_mc
);
gebp_kernel
(
res
+
k2
,
resStride
,
blockA
,
blockB
,
actual_mc
,
actual_kc
,
cols
,
alpha
);
gebp_kernel
(
res
.
getSubMapper
(
k2
,
0
)
,
blockA
,
blockB
,
actual_mc
,
actual_kc
,
cols
,
alpha
);
}
for
(
Index
i2
=
k2
+
kc
;
i2
<
size
;
i2
+=
mc
)
{
const
Index
actual_mc
=
(
std
::
min
)(
i2
+
mc
,
size
)
-
i2
;
gemm_pack_lhs
<
Scalar
,
Index
,
Traits
::
mr
,
Traits
::
LhsProgress
,
LhsStorageOrder
,
false
>
()
(
blockA
,
&
lhs
(
i2
,
k2
),
lhsStride
,
actual_kc
,
actual_mc
);
gemm_pack_lhs
<
Scalar
,
Index
,
LhsMapper
,
Traits
::
mr
,
Traits
::
LhsProgress
,
LhsStorageOrder
,
false
>
()
(
blockA
,
lhs
.
getSubMapper
(
i2
,
k2
)
,
actual_kc
,
actual_mc
);
gebp_kernel
(
res
+
i2
,
resStride
,
blockA
,
blockB
,
actual_mc
,
actual_kc
,
cols
,
alpha
);
gebp_kernel
(
res
.
getSubMapper
(
i2
,
0
)
,
blockA
,
blockB
,
actual_mc
,
actual_kc
,
cols
,
alpha
);
}
}
}
...
...
@@ -325,7 +407,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLh
const
Scalar
*
_lhs
,
Index
lhsStride
,
const
Scalar
*
_rhs
,
Index
rhsStride
,
Scalar
*
res
,
Index
resStride
,
const
Scalar
&
alpha
);
const
Scalar
&
alpha
,
level3_blocking
<
Scalar
,
Scalar
>&
blocking
);
};
template
<
typename
Scalar
,
typename
Index
,
...
...
@@ -335,27 +417,27 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
Index
rows
,
Index
cols
,
const
Scalar
*
_lhs
,
Index
lhsStride
,
const
Scalar
*
_rhs
,
Index
rhsStride
,
Scalar
*
res
,
Index
resStride
,
const
Scalar
&
alpha
)
Scalar
*
_
res
,
Index
resStride
,
const
Scalar
&
alpha
,
level3_blocking
<
Scalar
,
Scalar
>&
blocking
)
{
Index
size
=
cols
;
const_blas_data_mapper
<
Scalar
,
Index
,
LhsStorageOrder
>
lhs
(
_lhs
,
lhsStride
);
typedef
gebp_traits
<
Scalar
,
Scalar
>
Traits
;
Index
kc
=
size
;
// cache block size along the K direction
Index
mc
=
rows
;
// cache block size along the M direction
Index
nc
=
cols
;
// cache block size along the N direction
computeProductBlockingSizes
<
Scalar
,
Scalar
>
(
kc
,
mc
,
nc
);
std
::
size_t
sizeW
=
kc
*
Traits
::
WorkSpaceFactor
;
std
::
size_t
sizeB
=
sizeW
+
kc
*
cols
;
ei_declare_aligned_stack_constructed_variable
(
Scalar
,
blockA
,
kc
*
mc
,
0
);
ei_declare_aligned_stack_constructed_variable
(
Scalar
,
allocatedBlockB
,
sizeB
,
0
);
Scalar
*
blockB
=
allocatedBlockB
+
sizeW
;
gebp_kernel
<
Scalar
,
Scalar
,
Index
,
Traits
::
mr
,
Traits
::
nr
,
ConjugateLhs
,
ConjugateRhs
>
gebp_kernel
;
gemm_pack_lhs
<
Scalar
,
Index
,
Traits
::
mr
,
Traits
::
LhsProgress
,
LhsStorageOrder
>
pack_lhs
;
typedef
const_blas_data_mapper
<
Scalar
,
Index
,
LhsStorageOrder
>
LhsMapper
;
typedef
blas_data_mapper
<
typename
Traits
::
ResScalar
,
Index
,
ColMajor
>
ResMapper
;
LhsMapper
lhs
(
_lhs
,
lhsStride
);
ResMapper
res
(
_res
,
resStride
);
Index
kc
=
blocking
.
kc
();
// cache block size along the K direction
Index
mc
=
(
std
::
min
)(
rows
,
blocking
.
mc
());
// cache block size along the M direction
std
::
size_t
sizeA
=
kc
*
mc
;
std
::
size_t
sizeB
=
kc
*
cols
;
ei_declare_aligned_stack_constructed_variable
(
Scalar
,
blockA
,
sizeA
,
blocking
.
blockA
());
ei_declare_aligned_stack_constructed_variable
(
Scalar
,
blockB
,
sizeB
,
blocking
.
blockB
());
gebp_kernel
<
Scalar
,
Scalar
,
Index
,
ResMapper
,
Traits
::
mr
,
Traits
::
nr
,
ConjugateLhs
,
ConjugateRhs
>
gebp_kernel
;
gemm_pack_lhs
<
Scalar
,
Index
,
LhsMapper
,
Traits
::
mr
,
Traits
::
LhsProgress
,
LhsStorageOrder
>
pack_lhs
;
symm_pack_rhs
<
Scalar
,
Index
,
Traits
::
nr
,
RhsStorageOrder
>
pack_rhs
;
for
(
Index
k2
=
0
;
k2
<
size
;
k2
+=
kc
)
...
...
@@ -368,9 +450,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
for
(
Index
i2
=
0
;
i2
<
rows
;
i2
+=
mc
)
{
const
Index
actual_mc
=
(
std
::
min
)(
i2
+
mc
,
rows
)
-
i2
;
pack_lhs
(
blockA
,
&
lhs
(
i2
,
k2
),
lhsStride
,
actual_kc
,
actual_mc
);
pack_lhs
(
blockA
,
lhs
.
getSubMapper
(
i2
,
k2
)
,
actual_kc
,
actual_mc
);
gebp_kernel
(
res
+
i2
,
resStride
,
blockA
,
blockB
,
actual_mc
,
actual_kc
,
cols
,
alpha
);
gebp_kernel
(
res
.
getSubMapper
(
i2
,
0
)
,
blockA
,
blockB
,
actual_mc
,
actual_kc
,
cols
,
alpha
);
}
}
}
...
...
@@ -382,55 +464,58 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
***************************************************************************/
namespace
internal
{
template
<
typename
Lhs
,
int
LhsMode
,
typename
Rhs
,
int
RhsMode
>
struct
traits
<
SelfadjointProductMatrix
<
Lhs
,
LhsMode
,
false
,
Rhs
,
RhsMode
,
false
>
>
:
traits
<
ProductBase
<
SelfadjointProductMatrix
<
Lhs
,
LhsMode
,
false
,
Rhs
,
RhsMode
,
false
>
,
Lhs
,
Rhs
>
>
{};
}
template
<
typename
Lhs
,
int
LhsMode
,
typename
Rhs
,
int
RhsMode
>
struct
SelfadjointProductMatrix
<
Lhs
,
LhsMode
,
false
,
Rhs
,
RhsMode
,
false
>
:
public
ProductBase
<
SelfadjointProductMatrix
<
Lhs
,
LhsMode
,
false
,
Rhs
,
RhsMode
,
false
>
,
Lhs
,
Rhs
>
struct
selfadjoint_product_impl
<
Lhs
,
LhsMode
,
false
,
Rhs
,
RhsMode
,
false
>
{
EIGEN_PRODUCT_PUBLIC_INTERFACE
(
SelfadjointProductMatrix
)
SelfadjointProductMatrix
(
const
Lhs
&
lhs
,
const
Rhs
&
rhs
)
:
Base
(
lhs
,
rhs
)
{}
typedef
typename
Product
<
Lhs
,
Rhs
>::
Scalar
Scalar
;
typedef
internal
::
blas_traits
<
Lhs
>
LhsBlasTraits
;
typedef
typename
LhsBlasTraits
::
DirectLinearAccessType
ActualLhsType
;
typedef
internal
::
blas_traits
<
Rhs
>
RhsBlasTraits
;
typedef
typename
RhsBlasTraits
::
DirectLinearAccessType
ActualRhsType
;
enum
{
LhsIsUpper
=
(
LhsMode
&
(
Upper
|
Lower
))
==
Upper
,
LhsIsSelfAdjoint
=
(
LhsMode
&
SelfAdjoint
)
==
SelfAdjoint
,
RhsIsUpper
=
(
RhsMode
&
(
Upper
|
Lower
))
==
Upper
,
RhsIsSelfAdjoint
=
(
RhsMode
&
SelfAdjoint
)
==
SelfAdjoint
};
template
<
typename
Dest
>
void
scaleAndAddTo
(
Dest
&
dst
,
const
Scalar
&
alpha
)
const
template
<
typename
Dest
>
static
void
run
(
Dest
&
dst
,
const
Lhs
&
a_lhs
,
const
Rhs
&
a_rhs
,
const
Scalar
&
alpha
)
{
eigen_assert
(
dst
.
rows
()
==
m
_lhs
.
rows
()
&&
dst
.
cols
()
==
m
_rhs
.
cols
());
eigen_assert
(
dst
.
rows
()
==
a
_lhs
.
rows
()
&&
dst
.
cols
()
==
a
_rhs
.
cols
());
typename
internal
::
add_const_on_value_type
<
ActualLhsType
>::
type
lhs
=
LhsBlasTraits
::
extract
(
m
_lhs
);
typename
internal
::
add_const_on_value_type
<
ActualRhsType
>::
type
rhs
=
RhsBlasTraits
::
extract
(
m
_rhs
);
typename
internal
::
add_const_on_value_type
<
ActualLhsType
>::
type
lhs
=
LhsBlasTraits
::
extract
(
a
_lhs
);
typename
internal
::
add_const_on_value_type
<
ActualRhsType
>::
type
rhs
=
RhsBlasTraits
::
extract
(
a
_rhs
);
Scalar
actualAlpha
=
alpha
*
LhsBlasTraits
::
extractScalarFactor
(
m_lhs
)
*
RhsBlasTraits
::
extractScalarFactor
(
m_rhs
);
Scalar
actualAlpha
=
alpha
*
LhsBlasTraits
::
extractScalarFactor
(
a_lhs
)
*
RhsBlasTraits
::
extractScalarFactor
(
a_rhs
);
typedef
internal
::
gemm_blocking_space
<
(
Dest
::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
Scalar
,
Scalar
,
Lhs
::
MaxRowsAtCompileTime
,
Rhs
::
MaxColsAtCompileTime
,
Lhs
::
MaxColsAtCompileTime
,
1
>
BlockingType
;
BlockingType
blocking
(
lhs
.
rows
(),
rhs
.
cols
(),
lhs
.
cols
(),
1
,
false
);
internal
::
product_selfadjoint_matrix
<
Scalar
,
Index
,
EIGEN_LOGICAL_XOR
(
LhsIsUpper
,
internal
::
traits
<
Lhs
>::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
LhsIsSelfAdjoint
,
EIGEN_LOGICAL_XOR
(
LhsIsUpper
,
internal
::
traits
<
Lhs
>::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
LhsIsSelfAdjoint
,
NumTraits
<
Scalar
>::
IsComplex
&&
EIGEN_LOGICAL_XOR
(
LhsIsUpper
,
bool
(
LhsBlasTraits
::
NeedToConjugate
)),
EIGEN_LOGICAL_XOR
(
RhsIsUpper
,
internal
::
traits
<
Rhs
>::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
RhsIsSelfAdjoint
,
EIGEN_LOGICAL_XOR
(
RhsIsUpper
,
internal
::
traits
<
Rhs
>::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
RhsIsSelfAdjoint
,
NumTraits
<
Scalar
>::
IsComplex
&&
EIGEN_LOGICAL_XOR
(
RhsIsUpper
,
bool
(
RhsBlasTraits
::
NeedToConjugate
)),
internal
::
traits
<
Dest
>::
Flags
&
RowMajorBit
?
RowMajor
:
ColMajor
>
::
run
(
lhs
.
rows
(),
rhs
.
cols
(),
// sizes
&
lhs
.
coeffRef
(
0
,
0
),
lhs
.
outerStride
(),
// lhs info
&
rhs
.
coeffRef
(
0
,
0
),
rhs
.
outerStride
(),
// rhs info
&
lhs
.
coeffRef
(
0
,
0
),
lhs
.
outerStride
(),
// lhs info
&
rhs
.
coeffRef
(
0
,
0
),
rhs
.
outerStride
(),
// rhs info
&
dst
.
coeffRef
(
0
,
0
),
dst
.
outerStride
(),
// result info
actualAlpha
// alpha
actualAlpha
,
blocking
// alpha
);
}
};
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H
external/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix_
MKL
.h
→
external/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix_
BLAS
.h
View file @
a394b22a
...
...
@@ -25,13 +25,13 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
********************************************************************************
* Content : Eigen bindings to
Intel(R) MKL
* Content : Eigen bindings to
BLAS F77
* Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM.
********************************************************************************
*/
#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_
MKL
_H
#define EIGEN_SELFADJOINT_MATRIX_MATRIX_
MKL
_H
#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_
BLAS
_H
#define EIGEN_SELFADJOINT_MATRIX_MATRIX_
BLAS
_H
namespace
Eigen
{
...
...
@@ -40,7 +40,7 @@ namespace internal {
/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
#define EIGEN_
MKL
_SYMM_L(EIGTYPE,
MKL
TYPE, EIGPREFIX,
MKL
PREFIX) \
#define EIGEN_
BLAS
_SYMM_L(EIGTYPE,
BLAS
TYPE, EIGPREFIX,
BLAS
PREFIX) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
...
...
@@ -52,28 +52,23 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \
EIGTYPE alpha) \
EIGTYPE alpha
, level3_blocking<EIGTYPE, EIGTYPE>&
/*blocking*/
) \
{ \
char side='L', uplo='L'; \
MKL_INT
m, n, lda, ldb, ldc; \
BlasIndex
m, n, lda, ldb, ldc; \
const EIGTYPE *a, *b; \
MKL
TYPE
alpha_,
beta
_
; \
EIG
TYPE beta
(1)
; \
MatrixX##EIGPREFIX b_tmp; \
EIGTYPE myone(1);\
\
/* Set transpose options */
\
/* Set m, n, k */
\
m = (MKL_INT)rows; \
n = (MKL_INT)cols; \
\
/* Set alpha_ & beta_ */
\
assign_scalar_eig2mkl(alpha_, alpha); \
assign_scalar_eig2mkl(beta_, myone); \
m = convert_index<BlasIndex>(rows); \
n = convert_index<BlasIndex>(cols); \
\
/* Set lda, ldb, ldc */
\
lda =
(MKL_INT)
lhsStride; \
ldb =
(MKL_INT)
rhsStride; \
ldc =
(MKL_INT)
resStride; \
lda =
convert_index<BlasIndex>(
lhsStride
)
; \
ldb =
convert_index<BlasIndex>(
rhsStride
)
; \
ldc =
convert_index<BlasIndex>(
resStride
)
; \
\
/* Set a, b, c */
\
if (LhsStorageOrder==RowMajor) uplo='U'; \
...
...
@@ -83,16 +78,16 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
b_tmp = rhs.adjoint(); \
b = b_tmp.data(); \
ldb = b_tmp.outerStride(); \
ldb =
convert_index<BlasIndex>(
b_tmp.outerStride()
)
; \
} else b = _rhs; \
\
MKL
PREFIX##symm(&side, &uplo, &m, &n, &alpha
_
, (const
MKL
TYPE*)a, &lda, (const
MKL
TYPE*)b, &ldb, &beta
_
, (
MKL
TYPE*)res, &ldc); \
BLAS
PREFIX##symm
_
(&side, &uplo, &m, &n, &
numext::real_ref(
alpha
)
, (const
BLAS
TYPE*)a, &lda, (const
BLAS
TYPE*)b, &ldb, &
numext::real_ref(
beta
)
, (
BLAS
TYPE*)res, &ldc); \
\
} \
};
#define EIGEN_
MKL
_HEMM_L(EIGTYPE,
MKL
TYPE, EIGPREFIX,
MKL
PREFIX) \
#define EIGEN_
BLAS
_HEMM_L(EIGTYPE,
BLAS
TYPE, EIGPREFIX,
BLAS
PREFIX) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
...
...
@@ -103,36 +98,31 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \
EIGTYPE alpha) \
EIGTYPE alpha
, level3_blocking<EIGTYPE, EIGTYPE>&
/*blocking*/
) \
{ \
char side='L', uplo='L'; \
MKL_INT
m, n, lda, ldb, ldc; \
BlasIndex
m, n, lda, ldb, ldc; \
const EIGTYPE *a, *b; \
MKL
TYPE
alpha_,
beta
_
; \
EIG
TYPE beta
(1)
; \
MatrixX##EIGPREFIX b_tmp; \
Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> a_tmp; \
EIGTYPE myone(1); \
\
/* Set transpose options */
\
/* Set m, n, k */
\
m = (MKL_INT)rows; \
n = (MKL_INT)cols; \
\
/* Set alpha_ & beta_ */
\
assign_scalar_eig2mkl(alpha_, alpha); \
assign_scalar_eig2mkl(beta_, myone); \
m = convert_index<BlasIndex>(rows); \
n = convert_index<BlasIndex>(cols); \
\
/* Set lda, ldb, ldc */
\
lda =
(MKL_INT)
lhsStride; \
ldb =
(MKL_INT)
rhsStride; \
ldc =
(MKL_INT)
resStride; \
lda =
convert_index<BlasIndex>(
lhsStride
)
; \
ldb =
convert_index<BlasIndex>(
rhsStride
)
; \
ldc =
convert_index<BlasIndex>(
resStride
)
; \
\
/* Set a, b, c */
\
if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \
Map<const Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder>, 0, OuterStride<> > lhs(_lhs,m,m,OuterStride<>(lhsStride)); \
a_tmp = lhs.conjugate(); \
a = a_tmp.data(); \
lda = a_tmp.outerStride(); \
lda =
convert_index<BlasIndex>(
a_tmp.outerStride()
)
; \
} else a = _lhs; \
if (LhsStorageOrder==RowMajor) uplo='U'; \
\
...
...
@@ -151,23 +141,23 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
b_tmp = rhs.transpose(); \
} \
b = b_tmp.data(); \
ldb = b_tmp.outerStride(); \
ldb =
convert_index<BlasIndex>(
b_tmp.outerStride()
)
; \
} \
\
MKL
PREFIX##hemm(&side, &uplo, &m, &n, &alpha
_
, (const
MKL
TYPE*)a, &lda, (const
MKL
TYPE*)b, &ldb, &beta
_
, (
MKL
TYPE*)res, &ldc); \
BLAS
PREFIX##hemm
_
(&side, &uplo, &m, &n, &
numext::real_ref(
alpha
)
, (const
BLAS
TYPE*)a, &lda, (const
BLAS
TYPE*)b, &ldb, &
numext::real_ref(
beta
)
, (
BLAS
TYPE*)res, &ldc); \
\
} \
};
EIGEN_
MKL
_SYMM_L
(
double
,
double
,
d
,
d
)
EIGEN_
MKL
_SYMM_L
(
float
,
float
,
f
,
s
)
EIGEN_
MKL
_HEMM_L
(
dcomplex
,
MKL_Complex16
,
cd
,
z
)
EIGEN_
MKL
_HEMM_L
(
scomplex
,
MKL_Complex8
,
cf
,
c
)
EIGEN_
BLAS
_SYMM_L
(
double
,
double
,
d
,
d
)
EIGEN_
BLAS
_SYMM_L
(
float
,
float
,
f
,
s
)
EIGEN_
BLAS
_HEMM_L
(
dcomplex
,
double
,
cd
,
z
)
EIGEN_
BLAS
_HEMM_L
(
scomplex
,
float
,
cf
,
c
)
/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
#define EIGEN_
MKL
_SYMM_R(EIGTYPE,
MKL
TYPE, EIGPREFIX,
MKL
PREFIX) \
#define EIGEN_
BLAS
_SYMM_R(EIGTYPE,
BLAS
TYPE, EIGPREFIX,
BLAS
PREFIX) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
...
...
@@ -179,27 +169,22 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \
EIGTYPE alpha) \
EIGTYPE alpha
, level3_blocking<EIGTYPE, EIGTYPE>&
/*blocking*/
) \
{ \
char side='R', uplo='L'; \
MKL_INT
m, n, lda, ldb, ldc; \
BlasIndex
m, n, lda, ldb, ldc; \
const EIGTYPE *a, *b; \
MKL
TYPE
alpha_,
beta
_
; \
EIG
TYPE beta
(1)
; \
MatrixX##EIGPREFIX b_tmp; \
EIGTYPE myone(1);\
\
/* Set m, n, k */
\
m = (MKL_INT)rows; \
n = (MKL_INT)cols; \
\
/* Set alpha_ & beta_ */
\
assign_scalar_eig2mkl(alpha_, alpha); \
assign_scalar_eig2mkl(beta_, myone); \
m = convert_index<BlasIndex>(rows); \
n = convert_index<BlasIndex>(cols); \
\
/* Set lda, ldb, ldc */
\
lda =
(MKL_INT)
rhsStride; \
ldb =
(MKL_INT)
lhsStride; \
ldc =
(MKL_INT)
resStride; \
lda =
convert_index<BlasIndex>(
rhsStride
)
; \
ldb =
convert_index<BlasIndex>(
lhsStride
)
; \
ldc =
convert_index<BlasIndex>(
resStride
)
; \
\
/* Set a, b, c */
\
if (RhsStorageOrder==RowMajor) uplo='U'; \
...
...
@@ -209,16 +194,16 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \
b_tmp = lhs.adjoint(); \
b = b_tmp.data(); \
ldb = b_tmp.outerStride(); \
ldb =
convert_index<BlasIndex>(
b_tmp.outerStride()
)
; \
} else b = _lhs; \
\
MKL
PREFIX##symm(&side, &uplo, &m, &n, &alpha
_
, (const
MKL
TYPE*)a, &lda, (const
MKL
TYPE*)b, &ldb, &beta
_
, (
MKL
TYPE*)res, &ldc); \
BLAS
PREFIX##symm
_
(&side, &uplo, &m, &n, &
numext::real_ref(
alpha
)
, (const
BLAS
TYPE*)a, &lda, (const
BLAS
TYPE*)b, &ldb, &
numext::real_ref(
beta
)
, (
BLAS
TYPE*)res, &ldc); \
\
} \
};
#define EIGEN_
MKL
_HEMM_R(EIGTYPE,
MKL
TYPE, EIGPREFIX,
MKL
PREFIX) \
#define EIGEN_
BLAS
_HEMM_R(EIGTYPE,
BLAS
TYPE, EIGPREFIX,
BLAS
PREFIX) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
...
...
@@ -229,35 +214,30 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \
EIGTYPE alpha) \
EIGTYPE alpha
, level3_blocking<EIGTYPE, EIGTYPE>&
/*blocking*/
) \
{ \
char side='R', uplo='L'; \
MKL_INT
m, n, lda, ldb, ldc; \
BlasIndex
m, n, lda, ldb, ldc; \
const EIGTYPE *a, *b; \
MKL
TYPE
alpha_,
beta
_
; \
EIG
TYPE beta
(1)
; \
MatrixX##EIGPREFIX b_tmp; \
Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> a_tmp; \
EIGTYPE myone(1); \
\
/* Set m, n, k */
\
m = (MKL_INT)rows; \
n = (MKL_INT)cols; \
\
/* Set alpha_ & beta_ */
\
assign_scalar_eig2mkl(alpha_, alpha); \
assign_scalar_eig2mkl(beta_, myone); \
m = convert_index<BlasIndex>(rows); \
n = convert_index<BlasIndex>(cols); \
\
/* Set lda, ldb, ldc */
\
lda =
(MKL_INT)
rhsStride; \
ldb =
(MKL_INT)
lhsStride; \
ldc =
(MKL_INT)
resStride; \
lda =
convert_index<BlasIndex>(
rhsStride
)
; \
ldb =
convert_index<BlasIndex>(
lhsStride
)
; \
ldc =
convert_index<BlasIndex>(
resStride
)
; \
\
/* Set a, b, c */
\
if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \
Map<const Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder>, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \
a_tmp = rhs.conjugate(); \
a = a_tmp.data(); \
lda = a_tmp.outerStride(); \
lda =
convert_index<BlasIndex>(
a_tmp.outerStride()
)
; \
} else a = _rhs; \
if (RhsStorageOrder==RowMajor) uplo='U'; \
\
...
...
@@ -276,20 +256,20 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
b_tmp = lhs.transpose(); \
} \
b = b_tmp.data(); \
ldb = b_tmp.outerStride(); \
ldb =
convert_index<BlasIndex>(
b_tmp.outerStride()
)
; \
} \
\
MKL
PREFIX##hemm(&side, &uplo, &m, &n, &alpha
_
, (const
MKL
TYPE*)a, &lda, (const
MKL
TYPE*)b, &ldb, &beta
_
, (
MKL
TYPE*)res, &ldc); \
BLAS
PREFIX##hemm
_
(&side, &uplo, &m, &n, &
numext::real_ref(
alpha
)
, (const
BLAS
TYPE*)a, &lda, (const
BLAS
TYPE*)b, &ldb, &
numext::real_ref(
beta
)
, (
BLAS
TYPE*)res, &ldc); \
} \
};
EIGEN_
MKL
_SYMM_R
(
double
,
double
,
d
,
d
)
EIGEN_
MKL
_SYMM_R
(
float
,
float
,
f
,
s
)
EIGEN_
MKL
_HEMM_R
(
dcomplex
,
MKL_Complex16
,
cd
,
z
)
EIGEN_
MKL
_HEMM_R
(
scomplex
,
MKL_Complex8
,
cf
,
c
)
EIGEN_
BLAS
_SYMM_R
(
double
,
double
,
d
,
d
)
EIGEN_
BLAS
_SYMM_R
(
float
,
float
,
f
,
s
)
EIGEN_
BLAS
_HEMM_R
(
dcomplex
,
double
,
cd
,
z
)
EIGEN_
BLAS
_HEMM_R
(
scomplex
,
float
,
cf
,
c
)
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_
MKL
_H
#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_
BLAS
_H
external/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h
View file @
a394b22a
...
...
@@ -30,7 +30,7 @@ struct selfadjoint_matrix_vector_product
static
EIGEN_DONT_INLINE
void
run
(
Index
size
,
const
Scalar
*
lhs
,
Index
lhsStride
,
const
Scalar
*
_rhs
,
Index
rhsIncr
,
const
Scalar
*
rhs
,
Scalar
*
res
,
Scalar
alpha
);
};
...
...
@@ -39,11 +39,12 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju
EIGEN_DONT_INLINE
void
selfadjoint_matrix_vector_product
<
Scalar
,
Index
,
StorageOrder
,
UpLo
,
ConjugateLhs
,
ConjugateRhs
,
Version
>::
run
(
Index
size
,
const
Scalar
*
lhs
,
Index
lhsStride
,
const
Scalar
*
_rhs
,
Index
rhsIncr
,
const
Scalar
*
rhs
,
Scalar
*
res
,
Scalar
alpha
)
{
typedef
typename
packet_traits
<
Scalar
>::
type
Packet
;
typedef
typename
NumTraits
<
Scalar
>::
Real
RealScalar
;
const
Index
PacketSize
=
sizeof
(
Packet
)
/
sizeof
(
Scalar
);
enum
{
...
...
@@ -54,23 +55,13 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
conj_helper
<
Scalar
,
Scalar
,
NumTraits
<
Scalar
>::
IsComplex
&&
EIGEN_LOGICAL_XOR
(
ConjugateLhs
,
IsRowMajor
),
ConjugateRhs
>
cj0
;
conj_helper
<
Scalar
,
Scalar
,
NumTraits
<
Scalar
>::
IsComplex
&&
EIGEN_LOGICAL_XOR
(
ConjugateLhs
,
!
IsRowMajor
),
ConjugateRhs
>
cj1
;
conj_helper
<
Scalar
,
Scalar
,
NumTraits
<
Scalar
>::
IsComplex
,
ConjugateRhs
>
cjd
;
conj_helper
<
Real
Scalar
,
Scalar
,
false
,
ConjugateRhs
>
cjd
;
conj_helper
<
Packet
,
Packet
,
NumTraits
<
Scalar
>::
IsComplex
&&
EIGEN_LOGICAL_XOR
(
ConjugateLhs
,
IsRowMajor
),
ConjugateRhs
>
pcj0
;
conj_helper
<
Packet
,
Packet
,
NumTraits
<
Scalar
>::
IsComplex
&&
EIGEN_LOGICAL_XOR
(
ConjugateLhs
,
!
IsRowMajor
),
ConjugateRhs
>
pcj1
;
Scalar
cjAlpha
=
ConjugateRhs
?
numext
::
conj
(
alpha
)
:
alpha
;
// FIXME this copy is now handled outside product_selfadjoint_vector, so it could probably be removed.
// if the rhs is not sequentially stored in memory we copy it to a temporary buffer,
// this is because we need to extract packets
ei_declare_aligned_stack_constructed_variable
(
Scalar
,
rhs
,
size
,
rhsIncr
==
1
?
const_cast
<
Scalar
*>
(
_rhs
)
:
0
);
if
(
rhsIncr
!=
1
)
{
const
Scalar
*
it
=
_rhs
;
for
(
Index
i
=
0
;
i
<
size
;
++
i
,
it
+=
rhsIncr
)
rhs
[
i
]
=
*
it
;
}
Index
bound
=
(
std
::
max
)(
Index
(
0
),
size
-
8
)
&
0xfffffffe
;
if
(
FirstTriangular
)
...
...
@@ -92,12 +83,11 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
Scalar
t3
(
0
);
Packet
ptmp3
=
pset1
<
Packet
>
(
t3
);
size_t
starti
=
FirstTriangular
?
0
:
j
+
2
;
size_t
endi
=
FirstTriangular
?
j
:
size
;
size_t
alignedStart
=
(
starti
)
+
internal
::
first_aligned
(
&
res
[
starti
],
endi
-
starti
);
size_t
alignedEnd
=
alignedStart
+
((
endi
-
alignedStart
)
/
(
PacketSize
))
*
(
PacketSize
);
Index
starti
=
FirstTriangular
?
0
:
j
+
2
;
Index
endi
=
FirstTriangular
?
j
:
size
;
Index
alignedStart
=
(
starti
)
+
internal
::
first_
default_
aligned
(
&
res
[
starti
],
endi
-
starti
);
Index
alignedEnd
=
alignedStart
+
((
endi
-
alignedStart
)
/
(
PacketSize
))
*
(
PacketSize
);
// TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
res
[
j
]
+=
cjd
.
pmul
(
numext
::
real
(
A0
[
j
]),
t0
);
res
[
j
+
1
]
+=
cjd
.
pmul
(
numext
::
real
(
A1
[
j
+
1
]),
t1
);
if
(
FirstTriangular
)
...
...
@@ -111,11 +101,11 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
t2
+=
cj1
.
pmul
(
A0
[
j
+
1
],
rhs
[
j
+
1
]);
}
for
(
size_t
i
=
starti
;
i
<
alignedStart
;
++
i
)
for
(
Index
i
=
starti
;
i
<
alignedStart
;
++
i
)
{
res
[
i
]
+=
t0
*
A0
[
i
]
+
t1
*
A1
[
i
];
t2
+=
numext
::
conj
(
A0
[
i
]
)
*
rhs
[
i
];
t3
+=
numext
::
conj
(
A1
[
i
]
)
*
rhs
[
i
];
res
[
i
]
+=
cj0
.
pmul
(
A0
[
i
],
t0
)
+
cj0
.
pmul
(
A1
[
i
]
,
t1
)
;
t2
+=
cj1
.
pmul
(
A0
[
i
]
,
rhs
[
i
]
)
;
t3
+=
cj1
.
pmul
(
A1
[
i
]
,
rhs
[
i
]
)
;
}
// Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
// gcc 4.2 does this optimization automatically.
...
...
@@ -123,7 +113,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
const
Scalar
*
EIGEN_RESTRICT
a1It
=
A1
+
alignedStart
;
const
Scalar
*
EIGEN_RESTRICT
rhsIt
=
rhs
+
alignedStart
;
Scalar
*
EIGEN_RESTRICT
resIt
=
res
+
alignedStart
;
for
(
size_t
i
=
alignedStart
;
i
<
alignedEnd
;
i
+=
PacketSize
)
for
(
Index
i
=
alignedStart
;
i
<
alignedEnd
;
i
+=
PacketSize
)
{
Packet
A0i
=
ploadu
<
Packet
>
(
a0It
);
a0It
+=
PacketSize
;
Packet
A1i
=
ploadu
<
Packet
>
(
a1It
);
a1It
+=
PacketSize
;
...
...
@@ -135,7 +125,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
ptmp3
=
pcj1
.
pmadd
(
A1i
,
Bi
,
ptmp3
);
pstore
(
resIt
,
Xi
);
resIt
+=
PacketSize
;
}
for
(
size_t
i
=
alignedEnd
;
i
<
endi
;
i
++
)
for
(
Index
i
=
alignedEnd
;
i
<
endi
;
i
++
)
{
res
[
i
]
+=
cj0
.
pmul
(
A0
[
i
],
t0
)
+
cj0
.
pmul
(
A1
[
i
],
t1
);
t2
+=
cj1
.
pmul
(
A0
[
i
],
rhs
[
i
]);
...
...
@@ -151,7 +141,6 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
Scalar
t1
=
cjAlpha
*
rhs
[
j
];
Scalar
t2
(
0
);
// TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
res
[
j
]
+=
cjd
.
pmul
(
numext
::
real
(
A0
[
j
]),
t1
);
for
(
Index
i
=
FirstTriangular
?
0
:
j
+
1
;
i
<
(
FirstTriangular
?
j
:
size
);
i
++
)
{
...
...
@@ -169,45 +158,44 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
***************************************************************************/
namespace
internal
{
template
<
typename
Lhs
,
int
LhsMode
,
typename
Rhs
>
struct
traits
<
SelfadjointProductMatrix
<
Lhs
,
LhsMode
,
false
,
Rhs
,
0
,
true
>
>
:
traits
<
ProductBase
<
SelfadjointProductMatrix
<
Lhs
,
LhsMode
,
false
,
Rhs
,
0
,
true
>
,
Lhs
,
Rhs
>
>
{};
}
template
<
typename
Lhs
,
int
LhsMode
,
typename
Rhs
>
struct
SelfadjointProductMatrix
<
Lhs
,
LhsMode
,
false
,
Rhs
,
0
,
true
>
:
public
ProductBase
<
SelfadjointProductMatrix
<
Lhs
,
LhsMode
,
false
,
Rhs
,
0
,
true
>
,
Lhs
,
Rhs
>
struct
selfadjoint_product_impl
<
Lhs
,
LhsMode
,
false
,
Rhs
,
0
,
true
>
{
EIGEN_PRODUCT_PUBLIC_INTERFACE
(
SelfadjointProductMatrix
)
enum
{
LhsUpLo
=
LhsMode
&
(
Upper
|
Lower
)
};
SelfadjointProductMatrix
(
const
Lhs
&
lhs
,
const
Rhs
&
rhs
)
:
Base
(
lhs
,
rhs
)
{}
template
<
typename
Dest
>
void
scaleAndAddTo
(
Dest
&
dest
,
const
Scalar
&
alpha
)
const
typedef
typename
Product
<
Lhs
,
Rhs
>::
Scalar
Scalar
;
typedef
internal
::
blas_traits
<
Lhs
>
LhsBlasTraits
;
typedef
typename
LhsBlasTraits
::
DirectLinearAccessType
ActualLhsType
;
typedef
typename
internal
::
remove_all
<
ActualLhsType
>::
type
ActualLhsTypeCleaned
;
typedef
internal
::
blas_traits
<
Rhs
>
RhsBlasTraits
;
typedef
typename
RhsBlasTraits
::
DirectLinearAccessType
ActualRhsType
;
typedef
typename
internal
::
remove_all
<
ActualRhsType
>::
type
ActualRhsTypeCleaned
;
enum
{
LhsUpLo
=
LhsMode
&
(
Upper
|
Lower
)
};
template
<
typename
Dest
>
static
void
run
(
Dest
&
dest
,
const
Lhs
&
a_lhs
,
const
Rhs
&
a_rhs
,
const
Scalar
&
alpha
)
{
typedef
typename
Dest
::
Scalar
ResScalar
;
typedef
typename
Base
::
RhsScalar
RhsScalar
;
typedef
Map
<
Matrix
<
ResScalar
,
Dynamic
,
1
>
,
Aligned
>
MappedDest
;
typedef
typename
Rhs
::
Scalar
RhsScalar
;
typedef
Map
<
Matrix
<
ResScalar
,
Dynamic
,
1
>
,
EIGEN_PLAIN_ENUM_MIN
(
AlignedMax
,
internal
::
packet_traits
<
ResScalar
>::
size
)
>
MappedDest
;
eigen_assert
(
dest
.
rows
()
==
m
_lhs
.
rows
()
&&
dest
.
cols
()
==
m
_rhs
.
cols
());
eigen_assert
(
dest
.
rows
()
==
a
_lhs
.
rows
()
&&
dest
.
cols
()
==
a
_rhs
.
cols
());
typename
internal
::
add_const_on_value_type
<
ActualLhsType
>::
type
lhs
=
LhsBlasTraits
::
extract
(
m
_lhs
);
typename
internal
::
add_const_on_value_type
<
ActualRhsType
>::
type
rhs
=
RhsBlasTraits
::
extract
(
m
_rhs
);
typename
internal
::
add_const_on_value_type
<
ActualLhsType
>::
type
lhs
=
LhsBlasTraits
::
extract
(
a
_lhs
);
typename
internal
::
add_const_on_value_type
<
ActualRhsType
>::
type
rhs
=
RhsBlasTraits
::
extract
(
a
_rhs
);
Scalar
actualAlpha
=
alpha
*
LhsBlasTraits
::
extractScalarFactor
(
m
_lhs
)
*
RhsBlasTraits
::
extractScalarFactor
(
m
_rhs
);
Scalar
actualAlpha
=
alpha
*
LhsBlasTraits
::
extractScalarFactor
(
a
_lhs
)
*
RhsBlasTraits
::
extractScalarFactor
(
a
_rhs
);
enum
{
EvalToDest
=
(
Dest
::
InnerStrideAtCompileTime
==
1
),
UseRhs
=
(
_
ActualRhsType
::
InnerStrideAtCompileTime
==
1
)
UseRhs
=
(
ActualRhsType
Cleaned
::
InnerStrideAtCompileTime
==
1
)
};
internal
::
gemv_static_vector_if
<
ResScalar
,
Dest
::
SizeAtCompileTime
,
Dest
::
MaxSizeAtCompileTime
,
!
EvalToDest
>
static_dest
;
internal
::
gemv_static_vector_if
<
RhsScalar
,
_
ActualRhsType
::
SizeAtCompileTime
,
_
ActualRhsType
::
MaxSizeAtCompileTime
,
!
UseRhs
>
static_rhs
;
internal
::
gemv_static_vector_if
<
RhsScalar
,
ActualRhsType
Cleaned
::
SizeAtCompileTime
,
ActualRhsType
Cleaned
::
MaxSizeAtCompileTime
,
!
UseRhs
>
static_rhs
;
ei_declare_aligned_stack_constructed_variable
(
ResScalar
,
actualDestPtr
,
dest
.
size
(),
EvalToDest
?
dest
.
data
()
:
static_dest
.
data
());
...
...
@@ -218,7 +206,7 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
if
(
!
EvalToDest
)
{
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
int
size
=
dest
.
size
();
Index
size
=
dest
.
size
();
EIGEN_DENSE_STORAGE_CTOR_PLUGIN
#endif
MappedDest
(
actualDestPtr
,
dest
.
size
())
=
dest
;
...
...
@@ -227,18 +215,19 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
if
(
!
UseRhs
)
{
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
int
size
=
rhs
.
size
();
Index
size
=
rhs
.
size
();
EIGEN_DENSE_STORAGE_CTOR_PLUGIN
#endif
Map
<
typename
_
ActualRhsType
::
PlainObject
>
(
actualRhsPtr
,
rhs
.
size
())
=
rhs
;
Map
<
typename
ActualRhsType
Cleaned
::
PlainObject
>
(
actualRhsPtr
,
rhs
.
size
())
=
rhs
;
}
internal
::
selfadjoint_matrix_vector_product
<
Scalar
,
Index
,
(
internal
::
traits
<
_ActualLhsType
>::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
int
(
LhsUpLo
),
bool
(
LhsBlasTraits
::
NeedToConjugate
),
bool
(
RhsBlasTraits
::
NeedToConjugate
)
>::
run
internal
::
selfadjoint_matrix_vector_product
<
Scalar
,
Index
,
(
internal
::
traits
<
ActualLhsTypeCleaned
>::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
int
(
LhsUpLo
),
bool
(
LhsBlasTraits
::
NeedToConjugate
),
bool
(
RhsBlasTraits
::
NeedToConjugate
)
>::
run
(
lhs
.
rows
(),
// size
&
lhs
.
coeffRef
(
0
,
0
),
lhs
.
outerStride
(),
// lhs info
actualRhsPtr
,
1
,
// rhs info
actualRhsPtr
,
// rhs info
actualDestPtr
,
// result info
actualAlpha
// scale factor
);
...
...
@@ -248,34 +237,24 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
}
};
namespace
internal
{
template
<
typename
Lhs
,
typename
Rhs
,
int
RhsMode
>
struct
traits
<
SelfadjointProductMatrix
<
Lhs
,
0
,
true
,
Rhs
,
RhsMode
,
false
>
>
:
traits
<
ProductBase
<
SelfadjointProductMatrix
<
Lhs
,
0
,
true
,
Rhs
,
RhsMode
,
false
>
,
Lhs
,
Rhs
>
>
{};
}
template
<
typename
Lhs
,
typename
Rhs
,
int
RhsMode
>
struct
SelfadjointProductMatrix
<
Lhs
,
0
,
true
,
Rhs
,
RhsMode
,
false
>
:
public
ProductBase
<
SelfadjointProductMatrix
<
Lhs
,
0
,
true
,
Rhs
,
RhsMode
,
false
>
,
Lhs
,
Rhs
>
struct
selfadjoint_product_impl
<
Lhs
,
0
,
true
,
Rhs
,
RhsMode
,
false
>
{
EIGEN_PRODUCT_PUBLIC_INTERFACE
(
SelfadjointProductMatrix
)
typedef
typename
Product
<
Lhs
,
Rhs
>::
Scalar
Scalar
;
enum
{
RhsUpLo
=
RhsMode
&
(
Upper
|
Lower
)
};
enum
{
RhsUpLo
=
RhsMode
&
(
Upper
|
Lower
)
};
SelfadjointProductMatrix
(
const
Lhs
&
lhs
,
const
Rhs
&
rhs
)
:
Base
(
lhs
,
rhs
)
{}
template
<
typename
Dest
>
void
scaleAndAddTo
(
Dest
&
dest
,
const
Scalar
&
alpha
)
const
template
<
typename
Dest
>
static
void
run
(
Dest
&
dest
,
const
Lhs
&
a_lhs
,
const
Rhs
&
a_rhs
,
const
Scalar
&
alpha
)
{
// let's simply transpose the product
Transpose
<
Dest
>
destT
(
dest
);
S
elfadjoint
P
roduct
Matrix
<
Transpose
<
const
Rhs
>
,
int
(
RhsUpLo
)
==
Upper
?
Lower
:
Upper
,
false
,
Transpose
<
const
Lhs
>
,
0
,
true
>
(
m
_rhs
.
transpose
(),
m
_lhs
.
transpose
()
).
scaleAndAddTo
(
destT
,
alpha
);
s
elfadjoint
_p
roduct
_impl
<
Transpose
<
const
Rhs
>
,
int
(
RhsUpLo
)
==
Upper
?
Lower
:
Upper
,
false
,
Transpose
<
const
Lhs
>
,
0
,
true
>
::
run
(
destT
,
a
_rhs
.
transpose
(),
a
_lhs
.
transpose
(),
alpha
);
}
};
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
Prev
1
…
5
6
7
8
9
10
Next