Skip to content
GitLab
Explore
Sign in
Hide whitespace changes
Inline
Side-by-side
Some changes are not shown.
For a faster browsing experience, only
20 of 182+
files are shown. Download one of the files below to see all changes.
external/eigen3/Eigen/src/Core/arch/AVX512/PacketMath.h
0 → 100644
View file @
a394b22a
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner (benoit.steiner.goog@gmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_AVX512_H
#define EIGEN_PACKET_MATH_AVX512_H
namespace
Eigen
{
namespace
internal
{
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
#endif
#ifdef __FMA__
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
#endif
typedef
__m512
Packet16f
;
typedef
__m512i
Packet16i
;
typedef
__m512d
Packet8d
;
template
<
>
struct
is_arithmetic
<
__m512
>
{
enum
{
value
=
true
};
};
template
<
>
struct
is_arithmetic
<
__m512i
>
{
enum
{
value
=
true
};
};
template
<
>
struct
is_arithmetic
<
__m512d
>
{
enum
{
value
=
true
};
};
template
<
>
struct
packet_traits
<
float
>
:
default_packet_traits
{
typedef
Packet16f
type
;
typedef
Packet8f
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
16
,
HasHalfPacket
=
1
,
#if EIGEN_GNUC_AT_LEAST(5, 3)
#ifdef EIGEN_VECTORIZE_AVX512DQ
HasLog
=
1
,
#endif
HasExp
=
1
,
HasSqrt
=
1
,
HasRsqrt
=
1
,
#endif
HasDiv
=
1
};
};
template
<
>
struct
packet_traits
<
double
>
:
default_packet_traits
{
typedef
Packet8d
type
;
typedef
Packet4d
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
8
,
HasHalfPacket
=
1
,
#if EIGEN_GNUC_AT_LEAST(5, 3)
HasSqrt
=
1
,
HasRsqrt
=
EIGEN_FAST_MATH
,
#endif
HasDiv
=
1
};
};
/* TODO Implement AVX512 for integers
template<> struct packet_traits<int> : default_packet_traits
{
typedef Packet16i type;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=8
};
};
*/
template
<
>
struct
unpacket_traits
<
Packet16f
>
{
typedef
float
type
;
typedef
Packet8f
half
;
enum
{
size
=
16
,
alignment
=
Aligned64
};
};
template
<
>
struct
unpacket_traits
<
Packet8d
>
{
typedef
double
type
;
typedef
Packet4d
half
;
enum
{
size
=
8
,
alignment
=
Aligned64
};
};
template
<
>
struct
unpacket_traits
<
Packet16i
>
{
typedef
int
type
;
typedef
Packet8i
half
;
enum
{
size
=
16
,
alignment
=
Aligned64
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pset1
<
Packet16f
>
(
const
float
&
from
)
{
return
_mm512_set1_ps
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pset1
<
Packet8d
>
(
const
double
&
from
)
{
return
_mm512_set1_pd
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16i
pset1
<
Packet16i
>
(
const
int
&
from
)
{
return
_mm512_set1_epi32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pload1
<
Packet16f
>
(
const
float
*
from
)
{
return
_mm512_broadcastss_ps
(
_mm_load_ps1
(
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pload1
<
Packet8d
>
(
const
double
*
from
)
{
return
_mm512_broadcastsd_pd
(
_mm_load_pd1
(
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
plset
<
Packet16f
>
(
const
float
&
a
)
{
return
_mm512_add_ps
(
_mm512_set1_ps
(
a
),
_mm512_set_ps
(
15.0
f
,
14.0
f
,
13.0
f
,
12.0
f
,
11.0
f
,
10.0
f
,
9.0
f
,
8.0
f
,
7.0
f
,
6.0
f
,
5.0
f
,
4.0
f
,
3.0
f
,
2.0
f
,
1.0
f
,
0.0
f
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
plset
<
Packet8d
>
(
const
double
&
a
)
{
return
_mm512_add_pd
(
_mm512_set1_pd
(
a
),
_mm512_set_pd
(
7.0
,
6.0
,
5.0
,
4.0
,
3.0
,
2.0
,
1.0
,
0.0
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
padd
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
return
_mm512_add_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
padd
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
return
_mm512_add_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
psub
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
return
_mm512_sub_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
psub
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
return
_mm512_sub_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pnegate
(
const
Packet16f
&
a
)
{
return
_mm512_sub_ps
(
_mm512_set1_ps
(
0.0
),
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pnegate
(
const
Packet8d
&
a
)
{
return
_mm512_sub_pd
(
_mm512_set1_pd
(
0.0
),
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pconj
(
const
Packet16f
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pconj
(
const
Packet8d
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet16i
pconj
(
const
Packet16i
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pmul
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
return
_mm512_mul_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pmul
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
return
_mm512_mul_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pdiv
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
return
_mm512_div_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pdiv
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
return
_mm512_div_pd
(
a
,
b
);
}
#ifdef __FMA__
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pmadd
(
const
Packet16f
&
a
,
const
Packet16f
&
b
,
const
Packet16f
&
c
)
{
return
_mm512_fmadd_ps
(
a
,
b
,
c
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pmadd
(
const
Packet8d
&
a
,
const
Packet8d
&
b
,
const
Packet8d
&
c
)
{
return
_mm512_fmadd_pd
(
a
,
b
,
c
);
}
#endif
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pmin
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
return
_mm512_min_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pmin
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
return
_mm512_min_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pmax
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
return
_mm512_max_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pmax
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
return
_mm512_max_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pand
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
return
_mm512_and_ps
(
a
,
b
);
#else
Packet16f
res
=
_mm512_undefined_ps
();
Packet4f
lane0_a
=
_mm512_extractf32x4_ps
(
a
,
0
);
Packet4f
lane0_b
=
_mm512_extractf32x4_ps
(
b
,
0
);
res
=
_mm512_insertf32x4
(
res
,
_mm_and_ps
(
lane0_a
,
lane0_b
),
0
);
Packet4f
lane1_a
=
_mm512_extractf32x4_ps
(
a
,
1
);
Packet4f
lane1_b
=
_mm512_extractf32x4_ps
(
b
,
1
);
res
=
_mm512_insertf32x4
(
res
,
_mm_and_ps
(
lane1_a
,
lane1_b
),
1
);
Packet4f
lane2_a
=
_mm512_extractf32x4_ps
(
a
,
2
);
Packet4f
lane2_b
=
_mm512_extractf32x4_ps
(
b
,
2
);
res
=
_mm512_insertf32x4
(
res
,
_mm_and_ps
(
lane2_a
,
lane2_b
),
2
);
Packet4f
lane3_a
=
_mm512_extractf32x4_ps
(
a
,
3
);
Packet4f
lane3_b
=
_mm512_extractf32x4_ps
(
b
,
3
);
res
=
_mm512_insertf32x4
(
res
,
_mm_and_ps
(
lane3_a
,
lane3_b
),
3
);
return
res
;
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pand
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
return
_mm512_and_pd
(
a
,
b
);
#else
Packet8d
res
=
_mm512_undefined_pd
();
Packet4d
lane0_a
=
_mm512_extractf64x4_pd
(
a
,
0
);
Packet4d
lane0_b
=
_mm512_extractf64x4_pd
(
b
,
0
);
res
=
_mm512_insertf64x4
(
res
,
_mm256_and_pd
(
lane0_a
,
lane0_b
),
0
);
Packet4d
lane1_a
=
_mm512_extractf64x4_pd
(
a
,
1
);
Packet4d
lane1_b
=
_mm512_extractf64x4_pd
(
b
,
1
);
res
=
_mm512_insertf64x4
(
res
,
_mm256_and_pd
(
lane1_a
,
lane1_b
),
1
);
return
res
;
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
por
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
return
_mm512_or_ps
(
a
,
b
);
#else
Packet16f
res
=
_mm512_undefined_ps
();
Packet4f
lane0_a
=
_mm512_extractf32x4_ps
(
a
,
0
);
Packet4f
lane0_b
=
_mm512_extractf32x4_ps
(
b
,
0
);
res
=
_mm512_insertf32x4
(
res
,
_mm_or_ps
(
lane0_a
,
lane0_b
),
0
);
Packet4f
lane1_a
=
_mm512_extractf32x4_ps
(
a
,
1
);
Packet4f
lane1_b
=
_mm512_extractf32x4_ps
(
b
,
1
);
res
=
_mm512_insertf32x4
(
res
,
_mm_or_ps
(
lane1_a
,
lane1_b
),
1
);
Packet4f
lane2_a
=
_mm512_extractf32x4_ps
(
a
,
2
);
Packet4f
lane2_b
=
_mm512_extractf32x4_ps
(
b
,
2
);
res
=
_mm512_insertf32x4
(
res
,
_mm_or_ps
(
lane2_a
,
lane2_b
),
2
);
Packet4f
lane3_a
=
_mm512_extractf32x4_ps
(
a
,
3
);
Packet4f
lane3_b
=
_mm512_extractf32x4_ps
(
b
,
3
);
res
=
_mm512_insertf32x4
(
res
,
_mm_or_ps
(
lane3_a
,
lane3_b
),
3
);
return
res
;
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
por
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
return
_mm512_or_pd
(
a
,
b
);
#else
Packet8d
res
=
_mm512_undefined_pd
();
Packet4d
lane0_a
=
_mm512_extractf64x4_pd
(
a
,
0
);
Packet4d
lane0_b
=
_mm512_extractf64x4_pd
(
b
,
0
);
res
=
_mm512_insertf64x4
(
res
,
_mm256_or_pd
(
lane0_a
,
lane0_b
),
0
);
Packet4d
lane1_a
=
_mm512_extractf64x4_pd
(
a
,
1
);
Packet4d
lane1_b
=
_mm512_extractf64x4_pd
(
b
,
1
);
res
=
_mm512_insertf64x4
(
res
,
_mm256_or_pd
(
lane1_a
,
lane1_b
),
1
);
return
res
;
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pxor
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
return
_mm512_xor_ps
(
a
,
b
);
#else
Packet16f
res
=
_mm512_undefined_ps
();
Packet4f
lane0_a
=
_mm512_extractf32x4_ps
(
a
,
0
);
Packet4f
lane0_b
=
_mm512_extractf32x4_ps
(
b
,
0
);
res
=
_mm512_insertf32x4
(
res
,
_mm_xor_ps
(
lane0_a
,
lane0_b
),
0
);
Packet4f
lane1_a
=
_mm512_extractf32x4_ps
(
a
,
1
);
Packet4f
lane1_b
=
_mm512_extractf32x4_ps
(
b
,
1
);
res
=
_mm512_insertf32x4
(
res
,
_mm_xor_ps
(
lane1_a
,
lane1_b
),
1
);
Packet4f
lane2_a
=
_mm512_extractf32x4_ps
(
a
,
2
);
Packet4f
lane2_b
=
_mm512_extractf32x4_ps
(
b
,
2
);
res
=
_mm512_insertf32x4
(
res
,
_mm_xor_ps
(
lane2_a
,
lane2_b
),
2
);
Packet4f
lane3_a
=
_mm512_extractf32x4_ps
(
a
,
3
);
Packet4f
lane3_b
=
_mm512_extractf32x4_ps
(
b
,
3
);
res
=
_mm512_insertf32x4
(
res
,
_mm_xor_ps
(
lane3_a
,
lane3_b
),
3
);
return
res
;
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pxor
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
return
_mm512_xor_pd
(
a
,
b
);
#else
Packet8d
res
=
_mm512_undefined_pd
();
Packet4d
lane0_a
=
_mm512_extractf64x4_pd
(
a
,
0
);
Packet4d
lane0_b
=
_mm512_extractf64x4_pd
(
b
,
0
);
res
=
_mm512_insertf64x4
(
res
,
_mm256_xor_pd
(
lane0_a
,
lane0_b
),
0
);
Packet4d
lane1_a
=
_mm512_extractf64x4_pd
(
a
,
1
);
Packet4d
lane1_b
=
_mm512_extractf64x4_pd
(
b
,
1
);
res
=
_mm512_insertf64x4
(
res
,
_mm256_xor_pd
(
lane1_a
,
lane1_b
),
1
);
return
res
;
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pandnot
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
return
_mm512_andnot_ps
(
a
,
b
);
#else
Packet16f
res
=
_mm512_undefined_ps
();
Packet4f
lane0_a
=
_mm512_extractf32x4_ps
(
a
,
0
);
Packet4f
lane0_b
=
_mm512_extractf32x4_ps
(
b
,
0
);
res
=
_mm512_insertf32x4
(
res
,
_mm_andnot_ps
(
lane0_a
,
lane0_b
),
0
);
Packet4f
lane1_a
=
_mm512_extractf32x4_ps
(
a
,
1
);
Packet4f
lane1_b
=
_mm512_extractf32x4_ps
(
b
,
1
);
res
=
_mm512_insertf32x4
(
res
,
_mm_andnot_ps
(
lane1_a
,
lane1_b
),
1
);
Packet4f
lane2_a
=
_mm512_extractf32x4_ps
(
a
,
2
);
Packet4f
lane2_b
=
_mm512_extractf32x4_ps
(
b
,
2
);
res
=
_mm512_insertf32x4
(
res
,
_mm_andnot_ps
(
lane2_a
,
lane2_b
),
2
);
Packet4f
lane3_a
=
_mm512_extractf32x4_ps
(
a
,
3
);
Packet4f
lane3_b
=
_mm512_extractf32x4_ps
(
b
,
3
);
res
=
_mm512_insertf32x4
(
res
,
_mm_andnot_ps
(
lane3_a
,
lane3_b
),
3
);
return
res
;
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pandnot
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
return
_mm512_andnot_pd
(
a
,
b
);
#else
Packet8d
res
=
_mm512_undefined_pd
();
Packet4d
lane0_a
=
_mm512_extractf64x4_pd
(
a
,
0
);
Packet4d
lane0_b
=
_mm512_extractf64x4_pd
(
b
,
0
);
res
=
_mm512_insertf64x4
(
res
,
_mm256_andnot_pd
(
lane0_a
,
lane0_b
),
0
);
Packet4d
lane1_a
=
_mm512_extractf64x4_pd
(
a
,
1
);
Packet4d
lane1_b
=
_mm512_extractf64x4_pd
(
b
,
1
);
res
=
_mm512_insertf64x4
(
res
,
_mm256_andnot_pd
(
lane1_a
,
lane1_b
),
1
);
return
res
;
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pload
<
Packet16f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
_mm512_load_ps
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pload
<
Packet8d
>
(
const
double
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
_mm512_load_pd
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16i
pload
<
Packet16i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
_mm512_load_si512
(
reinterpret_cast
<
const
__m512i
*>
(
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
ploadu
<
Packet16f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
_mm512_loadu_ps
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
ploadu
<
Packet8d
>
(
const
double
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
_mm512_loadu_pd
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16i
ploadu
<
Packet16i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
_mm512_loadu_si512
(
reinterpret_cast
<
const
__m512i
*>
(
from
));
}
// Loads 8 floats from memory a returns the packet
// {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
ploaddup
<
Packet16f
>
(
const
float
*
from
)
{
Packet8f
lane0
=
_mm256_broadcast_ps
((
const
__m128
*
)(
const
void
*
)
from
);
// mimic an "inplace" permutation of the lower 128bits using a blend
lane0
=
_mm256_blend_ps
(
lane0
,
_mm256_castps128_ps256
(
_mm_permute_ps
(
_mm256_castps256_ps128
(
lane0
),
_MM_SHUFFLE
(
1
,
0
,
1
,
0
))),
15
);
// then we can perform a consistent permutation on the global register to get
// everything in shape:
lane0
=
_mm256_permute_ps
(
lane0
,
_MM_SHUFFLE
(
3
,
3
,
2
,
2
));
Packet8f
lane1
=
_mm256_broadcast_ps
((
const
__m128
*
)(
const
void
*
)(
from
+
4
));
// mimic an "inplace" permutation of the lower 128bits using a blend
lane1
=
_mm256_blend_ps
(
lane1
,
_mm256_castps128_ps256
(
_mm_permute_ps
(
_mm256_castps256_ps128
(
lane1
),
_MM_SHUFFLE
(
1
,
0
,
1
,
0
))),
15
);
// then we can perform a consistent permutation on the global register to get
// everything in shape:
lane1
=
_mm256_permute_ps
(
lane1
,
_MM_SHUFFLE
(
3
,
3
,
2
,
2
));
#ifdef EIGEN_VECTORIZE_AVX512DQ
Packet16f
res
=
_mm512_undefined_ps
();
return
_mm512_insertf32x8
(
res
,
lane0
,
0
);
return
_mm512_insertf32x8
(
res
,
lane1
,
1
);
return
res
;
#else
Packet16f
res
=
_mm512_undefined_ps
();
res
=
_mm512_insertf32x4
(
res
,
_mm256_extractf128_ps
(
lane0
,
0
),
0
);
res
=
_mm512_insertf32x4
(
res
,
_mm256_extractf128_ps
(
lane0
,
1
),
1
);
res
=
_mm512_insertf32x4
(
res
,
_mm256_extractf128_ps
(
lane1
,
0
),
2
);
res
=
_mm512_insertf32x4
(
res
,
_mm256_extractf128_ps
(
lane1
,
1
),
3
);
return
res
;
#endif
}
// Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3,
// a3}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
ploaddup
<
Packet8d
>
(
const
double
*
from
)
{
Packet4d
lane0
=
_mm256_broadcast_pd
((
const
__m128d
*
)(
const
void
*
)
from
);
lane0
=
_mm256_permute_pd
(
lane0
,
3
<<
2
);
Packet4d
lane1
=
_mm256_broadcast_pd
((
const
__m128d
*
)(
const
void
*
)(
from
+
2
));
lane1
=
_mm256_permute_pd
(
lane1
,
3
<<
2
);
Packet8d
res
=
_mm512_undefined_pd
();
res
=
_mm512_insertf64x4
(
res
,
lane0
,
0
);
return
_mm512_insertf64x4
(
res
,
lane1
,
1
);
}
// Loads 4 floats from memory a returns the packet
// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
ploadquad
<
Packet16f
>
(
const
float
*
from
)
{
Packet16f
tmp
=
_mm512_undefined_ps
();
tmp
=
_mm512_insertf32x4
(
tmp
,
_mm_load_ps1
(
from
),
0
);
tmp
=
_mm512_insertf32x4
(
tmp
,
_mm_load_ps1
(
from
+
1
),
1
);
tmp
=
_mm512_insertf32x4
(
tmp
,
_mm_load_ps1
(
from
+
2
),
2
);
tmp
=
_mm512_insertf32x4
(
tmp
,
_mm_load_ps1
(
from
+
3
),
3
);
return
tmp
;
}
// Loads 2 doubles from memory a returns the packet
// {a0, a0 a0, a0, a1, a1, a1, a1}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
ploadquad
<
Packet8d
>
(
const
double
*
from
)
{
Packet8d
tmp
=
_mm512_undefined_pd
();
Packet2d
tmp0
=
_mm_load_pd1
(
from
);
Packet2d
tmp1
=
_mm_load_pd1
(
from
+
1
);
Packet4d
lane0
=
_mm256_broadcastsd_pd
(
tmp0
);
Packet4d
lane1
=
_mm256_broadcastsd_pd
(
tmp1
);
tmp
=
_mm512_insertf64x4
(
tmp
,
lane0
,
0
);
return
_mm512_insertf64x4
(
tmp
,
lane1
,
1
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
float
>
(
float
*
to
,
const
Packet16f
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
_mm512_store_ps
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
double
>
(
double
*
to
,
const
Packet8d
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
_mm512_store_pd
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
int
>
(
int
*
to
,
const
Packet16i
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
_mm512_storeu_si512
(
reinterpret_cast
<
__m512i
*>
(
to
),
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
float
>
(
float
*
to
,
const
Packet16f
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
_mm512_storeu_ps
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
double
>
(
double
*
to
,
const
Packet8d
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
_mm512_storeu_pd
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
int
>
(
int
*
to
,
const
Packet16i
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
_mm512_storeu_si512
(
reinterpret_cast
<
__m512i
*>
(
to
),
from
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet16f
pgather
<
float
,
Packet16f
>
(
const
float
*
from
,
Index
stride
)
{
Packet16i
stride_vector
=
_mm512_set1_epi32
(
stride
);
Packet16i
stride_multiplier
=
_mm512_set_epi32
(
15
,
14
,
13
,
12
,
11
,
10
,
9
,
8
,
7
,
6
,
5
,
4
,
3
,
2
,
1
,
0
);
Packet16i
indices
=
_mm512_mullo_epi32
(
stride_vector
,
stride_multiplier
);
return
_mm512_i32gather_ps
(
indices
,
from
,
4
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet8d
pgather
<
double
,
Packet8d
>
(
const
double
*
from
,
Index
stride
)
{
Packet8i
stride_vector
=
_mm256_set1_epi32
(
stride
);
Packet8i
stride_multiplier
=
_mm256_set_epi32
(
7
,
6
,
5
,
4
,
3
,
2
,
1
,
0
);
Packet8i
indices
=
_mm256_mullo_epi32
(
stride_vector
,
stride_multiplier
);
return
_mm512_i32gather_pd
(
indices
,
from
,
8
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
float
,
Packet16f
>
(
float
*
to
,
const
Packet16f
&
from
,
Index
stride
)
{
Packet16i
stride_vector
=
_mm512_set1_epi32
(
stride
);
Packet16i
stride_multiplier
=
_mm512_set_epi32
(
15
,
14
,
13
,
12
,
11
,
10
,
9
,
8
,
7
,
6
,
5
,
4
,
3
,
2
,
1
,
0
);
Packet16i
indices
=
_mm512_mullo_epi32
(
stride_vector
,
stride_multiplier
);
_mm512_i32scatter_ps
(
to
,
indices
,
from
,
4
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
double
,
Packet8d
>
(
double
*
to
,
const
Packet8d
&
from
,
Index
stride
)
{
Packet8i
stride_vector
=
_mm256_set1_epi32
(
stride
);
Packet8i
stride_multiplier
=
_mm256_set_epi32
(
7
,
6
,
5
,
4
,
3
,
2
,
1
,
0
);
Packet8i
indices
=
_mm256_mullo_epi32
(
stride_vector
,
stride_multiplier
);
_mm512_i32scatter_pd
(
to
,
indices
,
from
,
8
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore1
<
Packet16f
>
(
float
*
to
,
const
float
&
a
)
{
Packet16f
pa
=
pset1
<
Packet16f
>
(
a
);
pstore
(
to
,
pa
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore1
<
Packet8d
>
(
double
*
to
,
const
double
&
a
)
{
Packet8d
pa
=
pset1
<
Packet8d
>
(
a
);
pstore
(
to
,
pa
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore1
<
Packet16i
>
(
int
*
to
,
const
int
&
a
)
{
Packet16i
pa
=
pset1
<
Packet16i
>
(
a
);
pstore
(
to
,
pa
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
float
>
(
const
float
*
addr
)
{
_mm_prefetch
((
const
char
*
)(
addr
),
_MM_HINT_T0
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
double
>
(
const
double
*
addr
)
{
_mm_prefetch
((
const
char
*
)(
addr
),
_MM_HINT_T0
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
int
>
(
const
int
*
addr
)
{
_mm_prefetch
((
const
char
*
)(
addr
),
_MM_HINT_T0
);
}
template
<
>
EIGEN_STRONG_INLINE
float
pfirst
<
Packet16f
>
(
const
Packet16f
&
a
)
{
return
_mm_cvtss_f32
(
_mm512_extractf32x4_ps
(
a
,
0
));
}
template
<
>
EIGEN_STRONG_INLINE
double
pfirst
<
Packet8d
>
(
const
Packet8d
&
a
)
{
return
_mm_cvtsd_f64
(
_mm256_extractf128_pd
(
_mm512_extractf64x4_pd
(
a
,
0
),
0
));
}
template
<
>
EIGEN_STRONG_INLINE
int
pfirst
<
Packet16i
>
(
const
Packet16i
&
a
)
{
return
_mm_extract_epi32
(
_mm512_extracti32x4_epi32
(
a
,
0
),
0
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
preverse
(
const
Packet16f
&
a
)
{
return
_mm512_permutexvar_ps
(
_mm512_set_epi32
(
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
),
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
preverse
(
const
Packet8d
&
a
)
{
return
_mm512_permutexvar_pd
(
_mm512_set_epi32
(
0
,
0
,
0
,
1
,
0
,
2
,
0
,
3
,
0
,
4
,
0
,
5
,
0
,
6
,
0
,
7
),
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pabs
(
const
Packet16f
&
a
)
{
// _mm512_abs_ps intrinsic not found, so hack around it
return
(
__m512
)
_mm512_and_si512
((
__m512i
)
a
,
_mm512_set1_epi32
(
0x7fffffff
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pabs
(
const
Packet8d
&
a
)
{
// _mm512_abs_ps intrinsic not found, so hack around it
return
(
__m512d
)
_mm512_and_si512
((
__m512i
)
a
,
_mm512_set1_epi64
(
0x7fffffffffffffff
));
}
#ifdef EIGEN_VECTORIZE_AVX512DQ
// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
__m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0) __m256 OUTPUT##_1 = \
_mm512_extractf32x8_ps(INPUT, 1)
#else
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
__m256 OUTPUT##_0 = _mm256_insertf128_ps( \
_mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \
_mm512_extractf32x4_ps(INPUT, 1), 1); \
__m256 OUTPUT##_1 = _mm256_insertf128_ps( \
_mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
_mm512_extractf32x4_ps(INPUT, 3), 1);
#endif
#ifdef EIGEN_VECTORIZE_AVX512DQ
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0); \
OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1);
#else
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
#endif
template
<
>
EIGEN_STRONG_INLINE
Packet16f
preduxp
<
Packet16f
>
(
const
Packet16f
*
vecs
)
{
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
0
],
vecs0
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
1
],
vecs1
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
2
],
vecs2
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
3
],
vecs3
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
4
],
vecs4
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
5
],
vecs5
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
6
],
vecs6
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
7
],
vecs7
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
8
],
vecs8
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
9
],
vecs9
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
10
],
vecs10
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
11
],
vecs11
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
12
],
vecs12
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
13
],
vecs13
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
14
],
vecs14
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
15
],
vecs15
);
__m256
hsum1
=
_mm256_hadd_ps
(
vecs0_0
,
vecs1_0
);
__m256
hsum2
=
_mm256_hadd_ps
(
vecs2_0
,
vecs3_0
);
__m256
hsum3
=
_mm256_hadd_ps
(
vecs4_0
,
vecs5_0
);
__m256
hsum4
=
_mm256_hadd_ps
(
vecs6_0
,
vecs7_0
);
__m256
hsum5
=
_mm256_hadd_ps
(
hsum1
,
hsum1
);
__m256
hsum6
=
_mm256_hadd_ps
(
hsum2
,
hsum2
);
__m256
hsum7
=
_mm256_hadd_ps
(
hsum3
,
hsum3
);
__m256
hsum8
=
_mm256_hadd_ps
(
hsum4
,
hsum4
);
__m256
perm1
=
_mm256_permute2f128_ps
(
hsum5
,
hsum5
,
0x23
);
__m256
perm2
=
_mm256_permute2f128_ps
(
hsum6
,
hsum6
,
0x23
);
__m256
perm3
=
_mm256_permute2f128_ps
(
hsum7
,
hsum7
,
0x23
);
__m256
perm4
=
_mm256_permute2f128_ps
(
hsum8
,
hsum8
,
0x23
);
__m256
sum1
=
_mm256_add_ps
(
perm1
,
hsum5
);
__m256
sum2
=
_mm256_add_ps
(
perm2
,
hsum6
);
__m256
sum3
=
_mm256_add_ps
(
perm3
,
hsum7
);
__m256
sum4
=
_mm256_add_ps
(
perm4
,
hsum8
);
__m256
blend1
=
_mm256_blend_ps
(
sum1
,
sum2
,
0xcc
);
__m256
blend2
=
_mm256_blend_ps
(
sum3
,
sum4
,
0xcc
);
__m256
final
=
_mm256_blend_ps
(
blend1
,
blend2
,
0xf0
);
hsum1
=
_mm256_hadd_ps
(
vecs0_1
,
vecs1_1
);
hsum2
=
_mm256_hadd_ps
(
vecs2_1
,
vecs3_1
);
hsum3
=
_mm256_hadd_ps
(
vecs4_1
,
vecs5_1
);
hsum4
=
_mm256_hadd_ps
(
vecs6_1
,
vecs7_1
);
hsum5
=
_mm256_hadd_ps
(
hsum1
,
hsum1
);
hsum6
=
_mm256_hadd_ps
(
hsum2
,
hsum2
);
hsum7
=
_mm256_hadd_ps
(
hsum3
,
hsum3
);
hsum8
=
_mm256_hadd_ps
(
hsum4
,
hsum4
);
perm1
=
_mm256_permute2f128_ps
(
hsum5
,
hsum5
,
0x23
);
perm2
=
_mm256_permute2f128_ps
(
hsum6
,
hsum6
,
0x23
);
perm3
=
_mm256_permute2f128_ps
(
hsum7
,
hsum7
,
0x23
);
perm4
=
_mm256_permute2f128_ps
(
hsum8
,
hsum8
,
0x23
);
sum1
=
_mm256_add_ps
(
perm1
,
hsum5
);
sum2
=
_mm256_add_ps
(
perm2
,
hsum6
);
sum3
=
_mm256_add_ps
(
perm3
,
hsum7
);
sum4
=
_mm256_add_ps
(
perm4
,
hsum8
);
blend1
=
_mm256_blend_ps
(
sum1
,
sum2
,
0xcc
);
blend2
=
_mm256_blend_ps
(
sum3
,
sum4
,
0xcc
);
final
=
padd
(
final
,
_mm256_blend_ps
(
blend1
,
blend2
,
0xf0
));
hsum1
=
_mm256_hadd_ps
(
vecs8_0
,
vecs9_0
);
hsum2
=
_mm256_hadd_ps
(
vecs10_0
,
vecs11_0
);
hsum3
=
_mm256_hadd_ps
(
vecs12_0
,
vecs13_0
);
hsum4
=
_mm256_hadd_ps
(
vecs14_0
,
vecs15_0
);
hsum5
=
_mm256_hadd_ps
(
hsum1
,
hsum1
);
hsum6
=
_mm256_hadd_ps
(
hsum2
,
hsum2
);
hsum7
=
_mm256_hadd_ps
(
hsum3
,
hsum3
);
hsum8
=
_mm256_hadd_ps
(
hsum4
,
hsum4
);
perm1
=
_mm256_permute2f128_ps
(
hsum5
,
hsum5
,
0x23
);
perm2
=
_mm256_permute2f128_ps
(
hsum6
,
hsum6
,
0x23
);
perm3
=
_mm256_permute2f128_ps
(
hsum7
,
hsum7
,
0x23
);
perm4
=
_mm256_permute2f128_ps
(
hsum8
,
hsum8
,
0x23
);
sum1
=
_mm256_add_ps
(
perm1
,
hsum5
);
sum2
=
_mm256_add_ps
(
perm2
,
hsum6
);
sum3
=
_mm256_add_ps
(
perm3
,
hsum7
);
sum4
=
_mm256_add_ps
(
perm4
,
hsum8
);
blend1
=
_mm256_blend_ps
(
sum1
,
sum2
,
0xcc
);
blend2
=
_mm256_blend_ps
(
sum3
,
sum4
,
0xcc
);
__m256
final_1
=
_mm256_blend_ps
(
blend1
,
blend2
,
0xf0
);
hsum1
=
_mm256_hadd_ps
(
vecs8_1
,
vecs9_1
);
hsum2
=
_mm256_hadd_ps
(
vecs10_1
,
vecs11_1
);
hsum3
=
_mm256_hadd_ps
(
vecs12_1
,
vecs13_1
);
hsum4
=
_mm256_hadd_ps
(
vecs14_1
,
vecs15_1
);
hsum5
=
_mm256_hadd_ps
(
hsum1
,
hsum1
);
hsum6
=
_mm256_hadd_ps
(
hsum2
,
hsum2
);
hsum7
=
_mm256_hadd_ps
(
hsum3
,
hsum3
);
hsum8
=
_mm256_hadd_ps
(
hsum4
,
hsum4
);
perm1
=
_mm256_permute2f128_ps
(
hsum5
,
hsum5
,
0x23
);
perm2
=
_mm256_permute2f128_ps
(
hsum6
,
hsum6
,
0x23
);
perm3
=
_mm256_permute2f128_ps
(
hsum7
,
hsum7
,
0x23
);
perm4
=
_mm256_permute2f128_ps
(
hsum8
,
hsum8
,
0x23
);
sum1
=
_mm256_add_ps
(
perm1
,
hsum5
);
sum2
=
_mm256_add_ps
(
perm2
,
hsum6
);
sum3
=
_mm256_add_ps
(
perm3
,
hsum7
);
sum4
=
_mm256_add_ps
(
perm4
,
hsum8
);
blend1
=
_mm256_blend_ps
(
sum1
,
sum2
,
0xcc
);
blend2
=
_mm256_blend_ps
(
sum3
,
sum4
,
0xcc
);
final_1
=
padd
(
final_1
,
_mm256_blend_ps
(
blend1
,
blend2
,
0xf0
));
__m512
final_output
;
EIGEN_INSERT_8f_INTO_16f
(
final_output
,
final
,
final_1
);
return
final_output
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
preduxp
<
Packet8d
>
(
const
Packet8d
*
vecs
)
{
Packet4d
vecs0_0
=
_mm512_extractf64x4_pd
(
vecs
[
0
],
0
);
Packet4d
vecs0_1
=
_mm512_extractf64x4_pd
(
vecs
[
0
],
1
);
Packet4d
vecs1_0
=
_mm512_extractf64x4_pd
(
vecs
[
1
],
0
);
Packet4d
vecs1_1
=
_mm512_extractf64x4_pd
(
vecs
[
1
],
1
);
Packet4d
vecs2_0
=
_mm512_extractf64x4_pd
(
vecs
[
2
],
0
);
Packet4d
vecs2_1
=
_mm512_extractf64x4_pd
(
vecs
[
2
],
1
);
Packet4d
vecs3_0
=
_mm512_extractf64x4_pd
(
vecs
[
3
],
0
);
Packet4d
vecs3_1
=
_mm512_extractf64x4_pd
(
vecs
[
3
],
1
);
Packet4d
vecs4_0
=
_mm512_extractf64x4_pd
(
vecs
[
4
],
0
);
Packet4d
vecs4_1
=
_mm512_extractf64x4_pd
(
vecs
[
4
],
1
);
Packet4d
vecs5_0
=
_mm512_extractf64x4_pd
(
vecs
[
5
],
0
);
Packet4d
vecs5_1
=
_mm512_extractf64x4_pd
(
vecs
[
5
],
1
);
Packet4d
vecs6_0
=
_mm512_extractf64x4_pd
(
vecs
[
6
],
0
);
Packet4d
vecs6_1
=
_mm512_extractf64x4_pd
(
vecs
[
6
],
1
);
Packet4d
vecs7_0
=
_mm512_extractf64x4_pd
(
vecs
[
7
],
0
);
Packet4d
vecs7_1
=
_mm512_extractf64x4_pd
(
vecs
[
7
],
1
);
Packet4d
tmp0
,
tmp1
;
tmp0
=
_mm256_hadd_pd
(
vecs0_0
,
vecs1_0
);
tmp0
=
_mm256_add_pd
(
tmp0
,
_mm256_permute2f128_pd
(
tmp0
,
tmp0
,
1
));
tmp1
=
_mm256_hadd_pd
(
vecs2_0
,
vecs3_0
);
tmp1
=
_mm256_add_pd
(
tmp1
,
_mm256_permute2f128_pd
(
tmp1
,
tmp1
,
1
));
__m256d
final_0
=
_mm256_blend_pd
(
tmp0
,
tmp1
,
0xC
);
tmp0
=
_mm256_hadd_pd
(
vecs0_1
,
vecs1_1
);
tmp0
=
_mm256_add_pd
(
tmp0
,
_mm256_permute2f128_pd
(
tmp0
,
tmp0
,
1
));
tmp1
=
_mm256_hadd_pd
(
vecs2_1
,
vecs3_1
);
tmp1
=
_mm256_add_pd
(
tmp1
,
_mm256_permute2f128_pd
(
tmp1
,
tmp1
,
1
));
final_0
=
padd
(
final_0
,
_mm256_blend_pd
(
tmp0
,
tmp1
,
0xC
));
tmp0
=
_mm256_hadd_pd
(
vecs4_0
,
vecs5_0
);
tmp0
=
_mm256_add_pd
(
tmp0
,
_mm256_permute2f128_pd
(
tmp0
,
tmp0
,
1
));
tmp1
=
_mm256_hadd_pd
(
vecs6_0
,
vecs7_0
);
tmp1
=
_mm256_add_pd
(
tmp1
,
_mm256_permute2f128_pd
(
tmp1
,
tmp1
,
1
));
__m256d
final_1
=
_mm256_blend_pd
(
tmp0
,
tmp1
,
0xC
);
tmp0
=
_mm256_hadd_pd
(
vecs4_1
,
vecs5_1
);
tmp0
=
_mm256_add_pd
(
tmp0
,
_mm256_permute2f128_pd
(
tmp0
,
tmp0
,
1
));
tmp1
=
_mm256_hadd_pd
(
vecs6_1
,
vecs7_1
);
tmp1
=
_mm256_add_pd
(
tmp1
,
_mm256_permute2f128_pd
(
tmp1
,
tmp1
,
1
));
final_1
=
padd
(
final_1
,
_mm256_blend_pd
(
tmp0
,
tmp1
,
0xC
));
__m512d
final_output
=
_mm512_insertf64x4
(
final_output
,
final_0
,
0
);
return
_mm512_insertf64x4
(
final_output
,
final_1
,
1
);
}
template
<
>
EIGEN_STRONG_INLINE
float
predux
<
Packet16f
>
(
const
Packet16f
&
a
)
{
//#ifdef EIGEN_VECTORIZE_AVX512DQ
#if 0
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
Packet8f sum = padd(lane0, lane1);
Packet8f tmp0 = _mm256_hadd_ps(sum, _mm256_permute2f128_ps(a, a, 1));
tmp0 = _mm256_hadd_ps(tmp0, tmp0);
return pfirst(_mm256_hadd_ps(tmp0, tmp0));
#else
Packet4f
lane0
=
_mm512_extractf32x4_ps
(
a
,
0
);
Packet4f
lane1
=
_mm512_extractf32x4_ps
(
a
,
1
);
Packet4f
lane2
=
_mm512_extractf32x4_ps
(
a
,
2
);
Packet4f
lane3
=
_mm512_extractf32x4_ps
(
a
,
3
);
Packet4f
sum
=
padd
(
padd
(
lane0
,
lane1
),
padd
(
lane2
,
lane3
));
sum
=
_mm_hadd_ps
(
sum
,
sum
);
sum
=
_mm_hadd_ps
(
sum
,
_mm_permute_ps
(
sum
,
1
));
return
pfirst
(
sum
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
double
predux
<
Packet8d
>
(
const
Packet8d
&
a
)
{
Packet4d
lane0
=
_mm512_extractf64x4_pd
(
a
,
0
);
Packet4d
lane1
=
_mm512_extractf64x4_pd
(
a
,
1
);
Packet4d
sum
=
padd
(
lane0
,
lane1
);
Packet4d
tmp0
=
_mm256_hadd_pd
(
sum
,
_mm256_permute2f128_pd
(
sum
,
sum
,
1
));
return
pfirst
(
_mm256_hadd_pd
(
tmp0
,
tmp0
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
predux_downto4
<
Packet16f
>
(
const
Packet16f
&
a
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
Packet8f
lane0
=
_mm512_extractf32x8_ps
(
a
,
0
);
Packet8f
lane1
=
_mm512_extractf32x8_ps
(
a
,
1
);
return
padd
(
lane0
,
lane1
);
#else
Packet4f
lane0
=
_mm512_extractf32x4_ps
(
a
,
0
);
Packet4f
lane1
=
_mm512_extractf32x4_ps
(
a
,
1
);
Packet4f
lane2
=
_mm512_extractf32x4_ps
(
a
,
2
);
Packet4f
lane3
=
_mm512_extractf32x4_ps
(
a
,
3
);
Packet4f
sum0
=
padd
(
lane0
,
lane2
);
Packet4f
sum1
=
padd
(
lane1
,
lane3
);
return
_mm256_insertf128_ps
(
_mm256_castps128_ps256
(
sum0
),
sum1
,
1
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
predux_downto4
<
Packet8d
>
(
const
Packet8d
&
a
)
{
Packet4d
lane0
=
_mm512_extractf64x4_pd
(
a
,
0
);
Packet4d
lane1
=
_mm512_extractf64x4_pd
(
a
,
1
);
Packet4d
res
=
padd
(
lane0
,
lane1
);
return
res
;
}
template
<
>
EIGEN_STRONG_INLINE
float
predux_mul
<
Packet16f
>
(
const
Packet16f
&
a
)
{
//#ifdef EIGEN_VECTORIZE_AVX512DQ
#if 0
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
Packet8f res = pmul(lane0, lane1);
res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
#else
Packet4f
lane0
=
_mm512_extractf32x4_ps
(
a
,
0
);
Packet4f
lane1
=
_mm512_extractf32x4_ps
(
a
,
1
);
Packet4f
lane2
=
_mm512_extractf32x4_ps
(
a
,
2
);
Packet4f
lane3
=
_mm512_extractf32x4_ps
(
a
,
3
);
Packet4f
res
=
pmul
(
pmul
(
lane0
,
lane1
),
pmul
(
lane2
,
lane3
));
res
=
pmul
(
res
,
_mm_permute_ps
(
res
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
return
pfirst
(
pmul
(
res
,
_mm_permute_ps
(
res
,
_MM_SHUFFLE
(
0
,
0
,
0
,
1
))));
#endif
}
template
<
>
EIGEN_STRONG_INLINE
double
predux_mul
<
Packet8d
>
(
const
Packet8d
&
a
)
{
Packet4d
lane0
=
_mm512_extractf64x4_pd
(
a
,
0
);
Packet4d
lane1
=
_mm512_extractf64x4_pd
(
a
,
1
);
Packet4d
res
=
pmul
(
lane0
,
lane1
);
res
=
pmul
(
res
,
_mm256_permute2f128_pd
(
res
,
res
,
1
));
return
pfirst
(
pmul
(
res
,
_mm256_shuffle_pd
(
res
,
res
,
1
)));
}
template
<
>
EIGEN_STRONG_INLINE
float
predux_min
<
Packet16f
>
(
const
Packet16f
&
a
)
{
Packet4f
lane0
=
_mm512_extractf32x4_ps
(
a
,
0
);
Packet4f
lane1
=
_mm512_extractf32x4_ps
(
a
,
1
);
Packet4f
lane2
=
_mm512_extractf32x4_ps
(
a
,
2
);
Packet4f
lane3
=
_mm512_extractf32x4_ps
(
a
,
3
);
Packet4f
res
=
_mm_min_ps
(
_mm_min_ps
(
lane0
,
lane1
),
_mm_min_ps
(
lane2
,
lane3
));
res
=
_mm_min_ps
(
res
,
_mm_permute_ps
(
res
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
return
pfirst
(
_mm_min_ps
(
res
,
_mm_permute_ps
(
res
,
_MM_SHUFFLE
(
0
,
0
,
0
,
1
))));
}
template
<
>
EIGEN_STRONG_INLINE
double
predux_min
<
Packet8d
>
(
const
Packet8d
&
a
)
{
Packet4d
lane0
=
_mm512_extractf64x4_pd
(
a
,
0
);
Packet4d
lane1
=
_mm512_extractf64x4_pd
(
a
,
1
);
Packet4d
res
=
_mm256_min_pd
(
lane0
,
lane1
);
res
=
_mm256_min_pd
(
res
,
_mm256_permute2f128_pd
(
res
,
res
,
1
));
return
pfirst
(
_mm256_min_pd
(
res
,
_mm256_shuffle_pd
(
res
,
res
,
1
)));
}
template
<
>
EIGEN_STRONG_INLINE
float
predux_max
<
Packet16f
>
(
const
Packet16f
&
a
)
{
Packet4f
lane0
=
_mm512_extractf32x4_ps
(
a
,
0
);
Packet4f
lane1
=
_mm512_extractf32x4_ps
(
a
,
1
);
Packet4f
lane2
=
_mm512_extractf32x4_ps
(
a
,
2
);
Packet4f
lane3
=
_mm512_extractf32x4_ps
(
a
,
3
);
Packet4f
res
=
_mm_max_ps
(
_mm_max_ps
(
lane0
,
lane1
),
_mm_max_ps
(
lane2
,
lane3
));
res
=
_mm_max_ps
(
res
,
_mm_permute_ps
(
res
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
return
pfirst
(
_mm_max_ps
(
res
,
_mm_permute_ps
(
res
,
_MM_SHUFFLE
(
0
,
0
,
0
,
1
))));
}
template
<
>
EIGEN_STRONG_INLINE
double
predux_max
<
Packet8d
>
(
const
Packet8d
&
a
)
{
Packet4d
lane0
=
_mm512_extractf64x4_pd
(
a
,
0
);
Packet4d
lane1
=
_mm512_extractf64x4_pd
(
a
,
1
);
Packet4d
res
=
_mm256_max_pd
(
lane0
,
lane1
);
res
=
_mm256_max_pd
(
res
,
_mm256_permute2f128_pd
(
res
,
res
,
1
));
return
pfirst
(
_mm256_max_pd
(
res
,
_mm256_shuffle_pd
(
res
,
res
,
1
)));
}
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet16f
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet16f
&
first
,
const
Packet16f
&
second
)
{
if
(
Offset
!=
0
)
{
__m512i
first_idx
=
_mm512_set_epi32
(
Offset
+
15
,
Offset
+
14
,
Offset
+
13
,
Offset
+
12
,
Offset
+
11
,
Offset
+
10
,
Offset
+
9
,
Offset
+
8
,
Offset
+
7
,
Offset
+
6
,
Offset
+
5
,
Offset
+
4
,
Offset
+
3
,
Offset
+
2
,
Offset
+
1
,
Offset
);
__m512i
second_idx
=
_mm512_set_epi32
(
Offset
-
1
,
Offset
-
2
,
Offset
-
3
,
Offset
-
4
,
Offset
-
5
,
Offset
-
6
,
Offset
-
7
,
Offset
-
8
,
Offset
-
9
,
Offset
-
10
,
Offset
-
11
,
Offset
-
12
,
Offset
-
13
,
Offset
-
14
,
Offset
-
15
,
Offset
-
16
);
unsigned
short
mask
=
0xFFFF
;
mask
<<=
(
16
-
Offset
);
first
=
_mm512_permutexvar_ps
(
first_idx
,
first
);
Packet16f
tmp
=
_mm512_permutexvar_ps
(
second_idx
,
second
);
first
=
_mm512_mask_blend_ps
(
mask
,
first
,
tmp
);
}
}
};
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet8d
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet8d
&
first
,
const
Packet8d
&
second
)
{
if
(
Offset
!=
0
)
{
__m512i
first_idx
=
_mm512_set_epi32
(
0
,
Offset
+
7
,
0
,
Offset
+
6
,
0
,
Offset
+
5
,
0
,
Offset
+
4
,
0
,
Offset
+
3
,
0
,
Offset
+
2
,
0
,
Offset
+
1
,
0
,
Offset
);
__m512i
second_idx
=
_mm512_set_epi32
(
0
,
Offset
-
1
,
0
,
Offset
-
2
,
0
,
Offset
-
3
,
0
,
Offset
-
4
,
0
,
Offset
-
5
,
0
,
Offset
-
6
,
0
,
Offset
-
7
,
0
,
Offset
-
8
);
unsigned
char
mask
=
0xFF
;
mask
<<=
(
8
-
Offset
);
first
=
_mm512_permutexvar_pd
(
first_idx
,
first
);
Packet8d
tmp
=
_mm512_permutexvar_pd
(
second_idx
,
second
);
first
=
_mm512_mask_blend_pd
(
mask
,
first
,
tmp
);
}
}
};
#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet16f
,
16
>&
kernel
)
{
__m512
T0
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
__m512
T1
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
__m512
T2
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
__m512
T3
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
__m512
T4
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
4
],
kernel
.
packet
[
5
]);
__m512
T5
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
4
],
kernel
.
packet
[
5
]);
__m512
T6
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
6
],
kernel
.
packet
[
7
]);
__m512
T7
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
6
],
kernel
.
packet
[
7
]);
__m512
T8
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
8
],
kernel
.
packet
[
9
]);
__m512
T9
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
8
],
kernel
.
packet
[
9
]);
__m512
T10
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
10
],
kernel
.
packet
[
11
]);
__m512
T11
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
10
],
kernel
.
packet
[
11
]);
__m512
T12
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
12
],
kernel
.
packet
[
13
]);
__m512
T13
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
12
],
kernel
.
packet
[
13
]);
__m512
T14
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
14
],
kernel
.
packet
[
15
]);
__m512
T15
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
14
],
kernel
.
packet
[
15
]);
__m512
S0
=
_mm512_shuffle_ps
(
T0
,
T2
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S1
=
_mm512_shuffle_ps
(
T0
,
T2
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m512
S2
=
_mm512_shuffle_ps
(
T1
,
T3
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S3
=
_mm512_shuffle_ps
(
T1
,
T3
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m512
S4
=
_mm512_shuffle_ps
(
T4
,
T6
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S5
=
_mm512_shuffle_ps
(
T4
,
T6
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m512
S6
=
_mm512_shuffle_ps
(
T5
,
T7
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S7
=
_mm512_shuffle_ps
(
T5
,
T7
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m512
S8
=
_mm512_shuffle_ps
(
T8
,
T10
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S9
=
_mm512_shuffle_ps
(
T8
,
T10
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m512
S10
=
_mm512_shuffle_ps
(
T9
,
T11
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S11
=
_mm512_shuffle_ps
(
T9
,
T11
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m512
S12
=
_mm512_shuffle_ps
(
T12
,
T14
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S13
=
_mm512_shuffle_ps
(
T12
,
T14
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m512
S14
=
_mm512_shuffle_ps
(
T13
,
T15
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S15
=
_mm512_shuffle_ps
(
T13
,
T15
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
EIGEN_EXTRACT_8f_FROM_16f
(
S0
,
S0
);
EIGEN_EXTRACT_8f_FROM_16f
(
S1
,
S1
);
EIGEN_EXTRACT_8f_FROM_16f
(
S2
,
S2
);
EIGEN_EXTRACT_8f_FROM_16f
(
S3
,
S3
);
EIGEN_EXTRACT_8f_FROM_16f
(
S4
,
S4
);
EIGEN_EXTRACT_8f_FROM_16f
(
S5
,
S5
);
EIGEN_EXTRACT_8f_FROM_16f
(
S6
,
S6
);
EIGEN_EXTRACT_8f_FROM_16f
(
S7
,
S7
);
EIGEN_EXTRACT_8f_FROM_16f
(
S8
,
S8
);
EIGEN_EXTRACT_8f_FROM_16f
(
S9
,
S9
);
EIGEN_EXTRACT_8f_FROM_16f
(
S10
,
S10
);
EIGEN_EXTRACT_8f_FROM_16f
(
S11
,
S11
);
EIGEN_EXTRACT_8f_FROM_16f
(
S12
,
S12
);
EIGEN_EXTRACT_8f_FROM_16f
(
S13
,
S13
);
EIGEN_EXTRACT_8f_FROM_16f
(
S14
,
S14
);
EIGEN_EXTRACT_8f_FROM_16f
(
S15
,
S15
);
PacketBlock
<
Packet8f
,
32
>
tmp
;
tmp
.
packet
[
0
]
=
_mm256_permute2f128_ps
(
S0_0
,
S4_0
,
0x20
);
tmp
.
packet
[
1
]
=
_mm256_permute2f128_ps
(
S1_0
,
S5_0
,
0x20
);
tmp
.
packet
[
2
]
=
_mm256_permute2f128_ps
(
S2_0
,
S6_0
,
0x20
);
tmp
.
packet
[
3
]
=
_mm256_permute2f128_ps
(
S3_0
,
S7_0
,
0x20
);
tmp
.
packet
[
4
]
=
_mm256_permute2f128_ps
(
S0_0
,
S4_0
,
0x31
);
tmp
.
packet
[
5
]
=
_mm256_permute2f128_ps
(
S1_0
,
S5_0
,
0x31
);
tmp
.
packet
[
6
]
=
_mm256_permute2f128_ps
(
S2_0
,
S6_0
,
0x31
);
tmp
.
packet
[
7
]
=
_mm256_permute2f128_ps
(
S3_0
,
S7_0
,
0x31
);
tmp
.
packet
[
8
]
=
_mm256_permute2f128_ps
(
S0_1
,
S4_1
,
0x20
);
tmp
.
packet
[
9
]
=
_mm256_permute2f128_ps
(
S1_1
,
S5_1
,
0x20
);
tmp
.
packet
[
10
]
=
_mm256_permute2f128_ps
(
S2_1
,
S6_1
,
0x20
);
tmp
.
packet
[
11
]
=
_mm256_permute2f128_ps
(
S3_1
,
S7_1
,
0x20
);
tmp
.
packet
[
12
]
=
_mm256_permute2f128_ps
(
S0_1
,
S4_1
,
0x31
);
tmp
.
packet
[
13
]
=
_mm256_permute2f128_ps
(
S1_1
,
S5_1
,
0x31
);
tmp
.
packet
[
14
]
=
_mm256_permute2f128_ps
(
S2_1
,
S6_1
,
0x31
);
tmp
.
packet
[
15
]
=
_mm256_permute2f128_ps
(
S3_1
,
S7_1
,
0x31
);
// Second set of _m256 outputs
tmp
.
packet
[
16
]
=
_mm256_permute2f128_ps
(
S8_0
,
S12_0
,
0x20
);
tmp
.
packet
[
17
]
=
_mm256_permute2f128_ps
(
S9_0
,
S13_0
,
0x20
);
tmp
.
packet
[
18
]
=
_mm256_permute2f128_ps
(
S10_0
,
S14_0
,
0x20
);
tmp
.
packet
[
19
]
=
_mm256_permute2f128_ps
(
S11_0
,
S15_0
,
0x20
);
tmp
.
packet
[
20
]
=
_mm256_permute2f128_ps
(
S8_0
,
S12_0
,
0x31
);
tmp
.
packet
[
21
]
=
_mm256_permute2f128_ps
(
S9_0
,
S13_0
,
0x31
);
tmp
.
packet
[
22
]
=
_mm256_permute2f128_ps
(
S10_0
,
S14_0
,
0x31
);
tmp
.
packet
[
23
]
=
_mm256_permute2f128_ps
(
S11_0
,
S15_0
,
0x31
);
tmp
.
packet
[
24
]
=
_mm256_permute2f128_ps
(
S8_1
,
S12_1
,
0x20
);
tmp
.
packet
[
25
]
=
_mm256_permute2f128_ps
(
S9_1
,
S13_1
,
0x20
);
tmp
.
packet
[
26
]
=
_mm256_permute2f128_ps
(
S10_1
,
S14_1
,
0x20
);
tmp
.
packet
[
27
]
=
_mm256_permute2f128_ps
(
S11_1
,
S15_1
,
0x20
);
tmp
.
packet
[
28
]
=
_mm256_permute2f128_ps
(
S8_1
,
S12_1
,
0x31
);
tmp
.
packet
[
29
]
=
_mm256_permute2f128_ps
(
S9_1
,
S13_1
,
0x31
);
tmp
.
packet
[
30
]
=
_mm256_permute2f128_ps
(
S10_1
,
S14_1
,
0x31
);
tmp
.
packet
[
31
]
=
_mm256_permute2f128_ps
(
S11_1
,
S15_1
,
0x31
);
// Pack them into the output
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
0
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
1
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
2
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
3
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
4
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
5
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
6
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
7
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
8
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
9
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
10
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
11
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
12
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
13
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
14
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
15
,
16
);
}
#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \
EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \
INPUT[2 * INDEX + STRIDE]);
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet16f
,
4
>&
kernel
)
{
__m512
T0
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
__m512
T1
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
__m512
T2
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
__m512
T3
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
__m512
S0
=
_mm512_shuffle_ps
(
T0
,
T2
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S1
=
_mm512_shuffle_ps
(
T0
,
T2
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m512
S2
=
_mm512_shuffle_ps
(
T1
,
T3
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S3
=
_mm512_shuffle_ps
(
T1
,
T3
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
EIGEN_EXTRACT_8f_FROM_16f
(
S0
,
S0
);
EIGEN_EXTRACT_8f_FROM_16f
(
S1
,
S1
);
EIGEN_EXTRACT_8f_FROM_16f
(
S2
,
S2
);
EIGEN_EXTRACT_8f_FROM_16f
(
S3
,
S3
);
PacketBlock
<
Packet8f
,
8
>
tmp
;
tmp
.
packet
[
0
]
=
_mm256_permute2f128_ps
(
S0_0
,
S1_0
,
0x20
);
tmp
.
packet
[
1
]
=
_mm256_permute2f128_ps
(
S2_0
,
S3_0
,
0x20
);
tmp
.
packet
[
2
]
=
_mm256_permute2f128_ps
(
S0_0
,
S1_0
,
0x31
);
tmp
.
packet
[
3
]
=
_mm256_permute2f128_ps
(
S2_0
,
S3_0
,
0x31
);
tmp
.
packet
[
4
]
=
_mm256_permute2f128_ps
(
S0_1
,
S1_1
,
0x20
);
tmp
.
packet
[
5
]
=
_mm256_permute2f128_ps
(
S2_1
,
S3_1
,
0x20
);
tmp
.
packet
[
6
]
=
_mm256_permute2f128_ps
(
S0_1
,
S1_1
,
0x31
);
tmp
.
packet
[
7
]
=
_mm256_permute2f128_ps
(
S2_1
,
S3_1
,
0x31
);
PACK_OUTPUT_2
(
kernel
.
packet
,
tmp
.
packet
,
0
,
1
);
PACK_OUTPUT_2
(
kernel
.
packet
,
tmp
.
packet
,
1
,
1
);
PACK_OUTPUT_2
(
kernel
.
packet
,
tmp
.
packet
,
2
,
1
);
PACK_OUTPUT_2
(
kernel
.
packet
,
tmp
.
packet
,
3
,
1
);
}
#define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE) \
OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX], 0); \
OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX + STRIDE], 1);
#define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE) \
OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
OUTPUT[INDEX] = \
_mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet8d
,
4
>&
kernel
)
{
__m512d
T0
=
_mm512_shuffle_pd
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
],
0
);
__m512d
T1
=
_mm512_shuffle_pd
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
],
0xff
);
__m512d
T2
=
_mm512_shuffle_pd
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
],
0
);
__m512d
T3
=
_mm512_shuffle_pd
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
],
0xff
);
PacketBlock
<
Packet4d
,
8
>
tmp
;
tmp
.
packet
[
0
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T0
,
0
),
_mm512_extractf64x4_pd
(
T2
,
0
),
0x20
);
tmp
.
packet
[
1
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T1
,
0
),
_mm512_extractf64x4_pd
(
T3
,
0
),
0x20
);
tmp
.
packet
[
2
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T0
,
0
),
_mm512_extractf64x4_pd
(
T2
,
0
),
0x31
);
tmp
.
packet
[
3
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T1
,
0
),
_mm512_extractf64x4_pd
(
T3
,
0
),
0x31
);
tmp
.
packet
[
4
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T0
,
1
),
_mm512_extractf64x4_pd
(
T2
,
1
),
0x20
);
tmp
.
packet
[
5
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T1
,
1
),
_mm512_extractf64x4_pd
(
T3
,
1
),
0x20
);
tmp
.
packet
[
6
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T0
,
1
),
_mm512_extractf64x4_pd
(
T2
,
1
),
0x31
);
tmp
.
packet
[
7
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T1
,
1
),
_mm512_extractf64x4_pd
(
T3
,
1
),
0x31
);
PACK_OUTPUT_D
(
kernel
.
packet
,
tmp
.
packet
,
0
,
1
);
PACK_OUTPUT_D
(
kernel
.
packet
,
tmp
.
packet
,
1
,
1
);
PACK_OUTPUT_D
(
kernel
.
packet
,
tmp
.
packet
,
2
,
1
);
PACK_OUTPUT_D
(
kernel
.
packet
,
tmp
.
packet
,
3
,
1
);
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet8d
,
8
>&
kernel
)
{
__m512d
T0
=
_mm512_unpacklo_pd
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
__m512d
T1
=
_mm512_unpackhi_pd
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
__m512d
T2
=
_mm512_unpacklo_pd
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
__m512d
T3
=
_mm512_unpackhi_pd
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
__m512d
T4
=
_mm512_unpacklo_pd
(
kernel
.
packet
[
4
],
kernel
.
packet
[
5
]);
__m512d
T5
=
_mm512_unpackhi_pd
(
kernel
.
packet
[
4
],
kernel
.
packet
[
5
]);
__m512d
T6
=
_mm512_unpacklo_pd
(
kernel
.
packet
[
6
],
kernel
.
packet
[
7
]);
__m512d
T7
=
_mm512_unpackhi_pd
(
kernel
.
packet
[
6
],
kernel
.
packet
[
7
]);
PacketBlock
<
Packet4d
,
16
>
tmp
;
tmp
.
packet
[
0
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T0
,
0
),
_mm512_extractf64x4_pd
(
T2
,
0
),
0x20
);
tmp
.
packet
[
1
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T1
,
0
),
_mm512_extractf64x4_pd
(
T3
,
0
),
0x20
);
tmp
.
packet
[
2
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T0
,
0
),
_mm512_extractf64x4_pd
(
T2
,
0
),
0x31
);
tmp
.
packet
[
3
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T1
,
0
),
_mm512_extractf64x4_pd
(
T3
,
0
),
0x31
);
tmp
.
packet
[
4
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T0
,
1
),
_mm512_extractf64x4_pd
(
T2
,
1
),
0x20
);
tmp
.
packet
[
5
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T1
,
1
),
_mm512_extractf64x4_pd
(
T3
,
1
),
0x20
);
tmp
.
packet
[
6
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T0
,
1
),
_mm512_extractf64x4_pd
(
T2
,
1
),
0x31
);
tmp
.
packet
[
7
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T1
,
1
),
_mm512_extractf64x4_pd
(
T3
,
1
),
0x31
);
tmp
.
packet
[
8
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T4
,
0
),
_mm512_extractf64x4_pd
(
T6
,
0
),
0x20
);
tmp
.
packet
[
9
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T5
,
0
),
_mm512_extractf64x4_pd
(
T7
,
0
),
0x20
);
tmp
.
packet
[
10
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T4
,
0
),
_mm512_extractf64x4_pd
(
T6
,
0
),
0x31
);
tmp
.
packet
[
11
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T5
,
0
),
_mm512_extractf64x4_pd
(
T7
,
0
),
0x31
);
tmp
.
packet
[
12
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T4
,
1
),
_mm512_extractf64x4_pd
(
T6
,
1
),
0x20
);
tmp
.
packet
[
13
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T5
,
1
),
_mm512_extractf64x4_pd
(
T7
,
1
),
0x20
);
tmp
.
packet
[
14
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T4
,
1
),
_mm512_extractf64x4_pd
(
T6
,
1
),
0x31
);
tmp
.
packet
[
15
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T5
,
1
),
_mm512_extractf64x4_pd
(
T7
,
1
),
0x31
);
PACK_OUTPUT_SQ_D
(
kernel
.
packet
,
tmp
.
packet
,
0
,
8
);
PACK_OUTPUT_SQ_D
(
kernel
.
packet
,
tmp
.
packet
,
1
,
8
);
PACK_OUTPUT_SQ_D
(
kernel
.
packet
,
tmp
.
packet
,
2
,
8
);
PACK_OUTPUT_SQ_D
(
kernel
.
packet
,
tmp
.
packet
,
3
,
8
);
PACK_OUTPUT_SQ_D
(
kernel
.
packet
,
tmp
.
packet
,
4
,
8
);
PACK_OUTPUT_SQ_D
(
kernel
.
packet
,
tmp
.
packet
,
5
,
8
);
PACK_OUTPUT_SQ_D
(
kernel
.
packet
,
tmp
.
packet
,
6
,
8
);
PACK_OUTPUT_SQ_D
(
kernel
.
packet
,
tmp
.
packet
,
7
,
8
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pblend
(
const
Selector
<
16
>&
/*ifPacket*/
,
const
Packet16f
&
/*thenPacket*/
,
const
Packet16f
&
/*elsePacket*/
)
{
assert
(
false
&&
"To be implemented"
);
return
Packet16f
();
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pblend
(
const
Selector
<
8
>&
/*ifPacket*/
,
const
Packet8d
&
/*thenPacket*/
,
const
Packet8d
&
/*elsePacket*/
)
{
assert
(
false
&&
"To be implemented"
);
return
Packet8d
();
}
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_PACKET_MATH_AVX512_H
external/eigen3/Eigen/src/Core/arch/AltiVec/CMakeLists.txt
deleted
100644 → 0
View file @
701c0225
FILE
(
GLOB Eigen_Core_arch_AltiVec_SRCS
"*.h"
)
INSTALL
(
FILES
${
Eigen_Core_arch_AltiVec_SRCS
}
DESTINATION
${
INCLUDE_INSTALL_DIR
}
/Eigen/src/Core/arch/AltiVec COMPONENT Devel
)
external/eigen3/Eigen/src/Core/arch/AltiVec/Complex.h
View file @
a394b22a
...
...
@@ -2,30 +2,34 @@
// for linear algebra.
//
// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2010-2016 Konstantinos Margaritis <markos@freevec.org>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COMPLEX_ALTIVEC_H
#define EIGEN_COMPLEX_ALTIVEC_H
#ifndef EIGEN_COMPLEX
32
_ALTIVEC_H
#define EIGEN_COMPLEX
32
_ALTIVEC_H
namespace
Eigen
{
namespace
internal
{
static
Packet4ui
p4ui_CONJ_XOR
=
vec_mergeh
((
Packet4ui
)
p4i_ZERO
,
(
Packet4ui
)
p4f_ZERO_
);
//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
static
Packet16uc
p16uc_COMPLEX_RE
=
vec_sld
((
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
0
),
(
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
2
),
8
);
//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
static
Packet16uc
p16uc_COMPLEX_IM
=
vec_sld
((
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
1
),
(
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
3
),
8
);
//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
static
Packet16uc
p16uc_COMPLEX_REV
=
vec_sld
(
p16uc_REVERSE
,
p16uc_REVERSE
,
8
);
//{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
static
Packet16uc
p16uc_COMPLEX_REV2
=
vec_sld
(
p16uc_FORWARD
,
p16uc_FORWARD
,
8
);
//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
static
Packet16uc
p16uc_PSET_HI
=
(
Packet16uc
)
vec_mergeh
((
Packet4ui
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
0
),
(
Packet4ui
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
1
));
//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
static
Packet16uc
p16uc_PSET_LO
=
(
Packet16uc
)
vec_mergeh
((
Packet4ui
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
2
),
(
Packet4ui
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
3
));
//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
static
Packet4ui
p4ui_CONJ_XOR
=
vec_mergeh
((
Packet4ui
)
p4i_ZERO
,
(
Packet4ui
)
p4f_MZERO
);
//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
#ifdef __VSX__
#if defined(_BIG_ENDIAN)
static
Packet2ul
p2ul_CONJ_XOR1
=
(
Packet2ul
)
vec_sld
((
Packet4ui
)
p2d_MZERO
,
(
Packet4ui
)
p2l_ZERO
,
8
);
//{ 0x8000000000000000, 0x0000000000000000 };
static
Packet2ul
p2ul_CONJ_XOR2
=
(
Packet2ul
)
vec_sld
((
Packet4ui
)
p2l_ZERO
,
(
Packet4ui
)
p2d_MZERO
,
8
);
//{ 0x8000000000000000, 0x0000000000000000 };
#else
static
Packet2ul
p2ul_CONJ_XOR1
=
(
Packet2ul
)
vec_sld
((
Packet4ui
)
p2l_ZERO
,
(
Packet4ui
)
p2d_MZERO
,
8
);
//{ 0x8000000000000000, 0x0000000000000000 };
static
Packet2ul
p2ul_CONJ_XOR2
=
(
Packet2ul
)
vec_sld
((
Packet4ui
)
p2d_MZERO
,
(
Packet4ui
)
p2l_ZERO
,
8
);
//{ 0x8000000000000000, 0x0000000000000000 };
#endif
#endif
//---------- float ----------
struct
Packet2cf
{
EIGEN_STRONG_INLINE
Packet2cf
()
{}
EIGEN_STRONG_INLINE
explicit
Packet2cf
()
:
v
(
p4f_ZERO
)
{}
EIGEN_STRONG_INLINE
explicit
Packet2cf
(
const
Packet4f
&
a
)
:
v
(
a
)
{}
Packet4f
v
;
};
...
...
@@ -33,10 +37,12 @@ struct Packet2cf
template
<
>
struct
packet_traits
<
std
::
complex
<
float
>
>
:
default_packet_traits
{
typedef
Packet2cf
type
;
typedef
Packet2cf
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
2
,
HasHalfPacket
=
0
,
HasAdd
=
1
,
HasSub
=
1
,
...
...
@@ -47,65 +53,78 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
#ifdef __VSX__
HasBlend
=
1
,
#endif
HasSetLinear
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet2cf
>
{
typedef
std
::
complex
<
float
>
type
;
enum
{
size
=
2
}
;
};
template
<
>
struct
unpacket_traits
<
Packet2cf
>
{
typedef
std
::
complex
<
float
>
type
;
enum
{
size
=
2
,
alignment
=
Aligned16
};
typedef
Packet2cf
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pset1
<
Packet2cf
>
(
const
std
::
complex
<
float
>&
from
)
{
Packet2cf
res
;
/* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */
if
((
ptrdiff_t
(
&
from
)
%
16
)
==
0
)
if
((
std
::
ptrdiff_t
(
&
from
)
%
16
)
==
0
)
res
.
v
=
pload
<
Packet4f
>
((
const
float
*
)
&
from
);
else
res
.
v
=
ploadu
<
Packet4f
>
((
const
float
*
)
&
from
);
res
.
v
=
vec_perm
(
res
.
v
,
res
.
v
,
p16uc_PSET_HI
);
res
.
v
=
vec_perm
(
res
.
v
,
res
.
v
,
p16uc_PSET
64
_HI
);
return
res
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
padd
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
vec_add
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
psub
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
vec_sub
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pload
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
return
Packet2cf
(
pload
<
Packet4f
>
((
const
float
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
ploadu
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
return
Packet2cf
(
ploadu
<
Packet4f
>
((
const
float
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
ploaddup
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
return
pset1
<
Packet2cf
>
(
*
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
pstore
((
float
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
pstoreu
((
float
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet2cf
pgather
<
std
::
complex
<
float
>
,
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
,
Index
stride
)
{
std
::
complex
<
float
>
EIGEN_ALIGN16
af
[
2
];
af
[
0
]
=
from
[
0
*
stride
];
af
[
1
]
=
from
[
1
*
stride
];
return
pload
<
Packet2cf
>
(
af
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
std
::
complex
<
float
>
,
Packet2cf
>
(
std
::
complex
<
float
>*
to
,
const
Packet2cf
&
from
,
Index
stride
)
{
std
::
complex
<
float
>
EIGEN_ALIGN16
af
[
2
];
pstore
<
std
::
complex
<
float
>
>
((
std
::
complex
<
float
>
*
)
af
,
from
);
to
[
0
*
stride
]
=
af
[
0
];
to
[
1
*
stride
]
=
af
[
1
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
padd
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
a
.
v
+
b
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
psub
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
a
.
v
-
b
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pnegate
(
const
Packet2cf
&
a
)
{
return
Packet2cf
(
pnegate
(
a
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pconj
(
const
Packet2cf
&
a
)
{
return
Packet2cf
(
(
Packet4f
)
vec_xor
((
Packet4ui
)
a
.
v
,
p4ui_CONJ_XOR
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pconj
(
const
Packet2cf
&
a
)
{
return
Packet2cf
(
pxor
<
Packet4f
>
(
a
.
v
,
reinterpret_cast
<
Packet4f
>
(
p4ui_CONJ_XOR
))
)
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pmul
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
Packet4f
v1
,
v2
;
// Permute and multiply the real parts of a and b
v1
=
vec_perm
(
a
.
v
,
a
.
v
,
p16uc_
COMPLEX_RE
);
v1
=
vec_perm
(
a
.
v
,
a
.
v
,
p16uc_
PSET32_WODD
);
// Get the imaginary parts of a
v2
=
vec_perm
(
a
.
v
,
a
.
v
,
p16uc_
COMPLEX_IM
);
v2
=
vec_perm
(
a
.
v
,
a
.
v
,
p16uc_
PSET32_WEVEN
);
// multiply a_re * b
v1
=
vec_madd
(
v1
,
b
.
v
,
p4f_ZERO
);
// multiply a_im * b and get the conjugate result
v2
=
vec_madd
(
v2
,
b
.
v
,
p4f_ZERO
);
v2
=
(
Packet4f
)
vec_xor
((
Packet4ui
)
v2
,
p4ui_CONJ_XOR
);
v2
=
reinterpret_cast
<
Packet4f
>
(
pxor
(
v2
,
reinterpret_cast
<
Packet4f
>
(
p4ui_CONJ_XOR
)
))
;
// permute back to a proper order
v2
=
vec_perm
(
v2
,
v2
,
p16uc_COMPLEX_REV
);
v2
=
vec_perm
(
v2
,
v2
,
p16uc_COMPLEX
32
_REV
);
return
Packet2cf
(
vec_add
(
v1
,
v2
));
return
Packet2cf
(
padd
<
Packet4f
>
(
v1
,
v2
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pand
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
vec_and
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
por
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
vec_or
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pxor
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
vec_xor
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pandnot
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
vec_and
(
a
.
v
,
vec_nor
(
b
.
v
,
b
.
v
)
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pand
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
pand
<
Packet4f
>
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
por
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
por
<
Packet4f
>
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pxor
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
pxor
<
Packet4f
>
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pandnot
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
pandnot
<
Packet4f
>
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pload
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
Packet2cf
(
pload
<
Packet4f
>
((
const
float
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
ploadu
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
Packet2cf
(
ploadu
<
Packet4f
>
((
const
float
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
ploaddup
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
return
pset1
<
Packet2cf
>
(
*
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
((
float
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
((
float
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
float
>
>
(
const
std
::
complex
<
float
>
*
addr
)
{
vec_dstt
((
float
*
)
addr
,
DST_CTRL
(
2
,
2
,
32
),
DST_CHAN
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
float
>
>
(
const
std
::
complex
<
float
>
*
addr
)
{
EIGEN_PPC_PREFETCH
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
float
>
pfirst
<
Packet2cf
>
(
const
Packet2cf
&
a
)
{
...
...
@@ -118,26 +137,30 @@ template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Pack
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
preverse
(
const
Packet2cf
&
a
)
{
Packet4f
rev_a
;
rev_a
=
vec_perm
(
a
.
v
,
a
.
v
,
p16uc_COMPLEX_REV2
);
rev_a
=
vec_perm
(
a
.
v
,
a
.
v
,
p16uc_COMPLEX
32
_REV2
);
return
Packet2cf
(
rev_a
);
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
float
>
predux
<
Packet2cf
>
(
const
Packet2cf
&
a
)
{
Packet4f
b
;
b
=
(
Packet4f
)
vec_sld
(
a
.
v
,
a
.
v
,
8
);
b
=
padd
(
a
.
v
,
b
);
return
pfirst
(
Packet2cf
(
b
));
b
=
vec_sld
(
a
.
v
,
a
.
v
,
8
);
b
=
padd
<
Packet4f
>
(
a
.
v
,
b
);
return
pfirst
<
Packet2cf
>
(
Packet2cf
(
b
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
preduxp
<
Packet2cf
>
(
const
Packet2cf
*
vecs
)
{
Packet4f
b1
,
b2
;
b1
=
(
Packet4f
)
vec_sld
(
vecs
[
0
].
v
,
vecs
[
1
].
v
,
8
);
b2
=
(
Packet4f
)
vec_sld
(
vecs
[
1
].
v
,
vecs
[
0
].
v
,
8
);
b2
=
(
Packet4f
)
vec_sld
(
b2
,
b2
,
8
);
b2
=
padd
(
b1
,
b2
);
#ifdef _BIG_ENDIAN
b1
=
vec_sld
(
vecs
[
0
].
v
,
vecs
[
1
].
v
,
8
);
b2
=
vec_sld
(
vecs
[
1
].
v
,
vecs
[
0
].
v
,
8
);
#else
b1
=
vec_sld
(
vecs
[
1
].
v
,
vecs
[
0
].
v
,
8
);
b2
=
vec_sld
(
vecs
[
0
].
v
,
vecs
[
1
].
v
,
8
);
#endif
b2
=
vec_sld
(
b2
,
b2
,
8
);
b2
=
padd
<
Packet4f
>
(
b1
,
b2
);
return
Packet2cf
(
b2
);
}
...
...
@@ -146,10 +169,10 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
{
Packet4f
b
;
Packet2cf
prod
;
b
=
(
Packet4f
)
vec_sld
(
a
.
v
,
a
.
v
,
8
);
prod
=
pmul
(
a
,
Packet2cf
(
b
));
b
=
vec_sld
(
a
.
v
,
a
.
v
,
8
);
prod
=
pmul
<
Packet2cf
>
(
a
,
Packet2cf
(
b
));
return
pfirst
(
prod
);
return
pfirst
<
Packet2cf
>
(
prod
);
}
template
<
int
Offset
>
...
...
@@ -159,7 +182,11 @@ struct palign_impl<Offset,Packet2cf>
{
if
(
Offset
==
1
)
{
#ifdef _BIG_ENDIAN
first
.
v
=
vec_sld
(
first
.
v
,
second
.
v
,
8
);
#else
first
.
v
=
vec_sld
(
second
.
v
,
first
.
v
,
8
);
#endif
}
}
};
...
...
@@ -197,21 +224,238 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
}
};
template
<
>
struct
conj_helper
<
Packet4f
,
Packet2cf
,
false
,
false
>
{
EIGEN_STRONG_INLINE
Packet2cf
pmadd
(
const
Packet4f
&
x
,
const
Packet2cf
&
y
,
const
Packet2cf
&
c
)
const
{
return
padd
(
c
,
pmul
(
x
,
y
));
}
EIGEN_STRONG_INLINE
Packet2cf
pmul
(
const
Packet4f
&
x
,
const
Packet2cf
&
y
)
const
{
return
Packet2cf
(
internal
::
pmul
<
Packet4f
>
(
x
,
y
.
v
));
}
};
template
<
>
struct
conj_helper
<
Packet2cf
,
Packet4f
,
false
,
false
>
{
EIGEN_STRONG_INLINE
Packet2cf
pmadd
(
const
Packet2cf
&
x
,
const
Packet4f
&
y
,
const
Packet2cf
&
c
)
const
{
return
padd
(
c
,
pmul
(
x
,
y
));
}
EIGEN_STRONG_INLINE
Packet2cf
pmul
(
const
Packet2cf
&
x
,
const
Packet4f
&
y
)
const
{
return
Packet2cf
(
internal
::
pmul
<
Packet4f
>
(
x
.
v
,
y
));
}
};
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pdiv
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
// TODO optimize it for AltiVec
Packet2cf
res
=
conj_helper
<
Packet2cf
,
Packet2cf
,
false
,
true
>
().
pmul
(
a
,
b
);
Packet4f
s
=
vec_madd
(
b
.
v
,
b
.
v
,
p4f_ZERO
);
return
Packet2cf
(
pdiv
(
res
.
v
,
vec_add
(
s
,
vec_perm
(
s
,
s
,
p16uc_COMPLEX_REV
))));
Packet2cf
res
=
conj_helper
<
Packet2cf
,
Packet2cf
,
false
,
true
>
().
pmul
(
a
,
b
);
Packet4f
s
=
pmul
<
Packet4f
>
(
b
.
v
,
b
.
v
);
return
Packet2cf
(
pdiv
(
res
.
v
,
padd
<
Packet4f
>
(
s
,
vec_perm
(
s
,
s
,
p16uc_COMPLEX
32
_REV
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pcplxflip
<
Packet2cf
>
(
const
Packet2cf
&
x
)
{
return
Packet2cf
(
vec_perm
(
x
.
v
,
x
.
v
,
p16uc_COMPLEX_REV
));
return
Packet2cf
(
vec_perm
(
x
.
v
,
x
.
v
,
p16uc_COMPLEX32_REV
));
}
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
Packet2cf
,
2
>&
kernel
)
{
Packet4f
tmp
=
vec_perm
(
kernel
.
packet
[
0
].
v
,
kernel
.
packet
[
1
].
v
,
p16uc_TRANSPOSE64_HI
);
kernel
.
packet
[
1
].
v
=
vec_perm
(
kernel
.
packet
[
0
].
v
,
kernel
.
packet
[
1
].
v
,
p16uc_TRANSPOSE64_LO
);
kernel
.
packet
[
0
].
v
=
tmp
;
}
#ifdef __VSX__
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pblend
(
const
Selector
<
2
>&
ifPacket
,
const
Packet2cf
&
thenPacket
,
const
Packet2cf
&
elsePacket
)
{
Packet2cf
result
;
result
.
v
=
reinterpret_cast
<
Packet4f
>
(
pblend
<
Packet2d
>
(
ifPacket
,
reinterpret_cast
<
Packet2d
>
(
thenPacket
.
v
),
reinterpret_cast
<
Packet2d
>
(
elsePacket
.
v
)));
return
result
;
}
#endif
//---------- double ----------
#ifdef __VSX__
struct
Packet1cd
{
EIGEN_STRONG_INLINE
Packet1cd
()
{}
EIGEN_STRONG_INLINE
explicit
Packet1cd
(
const
Packet2d
&
a
)
:
v
(
a
)
{}
Packet2d
v
;
};
template
<
>
struct
packet_traits
<
std
::
complex
<
double
>
>
:
default_packet_traits
{
typedef
Packet1cd
type
;
typedef
Packet1cd
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
0
,
size
=
1
,
HasHalfPacket
=
0
,
HasAdd
=
1
,
HasSub
=
1
,
HasMul
=
1
,
HasDiv
=
1
,
HasNegate
=
1
,
HasAbs
=
0
,
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
HasSetLinear
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet1cd
>
{
typedef
std
::
complex
<
double
>
type
;
enum
{
size
=
1
,
alignment
=
Aligned16
};
typedef
Packet1cd
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pload
<
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
)
{
return
Packet1cd
(
pload
<
Packet2d
>
((
const
double
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
ploadu
<
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
)
{
return
Packet1cd
(
ploadu
<
Packet2d
>
((
const
double
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet1cd
&
from
)
{
pstore
((
double
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet1cd
&
from
)
{
pstoreu
((
double
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pset1
<
Packet1cd
>
(
const
std
::
complex
<
double
>&
from
)
{
/* here we really have to use unaligned loads :( */
return
ploadu
<
Packet1cd
>
(
&
from
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet1cd
pgather
<
std
::
complex
<
double
>
,
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
,
Index
stride
)
{
std
::
complex
<
double
>
EIGEN_ALIGN16
af
[
2
];
af
[
0
]
=
from
[
0
*
stride
];
af
[
1
]
=
from
[
1
*
stride
];
return
pload
<
Packet1cd
>
(
af
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
std
::
complex
<
double
>
,
Packet1cd
>
(
std
::
complex
<
double
>*
to
,
const
Packet1cd
&
from
,
Index
stride
)
{
std
::
complex
<
double
>
EIGEN_ALIGN16
af
[
2
];
pstore
<
std
::
complex
<
double
>
>
(
af
,
from
);
to
[
0
*
stride
]
=
af
[
0
];
to
[
1
*
stride
]
=
af
[
1
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
padd
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
a
.
v
+
b
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
psub
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
a
.
v
-
b
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pnegate
(
const
Packet1cd
&
a
)
{
return
Packet1cd
(
pnegate
(
Packet2d
(
a
.
v
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pconj
(
const
Packet1cd
&
a
)
{
return
Packet1cd
(
pxor
(
a
.
v
,
reinterpret_cast
<
Packet2d
>
(
p2ul_CONJ_XOR2
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pmul
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
Packet2d
a_re
,
a_im
,
v1
,
v2
;
// Permute and multiply the real parts of a and b
a_re
=
vec_perm
(
a
.
v
,
a
.
v
,
p16uc_PSET64_HI
);
// Get the imaginary parts of a
a_im
=
vec_perm
(
a
.
v
,
a
.
v
,
p16uc_PSET64_LO
);
// multiply a_re * b
v1
=
vec_madd
(
a_re
,
b
.
v
,
p2d_ZERO
);
// multiply a_im * b and get the conjugate result
v2
=
vec_madd
(
a_im
,
b
.
v
,
p2d_ZERO
);
v2
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4ui
>
(
v2
),
reinterpret_cast
<
Packet4ui
>
(
v2
),
8
));
v2
=
pxor
(
v2
,
reinterpret_cast
<
Packet2d
>
(
p2ul_CONJ_XOR1
));
return
Packet1cd
(
padd
<
Packet2d
>
(
v1
,
v2
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pand
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
pand
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
por
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
por
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pxor
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
pxor
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pandnot
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
pandnot
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
ploaddup
<
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
)
{
return
pset1
<
Packet1cd
>
(
*
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
double
>
>
(
const
std
::
complex
<
double
>
*
addr
)
{
EIGEN_PPC_PREFETCH
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
pfirst
<
Packet1cd
>
(
const
Packet1cd
&
a
)
{
std
::
complex
<
double
>
EIGEN_ALIGN16
res
[
2
];
pstore
<
std
::
complex
<
double
>
>
(
res
,
a
);
return
res
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
preverse
(
const
Packet1cd
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
predux
<
Packet1cd
>
(
const
Packet1cd
&
a
)
{
return
pfirst
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
preduxp
<
Packet1cd
>
(
const
Packet1cd
*
vecs
)
{
return
vecs
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
predux_mul
<
Packet1cd
>
(
const
Packet1cd
&
a
)
{
return
pfirst
(
a
);
}
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet1cd
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet1cd
&
/*first*/
,
const
Packet1cd
&
/*second*/
)
{
// FIXME is it sure we never have to align a Packet1cd?
// Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet1cd
,
false
,
true
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet1cd
&
x
,
const
Packet1cd
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
const
{
return
internal
::
pmul
(
a
,
pconj
(
b
));
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet1cd
,
true
,
false
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet1cd
&
x
,
const
Packet1cd
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
const
{
return
internal
::
pmul
(
pconj
(
a
),
b
);
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet1cd
,
true
,
true
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet1cd
&
x
,
const
Packet1cd
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
const
{
return
pconj
(
internal
::
pmul
(
a
,
b
));
}
};
template
<
>
struct
conj_helper
<
Packet2d
,
Packet1cd
,
false
,
false
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet2d
&
x
,
const
Packet1cd
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
c
,
pmul
(
x
,
y
));
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet2d
&
x
,
const
Packet1cd
&
y
)
const
{
return
Packet1cd
(
internal
::
pmul
<
Packet2d
>
(
x
,
y
.
v
));
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet2d
,
false
,
false
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet1cd
&
x
,
const
Packet2d
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
c
,
pmul
(
x
,
y
));
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
x
,
const
Packet2d
&
y
)
const
{
return
Packet1cd
(
internal
::
pmul
<
Packet2d
>
(
x
.
v
,
y
));
}
};
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pdiv
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
// TODO optimize it for AltiVec
Packet1cd
res
=
conj_helper
<
Packet1cd
,
Packet1cd
,
false
,
true
>
().
pmul
(
a
,
b
);
Packet2d
s
=
pmul
<
Packet2d
>
(
b
.
v
,
b
.
v
);
return
Packet1cd
(
pdiv
(
res
.
v
,
padd
<
Packet2d
>
(
s
,
vec_perm
(
s
,
s
,
p16uc_REVERSE64
))));
}
EIGEN_STRONG_INLINE
Packet1cd
pcplxflip
/*<Packet1cd>*/
(
const
Packet1cd
&
x
)
{
return
Packet1cd
(
preverse
(
Packet2d
(
x
.
v
)));
}
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
Packet1cd
,
2
>&
kernel
)
{
Packet2d
tmp
=
vec_perm
(
kernel
.
packet
[
0
].
v
,
kernel
.
packet
[
1
].
v
,
p16uc_TRANSPOSE64_HI
);
kernel
.
packet
[
1
].
v
=
vec_perm
(
kernel
.
packet
[
0
].
v
,
kernel
.
packet
[
1
].
v
,
p16uc_TRANSPOSE64_LO
);
kernel
.
packet
[
0
].
v
=
tmp
;
}
#endif // __VSX__
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_COMPLEX_ALTIVEC_H
#endif // EIGEN_COMPLEX
32
_ALTIVEC_H
external/eigen3/Eigen/src/Core/arch/AltiVec/MathFunctions.h
0 → 100644
View file @
a394b22a
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2007 Julien Pommier
// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/* The sin, cos, exp, and log functions of this file come from
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
*/
#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H
#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H
namespace
Eigen
{
namespace
internal
{
static
_EIGEN_DECLARE_CONST_Packet4f
(
1
,
1.0
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
half
,
0.5
f
);
static
_EIGEN_DECLARE_CONST_Packet4i
(
0x7f
,
0x7f
);
static
_EIGEN_DECLARE_CONST_Packet4i
(
23
,
23
);
static
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT
(
inv_mant_mask
,
~
0x7f800000
);
/* the smallest non denormalized float number */
static
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT
(
min_norm_pos
,
0x00800000
);
static
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT
(
minus_inf
,
0xff800000
);
// -1.f/0.f
static
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT
(
minus_nan
,
0xffffffff
);
/* natural logarithm computed for 4 simultaneous float
return NaN for x <= 0
*/
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_SQRTHF
,
0.707106781186547524
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p0
,
7.0376836292E-2
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p1
,
-
1.1514610310E-1
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p2
,
1.1676998740E-1
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p3
,
-
1.2420140846E-1
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p4
,
+
1.4249322787E-1
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p5
,
-
1.6668057665E-1
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p6
,
+
2.0000714765E-1
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p7
,
-
2.4999993993E-1
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p8
,
+
3.3333331174E-1
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_q1
,
-
2.12194440e-4
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_q2
,
0.693359375
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
exp_hi
,
88.3762626647950
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
exp_lo
,
-
88.3762626647949
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_LOG2EF
,
1.44269504088896341
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_C1
,
0.693359375
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_C2
,
-
2.12194440e-4
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p0
,
1.9875691500E-4
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p1
,
1.3981999507E-3
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p2
,
8.3334519073E-3
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p3
,
4.1665795894E-2
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p4
,
1.6666665459E-1
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p5
,
5.0000001201E-1
f
);
#ifdef __VSX__
static
_EIGEN_DECLARE_CONST_Packet2d
(
1
,
1.0
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
2
,
2.0
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
half
,
0.5
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
exp_hi
,
709.437
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
exp_lo
,
-
709.436139303
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_LOG2EF
,
1.4426950408889634073599
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_exp_p0
,
1.26177193074810590878e-4
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_exp_p1
,
3.02994407707441961300e-2
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_exp_p2
,
9.99999999999999999910e-1
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_exp_q0
,
3.00198505138664455042e-6
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_exp_q1
,
2.52448340349684104192e-3
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_exp_q2
,
2.27265548208155028766e-1
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_exp_q3
,
2.00000000000000000009e0
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_exp_C1
,
0.693145751953125
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_exp_C2
,
1.42860682030941723212e-6
);
#ifdef __POWER8_VECTOR__
static
Packet2l
p2l_1023
=
{
1023
,
1023
};
static
Packet2ul
p2ul_52
=
{
52
,
52
};
#endif
#endif
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
plog
<
Packet4f
>
(
const
Packet4f
&
_x
)
{
Packet4f
x
=
_x
;
Packet4i
emm0
;
/* isvalid_mask is 0 if x < 0 or x is NaN. */
Packet4ui
isvalid_mask
=
reinterpret_cast
<
Packet4ui
>
(
vec_cmpge
(
x
,
p4f_ZERO
));
Packet4ui
iszero_mask
=
reinterpret_cast
<
Packet4ui
>
(
vec_cmpeq
(
x
,
p4f_ZERO
));
x
=
pmax
(
x
,
p4f_min_norm_pos
);
/* cut off denormalized stuff */
emm0
=
vec_sr
(
reinterpret_cast
<
Packet4i
>
(
x
),
reinterpret_cast
<
Packet4ui
>
(
p4i_23
));
/* keep only the fractional part */
x
=
pand
(
x
,
p4f_inv_mant_mask
);
x
=
por
(
x
,
p4f_half
);
emm0
=
psub
(
emm0
,
p4i_0x7f
);
Packet4f
e
=
padd
(
vec_ctf
(
emm0
,
0
),
p4f_1
);
/* part2:
if( x < SQRTHF ) {
e -= 1;
x = x + x - 1.0;
} else { x = x - 1.0; }
*/
Packet4f
mask
=
reinterpret_cast
<
Packet4f
>
(
vec_cmplt
(
x
,
p4f_cephes_SQRTHF
));
Packet4f
tmp
=
pand
(
x
,
mask
);
x
=
psub
(
x
,
p4f_1
);
e
=
psub
(
e
,
pand
(
p4f_1
,
mask
));
x
=
padd
(
x
,
tmp
);
Packet4f
x2
=
pmul
(
x
,
x
);
Packet4f
x3
=
pmul
(
x2
,
x
);
Packet4f
y
,
y1
,
y2
;
y
=
pmadd
(
p4f_cephes_log_p0
,
x
,
p4f_cephes_log_p1
);
y1
=
pmadd
(
p4f_cephes_log_p3
,
x
,
p4f_cephes_log_p4
);
y2
=
pmadd
(
p4f_cephes_log_p6
,
x
,
p4f_cephes_log_p7
);
y
=
pmadd
(
y
,
x
,
p4f_cephes_log_p2
);
y1
=
pmadd
(
y1
,
x
,
p4f_cephes_log_p5
);
y2
=
pmadd
(
y2
,
x
,
p4f_cephes_log_p8
);
y
=
pmadd
(
y
,
x3
,
y1
);
y
=
pmadd
(
y
,
x3
,
y2
);
y
=
pmul
(
y
,
x3
);
y1
=
pmul
(
e
,
p4f_cephes_log_q1
);
tmp
=
pmul
(
x2
,
p4f_half
);
y
=
padd
(
y
,
y1
);
x
=
psub
(
x
,
tmp
);
y2
=
pmul
(
e
,
p4f_cephes_log_q2
);
x
=
padd
(
x
,
y
);
x
=
padd
(
x
,
y2
);
// negative arg will be NAN, 0 will be -INF
x
=
vec_sel
(
x
,
p4f_minus_inf
,
iszero_mask
);
x
=
vec_sel
(
p4f_minus_nan
,
x
,
isvalid_mask
);
return
x
;
}
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
pexp
<
Packet4f
>
(
const
Packet4f
&
_x
)
{
Packet4f
x
=
_x
;
Packet4f
tmp
,
fx
;
Packet4i
emm0
;
// clamp x
x
=
pmax
(
pmin
(
x
,
p4f_exp_hi
),
p4f_exp_lo
);
// express exp(x) as exp(g + n*log(2))
fx
=
pmadd
(
x
,
p4f_cephes_LOG2EF
,
p4f_half
);
fx
=
pfloor
(
fx
);
tmp
=
pmul
(
fx
,
p4f_cephes_exp_C1
);
Packet4f
z
=
pmul
(
fx
,
p4f_cephes_exp_C2
);
x
=
psub
(
x
,
tmp
);
x
=
psub
(
x
,
z
);
z
=
pmul
(
x
,
x
);
Packet4f
y
=
p4f_cephes_exp_p0
;
y
=
pmadd
(
y
,
x
,
p4f_cephes_exp_p1
);
y
=
pmadd
(
y
,
x
,
p4f_cephes_exp_p2
);
y
=
pmadd
(
y
,
x
,
p4f_cephes_exp_p3
);
y
=
pmadd
(
y
,
x
,
p4f_cephes_exp_p4
);
y
=
pmadd
(
y
,
x
,
p4f_cephes_exp_p5
);
y
=
pmadd
(
y
,
z
,
x
);
y
=
padd
(
y
,
p4f_1
);
// build 2^n
emm0
=
vec_cts
(
fx
,
0
);
emm0
=
vec_add
(
emm0
,
p4i_0x7f
);
emm0
=
vec_sl
(
emm0
,
reinterpret_cast
<
Packet4ui
>
(
p4i_23
));
// Altivec's max & min operators just drop silent NaNs. Check NaNs in
// inputs and return them unmodified.
Packet4ui
isnumber_mask
=
reinterpret_cast
<
Packet4ui
>
(
vec_cmpeq
(
_x
,
_x
));
return
vec_sel
(
_x
,
pmax
(
pmul
(
y
,
reinterpret_cast
<
Packet4f
>
(
emm0
)),
_x
),
isnumber_mask
);
}
#ifndef EIGEN_COMP_CLANG
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
prsqrt
<
Packet4f
>
(
const
Packet4f
&
x
)
{
return
vec_rsqrt
(
x
);
}
#endif
#ifdef __VSX__
#ifndef EIGEN_COMP_CLANG
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet2d
prsqrt
<
Packet2d
>
(
const
Packet2d
&
x
)
{
return
vec_rsqrt
(
x
);
}
#endif
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
psqrt
<
Packet4f
>
(
const
Packet4f
&
x
)
{
return
vec_sqrt
(
x
);
}
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet2d
psqrt
<
Packet2d
>
(
const
Packet2d
&
x
)
{
return
vec_sqrt
(
x
);
}
// VSX support varies between different compilers and even different
// versions of the same compiler. For gcc version >= 4.9.3, we can use
// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use
// a slow version that works with older compilers.
// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
static
inline
Packet2l
ConvertToPacket2l
(
const
Packet2d
&
x
)
{
#if EIGEN_GNUC_AT_LEAST(5, 4) || \
(EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
return
vec_cts
(
x
,
0
);
// TODO: check clang version.
#else
double
tmp
[
2
];
memcpy
(
tmp
,
&
x
,
sizeof
(
tmp
));
Packet2l
l
=
{
static_cast
<
long
long
>
(
tmp
[
0
]),
static_cast
<
long
long
>
(
tmp
[
1
])
};
return
l
;
#endif
}
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet2d
pexp
<
Packet2d
>
(
const
Packet2d
&
_x
)
{
Packet2d
x
=
_x
;
Packet2d
tmp
,
fx
;
Packet2l
emm0
;
// clamp x
x
=
pmax
(
pmin
(
x
,
p2d_exp_hi
),
p2d_exp_lo
);
/* express exp(x) as exp(g + n*log(2)) */
fx
=
pmadd
(
x
,
p2d_cephes_LOG2EF
,
p2d_half
);
fx
=
pfloor
(
fx
);
tmp
=
pmul
(
fx
,
p2d_cephes_exp_C1
);
Packet2d
z
=
pmul
(
fx
,
p2d_cephes_exp_C2
);
x
=
psub
(
x
,
tmp
);
x
=
psub
(
x
,
z
);
Packet2d
x2
=
pmul
(
x
,
x
);
Packet2d
px
=
p2d_cephes_exp_p0
;
px
=
pmadd
(
px
,
x2
,
p2d_cephes_exp_p1
);
px
=
pmadd
(
px
,
x2
,
p2d_cephes_exp_p2
);
px
=
pmul
(
px
,
x
);
Packet2d
qx
=
p2d_cephes_exp_q0
;
qx
=
pmadd
(
qx
,
x2
,
p2d_cephes_exp_q1
);
qx
=
pmadd
(
qx
,
x2
,
p2d_cephes_exp_q2
);
qx
=
pmadd
(
qx
,
x2
,
p2d_cephes_exp_q3
);
x
=
pdiv
(
px
,
psub
(
qx
,
px
));
x
=
pmadd
(
p2d_2
,
x
,
p2d_1
);
// build 2^n
emm0
=
ConvertToPacket2l
(
fx
);
#ifdef __POWER8_VECTOR__
emm0
=
vec_add
(
emm0
,
p2l_1023
);
emm0
=
vec_sl
(
emm0
,
p2ul_52
);
#else
// Code is a bit complex for POWER7. There is actually a
// vec_xxsldi intrinsic but it is not supported by some gcc versions.
// So we shift (52-32) bits and do a word swap with zeros.
_EIGEN_DECLARE_CONST_Packet4i
(
1023
,
1023
);
_EIGEN_DECLARE_CONST_Packet4i
(
20
,
20
);
// 52 - 32
Packet4i
emm04i
=
reinterpret_cast
<
Packet4i
>
(
emm0
);
emm04i
=
vec_add
(
emm04i
,
p4i_1023
);
emm04i
=
vec_sl
(
emm04i
,
reinterpret_cast
<
Packet4ui
>
(
p4i_20
));
static
const
Packet16uc
perm
=
{
0x14
,
0x15
,
0x16
,
0x17
,
0x00
,
0x01
,
0x02
,
0x03
,
0x1c
,
0x1d
,
0x1e
,
0x1f
,
0x08
,
0x09
,
0x0a
,
0x0b
};
#ifdef _BIG_ENDIAN
emm0
=
reinterpret_cast
<
Packet2l
>
(
vec_perm
(
p4i_ZERO
,
emm04i
,
perm
));
#else
emm0
=
reinterpret_cast
<
Packet2l
>
(
vec_perm
(
emm04i
,
p4i_ZERO
,
perm
));
#endif
#endif
// Altivec's max & min operators just drop silent NaNs. Check NaNs in
// inputs and return them unmodified.
Packet2ul
isnumber_mask
=
reinterpret_cast
<
Packet2ul
>
(
vec_cmpeq
(
_x
,
_x
));
return
vec_sel
(
_x
,
pmax
(
pmul
(
x
,
reinterpret_cast
<
Packet2d
>
(
emm0
)),
_x
),
isnumber_mask
);
}
#endif
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_ALTIVEC_H
external/eigen3/Eigen/src/Core/arch/AltiVec/PacketMath.h
100644 → 100755
View file @
a394b22a
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008 Konstantinos Margaritis <markos@
codex.gr
>
// Copyright (C) 2008
-2016
Konstantinos Margaritis <markos@
freevec.org
>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
...
...
@@ -18,13 +18,17 @@ namespace internal {
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
#endif
#ifndef EIGEN_HAS_FUSE_CJMADD
#define EIGEN_HAS_FUSE_CJMADD 1
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#endif
// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
16
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
32
#endif
typedef
__vector
float
Packet4f
;
...
...
@@ -38,7 +42,7 @@ typedef __vector unsigned char Packet16uc;
// and it doesn't really work to declare them global, so we define macros instead
#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
Packet4f p4f_##NAME =
(
Packet4f
)
vec_splat_s32(X)
Packet4f p4f_##NAME =
reinterpret_cast<
Packet4f
>(
vec_splat_s32(X)
)
#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
Packet4i p4i_##NAME = vec_splat_s32(X)
...
...
@@ -46,60 +50,158 @@ typedef __vector unsigned char Packet16uc;
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
Packet4f p4f_##NAME = pset1<Packet4f>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
Packet4i p4i_##NAME = pset1<Packet4i>(X)
#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
Packet2d p2d_##NAME = pset1<Packet2d>(X)
#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
Packet2l p2l_##NAME = pset1<Packet2l>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
#define DST_CHAN 1
#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
// These constants are endian-agnostic
static
_EIGEN_DECLARE_CONST_FAST_Packet4f
(
ZERO
,
0
);
//{ 0.0, 0.0, 0.0, 0.0}
static
_EIGEN_DECLARE_CONST_FAST_Packet4i
(
ZERO
,
0
);
//{ 0, 0, 0, 0,}
static
_EIGEN_DECLARE_CONST_FAST_Packet4i
(
ONE
,
1
);
//{ 1, 1, 1, 1}
static
_EIGEN_DECLARE_CONST_FAST_Packet4i
(
MINUS16
,
-
16
);
//{ -16, -16, -16, -16}
static
_EIGEN_DECLARE_CONST_FAST_Packet4i
(
MINUS1
,
-
1
);
//{ -1, -1, -1, -1}
static
Packet4f
p4f_MZERO
=
(
Packet4f
)
vec_sl
((
Packet4ui
)
p4i_MINUS1
,
(
Packet4ui
)
p4i_MINUS1
);
//{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
#ifndef __VSX__
static
Packet4f
p4f_ONE
=
vec_ctf
(
p4i_ONE
,
0
);
//{ 1.0, 1.0, 1.0, 1.0}
#endif
static
Packet4f
p4f_COUNTDOWN
=
{
0.0
,
1.0
,
2.0
,
3.0
};
static
Packet4i
p4i_COUNTDOWN
=
{
0
,
1
,
2
,
3
};
static
Packet16uc
p16uc_REVERSE32
=
{
12
,
13
,
14
,
15
,
8
,
9
,
10
,
11
,
4
,
5
,
6
,
7
,
0
,
1
,
2
,
3
};
static
Packet16uc
p16uc_DUPLICATE32_HI
=
{
0
,
1
,
2
,
3
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
4
,
5
,
6
,
7
};
// Mask alignment
#ifdef __PPC64__
#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
#else
#define _EIGEN_MASK_ALIGNMENT 0xfffffff0
#endif
#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
// Handle endianness properly while loading constants
// Define global static constants:
static
Packet4f
p4f_COUNTDOWN
=
{
3.0
,
2.0
,
1.0
,
0.0
};
static
Packet4i
p4i_COUNTDOWN
=
{
3
,
2
,
1
,
0
};
static
Packet16uc
p16uc_REVERSE
=
{
12
,
13
,
14
,
15
,
8
,
9
,
10
,
11
,
4
,
5
,
6
,
7
,
0
,
1
,
2
,
3
};
#ifdef _BIG_ENDIAN
static
Packet16uc
p16uc_FORWARD
=
vec_lvsl
(
0
,
(
float
*
)
0
);
static
Packet16uc
p16uc_DUPLICATE
=
{
0
,
1
,
2
,
3
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
4
,
5
,
6
,
7
};
static
_EIGEN_DECLARE_CONST_FAST_Packet4f
(
ZERO
,
0
);
static
_EIGEN_DECLARE_CONST_FAST_Packet4i
(
ZERO
,
0
);
static
_EIGEN_DECLARE_CONST_FAST_Packet4i
(
ONE
,
1
);
static
_EIGEN_DECLARE_CONST_FAST_Packet4i
(
MINUS16
,
-
16
);
static
_EIGEN_DECLARE_CONST_FAST_Packet4i
(
MINUS1
,
-
1
);
static
Packet4f
p4f_ONE
=
vec_ctf
(
p4i_ONE
,
0
);
static
Packet4f
p4f_ZERO_
=
(
Packet4f
)
vec_sl
((
Packet4ui
)
p4i_MINUS1
,
(
Packet4ui
)
p4i_MINUS1
);
#ifdef __VSX__
static
Packet16uc
p16uc_REVERSE64
=
{
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
#endif
static
Packet16uc
p16uc_PSET32_WODD
=
vec_sld
((
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
0
),
(
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
2
),
8
);
//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
static
Packet16uc
p16uc_PSET32_WEVEN
=
vec_sld
(
p16uc_DUPLICATE32_HI
,
(
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
3
),
8
);
//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
static
Packet16uc
p16uc_HALF64_0_16
=
vec_sld
((
Packet16uc
)
p4i_ZERO
,
vec_splat
((
Packet16uc
)
vec_abs
(
p4i_MINUS16
),
3
),
8
);
//{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
#else
static
Packet16uc
p16uc_FORWARD
=
p16uc_REVERSE32
;
static
Packet16uc
p16uc_REVERSE64
=
{
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
static
Packet16uc
p16uc_PSET32_WODD
=
vec_sld
((
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
1
),
(
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
3
),
8
);
//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
static
Packet16uc
p16uc_PSET32_WEVEN
=
vec_sld
((
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
0
),
(
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
2
),
8
);
//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
static
Packet16uc
p16uc_HALF64_0_16
=
vec_sld
(
vec_splat
((
Packet16uc
)
vec_abs
(
p4i_MINUS16
),
0
),
(
Packet16uc
)
p4i_ZERO
,
8
);
//{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
#endif // _BIG_ENDIAN
static
Packet16uc
p16uc_PSET64_HI
=
(
Packet16uc
)
vec_mergeh
((
Packet4ui
)
p16uc_PSET32_WODD
,
(
Packet4ui
)
p16uc_PSET32_WEVEN
);
//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
static
Packet16uc
p16uc_PSET64_LO
=
(
Packet16uc
)
vec_mergel
((
Packet4ui
)
p16uc_PSET32_WODD
,
(
Packet4ui
)
p16uc_PSET32_WEVEN
);
//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
static
Packet16uc
p16uc_TRANSPOSE64_HI
=
p16uc_PSET64_HI
+
p16uc_HALF64_0_16
;
//{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
static
Packet16uc
p16uc_TRANSPOSE64_LO
=
p16uc_PSET64_LO
+
p16uc_HALF64_0_16
;
//{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
static
Packet16uc
p16uc_COMPLEX32_REV
=
vec_sld
(
p16uc_REVERSE32
,
p16uc_REVERSE32
,
8
);
//{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
#ifdef _BIG_ENDIAN
static
Packet16uc
p16uc_COMPLEX32_REV2
=
vec_sld
(
p16uc_FORWARD
,
p16uc_FORWARD
,
8
);
//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
#else
static
Packet16uc
p16uc_COMPLEX32_REV2
=
vec_sld
(
p16uc_PSET64_HI
,
p16uc_PSET64_LO
,
8
);
//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
#endif // _BIG_ENDIAN
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
#define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
#else
#define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
#endif
template
<
>
struct
packet_traits
<
float
>
:
default_packet_traits
{
typedef
Packet4f
type
;
typedef
Packet4f
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
,
// FIXME check the Has*
HasHalfPacket
=
1
,
HasAdd
=
1
,
HasSub
=
1
,
HasMul
=
1
,
HasDiv
=
1
,
HasMin
=
1
,
HasMax
=
1
,
HasAbs
=
1
,
HasSin
=
0
,
HasCos
=
0
,
HasLog
=
0
,
HasExp
=
0
,
HasSqrt
=
0
HasExp
=
1
,
#ifdef __VSX__
HasSqrt
=
1
,
#if !EIGEN_COMP_CLANG
HasRsqrt
=
1
,
#else
HasRsqrt
=
0
,
#endif
#else
HasSqrt
=
0
,
HasRsqrt
=
0
,
#endif
HasRound
=
1
,
HasFloor
=
1
,
HasCeil
=
1
,
HasNegate
=
1
,
HasBlend
=
1
};
};
template
<
>
struct
packet_traits
<
int
>
:
default_packet_traits
{
typedef
Packet4i
type
;
typedef
Packet4i
half
;
enum
{
// FIXME check the Has*
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
size
=
4
,
HasHalfPacket
=
0
,
HasAdd
=
1
,
HasSub
=
1
,
HasMul
=
1
,
HasDiv
=
0
,
HasBlend
=
1
};
};
template
<
>
struct
unpacket_traits
<
Packet4f
>
{
typedef
float
type
;
enum
{
size
=
4
};
};
template
<
>
struct
unpacket_traits
<
Packet4i
>
{
typedef
int
type
;
enum
{
size
=
4
};
};
/*
template
<
>
struct
unpacket_traits
<
Packet4f
>
{
typedef
float
type
;
enum
{
size
=
4
,
alignment
=
Aligned16
};
typedef
Packet4f
half
;
};
template
<
>
struct
unpacket_traits
<
Packet4i
>
{
typedef
int
type
;
enum
{
size
=
4
,
alignment
=
Aligned16
};
typedef
Packet4i
half
;
};
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
s
,
const
Packet16uc
&
v
)
{
union
{
Packet16uc
v
;
unsigned
char
n
[
16
];
}
vt
;
vt
.
v
=
v
;
for
(
int
i
=
0
;
i
<
16
;
i
++
)
s
<<
(
int
)
vt
.
n
[
i
]
<<
", "
;
return
s
;
}
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
s
,
const
Packet4f
&
v
)
{
union
{
...
...
@@ -133,89 +235,136 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
return
s
;
}
inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
// Need to define them first or we get specialization after instantiation errors
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pload
<
Packet4f
>
(
const
float
*
from
)
{
union {
Packet4bi v;
unsigned int n[4];
} vt;
vt.v = v;
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
return s;
}
*/
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pset1
<
Packet4f
>
(
const
float
&
from
)
{
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
float
EIGEN_ALIGN16
af
[
4
];
af
[
0
]
=
from
;
Packet4f
vc
=
vec_ld
(
0
,
af
);
vc
=
vec_splat
(
vc
,
0
);
return
vc
;
EIGEN_DEBUG_ALIGNED_LOAD
#ifdef __VSX__
return
vec_vsx_ld
(
0
,
from
);
#else
return
vec_ld
(
0
,
from
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pset1
<
Packet4i
>
(
const
int
&
from
)
{
int
EIGEN_ALIGN16
ai
[
4
];
ai
[
0
]
=
from
;
Packet4i
vc
=
vec_ld
(
0
,
ai
);
vc
=
vec_splat
(
vc
,
0
);
return
vc
;
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pload
<
Packet4i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
#ifdef __VSX__
return
vec_vsx_ld
(
0
,
from
);
#else
return
vec_ld
(
0
,
from
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
plset
<
float
>
(
const
float
&
a
)
{
return
vec_add
(
pset1
<
Packet4f
>
(
a
),
p4f_COUNTDOWN
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
plset
<
int
>
(
const
int
&
a
)
{
return
vec_add
(
pset1
<
Packet4i
>
(
a
),
p4i_COUNTDOWN
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
padd
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vec_add
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
padd
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vec_add
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
#ifdef __VSX__
vec_vsx_st
(
from
,
0
,
to
);
#else
vec_st
(
from
,
0
,
to
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
psub
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vec_sub
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
psub
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vec_sub
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
int
>
(
int
*
to
,
const
Packet4i
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
#ifdef __VSX__
vec_vsx_st
(
from
,
0
,
to
);
#else
vec_st
(
from
,
0
,
to
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pnegate
(
const
Packet4f
&
a
)
{
return
psub
<
Packet4f
>
(
p4f_ZERO
,
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pnegate
(
const
Packet4i
&
a
)
{
return
psub
<
Packet4i
>
(
p4i_ZERO
,
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pset1
<
Packet4f
>
(
const
float
&
from
)
{
Packet4f
v
=
{
from
,
from
,
from
,
from
};
return
v
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pconj
(
const
Packet4f
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pconj
(
const
Packet4i
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pset1
<
Packet4i
>
(
const
int
&
from
)
{
Packet4i
v
=
{
from
,
from
,
from
,
from
};
return
v
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pbroadcast4
<
Packet4f
>
(
const
float
*
a
,
Packet4f
&
a0
,
Packet4f
&
a1
,
Packet4f
&
a2
,
Packet4f
&
a3
)
{
a3
=
pload
<
Packet4f
>
(
a
);
a0
=
vec_splat
(
a3
,
0
);
a1
=
vec_splat
(
a3
,
1
);
a2
=
vec_splat
(
a3
,
2
);
a3
=
vec_splat
(
a3
,
3
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pbroadcast4
<
Packet4i
>
(
const
int
*
a
,
Packet4i
&
a0
,
Packet4i
&
a1
,
Packet4i
&
a2
,
Packet4i
&
a3
)
{
a3
=
pload
<
Packet4i
>
(
a
);
a0
=
vec_splat
(
a3
,
0
);
a1
=
vec_splat
(
a3
,
1
);
a2
=
vec_splat
(
a3
,
2
);
a3
=
vec_splat
(
a3
,
3
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmul
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vec_madd
(
a
,
b
,
p4f_ZERO
);
}
/* Commented out: it's actually slower than processing it scalar
*
template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet4f
pgather
<
float
,
Packet4f
>
(
const
float
*
from
,
Index
stride
)
{
float
EIGEN_ALIGN16
af
[
4
];
af
[
0
]
=
from
[
0
*
stride
];
af
[
1
]
=
from
[
1
*
stride
];
af
[
2
]
=
from
[
2
*
stride
];
af
[
3
]
=
from
[
3
*
stride
];
return
pload
<
Packet4f
>
(
af
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet4i
pgather
<
int
,
Packet4i
>
(
const
int
*
from
,
Index
stride
)
{
int
EIGEN_ALIGN16
ai
[
4
];
ai
[
0
]
=
from
[
0
*
stride
];
ai
[
1
]
=
from
[
1
*
stride
];
ai
[
2
]
=
from
[
2
*
stride
];
ai
[
3
]
=
from
[
3
*
stride
];
return
pload
<
Packet4i
>
(
ai
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
float
,
Packet4f
>
(
float
*
to
,
const
Packet4f
&
from
,
Index
stride
)
{
// Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
//Set up constants, variables
Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
float
EIGEN_ALIGN16
af
[
4
];
pstore
<
float
>
(
af
,
from
);
to
[
0
*
stride
]
=
af
[
0
];
to
[
1
*
stride
]
=
af
[
1
];
to
[
2
*
stride
]
=
af
[
2
];
to
[
3
*
stride
]
=
af
[
3
];
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
int
,
Packet4i
>
(
int
*
to
,
const
Packet4i
&
from
,
Index
stride
)
{
int
EIGEN_ALIGN16
ai
[
4
];
pstore
<
int
>
((
int
*
)
ai
,
from
);
to
[
0
*
stride
]
=
ai
[
0
];
to
[
1
*
stride
]
=
ai
[
1
];
to
[
2
*
stride
]
=
ai
[
2
];
to
[
3
*
stride
]
=
ai
[
3
];
}
// Get the absolute values
a1 = vec_abs(a);
b1 = vec_abs(b);
template
<
>
EIGEN_STRONG_INLINE
Packet4f
plset
<
Packet4f
>
(
const
float
&
a
)
{
return
pset1
<
Packet4f
>
(
a
)
+
p4f_COUNTDOWN
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
plset
<
Packet4i
>
(
const
int
&
a
)
{
return
pset1
<
Packet4i
>
(
a
)
+
p4i_COUNTDOWN
;
}
// Get the signs using xor
Packet4
b
i
sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), p4i_ZERO);
template
<
>
EIGEN_STRONG_INLINE
Packet4f
padd
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
a
+
b
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
padd
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
a
+
b
;
}
// Do the multiplication for the asbolute values.
bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) p4i_MINUS16 );
low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1);
high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, p4i_ZERO);
high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) p4i_MINUS16);
prod = vec_add( low_prod, high_prod );
template
<
>
EIGEN_STRONG_INLINE
Packet4f
psub
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
a
-
b
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
psub
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
a
-
b
;
}
// NOR the product and select only the negative elements according to the sign mask
prod_ = vec_nor(prod, prod);
prod_ = vec_sel(p4i_ZERO, prod_, sgn);
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pnegate
(
const
Packet4f
&
a
)
{
return
p4f_ZERO
-
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pnegate
(
const
Packet4i
&
a
)
{
return
p4i_ZERO
-
a
;
}
// Add 1 to the result to get the negative numbers
v1sel = vec_sel(p4i_ZERO, p4i_ONE, sgn);
prod_ = vec_add(prod_, v1sel);
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pconj
(
const
Packet4f
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pconj
(
const
Packet4i
&
a
)
{
return
a
;
}
// Merge the results back to the final vector.
prod = vec_sel(prod, prod_, sgn);
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmul
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vec_madd
(
a
,
b
,
p4f_MZERO
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmul
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
a
*
b
;
}
return prod;
}
*/
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pdiv
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
Packet4f
t
,
y_0
,
y_1
,
res
;
#ifndef __VSX__ // VSX actually provides a div instruction
Packet4f
t
,
y_0
,
y_1
;
// Altivec does not offer a divide instruction, we have to do a reciprocal approximation
y_0
=
vec_re
(
b
);
...
...
@@ -224,8 +373,10 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const
t
=
vec_nmsub
(
y_0
,
b
,
p4f_ONE
);
y_1
=
vec_madd
(
y_0
,
t
,
y_0
);
res
=
vec_madd
(
a
,
y_1
,
p4f_ZERO
);
return
res
;
return
vec_madd
(
a
,
y_1
,
p4f_MZERO
);
#else
return
vec_div
(
a
,
b
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pdiv
<
Packet4i
>
(
const
Packet4i
&
/*a*/
,
const
Packet4i
&
/*b*/
)
...
...
@@ -234,8 +385,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
}
// for some weird raisons, it has to be overloaded for packet of integers
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmadd
(
const
Packet4f
&
a
,
const
Packet4f
&
b
,
const
Packet4f
&
c
)
{
return
vec_madd
(
a
,
b
,
c
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmadd
(
const
Packet4i
&
a
,
const
Packet4i
&
b
,
const
Packet4i
&
c
)
{
return
padd
(
pmul
(
a
,
b
),
c
)
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmadd
(
const
Packet4f
&
a
,
const
Packet4f
&
b
,
const
Packet4f
&
c
)
{
return
vec_madd
(
a
,
b
,
c
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmadd
(
const
Packet4i
&
a
,
const
Packet4i
&
b
,
const
Packet4i
&
c
)
{
return
a
*
b
+
c
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmin
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vec_min
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmin
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vec_min
(
a
,
b
);
}
...
...
@@ -243,7 +394,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmax
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vec_max
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmax
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vec_max
(
a
,
b
);
}
// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pand
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vec_and
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pand
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vec_and
(
a
,
b
);
}
...
...
@@ -256,13 +406,14 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pandnot
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vec_and
(
a
,
vec_nor
(
b
,
b
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pandnot
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vec_and
(
a
,
vec_nor
(
b
,
b
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pload
<
Packet4f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
vec_ld
(
0
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pload
<
Packet4i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
vec_ld
(
0
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pround
<
Packet4f
>
(
const
Packet4f
&
a
)
{
return
vec_round
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pceil
<
Packet4f
>
(
const
Packet4f
&
a
)
{
return
vec_ceil
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pfloor
<
Packet4f
>
(
const
Packet4f
&
a
)
{
return
vec_floor
(
a
);
}
#ifdef _BIG_ENDIAN
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploadu
<
Packet4f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
Packet16uc
MSQ
,
LSQ
;
Packet16uc
mask
;
MSQ
=
vec_ld
(
0
,
(
unsigned
char
*
)
from
);
// most significant quadword
...
...
@@ -282,25 +433,36 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
mask
=
vec_lvsl
(
0
,
from
);
// create the permute mask
return
(
Packet4i
)
vec_perm
(
MSQ
,
LSQ
,
mask
);
// align the data
}
#else
// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploadu
<
Packet4i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
(
Packet4i
)
vec_vsx_ld
((
long
)
from
&
15
,
(
const
int
*
)
_EIGEN_ALIGNED_PTR
(
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploadu
<
Packet4f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
(
Packet4f
)
vec_vsx_ld
((
long
)
from
&
15
,
(
const
float
*
)
_EIGEN_ALIGNED_PTR
(
from
));
}
#endif
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploaddup
<
Packet4f
>
(
const
float
*
from
)
{
Packet4f
p
;
if
((
ptrdiff_t
(
&
from
)
%
16
)
==
0
)
p
=
pload
<
Packet4f
>
(
from
);
else
p
=
ploadu
<
Packet4f
>
(
from
);
return
vec_perm
(
p
,
p
,
p16uc_DUPLICATE
);
if
((
std
::
ptrdiff_t
(
from
)
%
16
)
==
0
)
p
=
pload
<
Packet4f
>
(
from
);
else
p
=
ploadu
<
Packet4f
>
(
from
);
return
vec_perm
(
p
,
p
,
p16uc_DUPLICATE
32_HI
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploaddup
<
Packet4i
>
(
const
int
*
from
)
{
Packet4i
p
;
if
((
ptrdiff_t
(
&
from
)
%
16
)
==
0
)
p
=
pload
<
Packet4i
>
(
from
);
else
p
=
ploadu
<
Packet4i
>
(
from
);
return
vec_perm
(
p
,
p
,
p16uc_DUPLICATE
);
if
((
std
::
ptrdiff_t
(
from
)
%
16
)
==
0
)
p
=
pload
<
Packet4i
>
(
from
);
else
p
=
ploadu
<
Packet4i
>
(
from
);
return
vec_perm
(
p
,
p
,
p16uc_DUPLICATE
32_HI
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vec_st
(
from
,
0
,
to
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
int
>
(
int
*
to
,
const
Packet4i
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vec_st
(
from
,
0
,
to
);
}
#ifdef _BIG_ENDIAN
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
...
...
@@ -337,15 +499,33 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& f
vec_st
(
LSQ
,
15
,
(
unsigned
char
*
)
to
);
// Store the LSQ part first
vec_st
(
MSQ
,
0
,
(
unsigned
char
*
)
to
);
// Store the MSQ part
}
#else
// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
int
>
(
int
*
to
,
const
Packet4i
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vec_vsx_st
(
from
,
(
long
)
to
&
15
,
(
int
*
)
_EIGEN_ALIGNED_PTR
(
to
));
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vec_vsx_st
(
from
,
(
long
)
to
&
15
,
(
float
*
)
_EIGEN_ALIGNED_PTR
(
to
));
}
#endif
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
float
>
(
const
float
*
addr
)
{
vec_dstt
(
addr
,
DST_CTRL
(
2
,
2
,
32
),
DST_CHAN
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
int
>
(
const
int
*
addr
)
{
vec_dstt
(
addr
,
DST_CTRL
(
2
,
2
,
32
),
DST_CHAN
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
float
>
(
const
float
*
addr
)
{
EIGEN_PPC_PREFETCH
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
int
>
(
const
int
*
addr
)
{
EIGEN_PPC_PREFETCH
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
float
pfirst
<
Packet4f
>
(
const
Packet4f
&
a
)
{
float
EIGEN_ALIGN16
x
[
4
]
;
vec_st
(
a
,
0
,
x
);
return
x
[
0
]
;
}
template
<
>
EIGEN_STRONG_INLINE
int
pfirst
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int
EIGEN_ALIGN16
x
[
4
]
;
vec_st
(
a
,
0
,
x
);
return
x
[
0
]
;
}
template
<
>
EIGEN_STRONG_INLINE
float
pfirst
<
Packet4f
>
(
const
Packet4f
&
a
)
{
float
EIGEN_ALIGN16
x
;
vec_st
e
(
a
,
0
,
&
x
);
return
x
;
}
template
<
>
EIGEN_STRONG_INLINE
int
pfirst
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int
EIGEN_ALIGN16
x
;
vec_st
e
(
a
,
0
,
&
x
);
return
x
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
preverse
(
const
Packet4f
&
a
)
{
return
(
Packet4f
)
vec_perm
((
Packet16uc
)
a
,(
Packet16uc
)
a
,
p16uc_REVERSE
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
preverse
(
const
Packet4i
&
a
)
{
return
(
Packet4i
)
vec_perm
((
Packet16uc
)
a
,(
Packet16uc
)
a
,
p16uc_REVERSE
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
preverse
(
const
Packet4f
&
a
)
{
return
reinterpret_cast
<
Packet4f
>
(
vec_perm
(
reinterpret_cast
<
Packet16uc
>
(
a
),
reinterpret_cast
<
Packet16uc
>
(
a
),
p16uc_REVERSE32
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
preverse
(
const
Packet4i
&
a
)
{
return
reinterpret_cast
<
Packet4i
>
(
vec_perm
(
reinterpret_cast
<
Packet16uc
>
(
a
),
reinterpret_cast
<
Packet16uc
>
(
a
),
p16uc_REVERSE32
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pabs
(
const
Packet4f
&
a
)
{
return
vec_abs
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pabs
(
const
Packet4i
&
a
)
{
return
vec_abs
(
a
);
}
...
...
@@ -353,10 +533,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs
template
<
>
EIGEN_STRONG_INLINE
float
predux
<
Packet4f
>
(
const
Packet4f
&
a
)
{
Packet4f
b
,
sum
;
b
=
(
Packet4f
)
vec_sld
(
a
,
a
,
8
);
sum
=
vec_add
(
a
,
b
)
;
b
=
(
Packet4f
)
vec_sld
(
sum
,
sum
,
4
);
sum
=
vec_add
(
sum
,
b
)
;
b
=
vec_sld
(
a
,
a
,
8
);
sum
=
a
+
b
;
b
=
vec_sld
(
sum
,
sum
,
4
);
sum
+
=
b
;
return
pfirst
(
sum
);
}
...
...
@@ -379,11 +559,11 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
// Now do the summation:
// Lines 0+1
sum
[
0
]
=
vec_add
(
sum
[
0
]
,
sum
[
1
]
)
;
sum
[
0
]
=
sum
[
0
]
+
sum
[
1
];
// Lines 2+3
sum
[
1
]
=
vec_add
(
sum
[
2
]
,
sum
[
3
]
)
;
sum
[
1
]
=
sum
[
2
]
+
sum
[
3
];
// Add the results
sum
[
0
]
=
vec_add
(
sum
[
0
]
,
sum
[
1
]
)
;
sum
[
0
]
=
sum
[
0
]
+
sum
[
1
];
return
sum
[
0
];
}
...
...
@@ -392,7 +572,11 @@ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
{
Packet4i
sum
;
sum
=
vec_sums
(
a
,
p4i_ZERO
);
#ifdef _BIG_ENDIAN
sum
=
vec_sld
(
sum
,
p4i_ZERO
,
12
);
#else
sum
=
vec_sld
(
p4i_ZERO
,
sum
,
4
);
#endif
return
pfirst
(
sum
);
}
...
...
@@ -415,11 +599,11 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
// Now do the summation:
// Lines 0+1
sum
[
0
]
=
vec_add
(
sum
[
0
]
,
sum
[
1
]
)
;
sum
[
0
]
=
sum
[
0
]
+
sum
[
1
];
// Lines 2+3
sum
[
1
]
=
vec_add
(
sum
[
2
]
,
sum
[
3
]
)
;
sum
[
1
]
=
sum
[
2
]
+
sum
[
3
];
// Add the results
sum
[
0
]
=
vec_add
(
sum
[
0
]
,
sum
[
1
]
)
;
sum
[
0
]
=
sum
[
0
]
+
sum
[
1
];
return
sum
[
0
];
}
...
...
@@ -429,8 +613,8 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
template
<
>
EIGEN_STRONG_INLINE
float
predux_mul
<
Packet4f
>
(
const
Packet4f
&
a
)
{
Packet4f
prod
;
prod
=
pmul
(
a
,
(
Packet4f
)
vec_sld
(
a
,
a
,
8
));
return
pfirst
(
pmul
(
prod
,
(
Packet4f
)
vec_sld
(
prod
,
prod
,
4
)));
prod
=
pmul
(
a
,
vec_sld
(
a
,
a
,
8
));
return
pfirst
(
pmul
(
prod
,
vec_sld
(
prod
,
prod
,
4
)));
}
template
<
>
EIGEN_STRONG_INLINE
int
predux_mul
<
Packet4i
>
(
const
Packet4i
&
a
)
...
...
@@ -479,8 +663,25 @@ struct palign_impl<Offset,Packet4f>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet4f
&
first
,
const
Packet4f
&
second
)
{
if
(
Offset
!=
0
)
first
=
vec_sld
(
first
,
second
,
Offset
*
4
);
#ifdef _BIG_ENDIAN
switch
(
Offset
%
4
)
{
case
1
:
first
=
vec_sld
(
first
,
second
,
4
);
break
;
case
2
:
first
=
vec_sld
(
first
,
second
,
8
);
break
;
case
3
:
first
=
vec_sld
(
first
,
second
,
12
);
break
;
}
#else
switch
(
Offset
%
4
)
{
case
1
:
first
=
vec_sld
(
second
,
first
,
12
);
break
;
case
2
:
first
=
vec_sld
(
second
,
first
,
8
);
break
;
case
3
:
first
=
vec_sld
(
second
,
first
,
4
);
break
;
}
#endif
}
};
...
...
@@ -489,11 +690,342 @@ struct palign_impl<Offset,Packet4i>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet4i
&
first
,
const
Packet4i
&
second
)
{
if
(
Offset
!=
0
)
first
=
vec_sld
(
first
,
second
,
Offset
*
4
);
#ifdef _BIG_ENDIAN
switch
(
Offset
%
4
)
{
case
1
:
first
=
vec_sld
(
first
,
second
,
4
);
break
;
case
2
:
first
=
vec_sld
(
first
,
second
,
8
);
break
;
case
3
:
first
=
vec_sld
(
first
,
second
,
12
);
break
;
}
#else
switch
(
Offset
%
4
)
{
case
1
:
first
=
vec_sld
(
second
,
first
,
12
);
break
;
case
2
:
first
=
vec_sld
(
second
,
first
,
8
);
break
;
case
3
:
first
=
vec_sld
(
second
,
first
,
4
);
break
;
}
#endif
}
};
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet4f
,
4
>&
kernel
)
{
Packet4f
t0
,
t1
,
t2
,
t3
;
t0
=
vec_mergeh
(
kernel
.
packet
[
0
],
kernel
.
packet
[
2
]);
t1
=
vec_mergel
(
kernel
.
packet
[
0
],
kernel
.
packet
[
2
]);
t2
=
vec_mergeh
(
kernel
.
packet
[
1
],
kernel
.
packet
[
3
]);
t3
=
vec_mergel
(
kernel
.
packet
[
1
],
kernel
.
packet
[
3
]);
kernel
.
packet
[
0
]
=
vec_mergeh
(
t0
,
t2
);
kernel
.
packet
[
1
]
=
vec_mergel
(
t0
,
t2
);
kernel
.
packet
[
2
]
=
vec_mergeh
(
t1
,
t3
);
kernel
.
packet
[
3
]
=
vec_mergel
(
t1
,
t3
);
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet4i
,
4
>&
kernel
)
{
Packet4i
t0
,
t1
,
t2
,
t3
;
t0
=
vec_mergeh
(
kernel
.
packet
[
0
],
kernel
.
packet
[
2
]);
t1
=
vec_mergel
(
kernel
.
packet
[
0
],
kernel
.
packet
[
2
]);
t2
=
vec_mergeh
(
kernel
.
packet
[
1
],
kernel
.
packet
[
3
]);
t3
=
vec_mergel
(
kernel
.
packet
[
1
],
kernel
.
packet
[
3
]);
kernel
.
packet
[
0
]
=
vec_mergeh
(
t0
,
t2
);
kernel
.
packet
[
1
]
=
vec_mergel
(
t0
,
t2
);
kernel
.
packet
[
2
]
=
vec_mergeh
(
t1
,
t3
);
kernel
.
packet
[
3
]
=
vec_mergel
(
t1
,
t3
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pblend
(
const
Selector
<
4
>&
ifPacket
,
const
Packet4i
&
thenPacket
,
const
Packet4i
&
elsePacket
)
{
Packet4ui
select
=
{
ifPacket
.
select
[
0
],
ifPacket
.
select
[
1
],
ifPacket
.
select
[
2
],
ifPacket
.
select
[
3
]
};
Packet4ui
mask
=
reinterpret_cast
<
Packet4ui
>
(
vec_cmpeq
(
reinterpret_cast
<
Packet4ui
>
(
select
),
reinterpret_cast
<
Packet4ui
>
(
p4i_ONE
)));
return
vec_sel
(
elsePacket
,
thenPacket
,
mask
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pblend
(
const
Selector
<
4
>&
ifPacket
,
const
Packet4f
&
thenPacket
,
const
Packet4f
&
elsePacket
)
{
Packet4ui
select
=
{
ifPacket
.
select
[
0
],
ifPacket
.
select
[
1
],
ifPacket
.
select
[
2
],
ifPacket
.
select
[
3
]
};
Packet4ui
mask
=
reinterpret_cast
<
Packet4ui
>
(
vec_cmpeq
(
reinterpret_cast
<
Packet4ui
>
(
select
),
reinterpret_cast
<
Packet4ui
>
(
p4i_ONE
)));
return
vec_sel
(
elsePacket
,
thenPacket
,
mask
);
}
//---------- double ----------
#ifdef __VSX__
typedef
__vector
double
Packet2d
;
typedef
__vector
unsigned
long
long
Packet2ul
;
typedef
__vector
long
long
Packet2l
;
#if EIGEN_COMP_CLANG
typedef
Packet2ul
Packet2bl
;
#else
typedef
__vector
__bool
long
Packet2bl
;
#endif
static
Packet2l
p2l_ONE
=
{
1
,
1
};
static
Packet2l
p2l_ZERO
=
reinterpret_cast
<
Packet2l
>
(
p4i_ZERO
);
static
Packet2d
p2d_ONE
=
{
1.0
,
1.0
};
static
Packet2d
p2d_ZERO
=
reinterpret_cast
<
Packet2d
>
(
p4f_ZERO
);
static
Packet2d
p2d_MZERO
=
{
-
0.0
,
-
0.0
};
#ifdef _BIG_ENDIAN
static
Packet2d
p2d_COUNTDOWN
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4f
>
(
p2d_ZERO
),
reinterpret_cast
<
Packet4f
>
(
p2d_ONE
),
8
));
#else
static
Packet2d
p2d_COUNTDOWN
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4f
>
(
p2d_ONE
),
reinterpret_cast
<
Packet4f
>
(
p2d_ZERO
),
8
));
#endif
template
<
int
index
>
Packet2d
vec_splat_dbl
(
Packet2d
&
a
);
template
<
>
EIGEN_STRONG_INLINE
Packet2d
vec_splat_dbl
<
0
>
(
Packet2d
&
a
)
{
return
reinterpret_cast
<
Packet2d
>
(
vec_perm
(
a
,
a
,
p16uc_PSET64_HI
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
vec_splat_dbl
<
1
>
(
Packet2d
&
a
)
{
return
reinterpret_cast
<
Packet2d
>
(
vec_perm
(
a
,
a
,
p16uc_PSET64_LO
));
}
template
<
>
struct
packet_traits
<
double
>
:
default_packet_traits
{
typedef
Packet2d
type
;
typedef
Packet2d
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
2
,
HasHalfPacket
=
1
,
HasAdd
=
1
,
HasSub
=
1
,
HasMul
=
1
,
HasDiv
=
1
,
HasMin
=
1
,
HasMax
=
1
,
HasAbs
=
1
,
HasSin
=
0
,
HasCos
=
0
,
HasLog
=
0
,
HasExp
=
1
,
HasSqrt
=
1
,
HasRsqrt
=
1
,
HasRound
=
1
,
HasFloor
=
1
,
HasCeil
=
1
,
HasNegate
=
1
,
HasBlend
=
1
};
};
template
<
>
struct
unpacket_traits
<
Packet2d
>
{
typedef
double
type
;
enum
{
size
=
2
,
alignment
=
Aligned16
};
typedef
Packet2d
half
;
};
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
s
,
const
Packet2l
&
v
)
{
union
{
Packet2l
v
;
int64_t
n
[
2
];
}
vt
;
vt
.
v
=
v
;
s
<<
vt
.
n
[
0
]
<<
", "
<<
vt
.
n
[
1
];
return
s
;
}
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
s
,
const
Packet2d
&
v
)
{
union
{
Packet2d
v
;
double
n
[
2
];
}
vt
;
vt
.
v
=
v
;
s
<<
vt
.
n
[
0
]
<<
", "
<<
vt
.
n
[
1
];
return
s
;
}
// Need to define them first or we get specialization after instantiation errors
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pload
<
Packet2d
>
(
const
double
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
#ifdef __VSX__
return
vec_vsx_ld
(
0
,
from
);
#else
return
vec_ld
(
0
,
from
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
double
>
(
double
*
to
,
const
Packet2d
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
#ifdef __VSX__
vec_vsx_st
(
from
,
0
,
to
);
#else
vec_st
(
from
,
0
,
to
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pset1
<
Packet2d
>
(
const
double
&
from
)
{
Packet2d
v
=
{
from
,
from
};
return
v
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pbroadcast4
<
Packet2d
>
(
const
double
*
a
,
Packet2d
&
a0
,
Packet2d
&
a1
,
Packet2d
&
a2
,
Packet2d
&
a3
)
{
a1
=
pload
<
Packet2d
>
(
a
);
a0
=
vec_splat_dbl
<
0
>
(
a1
);
a1
=
vec_splat_dbl
<
1
>
(
a1
);
a3
=
pload
<
Packet2d
>
(
a
+
2
);
a2
=
vec_splat_dbl
<
0
>
(
a3
);
a3
=
vec_splat_dbl
<
1
>
(
a3
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet2d
pgather
<
double
,
Packet2d
>
(
const
double
*
from
,
Index
stride
)
{
double
EIGEN_ALIGN16
af
[
2
];
af
[
0
]
=
from
[
0
*
stride
];
af
[
1
]
=
from
[
1
*
stride
];
return
pload
<
Packet2d
>
(
af
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
double
,
Packet2d
>
(
double
*
to
,
const
Packet2d
&
from
,
Index
stride
)
{
double
EIGEN_ALIGN16
af
[
2
];
pstore
<
double
>
(
af
,
from
);
to
[
0
*
stride
]
=
af
[
0
];
to
[
1
*
stride
]
=
af
[
1
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
plset
<
Packet2d
>
(
const
double
&
a
)
{
return
pset1
<
Packet2d
>
(
a
)
+
p2d_COUNTDOWN
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
padd
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
a
+
b
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
psub
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
a
-
b
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pnegate
(
const
Packet2d
&
a
)
{
return
p2d_ZERO
-
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pconj
(
const
Packet2d
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmul
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_madd
(
a
,
b
,
p2d_MZERO
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pdiv
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_div
(
a
,
b
);
}
// for some weird raisons, it has to be overloaded for packet of integers
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmadd
(
const
Packet2d
&
a
,
const
Packet2d
&
b
,
const
Packet2d
&
c
)
{
return
vec_madd
(
a
,
b
,
c
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmin
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_min
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmax
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_max
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pand
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_and
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
por
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_or
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pxor
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_xor
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pandnot
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_and
(
a
,
vec_nor
(
b
,
b
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pround
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vec_round
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pceil
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vec_ceil
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pfloor
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vec_floor
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
ploadu
<
Packet2d
>
(
const
double
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
(
Packet2d
)
vec_vsx_ld
((
long
)
from
&
15
,
(
const
double
*
)
_EIGEN_ALIGNED_PTR
(
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
ploaddup
<
Packet2d
>
(
const
double
*
from
)
{
Packet2d
p
;
if
((
std
::
ptrdiff_t
(
from
)
%
16
)
==
0
)
p
=
pload
<
Packet2d
>
(
from
);
else
p
=
ploadu
<
Packet2d
>
(
from
);
return
vec_splat_dbl
<
0
>
(
p
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
double
>
(
double
*
to
,
const
Packet2d
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vec_vsx_st
((
Packet4f
)
from
,
(
long
)
to
&
15
,
(
float
*
)
_EIGEN_ALIGNED_PTR
(
to
));
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
double
>
(
const
double
*
addr
)
{
EIGEN_PPC_PREFETCH
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
double
pfirst
<
Packet2d
>
(
const
Packet2d
&
a
)
{
double
EIGEN_ALIGN16
x
[
2
];
pstore
<
double
>
(
x
,
a
);
return
x
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
preverse
(
const
Packet2d
&
a
)
{
return
reinterpret_cast
<
Packet2d
>
(
vec_perm
(
reinterpret_cast
<
Packet16uc
>
(
a
),
reinterpret_cast
<
Packet16uc
>
(
a
),
p16uc_REVERSE64
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pabs
(
const
Packet2d
&
a
)
{
return
vec_abs
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
double
predux
<
Packet2d
>
(
const
Packet2d
&
a
)
{
Packet2d
b
,
sum
;
b
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4f
>
(
a
),
reinterpret_cast
<
Packet4f
>
(
a
),
8
));
sum
=
a
+
b
;
return
pfirst
<
Packet2d
>
(
sum
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
preduxp
<
Packet2d
>
(
const
Packet2d
*
vecs
)
{
Packet2d
v
[
2
],
sum
;
v
[
0
]
=
vecs
[
0
]
+
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4f
>
(
vecs
[
0
]),
reinterpret_cast
<
Packet4f
>
(
vecs
[
0
]),
8
));
v
[
1
]
=
vecs
[
1
]
+
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4f
>
(
vecs
[
1
]),
reinterpret_cast
<
Packet4f
>
(
vecs
[
1
]),
8
));
#ifdef _BIG_ENDIAN
sum
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4f
>
(
v
[
0
]),
reinterpret_cast
<
Packet4f
>
(
v
[
1
]),
8
));
#else
sum
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4f
>
(
v
[
1
]),
reinterpret_cast
<
Packet4f
>
(
v
[
0
]),
8
));
#endif
return
sum
;
}
// Other reduction functions:
// mul
template
<
>
EIGEN_STRONG_INLINE
double
predux_mul
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
pfirst
(
pmul
(
a
,
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4ui
>
(
a
),
reinterpret_cast
<
Packet4ui
>
(
a
),
8
))));
}
// min
template
<
>
EIGEN_STRONG_INLINE
double
predux_min
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
pfirst
(
pmin
(
a
,
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4ui
>
(
a
),
reinterpret_cast
<
Packet4ui
>
(
a
),
8
))));
}
// max
template
<
>
EIGEN_STRONG_INLINE
double
predux_max
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
pfirst
(
pmax
(
a
,
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4ui
>
(
a
),
reinterpret_cast
<
Packet4ui
>
(
a
),
8
))));
}
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet2d
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet2d
&
first
,
const
Packet2d
&
second
)
{
if
(
Offset
==
1
)
#ifdef _BIG_ENDIAN
first
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4ui
>
(
first
),
reinterpret_cast
<
Packet4ui
>
(
second
),
8
));
#else
first
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4ui
>
(
second
),
reinterpret_cast
<
Packet4ui
>
(
first
),
8
));
#endif
}
};
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet2d
,
2
>&
kernel
)
{
Packet2d
t0
,
t1
;
t0
=
vec_perm
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
],
p16uc_TRANSPOSE64_HI
);
t1
=
vec_perm
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
],
p16uc_TRANSPOSE64_LO
);
kernel
.
packet
[
0
]
=
t0
;
kernel
.
packet
[
1
]
=
t1
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pblend
(
const
Selector
<
2
>&
ifPacket
,
const
Packet2d
&
thenPacket
,
const
Packet2d
&
elsePacket
)
{
Packet2l
select
=
{
ifPacket
.
select
[
0
],
ifPacket
.
select
[
1
]
};
Packet2bl
mask
=
vec_cmpeq
(
reinterpret_cast
<
Packet2d
>
(
select
),
reinterpret_cast
<
Packet2d
>
(
p2l_ONE
));
return
vec_sel
(
elsePacket
,
thenPacket
,
mask
);
}
#endif // __VSX__
}
// end namespace internal
}
// end namespace Eigen
...
...
external/eigen3/Eigen/src/Core/arch/CMakeLists.txt
deleted
100644 → 0
View file @
701c0225
ADD_SUBDIRECTORY
(
SSE
)
ADD_SUBDIRECTORY
(
AltiVec
)
ADD_SUBDIRECTORY
(
NEON
)
ADD_SUBDIRECTORY
(
Default
)
external/eigen3/Eigen/src/Core/arch/CUDA/Complex.h
0 → 100644
View file @
a394b22a
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COMPLEX_CUDA_H
#define EIGEN_COMPLEX_CUDA_H
// clang-format off
namespace
Eigen
{
namespace
internal
{
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
// Many std::complex methods such as operator+, operator-, operator* and
// operator/ are not constexpr. Due to this, clang does not treat them as device
// functions and thus Eigen functors making use of these operators fail to
// compile. Here, we manually specialize these functors for complex types when
// building for CUDA to avoid non-constexpr methods.
// Sum
template
<
typename
T
>
struct
scalar_sum_op
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
:
binary_op_base
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
{
typedef
typename
std
::
complex
<
T
>
result_type
;
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_sum_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
std
::
complex
<
T
>
operator
()
(
const
std
::
complex
<
T
>&
a
,
const
std
::
complex
<
T
>&
b
)
const
{
return
std
::
complex
<
T
>
(
numext
::
real
(
a
)
+
numext
::
real
(
b
),
numext
::
imag
(
a
)
+
numext
::
imag
(
b
));
}
};
template
<
typename
T
>
struct
scalar_sum_op
<
std
::
complex
<
T
>
,
std
::
complex
<
T
>
>
:
scalar_sum_op
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
{};
// Difference
template
<
typename
T
>
struct
scalar_difference_op
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
:
binary_op_base
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
{
typedef
typename
std
::
complex
<
T
>
result_type
;
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_difference_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
std
::
complex
<
T
>
operator
()
(
const
std
::
complex
<
T
>&
a
,
const
std
::
complex
<
T
>&
b
)
const
{
return
std
::
complex
<
T
>
(
numext
::
real
(
a
)
-
numext
::
real
(
b
),
numext
::
imag
(
a
)
-
numext
::
imag
(
b
));
}
};
template
<
typename
T
>
struct
scalar_difference_op
<
std
::
complex
<
T
>
,
std
::
complex
<
T
>
>
:
scalar_difference_op
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
{};
// Product
template
<
typename
T
>
struct
scalar_product_op
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
:
binary_op_base
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
{
enum
{
Vectorizable
=
packet_traits
<
std
::
complex
<
T
>>::
HasMul
};
typedef
typename
std
::
complex
<
T
>
result_type
;
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_product_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
std
::
complex
<
T
>
operator
()
(
const
std
::
complex
<
T
>&
a
,
const
std
::
complex
<
T
>&
b
)
const
{
const
T
a_real
=
numext
::
real
(
a
);
const
T
a_imag
=
numext
::
imag
(
a
);
const
T
b_real
=
numext
::
real
(
b
);
const
T
b_imag
=
numext
::
imag
(
b
);
return
std
::
complex
<
T
>
(
a_real
*
b_real
-
a_imag
*
b_imag
,
a_real
*
b_imag
+
a_imag
*
b_real
);
}
};
template
<
typename
T
>
struct
scalar_product_op
<
std
::
complex
<
T
>
,
std
::
complex
<
T
>
>
:
scalar_product_op
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
{};
// Quotient
template
<
typename
T
>
struct
scalar_quotient_op
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
:
binary_op_base
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
{
enum
{
Vectorizable
=
packet_traits
<
std
::
complex
<
T
>>::
HasDiv
};
typedef
typename
std
::
complex
<
T
>
result_type
;
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_quotient_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
std
::
complex
<
T
>
operator
()
(
const
std
::
complex
<
T
>&
a
,
const
std
::
complex
<
T
>&
b
)
const
{
const
T
a_real
=
numext
::
real
(
a
);
const
T
a_imag
=
numext
::
imag
(
a
);
const
T
b_real
=
numext
::
real
(
b
);
const
T
b_imag
=
numext
::
imag
(
b
);
const
T
norm
=
T
(
1
)
/
(
b_real
*
b_real
+
b_imag
*
b_imag
);
return
std
::
complex
<
T
>
((
a_real
*
b_real
+
a_imag
*
b_imag
)
*
norm
,
(
a_imag
*
b_real
-
a_real
*
b_imag
)
*
norm
);
}
};
template
<
typename
T
>
struct
scalar_quotient_op
<
std
::
complex
<
T
>
,
std
::
complex
<
T
>
>
:
scalar_quotient_op
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
{};
#endif
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_COMPLEX_CUDA_H
external/eigen3/Eigen/src/Core/arch/CUDA/Half.h
0 → 100644
View file @
a394b22a
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
//
// The conversion routines are Copyright (c) Fabian Giesen, 2016.
// The original license follows:
//
// Copyright (c) Fabian Giesen, 2016
// All rights reserved.
// Redistribution and use in source and binary forms, with or without
// modification, are permitted.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Standard 16-bit float type, mostly useful for GPUs. Defines a new
// type Eigen::half (inheriting from CUDA's __half struct) with
// operator overloads such that it behaves basically as an arithmetic
// type. It will be quite slow on CPUs (so it is recommended to stay
// in fp32 for CPUs, except for simple parameter conversions, I/O
// to disk and the likes), but fast on GPUs.
#ifndef EIGEN_HALF_CUDA_H
#define EIGEN_HALF_CUDA_H
#if __cplusplus > 199711L
#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
#else
#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type()
#endif
namespace
Eigen
{
struct
half
;
namespace
half_impl
{
#if !defined(EIGEN_HAS_CUDA_FP16)
// Make our own __half definition that is similar to CUDA's.
struct
__half
{
EIGEN_DEVICE_FUNC
__half
()
{}
explicit
EIGEN_DEVICE_FUNC
__half
(
unsigned
short
raw
)
:
x
(
raw
)
{}
unsigned
short
x
;
};
#endif
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
__half
raw_uint16_to_half
(
unsigned
short
x
);
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
__half
float_to_half_rtne
(
float
ff
);
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
float
half_to_float
(
__half
h
);
struct
half_base
:
public
__half
{
EIGEN_DEVICE_FUNC
half_base
()
{}
EIGEN_DEVICE_FUNC
half_base
(
const
half_base
&
h
)
:
__half
(
h
)
{}
EIGEN_DEVICE_FUNC
half_base
(
const
__half
&
h
)
:
__half
(
h
)
{}
};
}
// namespace half_impl
// Class definition.
struct
half
:
public
half_impl
::
half_base
{
#if !defined(EIGEN_HAS_CUDA_FP16)
typedef
half_impl
::
__half
__half
;
#endif
EIGEN_DEVICE_FUNC
half
()
{}
EIGEN_DEVICE_FUNC
half
(
const
__half
&
h
)
:
half_impl
::
half_base
(
h
)
{}
EIGEN_DEVICE_FUNC
half
(
const
half
&
h
)
:
half_impl
::
half_base
(
h
)
{}
explicit
EIGEN_DEVICE_FUNC
half
(
bool
b
)
:
half_impl
::
half_base
(
half_impl
::
raw_uint16_to_half
(
b
?
0x3c00
:
0
))
{}
template
<
class
T
>
explicit
EIGEN_DEVICE_FUNC
half
(
const
T
&
val
)
:
half_impl
::
half_base
(
half_impl
::
float_to_half_rtne
(
static_cast
<
float
>
(
val
)))
{}
explicit
EIGEN_DEVICE_FUNC
half
(
float
f
)
:
half_impl
::
half_base
(
half_impl
::
float_to_half_rtne
(
f
))
{}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
bool
)
const
{
// +0.0 and -0.0 become false, everything else becomes true.
return
(
x
&
0x7fff
)
!=
0
;
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
signed
char
)
const
{
return
static_cast
<
signed
char
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
unsigned
char
)
const
{
return
static_cast
<
unsigned
char
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
short
)
const
{
return
static_cast
<
short
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
unsigned
short
)
const
{
return
static_cast
<
unsigned
short
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
int
)
const
{
return
static_cast
<
int
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
unsigned
int
)
const
{
return
static_cast
<
unsigned
int
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
long
)
const
{
return
static_cast
<
long
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
unsigned
long
)
const
{
return
static_cast
<
unsigned
long
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
long
long
)
const
{
return
static_cast
<
long
long
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
unsigned
long
long
)
const
{
return
static_cast
<
unsigned
long
long
>
(
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
float
)
const
{
return
half_impl
::
half_to_float
(
*
this
);
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
double
)
const
{
return
static_cast
<
double
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
half
&
operator
=
(
const
half
&
other
)
{
x
=
other
.
x
;
return
*
this
;
}
};
namespace
half_impl
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
// Intrinsics for native fp16 support. Note that on current hardware,
// these are no faster than fp32 arithmetic (you need to use the half2
// versions to get the ALU speed increased), but you do save the
// conversion steps back and forth.
__device__
half
operator
+
(
const
half
&
a
,
const
half
&
b
)
{
return
__hadd
(
a
,
b
);
}
__device__
half
operator
*
(
const
half
&
a
,
const
half
&
b
)
{
return
__hmul
(
a
,
b
);
}
__device__
half
operator
-
(
const
half
&
a
,
const
half
&
b
)
{
return
__hsub
(
a
,
b
);
}
__device__
half
operator
/
(
const
half
&
a
,
const
half
&
b
)
{
float
num
=
__half2float
(
a
);
float
denom
=
__half2float
(
b
);
return
__float2half
(
num
/
denom
);
}
__device__
half
operator
-
(
const
half
&
a
)
{
return
__hneg
(
a
);
}
__device__
half
&
operator
+=
(
half
&
a
,
const
half
&
b
)
{
a
=
a
+
b
;
return
a
;
}
__device__
half
&
operator
*=
(
half
&
a
,
const
half
&
b
)
{
a
=
a
*
b
;
return
a
;
}
__device__
half
&
operator
-=
(
half
&
a
,
const
half
&
b
)
{
a
=
a
-
b
;
return
a
;
}
__device__
half
&
operator
/=
(
half
&
a
,
const
half
&
b
)
{
a
=
a
/
b
;
return
a
;
}
__device__
bool
operator
==
(
const
half
&
a
,
const
half
&
b
)
{
return
__heq
(
a
,
b
);
}
__device__
bool
operator
!=
(
const
half
&
a
,
const
half
&
b
)
{
return
__hne
(
a
,
b
);
}
__device__
bool
operator
<
(
const
half
&
a
,
const
half
&
b
)
{
return
__hlt
(
a
,
b
);
}
__device__
bool
operator
<=
(
const
half
&
a
,
const
half
&
b
)
{
return
__hle
(
a
,
b
);
}
__device__
bool
operator
>
(
const
half
&
a
,
const
half
&
b
)
{
return
__hgt
(
a
,
b
);
}
__device__
bool
operator
>=
(
const
half
&
a
,
const
half
&
b
)
{
return
__hge
(
a
,
b
);
}
#else // Emulate support for half floats
// Definitions for CPUs and older CUDA, mostly working through conversion
// to/from fp32.
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
operator
+
(
const
half
&
a
,
const
half
&
b
)
{
return
half
(
float
(
a
)
+
float
(
b
));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
operator
*
(
const
half
&
a
,
const
half
&
b
)
{
return
half
(
float
(
a
)
*
float
(
b
));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
operator
-
(
const
half
&
a
,
const
half
&
b
)
{
return
half
(
float
(
a
)
-
float
(
b
));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
operator
/
(
const
half
&
a
,
const
half
&
b
)
{
return
half
(
float
(
a
)
/
float
(
b
));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
operator
-
(
const
half
&
a
)
{
half
result
;
result
.
x
=
a
.
x
^
0x8000
;
return
result
;
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
&
operator
+=
(
half
&
a
,
const
half
&
b
)
{
a
=
half
(
float
(
a
)
+
float
(
b
));
return
a
;
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
&
operator
*=
(
half
&
a
,
const
half
&
b
)
{
a
=
half
(
float
(
a
)
*
float
(
b
));
return
a
;
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
&
operator
-=
(
half
&
a
,
const
half
&
b
)
{
a
=
half
(
float
(
a
)
-
float
(
b
));
return
a
;
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
&
operator
/=
(
half
&
a
,
const
half
&
b
)
{
a
=
half
(
float
(
a
)
/
float
(
b
));
return
a
;
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
bool
operator
==
(
const
half
&
a
,
const
half
&
b
)
{
return
float
(
a
)
==
float
(
b
);
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
bool
operator
!=
(
const
half
&
a
,
const
half
&
b
)
{
return
float
(
a
)
!=
float
(
b
);
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
bool
operator
<
(
const
half
&
a
,
const
half
&
b
)
{
return
float
(
a
)
<
float
(
b
);
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
bool
operator
<=
(
const
half
&
a
,
const
half
&
b
)
{
return
float
(
a
)
<=
float
(
b
);
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
bool
operator
>
(
const
half
&
a
,
const
half
&
b
)
{
return
float
(
a
)
>
float
(
b
);
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
bool
operator
>=
(
const
half
&
a
,
const
half
&
b
)
{
return
float
(
a
)
>=
float
(
b
);
}
#endif // Emulate support for half floats
// Division by an index. Do it in full float precision to avoid accuracy
// issues in converting the denominator to half.
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
operator
/
(
const
half
&
a
,
Index
b
)
{
return
half
(
static_cast
<
float
>
(
a
)
/
static_cast
<
float
>
(
b
));
}
// Conversion routines, including fallbacks for the host or older CUDA.
// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
// these in hardware. If we need more performance on older/other CPUs, they are
// also possible to vectorize directly.
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
__half
raw_uint16_to_half
(
unsigned
short
x
)
{
__half
h
;
h
.
x
=
x
;
return
h
;
}
union
FP32
{
unsigned
int
u
;
float
f
;
};
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
__half
float_to_half_rtne
(
float
ff
)
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
return
__float2half
(
ff
);
#elif defined(EIGEN_HAS_FP16_C)
__half
h
;
h
.
x
=
_cvtss_sh
(
ff
,
0
);
return
h
;
#else
FP32
f
;
f
.
f
=
ff
;
const
FP32
f32infty
=
{
255
<<
23
};
const
FP32
f16max
=
{
(
127
+
16
)
<<
23
};
const
FP32
denorm_magic
=
{
((
127
-
15
)
+
(
23
-
10
)
+
1
)
<<
23
};
unsigned
int
sign_mask
=
0x80000000u
;
__half
o
;
o
.
x
=
static_cast
<
unsigned
short
>
(
0x0u
);
unsigned
int
sign
=
f
.
u
&
sign_mask
;
f
.
u
^=
sign
;
// NOTE all the integer compares in this function can be safely
// compiled into signed compares since all operands are below
// 0x80000000. Important if you want fast straight SSE2 code
// (since there's no unsigned PCMPGTD).
if
(
f
.
u
>=
f16max
.
u
)
{
// result is Inf or NaN (all exponent bits set)
o
.
x
=
(
f
.
u
>
f32infty
.
u
)
?
0x7e00
:
0x7c00
;
// NaN->qNaN and Inf->Inf
}
else
{
// (De)normalized number or zero
if
(
f
.
u
<
(
113
<<
23
))
{
// resulting FP16 is subnormal or zero
// use a magic value to align our 10 mantissa bits at the bottom of
// the float. as long as FP addition is round-to-nearest-even this
// just works.
f
.
f
+=
denorm_magic
.
f
;
// and one integer subtract of the bias later, we have our final float!
o
.
x
=
static_cast
<
unsigned
short
>
(
f
.
u
-
denorm_magic
.
u
);
}
else
{
unsigned
int
mant_odd
=
(
f
.
u
>>
13
)
&
1
;
// resulting mantissa is odd
// update exponent, rounding bias part 1
f
.
u
+=
((
unsigned
int
)(
15
-
127
)
<<
23
)
+
0xfff
;
// rounding bias part 2
f
.
u
+=
mant_odd
;
// take the bits!
o
.
x
=
static_cast
<
unsigned
short
>
(
f
.
u
>>
13
);
}
}
o
.
x
|=
static_cast
<
unsigned
short
>
(
sign
>>
16
);
return
o
;
#endif
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
float
half_to_float
(
__half
h
)
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
return
__half2float
(
h
);
#elif defined(EIGEN_HAS_FP16_C)
return
_cvtsh_ss
(
h
.
x
);
#else
const
FP32
magic
=
{
113
<<
23
};
const
unsigned
int
shifted_exp
=
0x7c00
<<
13
;
// exponent mask after shift
FP32
o
;
o
.
u
=
(
h
.
x
&
0x7fff
)
<<
13
;
// exponent/mantissa bits
unsigned
int
exp
=
shifted_exp
&
o
.
u
;
// just the exponent
o
.
u
+=
(
127
-
15
)
<<
23
;
// exponent adjust
// handle exponent special cases
if
(
exp
==
shifted_exp
)
{
// Inf/NaN?
o
.
u
+=
(
128
-
16
)
<<
23
;
// extra exp adjust
}
else
if
(
exp
==
0
)
{
// Zero/Denormal?
o
.
u
+=
1
<<
23
;
// extra exp adjust
o
.
f
-=
magic
.
f
;
// renormalize
}
o
.
u
|=
(
h
.
x
&
0x8000
)
<<
16
;
// sign bit
return
o
.
f
;
#endif
}
// --- standard functions ---
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
bool
(
isinf
)(
const
half
&
a
)
{
return
(
a
.
x
&
0x7fff
)
==
0x7c00
;
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
bool
(
isnan
)(
const
half
&
a
)
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return
__hisnan
(
a
);
#else
return
(
a
.
x
&
0x7fff
)
>
0x7c00
;
#endif
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
bool
(
isfinite
)(
const
half
&
a
)
{
return
!
(
isinf
EIGEN_NOT_A_MACRO
(
a
))
&&
!
(
isnan
EIGEN_NOT_A_MACRO
(
a
));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
abs
(
const
half
&
a
)
{
half
result
;
result
.
x
=
a
.
x
&
0x7FFF
;
return
result
;
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
exp
(
const
half
&
a
)
{
return
half
(
::
expf
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
log
(
const
half
&
a
)
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return
Eigen
::
half
(
::
hlog
(
a
));
#else
return
half
(
::
logf
(
float
(
a
)));
#endif
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
log1p
(
const
half
&
a
)
{
return
half
(
numext
::
log1p
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
log10
(
const
half
&
a
)
{
return
half
(
::
log10f
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
sqrt
(
const
half
&
a
)
{
return
half
(
::
sqrtf
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
pow
(
const
half
&
a
,
const
half
&
b
)
{
return
half
(
::
powf
(
float
(
a
),
float
(
b
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
sin
(
const
half
&
a
)
{
return
half
(
::
sinf
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
cos
(
const
half
&
a
)
{
return
half
(
::
cosf
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
tan
(
const
half
&
a
)
{
return
half
(
::
tanf
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
tanh
(
const
half
&
a
)
{
return
half
(
::
tanhf
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
floor
(
const
half
&
a
)
{
return
half
(
::
floorf
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
ceil
(
const
half
&
a
)
{
return
half
(
::
ceilf
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
(
min
)(
const
half
&
a
,
const
half
&
b
)
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return
__hlt
(
b
,
a
)
?
b
:
a
;
#else
const
float
f1
=
static_cast
<
float
>
(
a
);
const
float
f2
=
static_cast
<
float
>
(
b
);
return
f2
<
f1
?
b
:
a
;
#endif
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
(
max
)(
const
half
&
a
,
const
half
&
b
)
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return
__hlt
(
a
,
b
)
?
b
:
a
;
#else
const
float
f1
=
static_cast
<
float
>
(
a
);
const
float
f2
=
static_cast
<
float
>
(
b
);
return
f1
<
f2
?
b
:
a
;
#endif
}
EIGEN_ALWAYS_INLINE
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
half
&
v
)
{
os
<<
static_cast
<
float
>
(
v
);
return
os
;
}
}
// end namespace half_impl
// import Eigen::half_impl::half into Eigen namespace
// using half_impl::half;
namespace
internal
{
template
<
>
struct
random_default_impl
<
half
,
false
,
false
>
{
static
inline
half
run
(
const
half
&
x
,
const
half
&
y
)
{
return
x
+
(
y
-
x
)
*
half
(
float
(
std
::
rand
())
/
float
(
RAND_MAX
));
}
static
inline
half
run
()
{
return
run
(
half
(
-
1.
f
),
half
(
1.
f
));
}
};
template
<
>
struct
is_arithmetic
<
half
>
{
enum
{
value
=
true
};
};
}
// end namespace internal
}
// end namespace Eigen
namespace
std
{
template
<
>
struct
numeric_limits
<
Eigen
::
half
>
{
static
const
bool
is_specialized
=
true
;
static
const
bool
is_signed
=
true
;
static
const
bool
is_integer
=
false
;
static
const
bool
is_exact
=
false
;
static
const
bool
has_infinity
=
true
;
static
const
bool
has_quiet_NaN
=
true
;
static
const
bool
has_signaling_NaN
=
true
;
static
const
float_denorm_style
has_denorm
=
denorm_present
;
static
const
bool
has_denorm_loss
=
false
;
static
const
std
::
float_round_style
round_style
=
std
::
round_to_nearest
;
static
const
bool
is_iec559
=
false
;
static
const
bool
is_bounded
=
false
;
static
const
bool
is_modulo
=
false
;
static
const
int
digits
=
11
;
static
const
int
digits10
=
2
;
//static const int max_digits10 = ;
static
const
int
radix
=
2
;
static
const
int
min_exponent
=
-
13
;
static
const
int
min_exponent10
=
-
4
;
static
const
int
max_exponent
=
16
;
static
const
int
max_exponent10
=
4
;
static
const
bool
traps
=
true
;
static
const
bool
tinyness_before
=
false
;
static
Eigen
::
half
(
min
)()
{
return
Eigen
::
half_impl
::
raw_uint16_to_half
(
0x400
);
}
static
Eigen
::
half
lowest
()
{
return
Eigen
::
half_impl
::
raw_uint16_to_half
(
0xfbff
);
}
static
Eigen
::
half
(
max
)()
{
return
Eigen
::
half_impl
::
raw_uint16_to_half
(
0x7bff
);
}
static
Eigen
::
half
epsilon
()
{
return
Eigen
::
half_impl
::
raw_uint16_to_half
(
0x0800
);
}
static
Eigen
::
half
round_error
()
{
return
Eigen
::
half
(
0.5
);
}
static
Eigen
::
half
infinity
()
{
return
Eigen
::
half_impl
::
raw_uint16_to_half
(
0x7c00
);
}
static
Eigen
::
half
quiet_NaN
()
{
return
Eigen
::
half_impl
::
raw_uint16_to_half
(
0x7e00
);
}
static
Eigen
::
half
signaling_NaN
()
{
return
Eigen
::
half_impl
::
raw_uint16_to_half
(
0x7e00
);
}
static
Eigen
::
half
denorm_min
()
{
return
Eigen
::
half_impl
::
raw_uint16_to_half
(
0x1
);
}
};
}
namespace
Eigen
{
template
<
>
struct
NumTraits
<
Eigen
::
half
>
:
GenericNumTraits
<
Eigen
::
half
>
{
enum
{
IsSigned
=
true
,
IsInteger
=
false
,
IsComplex
=
false
,
RequireInitialization
=
false
};
EIGEN_DEVICE_FUNC
static
EIGEN_STRONG_INLINE
Eigen
::
half
epsilon
()
{
return
half_impl
::
raw_uint16_to_half
(
0x0800
);
}
EIGEN_DEVICE_FUNC
static
EIGEN_STRONG_INLINE
Eigen
::
half
dummy_precision
()
{
return
Eigen
::
half
(
1e-2
f
);
}
EIGEN_DEVICE_FUNC
static
EIGEN_STRONG_INLINE
Eigen
::
half
highest
()
{
return
half_impl
::
raw_uint16_to_half
(
0x7bff
);
}
EIGEN_DEVICE_FUNC
static
EIGEN_STRONG_INLINE
Eigen
::
half
lowest
()
{
return
half_impl
::
raw_uint16_to_half
(
0xfbff
);
}
EIGEN_DEVICE_FUNC
static
EIGEN_STRONG_INLINE
Eigen
::
half
infinity
()
{
return
half_impl
::
raw_uint16_to_half
(
0x7c00
);
}
EIGEN_DEVICE_FUNC
static
EIGEN_STRONG_INLINE
Eigen
::
half
quiet_NaN
()
{
return
half_impl
::
raw_uint16_to_half
(
0x7c01
);
}
};
}
// end namespace Eigen
// C-like standard mathematical functions and trancendentals.
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
Eigen
::
half
fabsh
(
const
Eigen
::
half
&
a
)
{
Eigen
::
half
result
;
result
.
x
=
a
.
x
&
0x7FFF
;
return
result
;
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
Eigen
::
half
exph
(
const
Eigen
::
half
&
a
)
{
return
Eigen
::
half
(
::
expf
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
Eigen
::
half
logh
(
const
Eigen
::
half
&
a
)
{
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return
Eigen
::
half
(
::
hlog
(
a
));
#else
return
Eigen
::
half
(
::
logf
(
float
(
a
)));
#endif
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
Eigen
::
half
sqrth
(
const
Eigen
::
half
&
a
)
{
return
Eigen
::
half
(
::
sqrtf
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
Eigen
::
half
powh
(
const
Eigen
::
half
&
a
,
const
Eigen
::
half
&
b
)
{
return
Eigen
::
half
(
::
powf
(
float
(
a
),
float
(
b
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
Eigen
::
half
floorh
(
const
Eigen
::
half
&
a
)
{
return
Eigen
::
half
(
::
floorf
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
Eigen
::
half
ceilh
(
const
Eigen
::
half
&
a
)
{
return
Eigen
::
half
(
::
ceilf
(
float
(
a
)));
}
namespace
std
{
#if __cplusplus > 199711L
template
<
>
struct
hash
<
Eigen
::
half
>
{
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
std
::
size_t
operator
()(
const
Eigen
::
half
&
a
)
const
{
return
static_cast
<
std
::
size_t
>
(
a
.
x
);
}
};
#endif
}
// end namespace std
// Add the missing shfl_xor intrinsic
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
__device__
EIGEN_STRONG_INLINE
Eigen
::
half
__shfl_xor
(
Eigen
::
half
var
,
int
laneMask
,
int
width
=
warpSize
)
{
return
static_cast
<
Eigen
::
half
>
(
__shfl_xor
(
static_cast
<
float
>
(
var
),
laneMask
,
width
));
}
#endif
// ldg() has an overload for __half, but we also need one for Eigen::half.
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
Eigen
::
half
__ldg
(
const
Eigen
::
half
*
ptr
)
{
return
Eigen
::
half_impl
::
raw_uint16_to_half
(
__ldg
(
reinterpret_cast
<
const
unsigned
short
*>
(
ptr
)));
}
#endif
#if defined(__CUDA_ARCH__)
namespace
Eigen
{
namespace
numext
{
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
bool
(
isnan
)(
const
Eigen
::
half
&
h
)
{
return
(
half_impl
::
isnan
)(
h
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
bool
(
isinf
)(
const
Eigen
::
half
&
h
)
{
return
(
half_impl
::
isinf
)(
h
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
bool
(
isfinite
)(
const
Eigen
::
half
&
h
)
{
return
(
half_impl
::
isfinite
)(
h
);
}
}
// namespace Eigen
}
// namespace numext
#endif
#endif // EIGEN_HALF_CUDA_H
external/eigen3/Eigen/src/Core/arch/CUDA/MathFunctions.h
0 → 100644
View file @
a394b22a
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
#define EIGEN_MATH_FUNCTIONS_CUDA_H
namespace
Eigen
{
namespace
internal
{
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
plog
<
float4
>
(
const
float4
&
a
)
{
return
make_float4
(
logf
(
a
.
x
),
logf
(
a
.
y
),
logf
(
a
.
z
),
logf
(
a
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
plog
<
double2
>
(
const
double2
&
a
)
{
using
::
log
;
return
make_double2
(
log
(
a
.
x
),
log
(
a
.
y
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
plog1p
<
float4
>
(
const
float4
&
a
)
{
return
make_float4
(
log1pf
(
a
.
x
),
log1pf
(
a
.
y
),
log1pf
(
a
.
z
),
log1pf
(
a
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
plog1p
<
double2
>
(
const
double2
&
a
)
{
return
make_double2
(
log1p
(
a
.
x
),
log1p
(
a
.
y
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pexp
<
float4
>
(
const
float4
&
a
)
{
return
make_float4
(
expf
(
a
.
x
),
expf
(
a
.
y
),
expf
(
a
.
z
),
expf
(
a
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
pexp
<
double2
>
(
const
double2
&
a
)
{
using
::
exp
;
return
make_double2
(
exp
(
a
.
x
),
exp
(
a
.
y
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
psqrt
<
float4
>
(
const
float4
&
a
)
{
return
make_float4
(
sqrtf
(
a
.
x
),
sqrtf
(
a
.
y
),
sqrtf
(
a
.
z
),
sqrtf
(
a
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
psqrt
<
double2
>
(
const
double2
&
a
)
{
using
::
sqrt
;
return
make_double2
(
sqrt
(
a
.
x
),
sqrt
(
a
.
y
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
prsqrt
<
float4
>
(
const
float4
&
a
)
{
return
make_float4
(
rsqrtf
(
a
.
x
),
rsqrtf
(
a
.
y
),
rsqrtf
(
a
.
z
),
rsqrtf
(
a
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
prsqrt
<
double2
>
(
const
double2
&
a
)
{
return
make_double2
(
rsqrt
(
a
.
x
),
rsqrt
(
a
.
y
));
}
#endif
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
external/eigen3/Eigen/src/Core/arch/CUDA/PacketMath.h
0 → 100644
View file @
a394b22a
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_CUDA_H
#define EIGEN_PACKET_MATH_CUDA_H
namespace
Eigen
{
namespace
internal
{
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
template
<
>
struct
is_arithmetic
<
float4
>
{
enum
{
value
=
true
};
};
template
<
>
struct
is_arithmetic
<
double2
>
{
enum
{
value
=
true
};
};
template
<
>
struct
packet_traits
<
float
>
:
default_packet_traits
{
typedef
float4
type
;
typedef
float4
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
,
HasHalfPacket
=
0
,
HasDiv
=
1
,
HasSin
=
0
,
HasCos
=
0
,
HasLog
=
1
,
HasExp
=
1
,
HasSqrt
=
1
,
HasRsqrt
=
1
,
HasLGamma
=
1
,
HasDiGamma
=
1
,
HasZeta
=
1
,
HasPolygamma
=
1
,
HasErf
=
1
,
HasErfc
=
1
,
HasIGamma
=
1
,
HasIGammac
=
1
,
HasBetaInc
=
1
,
HasBlend
=
0
,
};
};
template
<
>
struct
packet_traits
<
double
>
:
default_packet_traits
{
typedef
double2
type
;
typedef
double2
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
2
,
HasHalfPacket
=
0
,
HasDiv
=
1
,
HasLog
=
1
,
HasExp
=
1
,
HasSqrt
=
1
,
HasRsqrt
=
1
,
HasLGamma
=
1
,
HasDiGamma
=
1
,
HasZeta
=
1
,
HasPolygamma
=
1
,
HasErf
=
1
,
HasErfc
=
1
,
HasIGamma
=
1
,
HasIGammac
=
1
,
HasBetaInc
=
1
,
HasBlend
=
0
,
};
};
template
<
>
struct
unpacket_traits
<
float4
>
{
typedef
float
type
;
enum
{
size
=
4
,
alignment
=
Aligned16
};
typedef
float4
half
;
};
template
<
>
struct
unpacket_traits
<
double2
>
{
typedef
double
type
;
enum
{
size
=
2
,
alignment
=
Aligned16
};
typedef
double2
half
;
};
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pset1
<
float4
>
(
const
float
&
from
)
{
return
make_float4
(
from
,
from
,
from
,
from
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
pset1
<
double2
>
(
const
double
&
from
)
{
return
make_double2
(
from
,
from
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
plset
<
float4
>
(
const
float
&
a
)
{
return
make_float4
(
a
,
a
+
1
,
a
+
2
,
a
+
3
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
plset
<
double2
>
(
const
double
&
a
)
{
return
make_double2
(
a
,
a
+
1
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
padd
<
float4
>
(
const
float4
&
a
,
const
float4
&
b
)
{
return
make_float4
(
a
.
x
+
b
.
x
,
a
.
y
+
b
.
y
,
a
.
z
+
b
.
z
,
a
.
w
+
b
.
w
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
padd
<
double2
>
(
const
double2
&
a
,
const
double2
&
b
)
{
return
make_double2
(
a
.
x
+
b
.
x
,
a
.
y
+
b
.
y
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
psub
<
float4
>
(
const
float4
&
a
,
const
float4
&
b
)
{
return
make_float4
(
a
.
x
-
b
.
x
,
a
.
y
-
b
.
y
,
a
.
z
-
b
.
z
,
a
.
w
-
b
.
w
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
psub
<
double2
>
(
const
double2
&
a
,
const
double2
&
b
)
{
return
make_double2
(
a
.
x
-
b
.
x
,
a
.
y
-
b
.
y
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pnegate
(
const
float4
&
a
)
{
return
make_float4
(
-
a
.
x
,
-
a
.
y
,
-
a
.
z
,
-
a
.
w
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
pnegate
(
const
double2
&
a
)
{
return
make_double2
(
-
a
.
x
,
-
a
.
y
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pconj
(
const
float4
&
a
)
{
return
a
;
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
pconj
(
const
double2
&
a
)
{
return
a
;
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pmul
<
float4
>
(
const
float4
&
a
,
const
float4
&
b
)
{
return
make_float4
(
a
.
x
*
b
.
x
,
a
.
y
*
b
.
y
,
a
.
z
*
b
.
z
,
a
.
w
*
b
.
w
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
pmul
<
double2
>
(
const
double2
&
a
,
const
double2
&
b
)
{
return
make_double2
(
a
.
x
*
b
.
x
,
a
.
y
*
b
.
y
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pdiv
<
float4
>
(
const
float4
&
a
,
const
float4
&
b
)
{
return
make_float4
(
a
.
x
/
b
.
x
,
a
.
y
/
b
.
y
,
a
.
z
/
b
.
z
,
a
.
w
/
b
.
w
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
pdiv
<
double2
>
(
const
double2
&
a
,
const
double2
&
b
)
{
return
make_double2
(
a
.
x
/
b
.
x
,
a
.
y
/
b
.
y
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pmin
<
float4
>
(
const
float4
&
a
,
const
float4
&
b
)
{
return
make_float4
(
fminf
(
a
.
x
,
b
.
x
),
fminf
(
a
.
y
,
b
.
y
),
fminf
(
a
.
z
,
b
.
z
),
fminf
(
a
.
w
,
b
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
pmin
<
double2
>
(
const
double2
&
a
,
const
double2
&
b
)
{
return
make_double2
(
fmin
(
a
.
x
,
b
.
x
),
fmin
(
a
.
y
,
b
.
y
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pmax
<
float4
>
(
const
float4
&
a
,
const
float4
&
b
)
{
return
make_float4
(
fmaxf
(
a
.
x
,
b
.
x
),
fmaxf
(
a
.
y
,
b
.
y
),
fmaxf
(
a
.
z
,
b
.
z
),
fmaxf
(
a
.
w
,
b
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
pmax
<
double2
>
(
const
double2
&
a
,
const
double2
&
b
)
{
return
make_double2
(
fmax
(
a
.
x
,
b
.
x
),
fmax
(
a
.
y
,
b
.
y
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pload
<
float4
>
(
const
float
*
from
)
{
return
*
reinterpret_cast
<
const
float4
*>
(
from
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
pload
<
double2
>
(
const
double
*
from
)
{
return
*
reinterpret_cast
<
const
double2
*>
(
from
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
ploadu
<
float4
>
(
const
float
*
from
)
{
return
make_float4
(
from
[
0
],
from
[
1
],
from
[
2
],
from
[
3
]);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
ploadu
<
double2
>
(
const
double
*
from
)
{
return
make_double2
(
from
[
0
],
from
[
1
]);
}
template
<
>
EIGEN_STRONG_INLINE
float4
ploaddup
<
float4
>
(
const
float
*
from
)
{
return
make_float4
(
from
[
0
],
from
[
0
],
from
[
1
],
from
[
1
]);
}
template
<
>
EIGEN_STRONG_INLINE
double2
ploaddup
<
double2
>
(
const
double
*
from
)
{
return
make_double2
(
from
[
0
],
from
[
0
]);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
void
pstore
<
float
>
(
float
*
to
,
const
float4
&
from
)
{
*
reinterpret_cast
<
float4
*>
(
to
)
=
from
;
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
void
pstore
<
double
>
(
double
*
to
,
const
double2
&
from
)
{
*
reinterpret_cast
<
double2
*>
(
to
)
=
from
;
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
void
pstoreu
<
float
>
(
float
*
to
,
const
float4
&
from
)
{
to
[
0
]
=
from
.
x
;
to
[
1
]
=
from
.
y
;
to
[
2
]
=
from
.
z
;
to
[
3
]
=
from
.
w
;
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
void
pstoreu
<
double
>
(
double
*
to
,
const
double2
&
from
)
{
to
[
0
]
=
from
.
x
;
to
[
1
]
=
from
.
y
;
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
float4
ploadt_ro
<
float4
,
Aligned
>
(
const
float
*
from
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
return
__ldg
((
const
float4
*
)
from
);
#else
return
make_float4
(
from
[
0
],
from
[
1
],
from
[
2
],
from
[
3
]);
#endif
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
double2
ploadt_ro
<
double2
,
Aligned
>
(
const
double
*
from
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
return
__ldg
((
const
double2
*
)
from
);
#else
return
make_double2
(
from
[
0
],
from
[
1
]);
#endif
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
float4
ploadt_ro
<
float4
,
Unaligned
>
(
const
float
*
from
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
return
make_float4
(
__ldg
(
from
+
0
),
__ldg
(
from
+
1
),
__ldg
(
from
+
2
),
__ldg
(
from
+
3
));
#else
return
make_float4
(
from
[
0
],
from
[
1
],
from
[
2
],
from
[
3
]);
#endif
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
double2
ploadt_ro
<
double2
,
Unaligned
>
(
const
double
*
from
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
return
make_double2
(
__ldg
(
from
+
0
),
__ldg
(
from
+
1
));
#else
return
make_double2
(
from
[
0
],
from
[
1
]);
#endif
}
template
<
>
EIGEN_DEVICE_FUNC
inline
float4
pgather
<
float
,
float4
>
(
const
float
*
from
,
Index
stride
)
{
return
make_float4
(
from
[
0
*
stride
],
from
[
1
*
stride
],
from
[
2
*
stride
],
from
[
3
*
stride
]);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
double2
pgather
<
double
,
double2
>
(
const
double
*
from
,
Index
stride
)
{
return
make_double2
(
from
[
0
*
stride
],
from
[
1
*
stride
]);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
float
,
float4
>
(
float
*
to
,
const
float4
&
from
,
Index
stride
)
{
to
[
stride
*
0
]
=
from
.
x
;
to
[
stride
*
1
]
=
from
.
y
;
to
[
stride
*
2
]
=
from
.
z
;
to
[
stride
*
3
]
=
from
.
w
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
double
,
double2
>
(
double
*
to
,
const
double2
&
from
,
Index
stride
)
{
to
[
stride
*
0
]
=
from
.
x
;
to
[
stride
*
1
]
=
from
.
y
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
float
pfirst
<
float4
>
(
const
float4
&
a
)
{
return
a
.
x
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
double
pfirst
<
double2
>
(
const
double2
&
a
)
{
return
a
.
x
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
float
predux
<
float4
>
(
const
float4
&
a
)
{
return
a
.
x
+
a
.
y
+
a
.
z
+
a
.
w
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
double
predux
<
double2
>
(
const
double2
&
a
)
{
return
a
.
x
+
a
.
y
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
float
predux_max
<
float4
>
(
const
float4
&
a
)
{
return
fmaxf
(
fmaxf
(
a
.
x
,
a
.
y
),
fmaxf
(
a
.
z
,
a
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
inline
double
predux_max
<
double2
>
(
const
double2
&
a
)
{
return
fmax
(
a
.
x
,
a
.
y
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
float
predux_min
<
float4
>
(
const
float4
&
a
)
{
return
fminf
(
fminf
(
a
.
x
,
a
.
y
),
fminf
(
a
.
z
,
a
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
inline
double
predux_min
<
double2
>
(
const
double2
&
a
)
{
return
fmin
(
a
.
x
,
a
.
y
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
float
predux_mul
<
float4
>
(
const
float4
&
a
)
{
return
a
.
x
*
a
.
y
*
a
.
z
*
a
.
w
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
double
predux_mul
<
double2
>
(
const
double2
&
a
)
{
return
a
.
x
*
a
.
y
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
float4
pabs
<
float4
>
(
const
float4
&
a
)
{
return
make_float4
(
fabsf
(
a
.
x
),
fabsf
(
a
.
y
),
fabsf
(
a
.
z
),
fabsf
(
a
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
inline
double2
pabs
<
double2
>
(
const
double2
&
a
)
{
return
make_double2
(
fabs
(
a
.
x
),
fabs
(
a
.
y
));
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
float4
,
4
>&
kernel
)
{
float
tmp
=
kernel
.
packet
[
0
].
y
;
kernel
.
packet
[
0
].
y
=
kernel
.
packet
[
1
].
x
;
kernel
.
packet
[
1
].
x
=
tmp
;
tmp
=
kernel
.
packet
[
0
].
z
;
kernel
.
packet
[
0
].
z
=
kernel
.
packet
[
2
].
x
;
kernel
.
packet
[
2
].
x
=
tmp
;
tmp
=
kernel
.
packet
[
0
].
w
;
kernel
.
packet
[
0
].
w
=
kernel
.
packet
[
3
].
x
;
kernel
.
packet
[
3
].
x
=
tmp
;
tmp
=
kernel
.
packet
[
1
].
z
;
kernel
.
packet
[
1
].
z
=
kernel
.
packet
[
2
].
y
;
kernel
.
packet
[
2
].
y
=
tmp
;
tmp
=
kernel
.
packet
[
1
].
w
;
kernel
.
packet
[
1
].
w
=
kernel
.
packet
[
3
].
y
;
kernel
.
packet
[
3
].
y
=
tmp
;
tmp
=
kernel
.
packet
[
2
].
w
;
kernel
.
packet
[
2
].
w
=
kernel
.
packet
[
3
].
z
;
kernel
.
packet
[
3
].
z
=
tmp
;
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
double2
,
2
>&
kernel
)
{
double
tmp
=
kernel
.
packet
[
0
].
y
;
kernel
.
packet
[
0
].
y
=
kernel
.
packet
[
1
].
x
;
kernel
.
packet
[
1
].
x
=
tmp
;
}
#endif
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_PACKET_MATH_CUDA_H
external/eigen3/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
0 → 100644
View file @
a394b22a
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
#define EIGEN_PACKET_MATH_HALF_CUDA_H
namespace
Eigen
{
namespace
internal
{
// Most of the following operations require arch >= 3.0
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
template
<
>
struct
is_arithmetic
<
half2
>
{
enum
{
value
=
true
};
};
template
<
>
struct
packet_traits
<
Eigen
::
half
>
:
default_packet_traits
{
typedef
half2
type
;
typedef
half2
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
2
,
HasHalfPacket
=
0
,
HasAdd
=
1
,
HasMul
=
1
,
HasDiv
=
1
,
HasSqrt
=
1
,
HasRsqrt
=
1
,
HasExp
=
1
,
HasLog
=
1
,
HasLog1p
=
1
};
};
template
<
>
struct
unpacket_traits
<
half2
>
{
typedef
Eigen
::
half
type
;
enum
{
size
=
2
,
alignment
=
Aligned16
};
typedef
half2
half
;
};
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pset1
<
half2
>
(
const
Eigen
::
half
&
from
)
{
return
__half2half2
(
from
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pload
<
half2
>
(
const
Eigen
::
half
*
from
)
{
return
*
reinterpret_cast
<
const
half2
*>
(
from
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
ploadu
<
half2
>
(
const
Eigen
::
half
*
from
)
{
return
__halves2half2
(
from
[
0
],
from
[
1
]);
}
template
<
>
EIGEN_STRONG_INLINE
half2
ploaddup
<
half2
>
(
const
Eigen
::
half
*
from
)
{
return
__halves2half2
(
from
[
0
],
from
[
0
]);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
void
pstore
<
Eigen
::
half
>
(
Eigen
::
half
*
to
,
const
half2
&
from
)
{
*
reinterpret_cast
<
half2
*>
(
to
)
=
from
;
}
template
<
>
__device__
EIGEN_STRONG_INLINE
void
pstoreu
<
Eigen
::
half
>
(
Eigen
::
half
*
to
,
const
half2
&
from
)
{
to
[
0
]
=
__low2half
(
from
);
to
[
1
]
=
__high2half
(
from
);
}
template
<
>
__device__
EIGEN_ALWAYS_INLINE
half2
ploadt_ro
<
half2
,
Aligned
>
(
const
Eigen
::
half
*
from
)
{
#if __CUDA_ARCH__ >= 350
return
__ldg
((
const
half2
*
)
from
);
#else
return
__halves2half2
(
*
(
from
+
0
),
*
(
from
+
1
));
#endif
}
template
<
>
__device__
EIGEN_ALWAYS_INLINE
half2
ploadt_ro
<
half2
,
Unaligned
>
(
const
Eigen
::
half
*
from
)
{
#if __CUDA_ARCH__ >= 350
return
__halves2half2
(
__ldg
(
from
+
0
),
__ldg
(
from
+
1
));
#else
return
__halves2half2
(
*
(
from
+
0
),
*
(
from
+
1
));
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pgather
<
Eigen
::
half
,
half2
>
(
const
Eigen
::
half
*
from
,
Index
stride
)
{
return
__halves2half2
(
from
[
0
*
stride
],
from
[
1
*
stride
]);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
void
pscatter
<
Eigen
::
half
,
half2
>
(
Eigen
::
half
*
to
,
const
half2
&
from
,
Index
stride
)
{
to
[
stride
*
0
]
=
__low2half
(
from
);
to
[
stride
*
1
]
=
__high2half
(
from
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
Eigen
::
half
pfirst
<
half2
>
(
const
half2
&
a
)
{
return
__low2half
(
a
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pabs
<
half2
>
(
const
half2
&
a
)
{
half2
result
;
result
.
x
=
a
.
x
&
0x7FFF7FFF
;
return
result
;
}
__device__
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
half2
,
2
>&
kernel
)
{
__half
a1
=
__low2half
(
kernel
.
packet
[
0
]);
__half
a2
=
__high2half
(
kernel
.
packet
[
0
]);
__half
b1
=
__low2half
(
kernel
.
packet
[
1
]);
__half
b2
=
__high2half
(
kernel
.
packet
[
1
]);
kernel
.
packet
[
0
]
=
__halves2half2
(
a1
,
b1
);
kernel
.
packet
[
1
]
=
__halves2half2
(
a2
,
b2
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
plset
<
half2
>
(
const
Eigen
::
half
&
a
)
{
#if __CUDA_ARCH__ >= 530
return
__halves2half2
(
a
,
__hadd
(
a
,
__float2half
(
1.0
f
)));
#else
float
f
=
__half2float
(
a
)
+
1.0
f
;
return
__halves2half2
(
a
,
__float2half
(
f
));
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
padd
<
half2
>
(
const
half2
&
a
,
const
half2
&
b
)
{
#if __CUDA_ARCH__ >= 530
return
__hadd2
(
a
,
b
);
#else
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
b1
=
__low2float
(
b
);
float
b2
=
__high2float
(
b
);
float
r1
=
a1
+
b1
;
float
r2
=
a2
+
b2
;
return
__floats2half2_rn
(
r1
,
r2
);
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
psub
<
half2
>
(
const
half2
&
a
,
const
half2
&
b
)
{
#if __CUDA_ARCH__ >= 530
return
__hsub2
(
a
,
b
);
#else
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
b1
=
__low2float
(
b
);
float
b2
=
__high2float
(
b
);
float
r1
=
a1
-
b1
;
float
r2
=
a2
-
b2
;
return
__floats2half2_rn
(
r1
,
r2
);
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pnegate
(
const
half2
&
a
)
{
#if __CUDA_ARCH__ >= 530
return
__hneg2
(
a
);
#else
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
return
__floats2half2_rn
(
-
a1
,
-
a2
);
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pconj
(
const
half2
&
a
)
{
return
a
;
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pmul
<
half2
>
(
const
half2
&
a
,
const
half2
&
b
)
{
#if __CUDA_ARCH__ >= 530
return
__hmul2
(
a
,
b
);
#else
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
b1
=
__low2float
(
b
);
float
b2
=
__high2float
(
b
);
float
r1
=
a1
*
b1
;
float
r2
=
a2
*
b2
;
return
__floats2half2_rn
(
r1
,
r2
);
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pmadd
<
half2
>
(
const
half2
&
a
,
const
half2
&
b
,
const
half2
&
c
)
{
#if __CUDA_ARCH__ >= 530
return
__hfma2
(
a
,
b
,
c
);
#else
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
b1
=
__low2float
(
b
);
float
b2
=
__high2float
(
b
);
float
c1
=
__low2float
(
c
);
float
c2
=
__high2float
(
c
);
float
r1
=
a1
*
b1
+
c1
;
float
r2
=
a2
*
b2
+
c2
;
return
__floats2half2_rn
(
r1
,
r2
);
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pdiv
<
half2
>
(
const
half2
&
a
,
const
half2
&
b
)
{
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
b1
=
__low2float
(
b
);
float
b2
=
__high2float
(
b
);
float
r1
=
a1
/
b1
;
float
r2
=
a2
/
b2
;
return
__floats2half2_rn
(
r1
,
r2
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pmin
<
half2
>
(
const
half2
&
a
,
const
half2
&
b
)
{
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
b1
=
__low2float
(
b
);
float
b2
=
__high2float
(
b
);
__half
r1
=
a1
<
b1
?
__low2half
(
a
)
:
__low2half
(
b
);
__half
r2
=
a2
<
b2
?
__high2half
(
a
)
:
__high2half
(
b
);
return
__halves2half2
(
r1
,
r2
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pmax
<
half2
>
(
const
half2
&
a
,
const
half2
&
b
)
{
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
b1
=
__low2float
(
b
);
float
b2
=
__high2float
(
b
);
__half
r1
=
a1
>
b1
?
__low2half
(
a
)
:
__low2half
(
b
);
__half
r2
=
a2
>
b2
?
__high2half
(
a
)
:
__high2half
(
b
);
return
__halves2half2
(
r1
,
r2
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
Eigen
::
half
predux
<
half2
>
(
const
half2
&
a
)
{
#if __CUDA_ARCH__ >= 530
return
__hadd
(
__low2half
(
a
),
__high2half
(
a
));
#else
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
return
Eigen
::
half
(
half_impl
::
raw_uint16_to_half
(
__float2half_rn
(
a1
+
a2
)));
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
Eigen
::
half
predux_max
<
half2
>
(
const
half2
&
a
)
{
#if __CUDA_ARCH__ >= 530
__half
first
=
__low2half
(
a
);
__half
second
=
__high2half
(
a
);
return
__hgt
(
first
,
second
)
?
first
:
second
;
#else
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
return
a1
>
a2
?
__low2half
(
a
)
:
__high2half
(
a
);
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
Eigen
::
half
predux_min
<
half2
>
(
const
half2
&
a
)
{
#if __CUDA_ARCH__ >= 530
__half
first
=
__low2half
(
a
);
__half
second
=
__high2half
(
a
);
return
__hlt
(
first
,
second
)
?
first
:
second
;
#else
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
return
a1
<
a2
?
__low2half
(
a
)
:
__high2half
(
a
);
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
Eigen
::
half
predux_mul
<
half2
>
(
const
half2
&
a
)
{
#if __CUDA_ARCH__ >= 530
return
__hmul
(
__low2half
(
a
),
__high2half
(
a
));
#else
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
return
Eigen
::
half
(
half_impl
::
raw_uint16_to_half
(
__float2half_rn
(
a1
*
a2
)));
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
plog1p
<
half2
>
(
const
half2
&
a
)
{
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
r1
=
log1pf
(
a1
);
float
r2
=
log1pf
(
a2
);
return
__floats2half2_rn
(
r1
,
r2
);
}
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
plog
<
half2
>
(
const
half2
&
a
)
{
return
h2log
(
a
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pexp
<
half2
>
(
const
half2
&
a
)
{
return
h2exp
(
a
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
psqrt
<
half2
>
(
const
half2
&
a
)
{
return
h2sqrt
(
a
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
prsqrt
<
half2
>
(
const
half2
&
a
)
{
return
h2rsqrt
(
a
);
}
#else
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
plog
<
half2
>
(
const
half2
&
a
)
{
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
r1
=
logf
(
a1
);
float
r2
=
logf
(
a2
);
return
__floats2half2_rn
(
r1
,
r2
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pexp
<
half2
>
(
const
half2
&
a
)
{
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
r1
=
expf
(
a1
);
float
r2
=
expf
(
a2
);
return
__floats2half2_rn
(
r1
,
r2
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
psqrt
<
half2
>
(
const
half2
&
a
)
{
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
r1
=
sqrtf
(
a1
);
float
r2
=
sqrtf
(
a2
);
return
__floats2half2_rn
(
r1
,
r2
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
prsqrt
<
half2
>
(
const
half2
&
a
)
{
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
r1
=
rsqrtf
(
a1
);
float
r2
=
rsqrtf
(
a2
);
return
__floats2half2_rn
(
r1
,
r2
);
}
#endif
#elif defined EIGEN_VECTORIZE_AVX512
typedef
struct
{
__m256i
x
;
}
Packet16h
;
template
<
>
struct
is_arithmetic
<
Packet16h
>
{
enum
{
value
=
true
};
};
template
<
>
struct
packet_traits
<
half
>
:
default_packet_traits
{
typedef
Packet16h
type
;
// There is no half-size packet for Packet16h.
typedef
Packet16h
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
16
,
HasHalfPacket
=
0
,
HasAdd
=
0
,
HasSub
=
0
,
HasMul
=
0
,
HasNegate
=
0
,
HasAbs
=
0
,
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
HasConj
=
0
,
HasSetLinear
=
0
,
HasDiv
=
0
,
HasSqrt
=
0
,
HasRsqrt
=
0
,
HasExp
=
0
,
HasLog
=
0
,
HasBlend
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet16h
>
{
typedef
Eigen
::
half
type
;
enum
{
size
=
16
,
alignment
=
Aligned32
};
typedef
Packet16h
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet16h
pset1
<
Packet16h
>
(
const
Eigen
::
half
&
from
)
{
Packet16h
result
;
result
.
x
=
_mm256_set1_epi16
(
from
.
x
);
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
Eigen
::
half
pfirst
<
Packet16h
>
(
const
Packet16h
&
from
)
{
return
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
_mm256_extract_epi16
(
from
.
x
,
0
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet16h
pload
<
Packet16h
>
(
const
Eigen
::
half
*
from
)
{
Packet16h
result
;
result
.
x
=
_mm256_load_si256
(
reinterpret_cast
<
const
__m256i
*>
(
from
));
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet16h
ploadu
<
Packet16h
>
(
const
Eigen
::
half
*
from
)
{
Packet16h
result
;
result
.
x
=
_mm256_loadu_si256
(
reinterpret_cast
<
const
__m256i
*>
(
from
));
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
half
>
(
Eigen
::
half
*
to
,
const
Packet16h
&
from
)
{
_mm256_store_si256
((
__m256i
*
)
to
,
from
.
x
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
half
>
(
Eigen
::
half
*
to
,
const
Packet16h
&
from
)
{
_mm256_storeu_si256
((
__m256i
*
)
to
,
from
.
x
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16h
ploadquad
(
const
Eigen
::
half
*
from
)
{
Packet16h
result
;
unsigned
short
a
=
from
[
0
].
x
;
unsigned
short
b
=
from
[
1
].
x
;
unsigned
short
c
=
from
[
2
].
x
;
unsigned
short
d
=
from
[
3
].
x
;
result
.
x
=
_mm256_set_epi16
(
d
,
d
,
d
,
d
,
c
,
c
,
c
,
c
,
b
,
b
,
b
,
b
,
a
,
a
,
a
,
a
);
return
result
;
}
EIGEN_STRONG_INLINE
Packet16f
half2float
(
const
Packet16h
&
a
)
{
#ifdef EIGEN_HAS_FP16_C
return
_mm512_cvtph_ps
(
a
.
x
);
#else
EIGEN_ALIGN64
half
aux
[
16
];
pstore
(
aux
,
a
);
float
f0
(
aux
[
0
]);
float
f1
(
aux
[
1
]);
float
f2
(
aux
[
2
]);
float
f3
(
aux
[
3
]);
float
f4
(
aux
[
4
]);
float
f5
(
aux
[
5
]);
float
f6
(
aux
[
6
]);
float
f7
(
aux
[
7
]);
float
f8
(
aux
[
8
]);
float
f9
(
aux
[
9
]);
float
fa
(
aux
[
10
]);
float
fb
(
aux
[
11
]);
float
fc
(
aux
[
12
]);
float
fd
(
aux
[
13
]);
float
fe
(
aux
[
14
]);
float
ff
(
aux
[
15
]);
return
_mm512_set_ps
(
ff
,
fe
,
fd
,
fc
,
fb
,
fa
,
f9
,
f8
,
f7
,
f6
,
f5
,
f4
,
f3
,
f2
,
f1
,
f0
);
#endif
}
EIGEN_STRONG_INLINE
Packet16h
float2half
(
const
Packet16f
&
a
)
{
#ifdef EIGEN_HAS_FP16_C
Packet16h
result
;
result
.
x
=
_mm512_cvtps_ph
(
a
,
_MM_FROUND_TO_NEAREST_INT
|
_MM_FROUND_NO_EXC
);
return
result
;
#else
EIGEN_ALIGN64
float
aux
[
16
];
pstore
(
aux
,
a
);
half
h0
(
aux
[
0
]);
half
h1
(
aux
[
1
]);
half
h2
(
aux
[
2
]);
half
h3
(
aux
[
3
]);
half
h4
(
aux
[
4
]);
half
h5
(
aux
[
5
]);
half
h6
(
aux
[
6
]);
half
h7
(
aux
[
7
]);
half
h8
(
aux
[
8
]);
half
h9
(
aux
[
9
]);
half
ha
(
aux
[
10
]);
half
hb
(
aux
[
11
]);
half
hc
(
aux
[
12
]);
half
hd
(
aux
[
13
]);
half
he
(
aux
[
14
]);
half
hf
(
aux
[
15
]);
Packet16h
result
;
result
.
x
=
_mm256_set_epi16
(
hf
.
x
,
he
.
x
,
hd
.
x
,
hc
.
x
,
hb
.
x
,
ha
.
x
,
h9
.
x
,
h8
.
x
,
h7
.
x
,
h6
.
x
,
h5
.
x
,
h4
.
x
,
h3
.
x
,
h2
.
x
,
h1
.
x
,
h0
.
x
);
return
result
;
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet16h
padd
<
Packet16h
>
(
const
Packet16h
&
a
,
const
Packet16h
&
b
)
{
Packet16f
af
=
half2float
(
a
);
Packet16f
bf
=
half2float
(
b
);
Packet16f
rf
=
padd
(
af
,
bf
);
return
float2half
(
rf
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16h
pmul
<
Packet16h
>
(
const
Packet16h
&
a
,
const
Packet16h
&
b
)
{
Packet16f
af
=
half2float
(
a
);
Packet16f
bf
=
half2float
(
b
);
Packet16f
rf
=
pmul
(
af
,
bf
);
return
float2half
(
rf
);
}
template
<
>
EIGEN_STRONG_INLINE
half
predux
<
Packet16h
>
(
const
Packet16h
&
from
)
{
Packet16f
from_float
=
half2float
(
from
);
return
half
(
predux
(
from_float
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet16h
pgather
<
Eigen
::
half
,
Packet16h
>
(
const
Eigen
::
half
*
from
,
Index
stride
)
{
Packet16h
result
;
result
.
x
=
_mm256_set_epi16
(
from
[
15
*
stride
].
x
,
from
[
14
*
stride
].
x
,
from
[
13
*
stride
].
x
,
from
[
12
*
stride
].
x
,
from
[
11
*
stride
].
x
,
from
[
10
*
stride
].
x
,
from
[
9
*
stride
].
x
,
from
[
8
*
stride
].
x
,
from
[
7
*
stride
].
x
,
from
[
6
*
stride
].
x
,
from
[
5
*
stride
].
x
,
from
[
4
*
stride
].
x
,
from
[
3
*
stride
].
x
,
from
[
2
*
stride
].
x
,
from
[
1
*
stride
].
x
,
from
[
0
*
stride
].
x
);
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pscatter
<
half
,
Packet16h
>
(
half
*
to
,
const
Packet16h
&
from
,
Index
stride
)
{
EIGEN_ALIGN64
half
aux
[
16
];
pstore
(
aux
,
from
);
to
[
stride
*
0
].
x
=
aux
[
0
].
x
;
to
[
stride
*
1
].
x
=
aux
[
1
].
x
;
to
[
stride
*
2
].
x
=
aux
[
2
].
x
;
to
[
stride
*
3
].
x
=
aux
[
3
].
x
;
to
[
stride
*
4
].
x
=
aux
[
4
].
x
;
to
[
stride
*
5
].
x
=
aux
[
5
].
x
;
to
[
stride
*
6
].
x
=
aux
[
6
].
x
;
to
[
stride
*
7
].
x
=
aux
[
7
].
x
;
to
[
stride
*
8
].
x
=
aux
[
8
].
x
;
to
[
stride
*
9
].
x
=
aux
[
9
].
x
;
to
[
stride
*
10
].
x
=
aux
[
10
].
x
;
to
[
stride
*
11
].
x
=
aux
[
11
].
x
;
to
[
stride
*
12
].
x
=
aux
[
12
].
x
;
to
[
stride
*
13
].
x
=
aux
[
13
].
x
;
to
[
stride
*
14
].
x
=
aux
[
14
].
x
;
to
[
stride
*
15
].
x
=
aux
[
15
].
x
;
}
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
Packet16h
,
16
>&
kernel
)
{
__m256i
a
=
kernel
.
packet
[
0
].
x
;
__m256i
b
=
kernel
.
packet
[
1
].
x
;
__m256i
c
=
kernel
.
packet
[
2
].
x
;
__m256i
d
=
kernel
.
packet
[
3
].
x
;
__m256i
e
=
kernel
.
packet
[
4
].
x
;
__m256i
f
=
kernel
.
packet
[
5
].
x
;
__m256i
g
=
kernel
.
packet
[
6
].
x
;
__m256i
h
=
kernel
.
packet
[
7
].
x
;
__m256i
i
=
kernel
.
packet
[
8
].
x
;
__m256i
j
=
kernel
.
packet
[
9
].
x
;
__m256i
k
=
kernel
.
packet
[
10
].
x
;
__m256i
l
=
kernel
.
packet
[
11
].
x
;
__m256i
m
=
kernel
.
packet
[
12
].
x
;
__m256i
n
=
kernel
.
packet
[
13
].
x
;
__m256i
o
=
kernel
.
packet
[
14
].
x
;
__m256i
p
=
kernel
.
packet
[
15
].
x
;
__m256i
ab_07
=
_mm256_unpacklo_epi16
(
a
,
b
);
__m256i
cd_07
=
_mm256_unpacklo_epi16
(
c
,
d
);
__m256i
ef_07
=
_mm256_unpacklo_epi16
(
e
,
f
);
__m256i
gh_07
=
_mm256_unpacklo_epi16
(
g
,
h
);
__m256i
ij_07
=
_mm256_unpacklo_epi16
(
i
,
j
);
__m256i
kl_07
=
_mm256_unpacklo_epi16
(
k
,
l
);
__m256i
mn_07
=
_mm256_unpacklo_epi16
(
m
,
n
);
__m256i
op_07
=
_mm256_unpacklo_epi16
(
o
,
p
);
__m256i
ab_8f
=
_mm256_unpackhi_epi16
(
a
,
b
);
__m256i
cd_8f
=
_mm256_unpackhi_epi16
(
c
,
d
);
__m256i
ef_8f
=
_mm256_unpackhi_epi16
(
e
,
f
);
__m256i
gh_8f
=
_mm256_unpackhi_epi16
(
g
,
h
);
__m256i
ij_8f
=
_mm256_unpackhi_epi16
(
i
,
j
);
__m256i
kl_8f
=
_mm256_unpackhi_epi16
(
k
,
l
);
__m256i
mn_8f
=
_mm256_unpackhi_epi16
(
m
,
n
);
__m256i
op_8f
=
_mm256_unpackhi_epi16
(
o
,
p
);
__m256i
abcd_03
=
_mm256_unpacklo_epi32
(
ab_07
,
cd_07
);
__m256i
abcd_47
=
_mm256_unpackhi_epi32
(
ab_07
,
cd_07
);
__m256i
efgh_03
=
_mm256_unpacklo_epi32
(
ef_07
,
gh_07
);
__m256i
efgh_47
=
_mm256_unpackhi_epi32
(
ef_07
,
gh_07
);
__m256i
ijkl_03
=
_mm256_unpacklo_epi32
(
ij_07
,
kl_07
);
__m256i
ijkl_47
=
_mm256_unpackhi_epi32
(
ij_07
,
kl_07
);
__m256i
mnop_03
=
_mm256_unpacklo_epi32
(
mn_07
,
op_07
);
__m256i
mnop_47
=
_mm256_unpackhi_epi32
(
mn_07
,
op_07
);
__m256i
abcd_8b
=
_mm256_unpacklo_epi32
(
ab_8f
,
cd_8f
);
__m256i
abcd_cf
=
_mm256_unpackhi_epi32
(
ab_8f
,
cd_8f
);
__m256i
efgh_8b
=
_mm256_unpacklo_epi32
(
ef_8f
,
gh_8f
);
__m256i
efgh_cf
=
_mm256_unpackhi_epi32
(
ef_8f
,
gh_8f
);
__m256i
ijkl_8b
=
_mm256_unpacklo_epi32
(
ij_8f
,
kl_8f
);
__m256i
ijkl_cf
=
_mm256_unpackhi_epi32
(
ij_8f
,
kl_8f
);
__m256i
mnop_8b
=
_mm256_unpacklo_epi32
(
mn_8f
,
op_8f
);
__m256i
mnop_cf
=
_mm256_unpackhi_epi32
(
mn_8f
,
op_8f
);
__m256i
abcdefgh_01
=
_mm256_unpacklo_epi64
(
abcd_03
,
efgh_03
);
__m256i
abcdefgh_23
=
_mm256_unpackhi_epi64
(
abcd_03
,
efgh_03
);
__m256i
ijklmnop_01
=
_mm256_unpacklo_epi64
(
ijkl_03
,
mnop_03
);
__m256i
ijklmnop_23
=
_mm256_unpackhi_epi64
(
ijkl_03
,
mnop_03
);
__m256i
abcdefgh_45
=
_mm256_unpacklo_epi64
(
abcd_47
,
efgh_47
);
__m256i
abcdefgh_67
=
_mm256_unpackhi_epi64
(
abcd_47
,
efgh_47
);
__m256i
ijklmnop_45
=
_mm256_unpacklo_epi64
(
ijkl_47
,
mnop_47
);
__m256i
ijklmnop_67
=
_mm256_unpackhi_epi64
(
ijkl_47
,
mnop_47
);
__m256i
abcdefgh_89
=
_mm256_unpacklo_epi64
(
abcd_8b
,
efgh_8b
);
__m256i
abcdefgh_ab
=
_mm256_unpackhi_epi64
(
abcd_8b
,
efgh_8b
);
__m256i
ijklmnop_89
=
_mm256_unpacklo_epi64
(
ijkl_8b
,
mnop_8b
);
__m256i
ijklmnop_ab
=
_mm256_unpackhi_epi64
(
ijkl_8b
,
mnop_8b
);
__m256i
abcdefgh_cd
=
_mm256_unpacklo_epi64
(
abcd_cf
,
efgh_cf
);
__m256i
abcdefgh_ef
=
_mm256_unpackhi_epi64
(
abcd_cf
,
efgh_cf
);
__m256i
ijklmnop_cd
=
_mm256_unpacklo_epi64
(
ijkl_cf
,
mnop_cf
);
__m256i
ijklmnop_ef
=
_mm256_unpackhi_epi64
(
ijkl_cf
,
mnop_cf
);
// NOTE: no unpacklo/hi instr in this case, so using permute instr.
__m256i
a_p_0
=
_mm256_permute2x128_si256
(
abcdefgh_01
,
ijklmnop_01
,
0x20
);
__m256i
a_p_1
=
_mm256_permute2x128_si256
(
abcdefgh_01
,
ijklmnop_01
,
0x31
);
__m256i
a_p_2
=
_mm256_permute2x128_si256
(
abcdefgh_23
,
ijklmnop_23
,
0x20
);
__m256i
a_p_3
=
_mm256_permute2x128_si256
(
abcdefgh_23
,
ijklmnop_23
,
0x31
);
__m256i
a_p_4
=
_mm256_permute2x128_si256
(
abcdefgh_45
,
ijklmnop_45
,
0x20
);
__m256i
a_p_5
=
_mm256_permute2x128_si256
(
abcdefgh_45
,
ijklmnop_45
,
0x31
);
__m256i
a_p_6
=
_mm256_permute2x128_si256
(
abcdefgh_67
,
ijklmnop_67
,
0x20
);
__m256i
a_p_7
=
_mm256_permute2x128_si256
(
abcdefgh_67
,
ijklmnop_67
,
0x31
);
__m256i
a_p_8
=
_mm256_permute2x128_si256
(
abcdefgh_89
,
ijklmnop_89
,
0x20
);
__m256i
a_p_9
=
_mm256_permute2x128_si256
(
abcdefgh_89
,
ijklmnop_89
,
0x31
);
__m256i
a_p_a
=
_mm256_permute2x128_si256
(
abcdefgh_ab
,
ijklmnop_ab
,
0x20
);
__m256i
a_p_b
=
_mm256_permute2x128_si256
(
abcdefgh_ab
,
ijklmnop_ab
,
0x31
);
__m256i
a_p_c
=
_mm256_permute2x128_si256
(
abcdefgh_cd
,
ijklmnop_cd
,
0x20
);
__m256i
a_p_d
=
_mm256_permute2x128_si256
(
abcdefgh_cd
,
ijklmnop_cd
,
0x31
);
__m256i
a_p_e
=
_mm256_permute2x128_si256
(
abcdefgh_ef
,
ijklmnop_ef
,
0x20
);
__m256i
a_p_f
=
_mm256_permute2x128_si256
(
abcdefgh_ef
,
ijklmnop_ef
,
0x31
);
kernel
.
packet
[
0
].
x
=
a_p_0
;
kernel
.
packet
[
1
].
x
=
a_p_1
;
kernel
.
packet
[
2
].
x
=
a_p_2
;
kernel
.
packet
[
3
].
x
=
a_p_3
;
kernel
.
packet
[
4
].
x
=
a_p_4
;
kernel
.
packet
[
5
].
x
=
a_p_5
;
kernel
.
packet
[
6
].
x
=
a_p_6
;
kernel
.
packet
[
7
].
x
=
a_p_7
;
kernel
.
packet
[
8
].
x
=
a_p_8
;
kernel
.
packet
[
9
].
x
=
a_p_9
;
kernel
.
packet
[
10
].
x
=
a_p_a
;
kernel
.
packet
[
11
].
x
=
a_p_b
;
kernel
.
packet
[
12
].
x
=
a_p_c
;
kernel
.
packet
[
13
].
x
=
a_p_d
;
kernel
.
packet
[
14
].
x
=
a_p_e
;
kernel
.
packet
[
15
].
x
=
a_p_f
;
}
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
Packet16h
,
8
>&
kernel
)
{
EIGEN_ALIGN64
half
in
[
8
][
16
];
pstore
<
half
>
(
in
[
0
],
kernel
.
packet
[
0
]);
pstore
<
half
>
(
in
[
1
],
kernel
.
packet
[
1
]);
pstore
<
half
>
(
in
[
2
],
kernel
.
packet
[
2
]);
pstore
<
half
>
(
in
[
3
],
kernel
.
packet
[
3
]);
pstore
<
half
>
(
in
[
4
],
kernel
.
packet
[
4
]);
pstore
<
half
>
(
in
[
5
],
kernel
.
packet
[
5
]);
pstore
<
half
>
(
in
[
6
],
kernel
.
packet
[
6
]);
pstore
<
half
>
(
in
[
7
],
kernel
.
packet
[
7
]);
EIGEN_ALIGN64
half
out
[
8
][
16
];
for
(
int
i
=
0
;
i
<
8
;
++
i
)
{
for
(
int
j
=
0
;
j
<
8
;
++
j
)
{
out
[
i
][
j
]
=
in
[
j
][
2
*
i
];
}
for
(
int
j
=
0
;
j
<
8
;
++
j
)
{
out
[
i
][
j
+
8
]
=
in
[
j
][
2
*
i
+
1
];
}
}
kernel
.
packet
[
0
]
=
pload
<
Packet16h
>
(
out
[
0
]);
kernel
.
packet
[
1
]
=
pload
<
Packet16h
>
(
out
[
1
]);
kernel
.
packet
[
2
]
=
pload
<
Packet16h
>
(
out
[
2
]);
kernel
.
packet
[
3
]
=
pload
<
Packet16h
>
(
out
[
3
]);
kernel
.
packet
[
4
]
=
pload
<
Packet16h
>
(
out
[
4
]);
kernel
.
packet
[
5
]
=
pload
<
Packet16h
>
(
out
[
5
]);
kernel
.
packet
[
6
]
=
pload
<
Packet16h
>
(
out
[
6
]);
kernel
.
packet
[
7
]
=
pload
<
Packet16h
>
(
out
[
7
]);
}
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
Packet16h
,
4
>&
kernel
)
{
EIGEN_ALIGN64
half
in
[
4
][
16
];
pstore
<
half
>
(
in
[
0
],
kernel
.
packet
[
0
]);
pstore
<
half
>
(
in
[
1
],
kernel
.
packet
[
1
]);
pstore
<
half
>
(
in
[
2
],
kernel
.
packet
[
2
]);
pstore
<
half
>
(
in
[
3
],
kernel
.
packet
[
3
]);
EIGEN_ALIGN64
half
out
[
4
][
16
];
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
out
[
i
][
j
]
=
in
[
j
][
4
*
i
];
}
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
out
[
i
][
j
+
4
]
=
in
[
j
][
4
*
i
+
1
];
}
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
out
[
i
][
j
+
8
]
=
in
[
j
][
4
*
i
+
2
];
}
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
out
[
i
][
j
+
12
]
=
in
[
j
][
4
*
i
+
3
];
}
}
kernel
.
packet
[
0
]
=
pload
<
Packet16h
>
(
out
[
0
]);
kernel
.
packet
[
1
]
=
pload
<
Packet16h
>
(
out
[
1
]);
kernel
.
packet
[
2
]
=
pload
<
Packet16h
>
(
out
[
2
]);
kernel
.
packet
[
3
]
=
pload
<
Packet16h
>
(
out
[
3
]);
}
#elif defined EIGEN_VECTORIZE_AVX
typedef
struct
{
__m128i
x
;
}
Packet8h
;
template
<
>
struct
is_arithmetic
<
Packet8h
>
{
enum
{
value
=
true
};
};
template
<
>
struct
packet_traits
<
Eigen
::
half
>
:
default_packet_traits
{
typedef
Packet8h
type
;
// There is no half-size packet for Packet8h.
typedef
Packet8h
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
8
,
HasHalfPacket
=
0
,
HasAdd
=
0
,
HasSub
=
0
,
HasMul
=
0
,
HasNegate
=
0
,
HasAbs
=
0
,
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
HasConj
=
0
,
HasSetLinear
=
0
,
HasDiv
=
0
,
HasSqrt
=
0
,
HasRsqrt
=
0
,
HasExp
=
0
,
HasLog
=
0
,
HasBlend
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet8h
>
{
typedef
Eigen
::
half
type
;
enum
{
size
=
8
,
alignment
=
Aligned16
};
typedef
Packet8h
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet8h
pset1
<
Packet8h
>
(
const
Eigen
::
half
&
from
)
{
Packet8h
result
;
result
.
x
=
_mm_set1_epi16
(
from
.
x
);
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
Eigen
::
half
pfirst
<
Packet8h
>
(
const
Packet8h
&
from
)
{
return
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
_mm_extract_epi16
(
from
.
x
,
0
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet8h
pload
<
Packet8h
>
(
const
Eigen
::
half
*
from
)
{
Packet8h
result
;
result
.
x
=
_mm_load_si128
(
reinterpret_cast
<
const
__m128i
*>
(
from
));
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet8h
ploadu
<
Packet8h
>
(
const
Eigen
::
half
*
from
)
{
Packet8h
result
;
result
.
x
=
_mm_loadu_si128
(
reinterpret_cast
<
const
__m128i
*>
(
from
));
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
Eigen
::
half
>
(
Eigen
::
half
*
to
,
const
Packet8h
&
from
)
{
_mm_store_si128
(
reinterpret_cast
<
__m128i
*>
(
to
),
from
.
x
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
Eigen
::
half
>
(
Eigen
::
half
*
to
,
const
Packet8h
&
from
)
{
_mm_storeu_si128
(
reinterpret_cast
<
__m128i
*>
(
to
),
from
.
x
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8h
ploadquad
<
Packet8h
>
(
const
Eigen
::
half
*
from
)
{
Packet8h
result
;
unsigned
short
a
=
from
[
0
].
x
;
unsigned
short
b
=
from
[
1
].
x
;
result
.
x
=
_mm_set_epi16
(
b
,
b
,
b
,
b
,
a
,
a
,
a
,
a
);
return
result
;
}
EIGEN_STRONG_INLINE
Packet8f
half2float
(
const
Packet8h
&
a
)
{
#ifdef EIGEN_HAS_FP16_C
return
_mm256_cvtph_ps
(
a
.
x
);
#else
EIGEN_ALIGN32
Eigen
::
half
aux
[
8
];
pstore
(
aux
,
a
);
float
f0
(
aux
[
0
]);
float
f1
(
aux
[
1
]);
float
f2
(
aux
[
2
]);
float
f3
(
aux
[
3
]);
float
f4
(
aux
[
4
]);
float
f5
(
aux
[
5
]);
float
f6
(
aux
[
6
]);
float
f7
(
aux
[
7
]);
return
_mm256_set_ps
(
f7
,
f6
,
f5
,
f4
,
f3
,
f2
,
f1
,
f0
);
#endif
}
EIGEN_STRONG_INLINE
Packet8h
float2half
(
const
Packet8f
&
a
)
{
#ifdef EIGEN_HAS_FP16_C
Packet8h
result
;
result
.
x
=
_mm256_cvtps_ph
(
a
,
_MM_FROUND_TO_NEAREST_INT
|
_MM_FROUND_NO_EXC
);
return
result
;
#else
EIGEN_ALIGN32
float
aux
[
8
];
pstore
(
aux
,
a
);
Eigen
::
half
h0
(
aux
[
0
]);
Eigen
::
half
h1
(
aux
[
1
]);
Eigen
::
half
h2
(
aux
[
2
]);
Eigen
::
half
h3
(
aux
[
3
]);
Eigen
::
half
h4
(
aux
[
4
]);
Eigen
::
half
h5
(
aux
[
5
]);
Eigen
::
half
h6
(
aux
[
6
]);
Eigen
::
half
h7
(
aux
[
7
]);
Packet8h
result
;
result
.
x
=
_mm_set_epi16
(
h7
.
x
,
h6
.
x
,
h5
.
x
,
h4
.
x
,
h3
.
x
,
h2
.
x
,
h1
.
x
,
h0
.
x
);
return
result
;
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet8h
pconj
(
const
Packet8h
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet8h
padd
<
Packet8h
>
(
const
Packet8h
&
a
,
const
Packet8h
&
b
)
{
Packet8f
af
=
half2float
(
a
);
Packet8f
bf
=
half2float
(
b
);
Packet8f
rf
=
padd
(
af
,
bf
);
return
float2half
(
rf
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8h
pmul
<
Packet8h
>
(
const
Packet8h
&
a
,
const
Packet8h
&
b
)
{
Packet8f
af
=
half2float
(
a
);
Packet8f
bf
=
half2float
(
b
);
Packet8f
rf
=
pmul
(
af
,
bf
);
return
float2half
(
rf
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8h
pgather
<
Eigen
::
half
,
Packet8h
>
(
const
Eigen
::
half
*
from
,
Index
stride
)
{
Packet8h
result
;
result
.
x
=
_mm_set_epi16
(
from
[
7
*
stride
].
x
,
from
[
6
*
stride
].
x
,
from
[
5
*
stride
].
x
,
from
[
4
*
stride
].
x
,
from
[
3
*
stride
].
x
,
from
[
2
*
stride
].
x
,
from
[
1
*
stride
].
x
,
from
[
0
*
stride
].
x
);
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pscatter
<
Eigen
::
half
,
Packet8h
>
(
Eigen
::
half
*
to
,
const
Packet8h
&
from
,
Index
stride
)
{
EIGEN_ALIGN32
Eigen
::
half
aux
[
8
];
pstore
(
aux
,
from
);
to
[
stride
*
0
].
x
=
aux
[
0
].
x
;
to
[
stride
*
1
].
x
=
aux
[
1
].
x
;
to
[
stride
*
2
].
x
=
aux
[
2
].
x
;
to
[
stride
*
3
].
x
=
aux
[
3
].
x
;
to
[
stride
*
4
].
x
=
aux
[
4
].
x
;
to
[
stride
*
5
].
x
=
aux
[
5
].
x
;
to
[
stride
*
6
].
x
=
aux
[
6
].
x
;
to
[
stride
*
7
].
x
=
aux
[
7
].
x
;
}
template
<
>
EIGEN_STRONG_INLINE
Eigen
::
half
predux
<
Packet8h
>
(
const
Packet8h
&
a
)
{
Packet8f
af
=
half2float
(
a
);
float
reduced
=
predux
<
Packet8f
>
(
af
);
return
Eigen
::
half
(
reduced
);
}
template
<
>
EIGEN_STRONG_INLINE
Eigen
::
half
predux_max
<
Packet8h
>
(
const
Packet8h
&
a
)
{
Packet8f
af
=
half2float
(
a
);
float
reduced
=
predux_max
<
Packet8f
>
(
af
);
return
Eigen
::
half
(
reduced
);
}
template
<
>
EIGEN_STRONG_INLINE
Eigen
::
half
predux_min
<
Packet8h
>
(
const
Packet8h
&
a
)
{
Packet8f
af
=
half2float
(
a
);
float
reduced
=
predux_min
<
Packet8f
>
(
af
);
return
Eigen
::
half
(
reduced
);
}
template
<
>
EIGEN_STRONG_INLINE
Eigen
::
half
predux_mul
<
Packet8h
>
(
const
Packet8h
&
a
)
{
Packet8f
af
=
half2float
(
a
);
float
reduced
=
predux_mul
<
Packet8f
>
(
af
);
return
Eigen
::
half
(
reduced
);
}
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
Packet8h
,
8
>&
kernel
)
{
__m128i
a
=
kernel
.
packet
[
0
].
x
;
__m128i
b
=
kernel
.
packet
[
1
].
x
;
__m128i
c
=
kernel
.
packet
[
2
].
x
;
__m128i
d
=
kernel
.
packet
[
3
].
x
;
__m128i
e
=
kernel
.
packet
[
4
].
x
;
__m128i
f
=
kernel
.
packet
[
5
].
x
;
__m128i
g
=
kernel
.
packet
[
6
].
x
;
__m128i
h
=
kernel
.
packet
[
7
].
x
;
__m128i
a03b03
=
_mm_unpacklo_epi16
(
a
,
b
);
__m128i
c03d03
=
_mm_unpacklo_epi16
(
c
,
d
);
__m128i
e03f03
=
_mm_unpacklo_epi16
(
e
,
f
);
__m128i
g03h03
=
_mm_unpacklo_epi16
(
g
,
h
);
__m128i
a47b47
=
_mm_unpackhi_epi16
(
a
,
b
);
__m128i
c47d47
=
_mm_unpackhi_epi16
(
c
,
d
);
__m128i
e47f47
=
_mm_unpackhi_epi16
(
e
,
f
);
__m128i
g47h47
=
_mm_unpackhi_epi16
(
g
,
h
);
__m128i
a01b01c01d01
=
_mm_unpacklo_epi32
(
a03b03
,
c03d03
);
__m128i
a23b23c23d23
=
_mm_unpackhi_epi32
(
a03b03
,
c03d03
);
__m128i
e01f01g01h01
=
_mm_unpacklo_epi32
(
e03f03
,
g03h03
);
__m128i
e23f23g23h23
=
_mm_unpackhi_epi32
(
e03f03
,
g03h03
);
__m128i
a45b45c45d45
=
_mm_unpacklo_epi32
(
a47b47
,
c47d47
);
__m128i
a67b67c67d67
=
_mm_unpackhi_epi32
(
a47b47
,
c47d47
);
__m128i
e45f45g45h45
=
_mm_unpacklo_epi32
(
e47f47
,
g47h47
);
__m128i
e67f67g67h67
=
_mm_unpackhi_epi32
(
e47f47
,
g47h47
);
__m128i
a0b0c0d0e0f0g0h0
=
_mm_unpacklo_epi64
(
a01b01c01d01
,
e01f01g01h01
);
__m128i
a1b1c1d1e1f1g1h1
=
_mm_unpackhi_epi64
(
a01b01c01d01
,
e01f01g01h01
);
__m128i
a2b2c2d2e2f2g2h2
=
_mm_unpacklo_epi64
(
a23b23c23d23
,
e23f23g23h23
);
__m128i
a3b3c3d3e3f3g3h3
=
_mm_unpackhi_epi64
(
a23b23c23d23
,
e23f23g23h23
);
__m128i
a4b4c4d4e4f4g4h4
=
_mm_unpacklo_epi64
(
a45b45c45d45
,
e45f45g45h45
);
__m128i
a5b5c5d5e5f5g5h5
=
_mm_unpackhi_epi64
(
a45b45c45d45
,
e45f45g45h45
);
__m128i
a6b6c6d6e6f6g6h6
=
_mm_unpacklo_epi64
(
a67b67c67d67
,
e67f67g67h67
);
__m128i
a7b7c7d7e7f7g7h7
=
_mm_unpackhi_epi64
(
a67b67c67d67
,
e67f67g67h67
);
kernel
.
packet
[
0
].
x
=
a0b0c0d0e0f0g0h0
;
kernel
.
packet
[
1
].
x
=
a1b1c1d1e1f1g1h1
;
kernel
.
packet
[
2
].
x
=
a2b2c2d2e2f2g2h2
;
kernel
.
packet
[
3
].
x
=
a3b3c3d3e3f3g3h3
;
kernel
.
packet
[
4
].
x
=
a4b4c4d4e4f4g4h4
;
kernel
.
packet
[
5
].
x
=
a5b5c5d5e5f5g5h5
;
kernel
.
packet
[
6
].
x
=
a6b6c6d6e6f6g6h6
;
kernel
.
packet
[
7
].
x
=
a7b7c7d7e7f7g7h7
;
}
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
Packet8h
,
4
>&
kernel
)
{
EIGEN_ALIGN32
Eigen
::
half
in
[
4
][
8
];
pstore
<
Eigen
::
half
>
(
in
[
0
],
kernel
.
packet
[
0
]);
pstore
<
Eigen
::
half
>
(
in
[
1
],
kernel
.
packet
[
1
]);
pstore
<
Eigen
::
half
>
(
in
[
2
],
kernel
.
packet
[
2
]);
pstore
<
Eigen
::
half
>
(
in
[
3
],
kernel
.
packet
[
3
]);
EIGEN_ALIGN32
Eigen
::
half
out
[
4
][
8
];
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
out
[
i
][
j
]
=
in
[
j
][
2
*
i
];
}
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
out
[
i
][
j
+
4
]
=
in
[
j
][
2
*
i
+
1
];
}
}
kernel
.
packet
[
0
]
=
pload
<
Packet8h
>
(
out
[
0
]);
kernel
.
packet
[
1
]
=
pload
<
Packet8h
>
(
out
[
1
]);
kernel
.
packet
[
2
]
=
pload
<
Packet8h
>
(
out
[
2
]);
kernel
.
packet
[
3
]
=
pload
<
Packet8h
>
(
out
[
3
]);
}
// Disable the following code since it's broken on too many platforms / compilers.
//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
#elif 0
typedef
struct
{
__m64
x
;
}
Packet4h
;
template
<
>
struct
is_arithmetic
<
Packet4h
>
{
enum
{
value
=
true
};
};
template
<
>
struct
packet_traits
<
Eigen
::
half
>
:
default_packet_traits
{
typedef
Packet4h
type
;
// There is no half-size packet for Packet4h.
typedef
Packet4h
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
,
HasHalfPacket
=
0
,
HasAdd
=
0
,
HasSub
=
0
,
HasMul
=
0
,
HasNegate
=
0
,
HasAbs
=
0
,
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
HasConj
=
0
,
HasSetLinear
=
0
,
HasDiv
=
0
,
HasSqrt
=
0
,
HasRsqrt
=
0
,
HasExp
=
0
,
HasLog
=
0
,
HasBlend
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet4h
>
{
typedef
Eigen
::
half
type
;
enum
{
size
=
4
,
alignment
=
Aligned16
};
typedef
Packet4h
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet4h
pset1
<
Packet4h
>
(
const
Eigen
::
half
&
from
)
{
Packet4h
result
;
result
.
x
=
_mm_set1_pi16
(
from
.
x
);
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
Eigen
::
half
pfirst
<
Packet4h
>
(
const
Packet4h
&
from
)
{
return
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
_mm_cvtsi64_si32
(
from
.
x
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4h
pconj
(
const
Packet4h
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4h
padd
<
Packet4h
>
(
const
Packet4h
&
a
,
const
Packet4h
&
b
)
{
__int64_t
a64
=
_mm_cvtm64_si64
(
a
.
x
);
__int64_t
b64
=
_mm_cvtm64_si64
(
b
.
x
);
Eigen
::
half
h
[
4
];
Eigen
::
half
ha
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
));
Eigen
::
half
hb
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
b64
));
h
[
0
]
=
ha
+
hb
;
ha
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
>>
16
));
hb
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
b64
>>
16
));
h
[
1
]
=
ha
+
hb
;
ha
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
>>
32
));
hb
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
b64
>>
32
));
h
[
2
]
=
ha
+
hb
;
ha
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
>>
48
));
hb
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
b64
>>
48
));
h
[
3
]
=
ha
+
hb
;
Packet4h
result
;
result
.
x
=
_mm_set_pi16
(
h
[
3
].
x
,
h
[
2
].
x
,
h
[
1
].
x
,
h
[
0
].
x
);
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4h
pmul
<
Packet4h
>
(
const
Packet4h
&
a
,
const
Packet4h
&
b
)
{
__int64_t
a64
=
_mm_cvtm64_si64
(
a
.
x
);
__int64_t
b64
=
_mm_cvtm64_si64
(
b
.
x
);
Eigen
::
half
h
[
4
];
Eigen
::
half
ha
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
));
Eigen
::
half
hb
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
b64
));
h
[
0
]
=
ha
*
hb
;
ha
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
>>
16
));
hb
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
b64
>>
16
));
h
[
1
]
=
ha
*
hb
;
ha
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
>>
32
));
hb
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
b64
>>
32
));
h
[
2
]
=
ha
*
hb
;
ha
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
>>
48
));
hb
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
b64
>>
48
));
h
[
3
]
=
ha
*
hb
;
Packet4h
result
;
result
.
x
=
_mm_set_pi16
(
h
[
3
].
x
,
h
[
2
].
x
,
h
[
1
].
x
,
h
[
0
].
x
);
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4h
pload
<
Packet4h
>
(
const
Eigen
::
half
*
from
)
{
Packet4h
result
;
result
.
x
=
_mm_cvtsi64_m64
(
*
reinterpret_cast
<
const
__int64_t
*>
(
from
));
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4h
ploadu
<
Packet4h
>
(
const
Eigen
::
half
*
from
)
{
Packet4h
result
;
result
.
x
=
_mm_cvtsi64_m64
(
*
reinterpret_cast
<
const
__int64_t
*>
(
from
));
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
Eigen
::
half
>
(
Eigen
::
half
*
to
,
const
Packet4h
&
from
)
{
__int64_t
r
=
_mm_cvtm64_si64
(
from
.
x
);
*
(
reinterpret_cast
<
__int64_t
*>
(
to
))
=
r
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
Eigen
::
half
>
(
Eigen
::
half
*
to
,
const
Packet4h
&
from
)
{
__int64_t
r
=
_mm_cvtm64_si64
(
from
.
x
);
*
(
reinterpret_cast
<
__int64_t
*>
(
to
))
=
r
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4h
ploadquad
<
Packet4h
>
(
const
Eigen
::
half
*
from
)
{
return
pset1
<
Packet4h
>
(
*
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4h
pgather
<
Eigen
::
half
,
Packet4h
>
(
const
Eigen
::
half
*
from
,
Index
stride
)
{
Packet4h
result
;
result
.
x
=
_mm_set_pi16
(
from
[
3
*
stride
].
x
,
from
[
2
*
stride
].
x
,
from
[
1
*
stride
].
x
,
from
[
0
*
stride
].
x
);
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pscatter
<
Eigen
::
half
,
Packet4h
>
(
Eigen
::
half
*
to
,
const
Packet4h
&
from
,
Index
stride
)
{
__int64_t
a
=
_mm_cvtm64_si64
(
from
.
x
);
to
[
stride
*
0
].
x
=
static_cast
<
unsigned
short
>
(
a
);
to
[
stride
*
1
].
x
=
static_cast
<
unsigned
short
>
(
a
>>
16
);
to
[
stride
*
2
].
x
=
static_cast
<
unsigned
short
>
(
a
>>
32
);
to
[
stride
*
3
].
x
=
static_cast
<
unsigned
short
>
(
a
>>
48
);
}
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
Packet4h
,
4
>&
kernel
)
{
__m64
T0
=
_mm_unpacklo_pi16
(
kernel
.
packet
[
0
].
x
,
kernel
.
packet
[
1
].
x
);
__m64
T1
=
_mm_unpacklo_pi16
(
kernel
.
packet
[
2
].
x
,
kernel
.
packet
[
3
].
x
);
__m64
T2
=
_mm_unpackhi_pi16
(
kernel
.
packet
[
0
].
x
,
kernel
.
packet
[
1
].
x
);
__m64
T3
=
_mm_unpackhi_pi16
(
kernel
.
packet
[
2
].
x
,
kernel
.
packet
[
3
].
x
);
kernel
.
packet
[
0
].
x
=
_mm_unpacklo_pi32
(
T0
,
T1
);
kernel
.
packet
[
1
].
x
=
_mm_unpackhi_pi32
(
T0
,
T1
);
kernel
.
packet
[
2
].
x
=
_mm_unpacklo_pi32
(
T2
,
T3
);
kernel
.
packet
[
3
].
x
=
_mm_unpackhi_pi32
(
T2
,
T3
);
}
#endif
}
}
#endif // EIGEN_PACKET_MATH_HALF_CUDA_H
external/eigen3/Eigen/src/Core/arch/CUDA/TypeCasting.h
0 → 100644
View file @
a394b22a
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_TYPE_CASTING_CUDA_H
#define EIGEN_TYPE_CASTING_CUDA_H
namespace
Eigen
{
namespace
internal
{
template
<
>
struct
scalar_cast_op
<
float
,
Eigen
::
half
>
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_cast_op
)
typedef
Eigen
::
half
result_type
;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
Eigen
::
half
operator
()
(
const
float
&
a
)
const
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
return
__float2half
(
a
);
#else
return
Eigen
::
half
(
a
);
#endif
}
};
template
<
>
struct
functor_traits
<
scalar_cast_op
<
float
,
Eigen
::
half
>
>
{
enum
{
Cost
=
NumTraits
<
float
>::
AddCost
,
PacketAccess
=
false
};
};
template
<
>
struct
scalar_cast_op
<
int
,
Eigen
::
half
>
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_cast_op
)
typedef
Eigen
::
half
result_type
;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
Eigen
::
half
operator
()
(
const
int
&
a
)
const
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
return
__float2half
(
static_cast
<
float
>
(
a
));
#else
return
Eigen
::
half
(
static_cast
<
float
>
(
a
));
#endif
}
};
template
<
>
struct
functor_traits
<
scalar_cast_op
<
int
,
Eigen
::
half
>
>
{
enum
{
Cost
=
NumTraits
<
float
>::
AddCost
,
PacketAccess
=
false
};
};
template
<
>
struct
scalar_cast_op
<
Eigen
::
half
,
float
>
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_cast_op
)
typedef
float
result_type
;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float
operator
()
(
const
Eigen
::
half
&
a
)
const
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
return
__half2float
(
a
);
#else
return
static_cast
<
float
>
(
a
);
#endif
}
};
template
<
>
struct
functor_traits
<
scalar_cast_op
<
Eigen
::
half
,
float
>
>
{
enum
{
Cost
=
NumTraits
<
float
>::
AddCost
,
PacketAccess
=
false
};
};
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
template
<
>
struct
type_casting_traits
<
Eigen
::
half
,
float
>
{
enum
{
VectorizedCast
=
1
,
SrcCoeffRatio
=
2
,
TgtCoeffRatio
=
1
};
};
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pcast
<
half2
,
float4
>
(
const
half2
&
a
,
const
half2
&
b
)
{
float2
r1
=
__half22float2
(
a
);
float2
r2
=
__half22float2
(
b
);
return
make_float4
(
r1
.
x
,
r1
.
y
,
r2
.
x
,
r2
.
y
);
}
template
<
>
struct
type_casting_traits
<
float
,
Eigen
::
half
>
{
enum
{
VectorizedCast
=
1
,
SrcCoeffRatio
=
1
,
TgtCoeffRatio
=
2
};
};
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
half2
pcast
<
float4
,
half2
>
(
const
float4
&
a
)
{
// Simply discard the second half of the input
return
__floats2half2_rn
(
a
.
x
,
a
.
y
);
}
#elif defined EIGEN_VECTORIZE_AVX512
template
<
>
struct
type_casting_traits
<
half
,
float
>
{
enum
{
VectorizedCast
=
1
,
SrcCoeffRatio
=
1
,
TgtCoeffRatio
=
1
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pcast
<
Packet16h
,
Packet16f
>
(
const
Packet16h
&
a
)
{
return
half2float
(
a
);
}
template
<
>
struct
type_casting_traits
<
float
,
half
>
{
enum
{
VectorizedCast
=
1
,
SrcCoeffRatio
=
1
,
TgtCoeffRatio
=
1
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet16h
pcast
<
Packet16f
,
Packet16h
>
(
const
Packet16f
&
a
)
{
return
float2half
(
a
);
}
#elif defined EIGEN_VECTORIZE_AVX
template
<
>
struct
type_casting_traits
<
Eigen
::
half
,
float
>
{
enum
{
VectorizedCast
=
1
,
SrcCoeffRatio
=
1
,
TgtCoeffRatio
=
1
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pcast
<
Packet8h
,
Packet8f
>
(
const
Packet8h
&
a
)
{
return
half2float
(
a
);
}
template
<
>
struct
type_casting_traits
<
float
,
Eigen
::
half
>
{
enum
{
VectorizedCast
=
1
,
SrcCoeffRatio
=
1
,
TgtCoeffRatio
=
1
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet8h
pcast
<
Packet8f
,
Packet8h
>
(
const
Packet8f
&
a
)
{
return
float2half
(
a
);
}
// Disable the following code since it's broken on too many platforms / compilers.
//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
#elif 0
template
<
>
struct
type_casting_traits
<
Eigen
::
half
,
float
>
{
enum
{
VectorizedCast
=
1
,
SrcCoeffRatio
=
1
,
TgtCoeffRatio
=
1
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pcast
<
Packet4h
,
Packet4f
>
(
const
Packet4h
&
a
)
{
__int64_t
a64
=
_mm_cvtm64_si64
(
a
.
x
);
Eigen
::
half
h
=
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
));
float
f1
=
static_cast
<
float
>
(
h
);
h
=
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
>>
16
));
float
f2
=
static_cast
<
float
>
(
h
);
h
=
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
>>
32
));
float
f3
=
static_cast
<
float
>
(
h
);
h
=
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
>>
48
));
float
f4
=
static_cast
<
float
>
(
h
);
return
_mm_set_ps
(
f4
,
f3
,
f2
,
f1
);
}
template
<
>
struct
type_casting_traits
<
float
,
Eigen
::
half
>
{
enum
{
VectorizedCast
=
1
,
SrcCoeffRatio
=
1
,
TgtCoeffRatio
=
1
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet4h
pcast
<
Packet4f
,
Packet4h
>
(
const
Packet4f
&
a
)
{
EIGEN_ALIGN16
float
aux
[
4
];
pstore
(
aux
,
a
);
Eigen
::
half
h0
(
aux
[
0
]);
Eigen
::
half
h1
(
aux
[
1
]);
Eigen
::
half
h2
(
aux
[
2
]);
Eigen
::
half
h3
(
aux
[
3
]);
Packet4h
result
;
result
.
x
=
_mm_set_pi16
(
h3
.
x
,
h2
.
x
,
h1
.
x
,
h0
.
x
);
return
result
;
}
#endif
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_TYPE_CASTING_CUDA_H
external/eigen3/Eigen/src/Core/arch/Default/CMakeLists.txt
deleted
100644 → 0
View file @
701c0225
FILE
(
GLOB Eigen_Core_arch_Default_SRCS
"*.h"
)
INSTALL
(
FILES
${
Eigen_Core_arch_Default_SRCS
}
DESTINATION
${
INCLUDE_INSTALL_DIR
}
/Eigen/src/Core/arch/Default COMPONENT Devel
)
external/eigen3/Eigen/src/Core/arch/NEON/CMakeLists.txt
deleted
100644 → 0
View file @
701c0225
FILE
(
GLOB Eigen_Core_arch_NEON_SRCS
"*.h"
)
INSTALL
(
FILES
${
Eigen_Core_arch_NEON_SRCS
}
DESTINATION
${
INCLUDE_INSTALL_DIR
}
/Eigen/src/Core/arch/NEON COMPONENT Devel
)
external/eigen3/Eigen/src/Core/arch/NEON/Complex.h
View file @
a394b22a
...
...
@@ -2,6 +2,7 @@
// for linear algebra.
//
// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
...
...
@@ -14,8 +15,21 @@ namespace Eigen {
namespace
internal
{
static
uint32x4_t
p4ui_CONJ_XOR
=
EIGEN_INIT_NEON_PACKET4
(
0x00000000
,
0x80000000
,
0x00000000
,
0x80000000
);
static
uint32x2_t
p2ui_CONJ_XOR
=
EIGEN_INIT_NEON_PACKET2
(
0x00000000
,
0x80000000
);
inline
uint32x4_t
p4ui_CONJ_XOR
()
{
// See bug 1325, clang fails to call vld1q_u64.
#if EIGEN_COMP_CLANG
uint32x4_t
ret
=
{
0x00000000
,
0x80000000
,
0x00000000
,
0x80000000
};
return
ret
;
#else
static
const
uint32_t
conj_XOR_DATA
[]
=
{
0x00000000
,
0x80000000
,
0x00000000
,
0x80000000
};
return
vld1q_u32
(
conj_XOR_DATA
);
#endif
}
inline
uint32x2_t
p2ui_CONJ_XOR
()
{
static
const
uint32_t
conj_XOR_DATA
[]
=
{
0x00000000
,
0x80000000
};
return
vld1_u32
(
conj_XOR_DATA
);
}
//---------- float ----------
struct
Packet2cf
...
...
@@ -28,10 +42,12 @@ struct Packet2cf
template
<
>
struct
packet_traits
<
std
::
complex
<
float
>
>
:
default_packet_traits
{
typedef
Packet2cf
type
;
typedef
Packet2cf
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
2
,
HasHalfPacket
=
0
,
HasAdd
=
1
,
HasSub
=
1
,
...
...
@@ -46,7 +62,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
};
};
template
<
>
struct
unpacket_traits
<
Packet2cf
>
{
typedef
std
::
complex
<
float
>
type
;
enum
{
size
=
2
}
;
};
template
<
>
struct
unpacket_traits
<
Packet2cf
>
{
typedef
std
::
complex
<
float
>
type
;
enum
{
size
=
2
,
alignment
=
Aligned16
};
typedef
Packet2cf
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pset1
<
Packet2cf
>
(
const
std
::
complex
<
float
>&
from
)
{
...
...
@@ -62,7 +78,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Pa
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pconj
(
const
Packet2cf
&
a
)
{
Packet4ui
b
=
vreinterpretq_u32_f32
(
a
.
v
);
return
Packet2cf
(
vreinterpretq_f32_u32
(
veorq_u32
(
b
,
p4ui_CONJ_XOR
)));
return
Packet2cf
(
vreinterpretq_f32_u32
(
veorq_u32
(
b
,
p4ui_CONJ_XOR
()
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pmul
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
...
...
@@ -71,14 +87,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
// Get the real values of a | a1_re | a1_re | a2_re | a2_re |
v1
=
vcombine_f32
(
vdup_lane_f32
(
vget_low_f32
(
a
.
v
),
0
),
vdup_lane_f32
(
vget_high_f32
(
a
.
v
),
0
));
// Get the
real
values of a | a1_im | a1_im | a2_im | a2_im |
// Get the
imag
values of a | a1_im | a1_im | a2_im | a2_im |
v2
=
vcombine_f32
(
vdup_lane_f32
(
vget_low_f32
(
a
.
v
),
1
),
vdup_lane_f32
(
vget_high_f32
(
a
.
v
),
1
));
// Multiply the real a with b
v1
=
vmulq_f32
(
v1
,
b
.
v
);
// Multiply the imag a with b
v2
=
vmulq_f32
(
v2
,
b
.
v
);
// Conjugate v2
v2
=
vreinterpretq_f32_u32
(
veorq_u32
(
vreinterpretq_u32_f32
(
v2
),
p4ui_CONJ_XOR
));
v2
=
vreinterpretq_f32_u32
(
veorq_u32
(
vreinterpretq_u32_f32
(
v2
),
p4ui_CONJ_XOR
()
));
// Swap real/imag elements in v2.
v2
=
vrev64q_f32
(
v2
);
// Add and return the result
...
...
@@ -87,7 +103,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pand
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
vreinterpretq_f32_u32
(
v
orr
q_u32
(
vreinterpretq_u32_f32
(
a
.
v
),
vreinterpretq_u32_f32
(
b
.
v
))));
return
Packet2cf
(
vreinterpretq_f32_u32
(
v
and
q_u32
(
vreinterpretq_u32_f32
(
a
.
v
),
vreinterpretq_u32_f32
(
b
.
v
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
por
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
...
...
@@ -110,6 +126,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
((
float
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
((
float
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet2cf
pgather
<
std
::
complex
<
float
>
,
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
,
Index
stride
)
{
Packet4f
res
=
pset1
<
Packet4f
>
(
0.
f
);
res
=
vsetq_lane_f32
(
std
::
real
(
from
[
0
*
stride
]),
res
,
0
);
res
=
vsetq_lane_f32
(
std
::
imag
(
from
[
0
*
stride
]),
res
,
1
);
res
=
vsetq_lane_f32
(
std
::
real
(
from
[
1
*
stride
]),
res
,
2
);
res
=
vsetq_lane_f32
(
std
::
imag
(
from
[
1
*
stride
]),
res
,
3
);
return
Packet2cf
(
res
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
std
::
complex
<
float
>
,
Packet2cf
>
(
std
::
complex
<
float
>*
to
,
const
Packet2cf
&
from
,
Index
stride
)
{
to
[
stride
*
0
]
=
std
::
complex
<
float
>
(
vgetq_lane_f32
(
from
.
v
,
0
),
vgetq_lane_f32
(
from
.
v
,
1
));
to
[
stride
*
1
]
=
std
::
complex
<
float
>
(
vgetq_lane_f32
(
from
.
v
,
2
),
vgetq_lane_f32
(
from
.
v
,
3
));
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
float
>
>
(
const
std
::
complex
<
float
>
*
addr
)
{
EIGEN_ARM_PREFETCH
((
float
*
)
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
float
>
pfirst
<
Packet2cf
>
(
const
Packet2cf
&
a
)
...
...
@@ -177,7 +209,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
// Multiply the imag a with b
v2
=
vmul_f32
(
v2
,
a2
);
// Conjugate v2
v2
=
vreinterpret_f32_u32
(
veor_u32
(
vreinterpret_u32_f32
(
v2
),
p2ui_CONJ_XOR
));
v2
=
vreinterpret_f32_u32
(
veor_u32
(
vreinterpret_u32_f32
(
v2
),
p2ui_CONJ_XOR
()
));
// Swap real/imag elements in v2.
v2
=
vrev64_f32
(
v2
);
// Add v1, v2
...
...
@@ -235,7 +267,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pdiv
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
// TODO optimize it for
AltiVec
// TODO optimize it for
NEON
Packet2cf
res
=
conj_helper
<
Packet2cf
,
Packet2cf
,
false
,
true
>
().
pmul
(
a
,
b
);
Packet4f
s
,
rev_s
;
...
...
@@ -246,6 +278,207 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
return
Packet2cf
(
pdiv
(
res
.
v
,
vaddq_f32
(
s
,
rev_s
)));
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet2cf
,
2
>&
kernel
)
{
Packet4f
tmp
=
vcombine_f32
(
vget_high_f32
(
kernel
.
packet
[
0
].
v
),
vget_high_f32
(
kernel
.
packet
[
1
].
v
));
kernel
.
packet
[
0
].
v
=
vcombine_f32
(
vget_low_f32
(
kernel
.
packet
[
0
].
v
),
vget_low_f32
(
kernel
.
packet
[
1
].
v
));
kernel
.
packet
[
1
].
v
=
tmp
;
}
//---------- double ----------
#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
// See bug 1325, clang fails to call vld1q_u64.
#if EIGEN_COMP_CLANG
static
uint64x2_t
p2ul_CONJ_XOR
=
{
0x0
,
0x8000000000000000
};
#else
const
uint64_t
p2ul_conj_XOR_DATA
[]
=
{
0x0
,
0x8000000000000000
};
static
uint64x2_t
p2ul_CONJ_XOR
=
vld1q_u64
(
p2ul_conj_XOR_DATA
);
#endif
struct
Packet1cd
{
EIGEN_STRONG_INLINE
Packet1cd
()
{}
EIGEN_STRONG_INLINE
explicit
Packet1cd
(
const
Packet2d
&
a
)
:
v
(
a
)
{}
Packet2d
v
;
};
template
<
>
struct
packet_traits
<
std
::
complex
<
double
>
>
:
default_packet_traits
{
typedef
Packet1cd
type
;
typedef
Packet1cd
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
0
,
size
=
1
,
HasHalfPacket
=
0
,
HasAdd
=
1
,
HasSub
=
1
,
HasMul
=
1
,
HasDiv
=
1
,
HasNegate
=
1
,
HasAbs
=
0
,
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
HasSetLinear
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet1cd
>
{
typedef
std
::
complex
<
double
>
type
;
enum
{
size
=
1
,
alignment
=
Aligned16
};
typedef
Packet1cd
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pload
<
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
Packet1cd
(
pload
<
Packet2d
>
((
const
double
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
ploadu
<
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
Packet1cd
(
ploadu
<
Packet2d
>
((
const
double
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pset1
<
Packet1cd
>
(
const
std
::
complex
<
double
>&
from
)
{
/* here we really have to use unaligned loads :( */
return
ploadu
<
Packet1cd
>
(
&
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
padd
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
padd
<
Packet2d
>
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
psub
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
psub
<
Packet2d
>
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pnegate
(
const
Packet1cd
&
a
)
{
return
Packet1cd
(
pnegate
<
Packet2d
>
(
a
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pconj
(
const
Packet1cd
&
a
)
{
return
Packet1cd
(
vreinterpretq_f64_u64
(
veorq_u64
(
vreinterpretq_u64_f64
(
a
.
v
),
p2ul_CONJ_XOR
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pmul
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
Packet2d
v1
,
v2
;
// Get the real values of a
v1
=
vdupq_lane_f64
(
vget_low_f64
(
a
.
v
),
0
);
// Get the imag values of a
v2
=
vdupq_lane_f64
(
vget_high_f64
(
a
.
v
),
0
);
// Multiply the real a with b
v1
=
vmulq_f64
(
v1
,
b
.
v
);
// Multiply the imag a with b
v2
=
vmulq_f64
(
v2
,
b
.
v
);
// Conjugate v2
v2
=
vreinterpretq_f64_u64
(
veorq_u64
(
vreinterpretq_u64_f64
(
v2
),
p2ul_CONJ_XOR
));
// Swap real/imag elements in v2.
v2
=
preverse
<
Packet2d
>
(
v2
);
// Add and return the result
return
Packet1cd
(
vaddq_f64
(
v1
,
v2
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pand
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
vreinterpretq_f64_u64
(
vandq_u64
(
vreinterpretq_u64_f64
(
a
.
v
),
vreinterpretq_u64_f64
(
b
.
v
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
por
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
vreinterpretq_f64_u64
(
vorrq_u64
(
vreinterpretq_u64_f64
(
a
.
v
),
vreinterpretq_u64_f64
(
b
.
v
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pxor
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
vreinterpretq_f64_u64
(
veorq_u64
(
vreinterpretq_u64_f64
(
a
.
v
),
vreinterpretq_u64_f64
(
b
.
v
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pandnot
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
vreinterpretq_f64_u64
(
vbicq_u64
(
vreinterpretq_u64_f64
(
a
.
v
),
vreinterpretq_u64_f64
(
b
.
v
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
ploaddup
<
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
)
{
return
pset1
<
Packet1cd
>
(
*
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet1cd
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
((
double
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet1cd
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
((
double
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
double
>
>
(
const
std
::
complex
<
double
>
*
addr
)
{
EIGEN_ARM_PREFETCH
((
double
*
)
addr
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet1cd
pgather
<
std
::
complex
<
double
>
,
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
,
Index
stride
)
{
Packet2d
res
=
pset1
<
Packet2d
>
(
0.0
);
res
=
vsetq_lane_f64
(
std
::
real
(
from
[
0
*
stride
]),
res
,
0
);
res
=
vsetq_lane_f64
(
std
::
imag
(
from
[
0
*
stride
]),
res
,
1
);
return
Packet1cd
(
res
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
std
::
complex
<
double
>
,
Packet1cd
>
(
std
::
complex
<
double
>*
to
,
const
Packet1cd
&
from
,
Index
stride
)
{
to
[
stride
*
0
]
=
std
::
complex
<
double
>
(
vgetq_lane_f64
(
from
.
v
,
0
),
vgetq_lane_f64
(
from
.
v
,
1
));
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
pfirst
<
Packet1cd
>
(
const
Packet1cd
&
a
)
{
std
::
complex
<
double
>
EIGEN_ALIGN16
res
;
pstore
<
std
::
complex
<
double
>
>
(
&
res
,
a
);
return
res
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
preverse
(
const
Packet1cd
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
predux
<
Packet1cd
>
(
const
Packet1cd
&
a
)
{
return
pfirst
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
preduxp
<
Packet1cd
>
(
const
Packet1cd
*
vecs
)
{
return
vecs
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
predux_mul
<
Packet1cd
>
(
const
Packet1cd
&
a
)
{
return
pfirst
(
a
);
}
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet1cd
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet1cd
&
/*first*/
,
const
Packet1cd
&
/*second*/
)
{
// FIXME is it sure we never have to align a Packet1cd?
// Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet1cd
,
false
,
true
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet1cd
&
x
,
const
Packet1cd
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
const
{
return
internal
::
pmul
(
a
,
pconj
(
b
));
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet1cd
,
true
,
false
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet1cd
&
x
,
const
Packet1cd
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
const
{
return
internal
::
pmul
(
pconj
(
a
),
b
);
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet1cd
,
true
,
true
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet1cd
&
x
,
const
Packet1cd
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
const
{
return
pconj
(
internal
::
pmul
(
a
,
b
));
}
};
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pdiv
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
// TODO optimize it for NEON
Packet1cd
res
=
conj_helper
<
Packet1cd
,
Packet1cd
,
false
,
true
>
().
pmul
(
a
,
b
);
Packet2d
s
=
pmul
<
Packet2d
>
(
b
.
v
,
b
.
v
);
Packet2d
rev_s
=
preverse
<
Packet2d
>
(
s
);
return
Packet1cd
(
pdiv
(
res
.
v
,
padd
<
Packet2d
>
(
s
,
rev_s
)));
}
EIGEN_STRONG_INLINE
Packet1cd
pcplxflip
/*<Packet1cd>*/
(
const
Packet1cd
&
x
)
{
return
Packet1cd
(
preverse
(
Packet2d
(
x
.
v
)));
}
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
Packet1cd
,
2
>&
kernel
)
{
Packet2d
tmp
=
vcombine_f64
(
vget_high_f64
(
kernel
.
packet
[
0
].
v
),
vget_high_f64
(
kernel
.
packet
[
1
].
v
));
kernel
.
packet
[
0
].
v
=
vcombine_f64
(
vget_low_f64
(
kernel
.
packet
[
0
].
v
),
vget_low_f64
(
kernel
.
packet
[
1
].
v
));
kernel
.
packet
[
1
].
v
=
tmp
;
}
#endif // EIGEN_ARCH_ARM64
}
// end namespace internal
}
// end namespace Eigen
...
...
external/eigen3/Eigen/src/Core/arch/NEON/MathFunctions.h
0 → 100644
View file @
a394b22a
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/* The sin, cos, exp, and log functions of this file come from
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
*/
#ifndef EIGEN_MATH_FUNCTIONS_NEON_H
#define EIGEN_MATH_FUNCTIONS_NEON_H
namespace
Eigen
{
namespace
internal
{
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
pexp
<
Packet4f
>
(
const
Packet4f
&
_x
)
{
Packet4f
x
=
_x
;
Packet4f
tmp
,
fx
;
_EIGEN_DECLARE_CONST_Packet4f
(
1
,
1.0
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
half
,
0.5
f
);
_EIGEN_DECLARE_CONST_Packet4i
(
0x7f
,
0x7f
);
_EIGEN_DECLARE_CONST_Packet4f
(
exp_hi
,
88.3762626647950
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
exp_lo
,
-
88.3762626647949
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_LOG2EF
,
1.44269504088896341
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_C1
,
0.693359375
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_C2
,
-
2.12194440e-4
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p0
,
1.9875691500E-4
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p1
,
1.3981999507E-3
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p2
,
8.3334519073E-3
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p3
,
4.1665795894E-2
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p4
,
1.6666665459E-1
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p5
,
5.0000001201E-1
f
);
x
=
vminq_f32
(
x
,
p4f_exp_hi
);
x
=
vmaxq_f32
(
x
,
p4f_exp_lo
);
/* express exp(x) as exp(g + n*log(2)) */
fx
=
vmlaq_f32
(
p4f_half
,
x
,
p4f_cephes_LOG2EF
);
/* perform a floorf */
tmp
=
vcvtq_f32_s32
(
vcvtq_s32_f32
(
fx
));
/* if greater, substract 1 */
Packet4ui
mask
=
vcgtq_f32
(
tmp
,
fx
);
mask
=
vandq_u32
(
mask
,
vreinterpretq_u32_f32
(
p4f_1
));
fx
=
vsubq_f32
(
tmp
,
vreinterpretq_f32_u32
(
mask
));
tmp
=
vmulq_f32
(
fx
,
p4f_cephes_exp_C1
);
Packet4f
z
=
vmulq_f32
(
fx
,
p4f_cephes_exp_C2
);
x
=
vsubq_f32
(
x
,
tmp
);
x
=
vsubq_f32
(
x
,
z
);
Packet4f
y
=
vmulq_f32
(
p4f_cephes_exp_p0
,
x
);
z
=
vmulq_f32
(
x
,
x
);
y
=
vaddq_f32
(
y
,
p4f_cephes_exp_p1
);
y
=
vmulq_f32
(
y
,
x
);
y
=
vaddq_f32
(
y
,
p4f_cephes_exp_p2
);
y
=
vmulq_f32
(
y
,
x
);
y
=
vaddq_f32
(
y
,
p4f_cephes_exp_p3
);
y
=
vmulq_f32
(
y
,
x
);
y
=
vaddq_f32
(
y
,
p4f_cephes_exp_p4
);
y
=
vmulq_f32
(
y
,
x
);
y
=
vaddq_f32
(
y
,
p4f_cephes_exp_p5
);
y
=
vmulq_f32
(
y
,
z
);
y
=
vaddq_f32
(
y
,
x
);
y
=
vaddq_f32
(
y
,
p4f_1
);
/* build 2^n */
int32x4_t
mm
;
mm
=
vcvtq_s32_f32
(
fx
);
mm
=
vaddq_s32
(
mm
,
p4i_0x7f
);
mm
=
vshlq_n_s32
(
mm
,
23
);
Packet4f
pow2n
=
vreinterpretq_f32_s32
(
mm
);
y
=
vmulq_f32
(
y
,
pow2n
);
return
y
;
}
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_NEON_H
external/eigen3/Eigen/src/Core/arch/NEON/PacketMath.h
View file @
a394b22a
...
...
@@ -2,7 +2,7 @@
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2010 Konstantinos Margaritis <markos@
codex.gr
>
// Copyright (C) 2010 Konstantinos Margaritis <markos@
freevec.org
>
// Heavily based on Gael's SSE version.
//
// This Source Code Form is subject to the terms of the Mozilla
...
...
@@ -20,43 +20,48 @@ namespace internal {
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
// FIXME NEON has 16 quad registers, but since the current register allocator
// is so bad, it is much better to reduce it to 8
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#endif
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
#if EIGEN_ARCH_ARM64
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
#else
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
#endif
#endif
typedef
float32x2_t
Packet2f
;
typedef
float32x4_t
Packet4f
;
typedef
int32x4_t
Packet4i
;
typedef
int32x2_t
Packet2i
;
typedef
uint32x4_t
Packet4ui
;
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
const Packet4f p4f_##NAME = pset1<Packet4f>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int
32_t
>(X))
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
const Packet4i p4i_##NAME = pset1<Packet4i>(X)
#if defined(__llvm__) && !defined(__clang__)
//Special treatment for Apple's llvm-gcc, its NEON packet types are unions
#define EIGEN_INIT_NEON_PACKET2(X, Y) {{X, Y}}
#define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}}
#else
//Default initializer for packets
#define EIGEN_INIT_NEON_PACKET2(X, Y) {X, Y}
#define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
#endif
// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
// which available on LLVM and GCC (at least)
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || defined(__GNUC__)
#if EIGEN_ARCH_ARM64
// __builtin_prefetch tends to do nothing on ARM64 compilers because the
// prefetch instructions there are too detailed for __builtin_prefetch to map
// meaningfully to them.
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
#define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
#elif defined __pld
#define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
#elif
!defined(__aarch64__)
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ (
"
pld [%[addr]]\n" :: [addr] "r" (ADDR) :
"cc"
);
#elif
EIGEN_ARCH_ARM32
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ (
"
pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
#else
// by default no explicit prefetching
#define EIGEN_ARM_PREFETCH(ADDR)
...
...
@@ -65,53 +70,60 @@ typedef uint32x4_t Packet4ui;
template
<
>
struct
packet_traits
<
float
>
:
default_packet_traits
{
typedef
Packet4f
type
;
typedef
Packet4f
half
;
// Packet2f intrinsics not implemented yet
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
,
HasHalfPacket
=
0
,
// Packet2f intrinsics not implemented yet
HasDiv
=
1
,
// FIXME check the Has*
HasSin
=
0
,
HasCos
=
0
,
HasLog
=
0
,
HasExp
=
0
,
HasExp
=
1
,
HasSqrt
=
0
};
};
template
<
>
struct
packet_traits
<
int
>
:
default_packet_traits
template
<
>
struct
packet_traits
<
int
32_t
>
:
default_packet_traits
{
typedef
Packet4i
type
;
typedef
Packet4i
half
;
// Packet2i intrinsics not implemented yet
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
size
=
4
,
HasHalfPacket
=
0
// Packet2i intrinsics not implemented yet
// FIXME check the Has*
};
};
#if EIGEN_GNUC_AT_MOST(4,4) && !
defined(__llvm__)
#if EIGEN_GNUC_AT_MOST(4,4) && !
EIGEN_COMP_LLVM
// workaround gcc 4.2, 4.3 and 4.4 compilatin issue
EIGEN_STRONG_INLINE
float32x4_t
vld1q_f32
(
const
float
*
x
)
{
return
::
vld1q_f32
((
const
float32_t
*
)
x
);
}
EIGEN_STRONG_INLINE
float32x2_t
vld1_f32
(
const
float
*
x
)
{
return
::
vld1_f32
((
const
float32_t
*
)
x
);
}
EIGEN_STRONG_INLINE
float32x2_t
vld1_dup_f32
(
const
float
*
x
)
{
return
::
vld1_dup_f32
((
const
float32_t
*
)
x
);
}
EIGEN_STRONG_INLINE
void
vst1q_f32
(
float
*
to
,
float32x4_t
from
)
{
::
vst1q_f32
((
float32_t
*
)
to
,
from
);
}
EIGEN_STRONG_INLINE
void
vst1_f32
(
float
*
to
,
float32x2_t
from
)
{
::
vst1_f32
((
float32_t
*
)
to
,
from
);
}
#endif
template
<
>
struct
unpacket_traits
<
Packet4f
>
{
typedef
float
type
;
enum
{
size
=
4
}
;
};
template
<
>
struct
unpacket_traits
<
Packet4i
>
{
typedef
int
type
;
enum
{
size
=
4
}
;
};
template
<
>
struct
unpacket_traits
<
Packet4f
>
{
typedef
float
type
;
enum
{
size
=
4
,
alignment
=
Aligned16
};
typedef
Packet4f
half
;
};
template
<
>
struct
unpacket_traits
<
Packet4i
>
{
typedef
int
32_t
type
;
enum
{
size
=
4
,
alignment
=
Aligned16
};
typedef
Packet4i
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pset1
<
Packet4f
>
(
const
float
&
from
)
{
return
vdupq_n_f32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pset1
<
Packet4i
>
(
const
int
&
from
)
{
return
vdupq_n_s32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pset1
<
Packet4i
>
(
const
int
32_t
&
from
)
{
return
vdupq_n_s32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
plset
<
float
>
(
const
float
&
a
)
template
<
>
EIGEN_STRONG_INLINE
Packet4f
plset
<
Packet4f
>
(
const
float
&
a
)
{
Packet4f
countdown
=
EIGEN_INIT_NEON_PACKET4
(
0
,
1
,
2
,
3
);
const
float
f
[]
=
{
0
,
1
,
2
,
3
};
Packet4f
countdown
=
vld1q_f32
(
f
);
return
vaddq_f32
(
pset1
<
Packet4f
>
(
a
),
countdown
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
plset
<
int
>
(
const
int
&
a
)
template
<
>
EIGEN_STRONG_INLINE
Packet4i
plset
<
Packet4i
>
(
const
int
32_t
&
a
)
{
Packet4i
countdown
=
EIGEN_INIT_NEON_PACKET4
(
0
,
1
,
2
,
3
);
const
int32_t
i
[]
=
{
0
,
1
,
2
,
3
};
Packet4i
countdown
=
vld1q_s32
(
i
);
return
vaddq_s32
(
pset1
<
Packet4i
>
(
a
),
countdown
);
}
...
...
@@ -132,6 +144,9 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pdiv
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
#if EIGEN_ARCH_ARM64
return
vdivq_f32
(
a
,
b
);
#else
Packet4f
inv
,
restep
,
div
;
// NEON does not offer a divide instruction, we have to do a reciprocal approximation
...
...
@@ -150,14 +165,51 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const
div
=
vmulq_f32
(
a
,
inv
);
return
div
;
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pdiv
<
Packet4i
>
(
const
Packet4i
&
/*a*/
,
const
Packet4i
&
/*b*/
)
{
eigen_assert
(
false
&&
"packet integer division are not supported by NEON"
);
return
pset1
<
Packet4i
>
(
0
);
}
// for some weird raisons, it has to be overloaded for packet of integers
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmadd
(
const
Packet4f
&
a
,
const
Packet4f
&
b
,
const
Packet4f
&
c
)
{
return
vmlaq_f32
(
c
,
a
,
b
);
}
// Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available,
// then implements a slow software scalar fallback calling fmaf()!
// Filed LLVM bug:
// https://llvm.org/bugs/show_bug.cgi?id=27216
#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM)
// See bug 936.
// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
// MLA is not fused i.e. does 2 roundings.
// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4):
// MLA: 10 GFlop/s ; FMA: 12 GFlops/s.
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmadd
(
const
Packet4f
&
a
,
const
Packet4f
&
b
,
const
Packet4f
&
c
)
{
return
vfmaq_f32
(
c
,
a
,
b
);
}
#else
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmadd
(
const
Packet4f
&
a
,
const
Packet4f
&
b
,
const
Packet4f
&
c
)
{
#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM
// Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu,
// at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on
// -march=armv7-a, that is a very common case.
// See e.g. this thread:
// http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html
// Filed LLVM bug:
// https://llvm.org/bugs/show_bug.cgi?id=27219
Packet4f
r
=
c
;
asm
volatile
(
"vmla.f32 %q[r], %q[a], %q[b]"
:
[
r
]
"+w"
(
r
)
:
[
a
]
"w"
(
a
),
[
b
]
"w"
(
b
)
:
);
return
r
;
#else
return
vmlaq_f32
(
c
,
a
,
b
);
#endif
}
#endif
// No FMA instruction for int, so use MLA unconditionally.
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmadd
(
const
Packet4i
&
a
,
const
Packet4i
&
b
,
const
Packet4i
&
c
)
{
return
vmlaq_s32
(
c
,
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmin
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vminq_f32
(
a
,
b
);
}
...
...
@@ -191,20 +243,20 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, con
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pandnot
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vbicq_s32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pload
<
Packet4f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
vld1q_f32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pload
<
Packet4i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
vld1q_s32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pload
<
Packet4f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
vld1q_f32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pload
<
Packet4i
>
(
const
int
32_t
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
vld1q_s32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploadu
<
Packet4f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
vld1q_f32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploadu
<
Packet4i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
vld1q_s32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploadu
<
Packet4f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
vld1q_f32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploadu
<
Packet4i
>
(
const
int
32_t
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
vld1q_s32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploaddup
<
Packet4f
>
(
const
float
*
from
)
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploaddup
<
Packet4f
>
(
const
float
*
from
)
{
float32x2_t
lo
,
hi
;
lo
=
vld1_dup_f32
(
from
);
hi
=
vld1_dup_f32
(
from
+
1
);
return
vcombine_f32
(
lo
,
hi
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploaddup
<
Packet4i
>
(
const
int
*
from
)
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploaddup
<
Packet4i
>
(
const
int
32_t
*
from
)
{
int32x2_t
lo
,
hi
;
lo
=
vld1_dup_s32
(
from
);
...
...
@@ -212,18 +264,52 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
return
vcombine_s32
(
lo
,
hi
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vst1q_f32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
int
>
(
int
*
to
,
const
Packet4i
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vst1q_s32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vst1q_f32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
int32_t
>
(
int32_t
*
to
,
const
Packet4i
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vst1q_s32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
vst1q_f32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
int32_t
>
(
int32_t
*
to
,
const
Packet4i
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
vst1q_s32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
vst1q_f32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
int
>
(
int
*
to
,
const
Packet4i
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
vst1q_s32
(
to
,
from
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet4f
pgather
<
float
,
Packet4f
>
(
const
float
*
from
,
Index
stride
)
{
Packet4f
res
=
pset1
<
Packet4f
>
(
0.
f
);
res
=
vsetq_lane_f32
(
from
[
0
*
stride
],
res
,
0
);
res
=
vsetq_lane_f32
(
from
[
1
*
stride
],
res
,
1
);
res
=
vsetq_lane_f32
(
from
[
2
*
stride
],
res
,
2
);
res
=
vsetq_lane_f32
(
from
[
3
*
stride
],
res
,
3
);
return
res
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet4i
pgather
<
int32_t
,
Packet4i
>
(
const
int32_t
*
from
,
Index
stride
)
{
Packet4i
res
=
pset1
<
Packet4i
>
(
0
);
res
=
vsetq_lane_s32
(
from
[
0
*
stride
],
res
,
0
);
res
=
vsetq_lane_s32
(
from
[
1
*
stride
],
res
,
1
);
res
=
vsetq_lane_s32
(
from
[
2
*
stride
],
res
,
2
);
res
=
vsetq_lane_s32
(
from
[
3
*
stride
],
res
,
3
);
return
res
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
float
,
Packet4f
>
(
float
*
to
,
const
Packet4f
&
from
,
Index
stride
)
{
to
[
stride
*
0
]
=
vgetq_lane_f32
(
from
,
0
);
to
[
stride
*
1
]
=
vgetq_lane_f32
(
from
,
1
);
to
[
stride
*
2
]
=
vgetq_lane_f32
(
from
,
2
);
to
[
stride
*
3
]
=
vgetq_lane_f32
(
from
,
3
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
int32_t
,
Packet4i
>
(
int32_t
*
to
,
const
Packet4i
&
from
,
Index
stride
)
{
to
[
stride
*
0
]
=
vgetq_lane_s32
(
from
,
0
);
to
[
stride
*
1
]
=
vgetq_lane_s32
(
from
,
1
);
to
[
stride
*
2
]
=
vgetq_lane_s32
(
from
,
2
);
to
[
stride
*
3
]
=
vgetq_lane_s32
(
from
,
3
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
float
>
(
const
float
*
addr
)
{
EIGEN_ARM_PREFETCH
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
int
>
(
const
int
*
addr
)
{
EIGEN_ARM_PREFETCH
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
float
>
(
const
float
*
addr
)
{
EIGEN_ARM_PREFETCH
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
int
32_t
>
(
const
int
32_t
*
addr
)
{
EIGEN_ARM_PREFETCH
(
addr
);
}
// FIXME only store the 2 first elements ?
template
<
>
EIGEN_STRONG_INLINE
float
pfirst
<
Packet4f
>
(
const
Packet4f
&
a
)
{
float
EIGEN_ALIGN16
x
[
4
];
vst1q_f32
(
x
,
a
);
return
x
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
int
pfirst
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int
EIGEN_ALIGN16
x
[
4
];
vst1q_s32
(
x
,
a
);
return
x
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
float
pfirst
<
Packet4f
>
(
const
Packet4f
&
a
)
{
float
EIGEN_ALIGN16
x
[
4
];
vst1q_f32
(
x
,
a
);
return
x
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
int
32_t
pfirst
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int
32_t
EIGEN_ALIGN16
x
[
4
];
vst1q_s32
(
x
,
a
);
return
x
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
preverse
(
const
Packet4f
&
a
)
{
float32x2_t
a_lo
,
a_hi
;
...
...
@@ -243,6 +329,7 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
a_hi
=
vget_high_s32
(
a_r64
);
return
vcombine_s32
(
a_hi
,
a_lo
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pabs
(
const
Packet4f
&
a
)
{
return
vabsq_f32
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pabs
(
const
Packet4i
&
a
)
{
return
vabsq_s32
(
a
);
}
...
...
@@ -277,7 +364,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
return
sum
;
}
template
<
>
EIGEN_STRONG_INLINE
int
predux
<
Packet4i
>
(
const
Packet4i
&
a
)
template
<
>
EIGEN_STRONG_INLINE
int
32_t
predux
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int32x2_t
a_lo
,
a_hi
,
sum
;
...
...
@@ -324,7 +411,7 @@ template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
return
vget_lane_f32
(
prod
,
0
);
}
template
<
>
EIGEN_STRONG_INLINE
int
predux_mul
<
Packet4i
>
(
const
Packet4i
&
a
)
template
<
>
EIGEN_STRONG_INLINE
int
32_t
predux_mul
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int32x2_t
a_lo
,
a_hi
,
prod
;
...
...
@@ -352,7 +439,7 @@ template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
return
vget_lane_f32
(
min
,
0
);
}
template
<
>
EIGEN_STRONG_INLINE
int
predux_min
<
Packet4i
>
(
const
Packet4i
&
a
)
template
<
>
EIGEN_STRONG_INLINE
int
32_t
predux_min
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int32x2_t
a_lo
,
a_hi
,
min
;
...
...
@@ -377,7 +464,7 @@ template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
return
vget_lane_f32
(
max
,
0
);
}
template
<
>
EIGEN_STRONG_INLINE
int
predux_max
<
Packet4i
>
(
const
Packet4i
&
a
)
template
<
>
EIGEN_STRONG_INLINE
int
32_t
predux_max
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int32x2_t
a_lo
,
a_hi
,
max
;
...
...
@@ -410,9 +497,231 @@ PALIGN_NEON(0,Packet4i,vextq_s32)
PALIGN_NEON
(
1
,
Packet4i
,
vextq_s32
)
PALIGN_NEON
(
2
,
Packet4i
,
vextq_s32
)
PALIGN_NEON
(
3
,
Packet4i
,
vextq_s32
)
#undef PALIGN_NEON
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet4f
,
4
>&
kernel
)
{
float32x4x2_t
tmp1
=
vzipq_f32
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
float32x4x2_t
tmp2
=
vzipq_f32
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
kernel
.
packet
[
0
]
=
vcombine_f32
(
vget_low_f32
(
tmp1
.
val
[
0
]),
vget_low_f32
(
tmp2
.
val
[
0
]));
kernel
.
packet
[
1
]
=
vcombine_f32
(
vget_high_f32
(
tmp1
.
val
[
0
]),
vget_high_f32
(
tmp2
.
val
[
0
]));
kernel
.
packet
[
2
]
=
vcombine_f32
(
vget_low_f32
(
tmp1
.
val
[
1
]),
vget_low_f32
(
tmp2
.
val
[
1
]));
kernel
.
packet
[
3
]
=
vcombine_f32
(
vget_high_f32
(
tmp1
.
val
[
1
]),
vget_high_f32
(
tmp2
.
val
[
1
]));
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet4i
,
4
>&
kernel
)
{
int32x4x2_t
tmp1
=
vzipq_s32
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
int32x4x2_t
tmp2
=
vzipq_s32
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
kernel
.
packet
[
0
]
=
vcombine_s32
(
vget_low_s32
(
tmp1
.
val
[
0
]),
vget_low_s32
(
tmp2
.
val
[
0
]));
kernel
.
packet
[
1
]
=
vcombine_s32
(
vget_high_s32
(
tmp1
.
val
[
0
]),
vget_high_s32
(
tmp2
.
val
[
0
]));
kernel
.
packet
[
2
]
=
vcombine_s32
(
vget_low_s32
(
tmp1
.
val
[
1
]),
vget_low_s32
(
tmp2
.
val
[
1
]));
kernel
.
packet
[
3
]
=
vcombine_s32
(
vget_high_s32
(
tmp1
.
val
[
1
]),
vget_high_s32
(
tmp2
.
val
[
1
]));
}
//---------- double ----------
// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
// Confirmed at least with __apple_build_version__ = 6000054.
#ifdef __apple_build_version__
// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
// major toolchain updates.
#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
#else
#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
#endif
#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
// Bug 907: workaround missing declarations of the following two functions in the ADK
// Defining these functions as templates ensures that if these intrinsics are
// already defined in arm_neon.h, then our workaround doesn't cause a conflict
// and has lower priority in overload resolution.
template
<
typename
T
>
uint64x2_t
vreinterpretq_u64_f64
(
T
a
)
{
return
(
uint64x2_t
)
a
;
}
template
<
typename
T
>
float64x2_t
vreinterpretq_f64_u64
(
T
a
)
{
return
(
float64x2_t
)
a
;
}
typedef
float64x2_t
Packet2d
;
typedef
float64x1_t
Packet1d
;
template
<
>
struct
packet_traits
<
double
>
:
default_packet_traits
{
typedef
Packet2d
type
;
typedef
Packet2d
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
2
,
HasHalfPacket
=
0
,
HasDiv
=
1
,
// FIXME check the Has*
HasSin
=
0
,
HasCos
=
0
,
HasLog
=
0
,
HasExp
=
0
,
HasSqrt
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet2d
>
{
typedef
double
type
;
enum
{
size
=
2
,
alignment
=
Aligned16
};
typedef
Packet2d
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pset1
<
Packet2d
>
(
const
double
&
from
)
{
return
vdupq_n_f64
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
plset
<
Packet2d
>
(
const
double
&
a
)
{
const
double
countdown_raw
[]
=
{
0.0
,
1.0
};
const
Packet2d
countdown
=
vld1q_f64
(
countdown_raw
);
return
vaddq_f64
(
pset1
<
Packet2d
>
(
a
),
countdown
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
padd
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vaddq_f64
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
psub
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vsubq_f64
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pnegate
(
const
Packet2d
&
a
)
{
return
vnegq_f64
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pconj
(
const
Packet2d
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmul
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vmulq_f64
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pdiv
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vdivq_f64
(
a
,
b
);
}
#ifdef __ARM_FEATURE_FMA
// See bug 936. See above comment about FMA for float.
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmadd
(
const
Packet2d
&
a
,
const
Packet2d
&
b
,
const
Packet2d
&
c
)
{
return
vfmaq_f64
(
c
,
a
,
b
);
}
#else
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmadd
(
const
Packet2d
&
a
,
const
Packet2d
&
b
,
const
Packet2d
&
c
)
{
return
vmlaq_f64
(
c
,
a
,
b
);
}
#endif
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmin
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vminq_f64
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmax
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vmaxq_f64
(
a
,
b
);
}
// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pand
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vreinterpretq_f64_u64
(
vandq_u64
(
vreinterpretq_u64_f64
(
a
),
vreinterpretq_u64_f64
(
b
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
por
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vreinterpretq_f64_u64
(
vorrq_u64
(
vreinterpretq_u64_f64
(
a
),
vreinterpretq_u64_f64
(
b
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pxor
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vreinterpretq_f64_u64
(
veorq_u64
(
vreinterpretq_u64_f64
(
a
),
vreinterpretq_u64_f64
(
b
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pandnot
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vreinterpretq_f64_u64
(
vbicq_u64
(
vreinterpretq_u64_f64
(
a
),
vreinterpretq_u64_f64
(
b
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pload
<
Packet2d
>
(
const
double
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
vld1q_f64
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
ploadu
<
Packet2d
>
(
const
double
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
vld1q_f64
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
ploaddup
<
Packet2d
>
(
const
double
*
from
)
{
return
vld1q_dup_f64
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
double
>
(
double
*
to
,
const
Packet2d
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vst1q_f64
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
double
>
(
double
*
to
,
const
Packet2d
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
vst1q_f64
(
to
,
from
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet2d
pgather
<
double
,
Packet2d
>
(
const
double
*
from
,
Index
stride
)
{
Packet2d
res
=
pset1
<
Packet2d
>
(
0.0
);
res
=
vsetq_lane_f64
(
from
[
0
*
stride
],
res
,
0
);
res
=
vsetq_lane_f64
(
from
[
1
*
stride
],
res
,
1
);
return
res
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
double
,
Packet2d
>
(
double
*
to
,
const
Packet2d
&
from
,
Index
stride
)
{
to
[
stride
*
0
]
=
vgetq_lane_f64
(
from
,
0
);
to
[
stride
*
1
]
=
vgetq_lane_f64
(
from
,
1
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
double
>
(
const
double
*
addr
)
{
EIGEN_ARM_PREFETCH
(
addr
);
}
// FIXME only store the 2 first elements ?
template
<
>
EIGEN_STRONG_INLINE
double
pfirst
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vgetq_lane_f64
(
a
,
0
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
preverse
(
const
Packet2d
&
a
)
{
return
vcombine_f64
(
vget_high_f64
(
a
),
vget_low_f64
(
a
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pabs
(
const
Packet2d
&
a
)
{
return
vabsq_f64
(
a
);
}
#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
// workaround ICE, see bug 907
template
<
>
EIGEN_STRONG_INLINE
double
predux
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
(
vget_low_f64
(
a
)
+
vget_high_f64
(
a
))[
0
];
}
#else
template
<
>
EIGEN_STRONG_INLINE
double
predux
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vget_lane_f64
(
vget_low_f64
(
a
)
+
vget_high_f64
(
a
),
0
);
}
#endif
template
<
>
EIGEN_STRONG_INLINE
Packet2d
preduxp
<
Packet2d
>
(
const
Packet2d
*
vecs
)
{
float64x2_t
trn1
,
trn2
;
// NEON zip performs interleaving of the supplied vectors.
// We perform two interleaves in a row to acquire the transposed vector
trn1
=
vzip1q_f64
(
vecs
[
0
],
vecs
[
1
]);
trn2
=
vzip2q_f64
(
vecs
[
0
],
vecs
[
1
]);
// Do the addition of the resulting vectors
return
vaddq_f64
(
trn1
,
trn2
);
}
// Other reduction functions:
// mul
#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
template
<
>
EIGEN_STRONG_INLINE
double
predux_mul
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
(
vget_low_f64
(
a
)
*
vget_high_f64
(
a
))[
0
];
}
#else
template
<
>
EIGEN_STRONG_INLINE
double
predux_mul
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vget_lane_f64
(
vget_low_f64
(
a
)
*
vget_high_f64
(
a
),
0
);
}
#endif
// min
template
<
>
EIGEN_STRONG_INLINE
double
predux_min
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vgetq_lane_f64
(
vpminq_f64
(
a
,
a
),
0
);
}
// max
template
<
>
EIGEN_STRONG_INLINE
double
predux_max
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vgetq_lane_f64
(
vpmaxq_f64
(
a
,
a
),
0
);
}
// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
#define PALIGN_NEON(Offset,Type,Command) \
template<>\
struct palign_impl<Offset,Type>\
{\
EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
{\
if (Offset!=0)\
first = Command(first, second, Offset);\
}\
};\
PALIGN_NEON
(
0
,
Packet2d
,
vextq_f64
)
PALIGN_NEON
(
1
,
Packet2d
,
vextq_f64
)
#undef PALIGN_NEON
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet2d
,
2
>&
kernel
)
{
float64x2_t
trn1
=
vzip1q_f64
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
float64x2_t
trn2
=
vzip2q_f64
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
kernel
.
packet
[
0
]
=
trn1
;
kernel
.
packet
[
1
]
=
trn2
;
}
#endif // EIGEN_ARCH_ARM64
}
// end namespace internal
}
// end namespace Eigen
...
...
external/eigen3/Eigen/src/Core/arch/SSE/CMakeLists.txt
deleted
100644 → 0
View file @
701c0225
FILE
(
GLOB Eigen_Core_arch_SSE_SRCS
"*.h"
)
INSTALL
(
FILES
${
Eigen_Core_arch_SSE_SRCS
}
DESTINATION
${
INCLUDE_INSTALL_DIR
}
/Eigen/src/Core/arch/SSE COMPONENT Devel
)
external/eigen3/Eigen/src/Core/arch/SSE/Complex.h
View file @
a394b22a
...
...
@@ -22,13 +22,18 @@ struct Packet2cf
__m128
v
;
};
// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
// to leverage AVX instructions.
#ifndef EIGEN_VECTORIZE_AVX
template
<
>
struct
packet_traits
<
std
::
complex
<
float
>
>
:
default_packet_traits
{
typedef
Packet2cf
type
;
typedef
Packet2cf
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
2
,
HasHalfPacket
=
0
,
HasAdd
=
1
,
HasSub
=
1
,
...
...
@@ -39,11 +44,13 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
HasSetLinear
=
0
HasSetLinear
=
0
,
HasBlend
=
1
};
};
#endif
template
<
>
struct
unpacket_traits
<
Packet2cf
>
{
typedef
std
::
complex
<
float
>
type
;
enum
{
size
=
2
}
;
};
template
<
>
struct
unpacket_traits
<
Packet2cf
>
{
typedef
std
::
complex
<
float
>
type
;
enum
{
size
=
2
,
alignment
=
Aligned16
};
typedef
Packet2cf
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
padd
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
_mm_add_ps
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
psub
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
_mm_sub_ps
(
a
.
v
,
b
.
v
));
}
...
...
@@ -60,7 +67,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pmul
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
// TODO optimize it for SSE3 and 4
#ifdef EIGEN_VECTORIZE_SSE3
return
Packet2cf
(
_mm_addsub_ps
(
_mm_mul_ps
(
_mm_moveldup_ps
(
a
.
v
),
b
.
v
),
_mm_mul_ps
(
_mm_movehdup_ps
(
a
.
v
),
...
...
@@ -104,8 +110,23 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
ploaddup
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
return
pset1
<
Packet2cf
>
(
*
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
(
&
numext
::
real_ref
(
*
to
),
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
(
&
numext
::
real_ref
(
*
to
),
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
(
&
numext
::
real_ref
(
*
to
),
Packet4f
(
from
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
(
&
numext
::
real_ref
(
*
to
),
Packet4f
(
from
.
v
));
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet2cf
pgather
<
std
::
complex
<
float
>
,
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
,
Index
stride
)
{
return
Packet2cf
(
_mm_set_ps
(
std
::
imag
(
from
[
1
*
stride
]),
std
::
real
(
from
[
1
*
stride
]),
std
::
imag
(
from
[
0
*
stride
]),
std
::
real
(
from
[
0
*
stride
])));
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
std
::
complex
<
float
>
,
Packet2cf
>
(
std
::
complex
<
float
>*
to
,
const
Packet2cf
&
from
,
Index
stride
)
{
to
[
stride
*
0
]
=
std
::
complex
<
float
>
(
_mm_cvtss_f32
(
_mm_shuffle_ps
(
from
.
v
,
from
.
v
,
0
)),
_mm_cvtss_f32
(
_mm_shuffle_ps
(
from
.
v
,
from
.
v
,
1
)));
to
[
stride
*
1
]
=
std
::
complex
<
float
>
(
_mm_cvtss_f32
(
_mm_shuffle_ps
(
from
.
v
,
from
.
v
,
2
)),
_mm_cvtss_f32
(
_mm_shuffle_ps
(
from
.
v
,
from
.
v
,
3
)));
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
float
>
>
(
const
std
::
complex
<
float
>
*
addr
)
{
_mm_prefetch
((
const
char
*
)(
addr
),
_MM_HINT_T0
);
}
...
...
@@ -124,7 +145,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Pack
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
preverse
(
const
Packet2cf
&
a
)
{
return
Packet2cf
(
_mm_castpd_ps
(
preverse
(
_mm_castps_pd
(
a
.
v
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
preverse
(
const
Packet2cf
&
a
)
{
return
Packet2cf
(
_mm_castpd_ps
(
preverse
(
Packet2d
(
_mm_castps_pd
(
a
.
v
))))
)
;
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
float
>
predux
<
Packet2cf
>
(
const
Packet2cf
&
a
)
{
...
...
@@ -214,7 +235,7 @@ template<> struct conj_helper<Packet4f, Packet2cf, false,false>
{
return
padd
(
c
,
pmul
(
x
,
y
));
}
EIGEN_STRONG_INLINE
Packet2cf
pmul
(
const
Packet4f
&
x
,
const
Packet2cf
&
y
)
const
{
return
Packet2cf
(
Eigen
::
internal
::
pmul
(
x
,
y
.
v
));
}
{
return
Packet2cf
(
Eigen
::
internal
::
pmul
<
Packet4f
>
(
x
,
y
.
v
));
}
};
template
<
>
struct
conj_helper
<
Packet2cf
,
Packet4f
,
false
,
false
>
...
...
@@ -223,7 +244,7 @@ template<> struct conj_helper<Packet2cf, Packet4f, false,false>
{
return
padd
(
c
,
pmul
(
x
,
y
));
}
EIGEN_STRONG_INLINE
Packet2cf
pmul
(
const
Packet2cf
&
x
,
const
Packet4f
&
y
)
const
{
return
Packet2cf
(
Eigen
::
internal
::
pmul
(
x
.
v
,
y
));
}
{
return
Packet2cf
(
Eigen
::
internal
::
pmul
<
Packet4f
>
(
x
.
v
,
y
));
}
};
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pdiv
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
...
...
@@ -234,7 +255,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
return
Packet2cf
(
_mm_div_ps
(
res
.
v
,
_mm_add_ps
(
s
,
_mm_castsi128_ps
(
_mm_shuffle_epi32
(
_mm_castps_si128
(
s
),
0xb1
)))));
}
EIGEN_STRONG_INLINE
Packet2cf
pcplxflip
/*<Packet2cf>*/
(
const
Packet2cf
&
x
)
EIGEN_STRONG_INLINE
Packet2cf
pcplxflip
/*
<Packet2cf>
*/
(
const
Packet2cf
&
x
)
{
return
Packet2cf
(
vec4f_swizzle1
(
x
.
v
,
1
,
0
,
3
,
2
));
}
...
...
@@ -248,13 +269,18 @@ struct Packet1cd
__m128d
v
;
};
// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
// to leverage AVX instructions.
#ifndef EIGEN_VECTORIZE_AVX
template
<
>
struct
packet_traits
<
std
::
complex
<
double
>
>
:
default_packet_traits
{
typedef
Packet1cd
type
;
typedef
Packet1cd
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
0
,
size
=
1
,
HasHalfPacket
=
0
,
HasAdd
=
1
,
HasSub
=
1
,
...
...
@@ -268,12 +294,13 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
HasSetLinear
=
0
};
};
#endif
template
<
>
struct
unpacket_traits
<
Packet1cd
>
{
typedef
std
::
complex
<
double
>
type
;
enum
{
size
=
1
}
;
};
template
<
>
struct
unpacket_traits
<
Packet1cd
>
{
typedef
std
::
complex
<
double
>
type
;
enum
{
size
=
1
,
alignment
=
Aligned16
};
typedef
Packet1cd
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
padd
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
_mm_add_pd
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
psub
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
_mm_sub_pd
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pnegate
(
const
Packet1cd
&
a
)
{
return
Packet1cd
(
pnegate
(
a
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pnegate
(
const
Packet1cd
&
a
)
{
return
Packet1cd
(
pnegate
(
Packet2d
(
a
.
v
))
)
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pconj
(
const
Packet1cd
&
a
)
{
const
__m128d
mask
=
_mm_castsi128_pd
(
_mm_set_epi32
(
0x80000000
,
0x0
,
0x0
,
0x0
));
...
...
@@ -282,9 +309,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pmul
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
// TODO optimize it for SSE3 and 4
#ifdef EIGEN_VECTORIZE_SSE3
return
Packet1cd
(
_mm_addsub_pd
(
_mm_mul_pd
(
vec2d_swizzle1
(
a
.
v
,
0
,
0
),
b
.
v
),
return
Packet1cd
(
_mm_addsub_pd
(
_mm_mul_pd
(
_mm_movedup_pd
(
a
.
v
),
b
.
v
),
_mm_mul_pd
(
vec2d_swizzle1
(
a
.
v
,
1
,
1
),
vec2d_swizzle1
(
b
.
v
,
1
,
0
))));
#else
...
...
@@ -311,8 +337,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
ploaddup
<
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
)
{
return
pset1
<
Packet1cd
>
(
*
from
);
}
// FIXME force unaligned store, this is a temporary fix
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet1cd
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
((
double
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet1cd
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
((
double
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet1cd
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
((
double
*
)
to
,
Packet2d
(
from
.
v
)
)
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet1cd
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
((
double
*
)
to
,
Packet2d
(
from
.
v
)
)
;
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
double
>
>
(
const
std
::
complex
<
double
>
*
addr
)
{
_mm_prefetch
((
const
char
*
)(
addr
),
_MM_HINT_T0
);
}
...
...
@@ -410,7 +436,7 @@ template<> struct conj_helper<Packet2d, Packet1cd, false,false>
{
return
padd
(
c
,
pmul
(
x
,
y
));
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet2d
&
x
,
const
Packet1cd
&
y
)
const
{
return
Packet1cd
(
Eigen
::
internal
::
pmul
(
x
,
y
.
v
));
}
{
return
Packet1cd
(
Eigen
::
internal
::
pmul
<
Packet2d
>
(
x
,
y
.
v
));
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet2d
,
false
,
false
>
...
...
@@ -419,7 +445,7 @@ template<> struct conj_helper<Packet1cd, Packet2d, false,false>
{
return
padd
(
c
,
pmul
(
x
,
y
));
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
x
,
const
Packet2d
&
y
)
const
{
return
Packet1cd
(
Eigen
::
internal
::
pmul
(
x
.
v
,
y
));
}
{
return
Packet1cd
(
Eigen
::
internal
::
pmul
<
Packet2d
>
(
x
.
v
,
y
));
}
};
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pdiv
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
...
...
@@ -430,9 +456,44 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con
return
Packet1cd
(
_mm_div_pd
(
res
.
v
,
_mm_add_pd
(
s
,
_mm_shuffle_pd
(
s
,
s
,
0x1
))));
}
EIGEN_STRONG_INLINE
Packet1cd
pcplxflip
/*<Packet1cd>*/
(
const
Packet1cd
&
x
)
EIGEN_STRONG_INLINE
Packet1cd
pcplxflip
/* <Packet1cd> */
(
const
Packet1cd
&
x
)
{
return
Packet1cd
(
preverse
(
Packet2d
(
x
.
v
)));
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet2cf
,
2
>&
kernel
)
{
__m128d
w1
=
_mm_castps_pd
(
kernel
.
packet
[
0
].
v
);
__m128d
w2
=
_mm_castps_pd
(
kernel
.
packet
[
1
].
v
);
__m128
tmp
=
_mm_castpd_ps
(
_mm_unpackhi_pd
(
w1
,
w2
));
kernel
.
packet
[
0
].
v
=
_mm_castpd_ps
(
_mm_unpacklo_pd
(
w1
,
w2
));
kernel
.
packet
[
1
].
v
=
tmp
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pblend
(
const
Selector
<
2
>&
ifPacket
,
const
Packet2cf
&
thenPacket
,
const
Packet2cf
&
elsePacket
)
{
__m128d
result
=
pblend
<
Packet2d
>
(
ifPacket
,
_mm_castps_pd
(
thenPacket
.
v
),
_mm_castps_pd
(
elsePacket
.
v
));
return
Packet2cf
(
_mm_castpd_ps
(
result
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pinsertfirst
(
const
Packet2cf
&
a
,
std
::
complex
<
float
>
b
)
{
return
Packet2cf
(
_mm_loadl_pi
(
a
.
v
,
reinterpret_cast
<
const
__m64
*>
(
&
b
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pinsertfirst
(
const
Packet1cd
&
,
std
::
complex
<
double
>
b
)
{
return
pset1
<
Packet1cd
>
(
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pinsertlast
(
const
Packet2cf
&
a
,
std
::
complex
<
float
>
b
)
{
return
Packet2cf
(
_mm_loadh_pi
(
a
.
v
,
reinterpret_cast
<
const
__m64
*>
(
&
b
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pinsertlast
(
const
Packet1cd
&
,
std
::
complex
<
double
>
b
)
{
return
Packet1cd
(
preverse
(
x
.
v
)
);
return
pset1
<
Packet1cd
>
(
b
);
}
}
// end namespace internal
...
...
external/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h
View file @
a394b22a
...
...
@@ -32,7 +32,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
/* the smallest non denormalized float number */
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT
(
min_norm_pos
,
0x00800000
);
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT
(
minus_inf
,
0xff800000
);
//-1.f/0.f);
/* natural logarithm computed for 4 simultaneous float
return NaN for x <= 0
*/
...
...
@@ -63,7 +63,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
x
=
_mm_or_ps
(
x
,
p4f_half
);
emm0
=
_mm_sub_epi32
(
emm0
,
p4i_0x7f
);
Packet4f
e
=
padd
(
_mm_cvtepi32_ps
(
emm0
),
p4f_1
);
Packet4f
e
=
padd
(
Packet4f
(
_mm_cvtepi32_ps
(
emm0
)
)
,
p4f_1
);
/* part2:
if( x < SQRTHF ) {
...
...
@@ -72,9 +72,9 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
} else { x = x - 1.0; }
*/
Packet4f
mask
=
_mm_cmplt_ps
(
x
,
p4f_cephes_SQRTHF
);
Packet4f
tmp
=
_mm_and_ps
(
x
,
mask
);
Packet4f
tmp
=
pand
(
x
,
mask
);
x
=
psub
(
x
,
p4f_1
);
e
=
psub
(
e
,
_mm_and_ps
(
p4f_1
,
mask
));
e
=
psub
(
e
,
pand
(
p4f_1
,
mask
));
x
=
padd
(
x
,
tmp
);
Packet4f
x2
=
pmul
(
x
,
x
);
...
...
@@ -444,32 +444,119 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
#if EIGEN_FAST_MATH
// This is based on Quake3's fast inverse square root.
// Functions for sqrt.
// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
// exact solution. It does not handle +inf, or denormalized numbers correctly.
// The main advantage of this approach is not just speed, but also the fact that
// it can be inlined and pipelined with other computations, further reducing its
// effective latency. This is similar to Quake3's fast inverse square root.
// For detail see here: http://www.beyond3d.com/content/articles/8/
// It lacks 1 (or 2 bits in some rare cases) of precision, and does not handle negative, +inf, or denormalized numbers correctly.
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
psqrt
<
Packet4f
>
(
const
Packet4f
&
_x
)
{
Packet4f
half
=
pmul
(
_x
,
pset1
<
Packet4f
>
(
.5
f
));
Packet4f
denormal_mask
=
_mm_and_ps
(
_mm_cmpge_ps
(
_x
,
_mm_setzero_ps
()),
_mm_cmplt_ps
(
_x
,
pset1
<
Packet4f
>
((
std
::
numeric_limits
<
float
>::
min
)())));
/* select only the inverse sqrt of non-zero inputs */
Packet4f
non_zero_mask
=
_mm_cmpge_ps
(
_x
,
pset1
<
Packet4f
>
((
std
::
numeric_limits
<
float
>::
min
)()));
Packet4f
x
=
_mm_and_ps
(
non_zero_mask
,
_mm_rsqrt_ps
(
_x
));
// Compute approximate reciprocal sqrt.
Packet4f
x
=
_mm_rsqrt_ps
(
_x
);
// Do a single step of Newton's iteration.
x
=
pmul
(
x
,
psub
(
pset1
<
Packet4f
>
(
1.5
f
),
pmul
(
half
,
pmul
(
x
,
x
))));
return
pmul
(
_x
,
x
);
// Flush results for denormals to zero.
return
_mm_andnot_ps
(
denormal_mask
,
pmul
(
_x
,
x
));
}
#else
template
<
>
EIGEN_STRONG_INLINE
Packet4f
psqrt
<
Packet4f
>
(
const
Packet4f
&
x
)
{
return
_mm_sqrt_ps
(
x
);
}
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
psqrt
<
Packet4f
>
(
const
Packet4f
&
x
)
{
return
_mm_sqrt_ps
(
x
);
}
#endif
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet2d
psqrt
<
Packet2d
>
(
const
Packet2d
&
x
)
{
return
_mm_sqrt_pd
(
x
);
}
#if EIGEN_FAST_MATH
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
prsqrt
<
Packet4f
>
(
const
Packet4f
&
_x
)
{
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT
(
inf
,
0x7f800000
);
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT
(
nan
,
0x7fc00000
);
_EIGEN_DECLARE_CONST_Packet4f
(
one_point_five
,
1.5
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
minus_half
,
-
0.5
f
);
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT
(
flt_min
,
0x00800000
);
Packet4f
neg_half
=
pmul
(
_x
,
p4f_minus_half
);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
Packet4f
le_zero_mask
=
_mm_cmple_ps
(
_x
,
p4f_flt_min
);
Packet4f
x
=
_mm_andnot_ps
(
le_zero_mask
,
_mm_rsqrt_ps
(
_x
));
// Fill in NaNs and Infs for the negative/zero entries.
Packet4f
neg_mask
=
_mm_cmplt_ps
(
_x
,
_mm_setzero_ps
());
Packet4f
zero_mask
=
_mm_andnot_ps
(
neg_mask
,
le_zero_mask
);
Packet4f
infs_and_nans
=
_mm_or_ps
(
_mm_and_ps
(
neg_mask
,
p4f_nan
),
_mm_and_ps
(
zero_mask
,
p4f_inf
));
// Do a single step of Newton's iteration.
x
=
pmul
(
x
,
pmadd
(
neg_half
,
pmul
(
x
,
x
),
p4f_one_point_five
));
// Insert NaNs and Infs in all the right places.
return
_mm_or_ps
(
x
,
infs_and_nans
);
}
#else
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
prsqrt
<
Packet4f
>
(
const
Packet4f
&
x
)
{
// Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation.
return
_mm_div_ps
(
pset1
<
Packet4f
>
(
1.0
f
),
_mm_sqrt_ps
(
x
));
}
#endif
template
<
>
EIGEN_STRONG_INLINE
Packet2d
psqrt
<
Packet2d
>
(
const
Packet2d
&
x
)
{
return
_mm_sqrt_pd
(
x
);
}
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet2d
prsqrt
<
Packet2d
>
(
const
Packet2d
&
x
)
{
// Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
return
_mm_div_pd
(
pset1
<
Packet2d
>
(
1.0
),
_mm_sqrt_pd
(
x
));
}
// Hyperbolic Tangent function.
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
ptanh
<
Packet4f
>
(
const
Packet4f
&
x
)
{
return
internal
::
generic_fast_tanh_float
(
x
);
}
}
// end namespace internal
namespace
numext
{
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
float
sqrt
(
const
float
&
x
)
{
return
internal
::
pfirst
(
internal
::
Packet4f
(
_mm_sqrt_ss
(
_mm_set_ss
(
x
))));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
double
sqrt
(
const
double
&
x
)
{
#if EIGEN_COMP_GNUC_STRICT
// This works around a GCC bug generating poor code for _mm_sqrt_pd
// See https://bitbucket.org/eigen/eigen/commits/14f468dba4d350d7c19c9b93072e19f7b3df563b
return
internal
::
pfirst
(
internal
::
Packet2d
(
__builtin_ia32_sqrtsd
(
_mm_set_sd
(
x
))));
#else
return
internal
::
pfirst
(
internal
::
Packet2d
(
_mm_sqrt_pd
(
_mm_set_sd
(
x
))));
#endif
}
}
// end namespace numex
}
// end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_SSE_H
Prev
1
…
4
5
6
7
8
9
10
Next