/*
* Copyright (c) 2018 by Intinor AB. All rights reserved.
*
* This file is part of "libRaptorQ".
*
* libRaptorQ is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* libRaptorQ is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* and a copy of the GNU Lesser General Public License
* along with libRaptorQ. If not, see .
*/
#pragma once
#include "../external/delegates/include/delegate.h"
#include "RaptorQ/v1/util/CPU_Info.hpp"
#include
#include
namespace RaptorQ__v1 {
namespace Impl {
typedef dlgt::delegate add_delegate;
typedef dlgt::delegate div_delegate;
typedef dlgt::delegate
add_mul_delegate;
typedef dlgt::delegate
non_zero_delegate;
typedef dlgt::delegate
first_non_zero_delegate;
namespace Matrix_Row_SIMD {
// The SIMD versions are: dest[i] ^= (high[src[i]>>4] ^ low[src[i]&0xf]);
// high and low are 16 byte lookup tables (high four bits and low four bits)
// Note: Putting these outside the class as the gcc 4.9.4 linker
// LTO got into real trouble otherwise.
extern void add_avx2(uint8_t *dest, uint8_t *src, int bytes);
extern void div_avx2(uint8_t *data, uint8_t num, int bytes);
extern void multiply_and_add_avx2(uint8_t *data, uint8_t *src, uint8_t num,
int bytes);
extern uint32_t non_zero_avx2(uint8_t *data, uint32_t start, uint32_t stop);
extern uint32_t first_non_zero_avx2(uint8_t *data, uint32_t start,
uint32_t stop);
extern void add_ssse3(uint8_t *dest, uint8_t *src, int bytes);
extern void div_ssse3(uint8_t *data, uint8_t num, int bytes);
extern void multiply_and_add_ssse3(uint8_t *data, uint8_t *src, uint8_t num,
int bytes);
extern uint32_t non_zero_ssse3(uint8_t *data, uint32_t start,
uint32_t stop);
extern uint32_t first_non_zero_ssse3(uint8_t *data, uint32_t start,
uint32_t stop);
} // namespace Matrix_Row_SIMD
class RAPTORQ_LOCAL Matrix_Row
{
public:
enum class SIMD : uint8_t {
NONE = 0x00,
AUTO = 0x01,
SSSE3 = 0x02,
AVX2 = 0x03,
};
static Matrix_Row & get_instance() {
static Matrix_Row matrix;
return matrix;
}
static void init_simd(SIMD type) {
Matrix_Row::get_instance()._init_simd(type);
}
static void row_multiply_add(uint8_t *dest, uint8_t *src, uint8_t scalar,
int bytes)
{
if (scalar == 1) {
Matrix_Row::get_instance().add_del(dest, src, bytes);
} else {
Matrix_Row::get_instance().add_mul_del(dest, src, scalar, bytes);
}
}
static void row_div(uint8_t *data, uint8_t scalar, int bytes)
{
if (scalar == 0) {
return;
}
Matrix_Row::get_instance().div_del(data, scalar, bytes);
}
static void row_add(uint8_t *dest, uint8_t *src, int bytes)
{
Matrix_Row::get_instance().add_del(dest, src, bytes);
}
static uint32_t row_non_zero(uint8_t *data, uint32_t start, uint32_t stop)
{
return Matrix_Row::get_instance().non_zero_del(data, start, stop);
}
static uint32_t row_first_non_zero(uint8_t *data, uint32_t start,
uint32_t stop)
{
return Matrix_Row::get_instance().first_non_zero_del(data, start, stop);
}
private:
add_delegate add_del = dlgt::make_delegate(&add);
div_delegate div_del = dlgt::make_delegate(&div);
add_mul_delegate add_mul_del = dlgt::make_delegate(&multiply_and_add);
non_zero_delegate non_zero_del = dlgt::make_delegate(&non_zero);
first_non_zero_delegate first_non_zero_del =
dlgt::make_delegate(&first_non_zero);
Matrix_Row() {
_init_simd(SIMD::AUTO);
}
// Ordered from most wanted to least wanted
SIMD get_simd_type()
{
if (CPU_Info::has_avx2()) {
return SIMD::AVX2;
}
if (CPU_Info::has_ssse3()) {
return SIMD::SSSE3;
}
return SIMD::NONE;
}
bool has_simd_type(SIMD type)
{
bool return_value = false; //Better safe than sorry
switch( type )
{
case SIMD::SSSE3:
return_value = CPU_Info::has_ssse3();
break;
case SIMD::AVX2:
return_value = CPU_Info::has_avx2();
break;
default:
return_value = true;
break;
}
return return_value;
}
void _init_simd(SIMD type) {
if (type == SIMD::AUTO || !has_simd_type(type)) {
type = get_simd_type();
}
switch( type )
{
case SIMD::SSSE3:
add_del = dlgt::make_delegate(&Matrix_Row_SIMD::add_ssse3);
div_del = dlgt::make_delegate(&Matrix_Row_SIMD::div_ssse3);
add_mul_del = dlgt::make_delegate(
&Matrix_Row_SIMD::multiply_and_add_ssse3);
non_zero_del = dlgt::make_delegate(
&Matrix_Row_SIMD::non_zero_ssse3);
first_non_zero_del = dlgt::make_delegate(
&Matrix_Row_SIMD::first_non_zero_ssse3);
break;
case SIMD::AVX2:
add_del = dlgt::make_delegate(&Matrix_Row_SIMD::add_avx2);
div_del = dlgt::make_delegate(&Matrix_Row_SIMD::div_avx2);
add_mul_del = dlgt::make_delegate(
&Matrix_Row_SIMD::multiply_and_add_avx2);
non_zero_del = dlgt::make_delegate(&Matrix_Row_SIMD::non_zero_avx2);
first_non_zero_del =dlgt::make_delegate(
&Matrix_Row_SIMD::first_non_zero_avx2);
break;
default:
add_del = dlgt::make_delegate(&add);
div_del = dlgt::make_delegate(&div);
add_mul_del = dlgt::make_delegate(&multiply_and_add);
non_zero_del = dlgt::make_delegate(&non_zero);
first_non_zero_del = dlgt::make_delegate(&first_non_zero);
break;
}
}
static void add(uint8_t *dest, uint8_t *src, int bytes)
{
for (int i = 0; i < bytes;i++) {
dest[i] = dest[i] ^ src[i];
}
}
static void div(uint8_t *data, uint8_t num, int bytes)
{
num = oct_log[num - 1];
for (int i = 0; i < bytes;i++) {
if (data[i] != 0) {
data[i] = oct_exp[oct_log[data[i] - 1] - num + 255];
}
}
}
static void multiply_and_add(uint8_t *dest, uint8_t *src, uint8_t num,
int bytes)
{
if (num == 0) {
return;
}
// TODO: Probably faster with a single lookup based on num, a single
// lookup require an additional lookup table of size 256 * 256.
uint16_t log_num = oct_log_no_if[num];
for (int i = 0; i < bytes;i++) {
dest[i] = dest[i] ^ oct_exp_no_if[oct_log_no_if[src[i]] + log_num];
}
}
static uint32_t non_zero(uint8_t *data, uint32_t start, uint32_t stop)
{
uint32_t non_zero_count = 0;
for (uint32_t j = start; j < stop; j++) {
non_zero_count += (data[j] != 0);
}
return non_zero_count;
}
static uint32_t first_non_zero(uint8_t *data, uint32_t start, uint32_t stop)
{
for (uint32_t j = start; j < stop; j++) {
if (data[j] != 0)
return j;
}
assert(0);
return 0;
}
};
} // namespace Impl
} // namespace RaptorQ__v1