ビット数：プリプロセッサマジックと最新のC ++

Question

コンパイル時に構築された16ビットチャンクの64ビット整数のビットカウントルックアップテーブルを作成するとします。これを行う唯一の方法は、次のコードです。

#define B4(n) n, n + 1, n + 1, n + 2 #define B6(n) B4(n), B4(n + 1), B4(n + 1), B4(n + 2) #define B8(n) B6(n), B6(n + 1), B6(n + 1), B6(n + 2) #define B10(n) B8(n), B8(n + 1), B8(n + 1), B8(n + 2) #define B12(n) B10(n), B10(n + 1), B10(n + 1), B10(n + 2) #define B14(n) B12(n), B12(n + 1), B12(n + 1), B12(n + 2) #define B16(n) B14(n), B14(n + 1), B14(n + 1), B14(n + 2) #define COUNT_BITS B16(0), B16(1), B16(1), B16(2) unsigned int lookup[65536] = { COUNT_BITS };

同じ結果を得るための最新の（C++ 11/14）方法はありますか？

Richard Hodges · Accepted Answer

標準ライブラリを使用しないのはなぜですか？

_#include <bitset> int bits_in(std::uint64_t u) { auto bs = std::bitset<64>(u); return bs.count(); } _

結果のアセンブラー（_-O2 -march=native_でコンパイル）：

_bits_in(unsigned long): xor eax, eax popcnt rax, rdi ret _

この時点ですべてのx86プロセッサにこの命令があるわけではないため、（少なくともgccでは）コンパイルするアーキテクチャを知らせる必要があります。

@tambreは、実際には、可能であれば、オプティマイザーはさらに先に進むと述べています。

_volatile int results[3]; int main() { results[0] = bits_in(255); results[1] = bits_in(1023); results[2] = bits_in(0x8000800080008000); } _

結果のアセンブラー：

_main: mov DWORD PTR results[rip], 8 xor eax, eax mov DWORD PTR results[rip+4], 10 mov DWORD PTR results[rip+8], 4 ret _

私のような昔ながらのビットツイッターは、解決する新しい問題を見つける必要があります:)

更新

ソリューションがビットカウントを計算するのにCPUの助けに依存していることに誰もが満足していませんでした。自動生成されたテーブルを使用したが、開発者がテーブルのサイズを設定できるようにした場合はどうでしょうか？（警告-16ビットテーブルバージョンの長いコンパイル時間）

_#include <utility> #include <cstdint> #include <array> #include <numeric> #include <bitset> template<std::size_t Word_size, std::size_t...Is> constexpr auto generate(std::integral_constant<std::size_t, Word_size>, std::index_sequence<Is...>) { struct popcount_type { constexpr auto operator()(int i) const { int bits = 0; while (i) { i &= i - 1; ++bits; } return bits; } }; constexpr auto popcnt = popcount_type(); return std::array<int, sizeof...(Is)> { {popcnt(Is)...} }; } template<class T> constexpr auto power2(T x) { T result = 1; for (T i = 0; i < x; ++i) result *= 2; return result; } template<class TableWord> struct table { static constexpr auto Word_size = std::numeric_limits<TableWord>::digits; static constexpr auto table_length = power2(Word_size); using array_type = std::array<int, table_length>; static const array_type& get_data() { static const array_type data = generate(std::integral_constant<std::size_t, Word_size>(), std::make_index_sequence<table_length>()); return data; }; }; template<class Word> struct use_table_Word { }; template<class Word, class TableWord = std::uint8_t> int bits_in(Word val, use_table_Word<TableWord> = use_table_Word<TableWord>()) { constexpr auto table_Word_size = std::numeric_limits<TableWord>::digits; constexpr auto Word_size = std::numeric_limits<Word>::digits; constexpr auto times = Word_size / table_Word_size; static_assert(times > 0, "incompatible"); auto reduce = [val](auto times) { return (val >> (table_Word_size * times)) & (power2(table_Word_size) - 1); }; auto const& data = table<TableWord>::get_data(); auto result = 0; for (int i = 0; i < times; ++i) { result += data[reduce(i)]; } return result; } volatile int results[3]; #include <iostream> int main() { auto input = std::uint64_t(1023); results[0] = bits_in(input); results[0] = bits_in(input, use_table_Word<std::uint16_t>()); results[1] = bits_in(0x8000800080008000); results[2] = bits_in(34567890); for (int i = 0; i < 3; ++i) { std::cout << results[i] << std::endl; } return 0; } _

最終更新

このバージョンでは、ルックアップテーブルの任意のビット数を使用でき、ルックアップテーブルのビット数よりも小さい場合でも、任意の入力タイプをサポートします。

また、上位ビットがゼロの場合、短絡します。

_#include <utility> #include <cstdint> #include <array> #include <numeric> #include <algorithm> namespace detail { template<std::size_t bits, typename = void> struct smallest_Word; template<std::size_t bits> struct smallest_Word<bits, std::enable_if_t<(bits <= 8)>> { using type = std::uint8_t; }; template<std::size_t bits> struct smallest_Word<bits, std::enable_if_t<(bits > 8 and bits <= 16)>> { using type = std::uint16_t; }; template<std::size_t bits> struct smallest_Word<bits, std::enable_if_t<(bits > 16 and bits <= 32)>> { using type = std::uint32_t; }; template<std::size_t bits> struct smallest_Word<bits, std::enable_if_t<(bits > 32 and bits <= 64)>> { using type = std::uint64_t; }; } template<std::size_t bits> using smallest_Word = typename detail::smallest_Word<bits>::type; template<class WordType, std::size_t bits, std::size_t...Is> constexpr auto generate(std::index_sequence<Is...>) { using Word_type = WordType; struct popcount_type { constexpr auto operator()(Word_type i) const { int result = 0; while (i) { i &= i - 1; ++result; } return result; } }; constexpr auto popcnt = popcount_type(); return std::array<Word_type, sizeof...(Is)> { {popcnt(Is)...} }; } template<class T> constexpr auto power2(T x) { return T(1) << x; } template<std::size_t Word_size> struct table { static constexpr auto table_length = power2(Word_size); using Word_type = smallest_Word<Word_size>; using array_type = std::array<Word_type, table_length>; static const array_type& get_data() { static const array_type data = generate<Word_type, Word_size>(std::make_index_sequence<table_length>()); return data; }; template<class Type, std::size_t bits> static constexpr auto n_bits() { auto result = Type(); auto b = bits; while(b--) { result = (result << 1) | Type(1); } return result; }; template<class Uint> int operator()(Uint i) const { constexpr auto mask = n_bits<Uint, Word_size>(); return get_data()[i & mask]; } }; template<int bits> struct use_bits { static constexpr auto digits = bits; }; template<class T> constexpr auto minimum(T x, T y) { return x < y ? x : y; } template<class Word, class UseBits = use_bits<8>> int bits_in(Word val, UseBits = UseBits()) { using Word_type = std::make_unsigned_t<Word>; auto uval = static_cast<Word_type>(val); constexpr auto table_Word_size = UseBits::digits; constexpr auto Word_size = std::numeric_limits<Word_type>::digits; auto const& mytable = table<table_Word_size>(); int result = 0; while (uval) { result += mytable(uval); #pragma clang diagnostic Push #pragma clang diagnostic ignored "-Wshift-count-overflow" uval >>= minimum(table_Word_size, Word_size); #pragma clang diagnostic pop } return result; } volatile int results[4]; #include <iostream> int main() { auto input = std::uint8_t(127); results[0] = bits_in(input); results[1] = bits_in(input, use_bits<4>()); results[2] = bits_in(input, use_bits<11>()); results[3] = bits_in(input, use_bits<15>()); for (auto&& i : results) { std::cout << i << std::endl; } auto input2 = 0xabcdef; results[0] = bits_in(input2); results[1] = bits_in(input2, use_bits<4>()); results[2] = bits_in(input2, use_bits<11>()); results[3] = bits_in(input2, use_bits<15>()); for (auto&& i : results) { std::cout << i << std::endl; } auto input3 = -1; results[0] = bits_in(input3); results[1] = bits_in(input3, use_bits<4>()); results[2] = bits_in(input3, use_bits<11>()); results[3] = bits_in(input3, use_bits<15>()); for (auto&& i : results) { std::cout << i << std::endl; } return 0; } _

出力例：

_7 7 7 7 17 17 17 17 32 32 32 32 _

たとえば、bits_in(int, use_bits<11>())の呼び出しに対するアセンブリ出力は次のようになります。

_.L16: mov edx, edi and edx, 2047 movzx edx, Word PTR table<11ul>::get_data()::data[rdx+rdx] add eax, edx shr edi, 11 jne .L16 _

それは私にとって理にかなっているようです。

DAle · Answer

これは、基本的にconstexprの使用を中心に構築されたC++ 14ソリューションです。

// this struct is a primitive replacement of the std::array that // has no 'constexpr reference operator[]' in C++14 template<int N> struct lookup_table { int table[N]; constexpr int& operator[](size_t i) { return table[i]; } constexpr const int& operator[](size_t i) const { return table[i]; } }; constexpr int bit_count(int i) { int bits = 0; while (i) { i &= i-1; ++bits; } return bits; } template<int N> constexpr lookup_table<N> generate() { lookup_table<N> table = {}; for (int i = 0; i < N; ++i) table[i] = bit_count(i); return table; } template<int I> struct Check { Check() { std::cout << I << "
"; } }; constexpr auto table = generate<65536>(); int main() { // checks that they are evaluated at compile-time Check<table[5]>(); Check<table[65535]>(); return 0; }

実行可能バージョン： http://ideone.com/zQB86O

Akira · Answer

c ++ 17 を使用すると、constexprを使用して、コンパイル時にルックアップテーブルを構築できます。 population count 計算を使用すると、ルックアップテーブルを次のように構築できます。

#include <array> #include <cstdint> template<std::size_t N> constexpr std::array<std::uint16_t, N> make_lookup() { std::array<std::uint16_t, N> table {}; for(std::size_t i = 0; i < N; ++i) { std::uint16_t popcnt = i; popcnt = popcnt - ((popcnt >> 1) & 0x5555); popcnt = (popcnt & 0x3333) + ((popcnt >> 2) & 0x3333); popcnt = ((popcnt + (popcnt >> 4)) & 0x0F0F) * 0x0101; table[i] = popcnt >> 8; } return table; }

サンプル使用法：

auto lookup = make_lookup<65536>();

std::array::operator[]はconstexprです c ++ 17 で、 c ++ 14 で上記の例はコンパイルされますが、真ではありませんconstexpr。

コンパイラを罰したい場合は、結果のstd::arrayを可変長テンプレートで初期化することもできます。このバージョンは c ++ 14 でも動作し、 c ++ 11 でも indices trick を使用して動作します。

#include <array> #include <cstdint> #include <utility> namespace detail { constexpr std::uint8_t popcnt_8(std::uint8_t i) { i = i - ((i >> 1) & 0x55); i = (i & 0x33) + ((i >> 2) & 0x33); return ((i + (i >> 4)) & 0x0F); } template<std::size_t... I> constexpr std::array<std::uint8_t, sizeof...(I)> make_lookup_impl(std::index_sequence<I...>) { return { popcnt_8(I)... }; } } /* detail */ template<std::size_t N> constexpr decltype(auto) make_lookup() { return detail::make_lookup_impl(std::make_index_sequence<N>{}); }

注：上記の例では、16ビット整数から8ビット整数に切り替えました。

Assembly Output

8ビットバージョンは、65536の代わりにdetail::make_lookup_impl関数に対して256個のテンプレート引数のみを作成します。後者は多すぎて、テンプレートのインスタンス化の深さの最大値を超えます。十分な仮想メモリがある場合は、GCCの-ftemplate-depth=65536コンパイラフラグを使用してこの最大値を増やし、16ビット整数に戻すことができます。

とにかく、次のデモを見て、8ビットバージョンが64ビット整数のセットビットをカウントする方法を試してください。

Live Demo

Matt A · Answer

子孫のためにもう1つ、再帰ソリューション（log（N）深さ）を使用してルックアップテーブルを作成します。 constexpr-ifおよびconstexpr-array-operator []を使用するため、非常に多くのC++ 17です。

#include <array> template<size_t Target, size_t I = 1> constexpr auto make_table (std::array<int, I> in = {{ 0 }}) { if constexpr (I >= Target) { return in; } else { std::array<int, I * 2> out {{}}; for (size_t i = 0; i != I; ++i) { out[i] = in[i]; out[I + i] = in[i] + 1; } return make_table<Target> (out); } } constexpr auto population = make_table<65536> ();

ここでコンパイルしてください： https://godbolt.org/g/RJG1JA