>>107127085
Here is how I would implement these two cases.
[spoiler][/spoiler]
} else if (num_bits < 128) {
for (u32 bk = 0; bk < num_buckets - 1; ++bk) {
const auto left = ~buckets[bk];
const auto right = ~buckets[bk + 1];
const auto ones_left = _lzcnt_u64(left);
int sum;
if(right == 0 && bk < num_buckets - 2) {
const auto right2 = ~buckets[bk + 2];
const auto ones_right2 = _tzcnt_u64(right2);
sum = ones_left + 64 + ones_right2;
} else {
const auto ones_right = _tzcnt_u64(right);
sum = ones_left + ones_right;
}
if (sum >= num_bits) return idx(bk, 64 - ones_left);
}
} else {
const auto min_full = (i32)(num_bits) / 64 - 1;
u32 num_full = 0;
for (i32 bk = 0; bk < num_buckets; ++bk) {
constexpr auto FULL = U64_MAX;
if (buckets[bk] == FULL) ++num_full;
else num_full = 0;
if (num_full == min_full) {
u32 ones_left = 0;
const auto bk_l = bk - min_full;
if (bk_l >= 0) {
const auto left = ~buckets[bk_l];
ones_left = _lzcnt_u64(left);
}
u32 ones_right = 0;
const auto bk_r = bk + 1;
if(bk_r < num_buckets) {
const auto right = ~buckets[bk_r];
if(right == 0 && bk_r < num_buckets - 1) {
const auto right2 = ~buckets[bk_r + 1];
ones_right = 64 + _tzcnt_u64(right2);
bk += 1;
} else {
ones_right = _tzcnt_u64(right);
}
}
if (min_full * 64 + ones_left + ones_right >= num_bits) return idx(bk_l, 64 - ones_left);
}
}
Here are plots with my fixes. Noice