-
Notifications
You must be signed in to change notification settings - Fork 12
/
fastmod.h
214 lines (175 loc) · 5.56 KB
/
fastmod.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
/* From https://github.com/lemire/fastmod - Apache 2 */
#ifndef FASTMOD_H
#define FASTMOD_H
#ifndef __cplusplus
#include <stdbool.h>
#include <stdint.h>
#else
// In C++ <cstdbool>/<stdbool.h> are irelevant as bool is already a type
#include <cstdint>
#endif
#ifndef __cplusplus
#define FASTMOD_API static inline
#else
// In C++ we achieve the effects of the above through putting everything in an
// unnamed namespace
// Instead, we mark it constexpr so it can be used at compile-time if C++14
// relaxed constexpr is supported.
#if __cpp_constexpr >= 201304
#define FASTMOD_API constexpr
#else
#define FASTMOD_API
#endif
#endif
#ifdef _MSC_VER
#include <intrin.h>
#endif
#ifdef __cplusplus
namespace {
namespace fastmod {
#endif
#ifdef _MSC_VER
// __umulh is only available in x64 mode under Visual Studio: don't compile to
// 32-bit!
FASTMOD_API uint64_t mul128_u32(uint64_t lowbits, uint32_t d) {
return __umulh(lowbits, d);
}
#else // _MSC_VER NOT defined
FASTMOD_API uint64_t mul128_u32(uint64_t lowbits, uint32_t d) {
return ((__uint128_t)lowbits * d) >> 64;
}
FASTMOD_API uint64_t mul128_s32(uint64_t lowbits, int32_t d) {
return ((__int128_t)lowbits * d) >> 64;
}
// This is for the 64-bit functions.
FASTMOD_API uint64_t mul128_u64(__uint128_t lowbits, uint64_t d) {
__uint128_t bottom_half =
(lowbits & UINT64_C(0xFFFFFFFFFFFFFFFF)) * d; // Won't overflow
bottom_half >>=
64; // Only need the top 64 bits, as we'll shift the lower half away;
__uint128_t top_half = (lowbits >> 64) * d;
__uint128_t both_halves =
bottom_half + top_half; // Both halves are already shifted down by 64
both_halves >>= 64; // Get top half of both_halves
return (uint64_t)both_halves;
}
#endif // _MSC_VER
/**
* Unsigned integers.
* Usage:
* uint32_t d = ... ; // divisor, should be non-zero
* uint64_t M = computeM_u32(d); // do once
* fastmod_u32(a,M,d) is a % d for all 32-bit a.
*
**/
// M = ceil( (1<<64) / d ), d > 0
FASTMOD_API uint64_t computeM_u32(uint32_t d) {
return UINT64_C(0xFFFFFFFFFFFFFFFF) / d + 1;
}
// fastmod computes (a % d) given precomputed M
FASTMOD_API uint32_t fastmod_u32(uint32_t a, uint64_t M, uint32_t d) {
uint64_t lowbits = M * a;
return (uint32_t)(mul128_u32(lowbits, d));
}
// fastmod computes (a / d) given precomputed M for d>1
FASTMOD_API uint32_t fastdiv_u32(uint32_t a, uint64_t M) {
return (uint32_t)(mul128_u32(M, a));
}
// given precomputed M, checks whether n % d == 0
FASTMOD_API bool is_divisible(uint32_t n, uint64_t M) {
return n * M <= M - 1;
}
/**
* signed integers
* Usage:
* int32_t d = ... ; // should be non-zero and between [-2147483647,2147483647]
* int32_t positive_d = d < 0 ? -d : d; // absolute value
* uint64_t M = computeM_s32(d); // do once
* fastmod_s32(a,M,positive_d) is a % d for all 32-bit a.
**/
// M = floor( (1<<64) / d ) + 1
// you must have that d is different from 0 and -2147483648
// if d = -1 and a = -2147483648, the result is undefined
FASTMOD_API uint64_t computeM_s32(int32_t d) {
if (d < 0) {
d = -d;
}
return UINT64_C(0xFFFFFFFFFFFFFFFF) / d + 1 + ((d & (d - 1)) == 0 ? 1 : 0);
}
// fastmod computes (a % d) given precomputed M,
// you should pass the absolute value of d
FASTMOD_API int32_t fastmod_s32(int32_t a, uint64_t M, int32_t positive_d) {
uint64_t lowbits = M * a;
int32_t highbits = mul128_u32(lowbits, positive_d);
return highbits - ((positive_d - 1) & (a >> 31));
}
#ifndef _MSC_VER
// fastmod computes (a / d) given precomputed M, assumes that d must not
// be one of -1, 1, or -2147483648
FASTMOD_API int32_t fastdiv_s32(int32_t a, uint64_t M, int32_t d) {
uint64_t highbits = mul128_s32(M, a);
highbits += (a < 0 ? 1 : 0);
if (d < 0) {
return -(int32_t)(highbits);
}
return (int32_t)(highbits);
}
// What follows is the 64-bit functions.
// They are currently not supported on Visual Studio
// due to the lack of a mul128_u64 function.
// They may not be faster than what the compiler
// can produce.
FASTMOD_API __uint128_t computeM_u64(uint64_t d) {
__uint128_t M = UINT64_C(0xFFFFFFFFFFFFFFFF);
M <<= 64;
M |= UINT64_C(0xFFFFFFFFFFFFFFFF);
M /= d;
M += 1;
return M;
}
FASTMOD_API __uint128_t computeM_s64(int64_t d) {
if (d < 0) {
d = -d;
}
__uint128_t M = UINT64_C(0xFFFFFFFFFFFFFFFF);
M <<= 64;
M |= UINT64_C(0xFFFFFFFFFFFFFFFF);
M /= d;
M += 1;
M += ((d & (d - 1)) == 0 ? 1 : 0);
return M;
}
FASTMOD_API uint64_t fastmod_u64(uint64_t a, __uint128_t M, uint64_t d) {
__uint128_t lowbits = M * a;
return mul128_u64(lowbits, d);
}
FASTMOD_API uint64_t fastdiv_u64(uint64_t a, __uint128_t M) {
return mul128_u64(M, a);
}
// End of the 64-bit functions
#endif // #ifndef _MSC_VER
#ifdef __cplusplus
template <uint32_t d> FASTMOD_API uint32_t fastmod(uint32_t x) {
constexpr uint64_t v = computeM_u32(d);
return fastmod_u32(x, v, d);
}
template <uint32_t d> FASTMOD_API uint32_t fastdiv(uint32_t x) {
constexpr uint64_t v = computeM_u32(d);
return fastdiv_u32(x, v);
}
template <int32_t d> FASTMOD_API int32_t fastmod(int32_t x) {
constexpr uint64_t v = computeM_s32(d);
return fastmod_s32(x, v, d);
}
template <int32_t d> FASTMOD_API int32_t fastdiv(int32_t x) {
constexpr uint64_t v = computeM_s32(d);
return fastdiv_s32(x, v, d);
}
} // fastmod
}
#endif
// There's no reason to polute the global scope with this macro once its use
// ends This won't create any problems as the preprocessor will have done its
// thing once it reaches this point
#undef FASTMOD_API
#endif // FASTMOD_H