fftw3/simd-support/simd-avx-128-fma.h at master · lihp1603/fftw3

History

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

/*

*

* 128-bit AVX support by Erik Lindahl, 2015.

* Erik Lindahl hereby places his modifications in the public domain.

*

* This program is free software; you can redistribute it and/or modify

* it under the terms of the GNU General Public License as published by

* the Free Software Foundation; either version 2 of the License, or

* (at your option) any later version.

*

* This program is distributed in the hope that it will be useful,

* but WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

* GNU General Public License for more details.

*

* You should have received a copy of the GNU General Public License

* along with this program; if not, write to the Free Software

* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

*

*/

#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)

#error "AVX only works in single or double precision"

#endif

#ifdef FFTW_SINGLE

# define DS(d,s) s /* single-precision option */

# define SUFF(name) name ## s

#else

# define DS(d,s) d /* double-precision option */

# define SUFF(name) name ## d

#endif

#define SIMD_SUFFIX _avx_128_fma /* for renaming */

#define VL DS(1,2) /* SIMD vector length, in term of complex numbers */

#define SIMD_VSTRIDE_OKA(x) DS(1,((x) == 2))

#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK

#ifdef _MSC_VER

#ifndef inline

#define inline __inline

#endif

#include <immintrin.h>

#ifdef _MSC_VER

# include <intrin.h>

#elif defined (__GNUC__)

# include <x86intrin.h>

#endif

#if !(defined(__AVX__) && defined(__FMA4__)) /* sanity check */

#error "compiling simd-avx-128-fma.h without -mavx or -mfma4"

#endif

typedef DS(__m128d,__m128) V;

#define VADD SUFF(_mm_add_p)

#define VSUB SUFF(_mm_sub_p)

#define VMUL SUFF(_mm_mul_p)

#define VXOR SUFF(_mm_xor_p)

#define SHUF SUFF(_mm_shuffle_p)

#define VPERM1 SUFF(_mm_permute_p)

#define UNPCKL SUFF(_mm_unpacklo_p)

#define UNPCKH SUFF(_mm_unpackhi_p)

#define SHUFVALS(fp0,fp1,fp2,fp3) \

(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))

#define VDUPL(x) DS(_mm_permute_pd(x,0), _mm_moveldup_ps(x))

#define VDUPH(x) DS(_mm_permute_pd(x,3), _mm_movehdup_ps(x))

#define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))

#define LOADL(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))

#define STOREH(a, v) DS(_mm_storeh_pd(a, v), _mm_storeh_pi((__m64 *)(a), v))

#define STOREL(a, v) DS(_mm_storel_pd(a, v), _mm_storel_pi((__m64 *)(a), v))

#define VLIT(x0, x1) DS(_mm_set_pd(x0, x1), _mm_set_ps(x0, x1, x0, x1))

#define DVK(var, val) V var = VLIT(val, val)

#define LDK(x) x

static inline V LDA(const R *x, INT ivs, const R *aligned_like)

{

(void)aligned_like; /* UNUSED */

(void)ivs; /* UNUSED */

return *(const V *)x;

}

static inline void STA(R *x, V v, INT ovs, const R *aligned_like)

{

(void)aligned_like; /* UNUSED */

(void)ovs; /* UNUSED */

*(V *)x = v;

}

#ifdef FFTW_SINGLE

static inline V LD(const R *x, INT ivs, const R *aligned_like)

{

V var;

#if defined(__ICC) || (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ > 8)

var = LOADL(x, SUFF(_mm_undefined_p)());

var = LOADH(x + ivs, var);

#else

var = LOADL(x, var);

var = LOADH(x + ivs, var);

#endif

return var;

}

# ifdef _MSC_VER

# pragma warning(default : 4700)

# pragma runtime_checks("u", restore)

# endif

static inline void ST(R *x, V v, INT ovs, const R *aligned_like)

{

(void)aligned_like; /* UNUSED */

/* WARNING: the extra_iter hack depends upon STOREL occurring

after STOREH */

STOREH(x + ovs, v);

STOREL(x, v);

}

#else /* ! FFTW_SINGLE */

# define LD LDA

# define ST STA

#endif

#define STM2 DS(STA,ST)

#define STN2(x, v0, v1, ovs) /* nop */

#ifdef FFTW_SINGLE

# define STM4(x, v, ovs, aligned_like) /* no-op */

/* STN4 is a macro, not a function, thanks to Visual C++ developers

deciding "it would be infrequent that people would want to pass more

than 3 [__m128 parameters] by value." 3 parameters ought to be enough

for anybody. */

# define STN4(x, v0, v1, v2, v3, ovs) \

{ \

V xxx0, xxx1, xxx2, xxx3; \

xxx0 = UNPCKL(v0, v2); \

xxx1 = UNPCKH(v0, v2); \

xxx2 = UNPCKL(v1, v3); \

xxx3 = UNPCKH(v1, v3); \

STA(x, UNPCKL(xxx0, xxx2), 0, 0); \

STA(x + ovs, UNPCKH(xxx0, xxx2), 0, 0); \

STA(x + 2 * ovs, UNPCKL(xxx1, xxx3), 0, 0); \

STA(x + 3 * ovs, UNPCKH(xxx1, xxx3), 0, 0); \

}

#else /* !FFTW_SINGLE */

static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)

{

(void)aligned_like; /* UNUSED */

STOREL(x, v);

STOREH(x + ovs, v);

}

# define STN4(x, v0, v1, v2, v3, ovs) /* nothing */

#endif

static inline V FLIP_RI(V x)

{

return VPERM1(x, DS(1, SHUFVALS(1, 0, 3, 2)));

}

static inline V VCONJ(V x)

{

/* Produce a SIMD vector[VL] of (0 + -0i).

We really want to write this:

V pmpm = VLIT(-0.0, 0.0);

but historically some compilers have ignored the distiction

between +0 and -0. It looks like 'gcc-8 -fast-math' treats -0

as 0 too.

*/

union uvec {

unsigned u[4];

V v;

};

static const union uvec pmpm = {

#ifdef FFTW_SINGLE

{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }

#else

{ 0x00000000, 0x00000000, 0x00000000, 0x80000000 }

#endif

};

return VXOR(pmpm.v, x);

}

static inline V VBYI(V x)

{

x = VCONJ(x);

x = FLIP_RI(x);

return x;

}

/* FMA support */

#define VFMA(a, b, c) SUFF(_mm_macc_p)(a,b,c)

#define VFNMS(a, b, c) SUFF(_mm_nmacc_p)(a,b,c)

#define VFMS(a, b, c) SUFF(_mm_msub_p)(a,b,c)

#define VFMAI(b, c) SUFF(_mm_addsub_p)(c,FLIP_RI(b))

#define VFNMSI(b, c) VSUB(c, VBYI(b))

#define VFMACONJ(b,c) VADD(VCONJ(b),c)

#define VFMSCONJ(b,c) VSUB(VCONJ(b),c)

#define VFNMSCONJ(b,c) SUFF(_mm_addsub_p)(c,b)

static inline V VZMUL(V tx, V sr)

{

V tr = VDUPL(tx);

V ti = VDUPH(tx);

tr = VMUL(tr, sr);

ti = VMUL(ti, FLIP_RI(sr));

return SUFF(_mm_addsub_p)(tr,ti);

}

static inline V VZMULJ(V tx, V sr)

{

V tr = VDUPL(tx);

V ti = VDUPH(tx);

tr = VMUL(tr, sr);

sr = VBYI(sr);

return VFNMS(ti, sr, tr);

}

static inline V VZMULI(V tx, V sr)

{

V tr = VDUPL(tx);

V ti = VDUPH(tx);

ti = VMUL(ti, sr);

sr = VBYI(sr);

return VFMS(tr, sr, ti);

}

static inline V VZMULIJ(V tx, V sr)

{

V tr = VDUPL(tx);

V ti = VDUPH(tx);

ti = VMUL(ti, sr);

tr = VMUL(tr, FLIP_RI(sr));

return SUFF(_mm_addsub_p)(ti,tr);

}

/* twiddle storage #1: compact, slower */

#ifdef FFTW_SINGLE

# define VTW1(v,x) \

{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}

static inline V BYTW1(const R *t, V sr)

{

const V *twp = (const V *)t;

V tx = twp[0];

V tr = UNPCKL(tx, tx);

V ti = UNPCKH(tx, tx);

tr = VMUL(tr, sr);

ti = VMUL(ti, FLIP_RI(sr));

return SUFF(_mm_addsub_p)(tr,ti);

}

static inline V BYTWJ1(const R *t, V sr)

{

const V *twp = (const V *)t;

V tx = twp[0];

V tr = UNPCKL(tx, tx);

V ti = UNPCKH(tx, tx);

tr = VMUL(tr, sr);

sr = VBYI(sr);

return VFNMS(ti, sr, tr);

}

#else /* !FFTW_SINGLE */

# define VTW1(v,x) {TW_CEXP, v, x}

static inline V BYTW1(const R *t, V sr)

{

V tx = LD(t, 1, t);

return VZMUL(tx, sr);

}

static inline V BYTWJ1(const R *t, V sr)

{

V tx = LD(t, 1, t);

return VZMULJ(tx, sr);

}

#endif

#define TWVL1 (VL)

/* twiddle storage #2: twice the space, faster (when in cache) */

#ifdef FFTW_SINGLE

# define VTW2(v,x) \

{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \

{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}

#else /* !FFTW_SINGLE */

# define VTW2(v,x) \

{TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}

#endif

#define TWVL2 (2 * VL)

static inline V BYTW2(const R *t, V sr)

{

const V *twp = (const V *)t;

V si = FLIP_RI(sr);

V tr = twp[0], ti = twp[1];

return VFMA(tr, sr, VMUL(ti, si));

}

static inline V BYTWJ2(const R *t, V sr)

{

const V *twp = (const V *)t;

V si = FLIP_RI(sr);

V tr = twp[0], ti = twp[1];

return VFNMS(ti, si, VMUL(tr, sr));

}

/* twiddle storage #3 */

#ifdef FFTW_SINGLE

# define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}

# define TWVL3 (VL)

#else

# define VTW3(v,x) VTW1(v,x)

# define TWVL3 TWVL1

#endif

/* twiddle storage for split arrays */

#ifdef FFTW_SINGLE

# define VTWS(v,x) \

{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \

{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}

#else

# define VTWS(v,x) \

{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}

#endif

#define TWVLS (2 * VL)

#define VLEAVE() /* nothing */

#include "simd-common.h"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

simd-avx-128-fma.h

simd-avx-128-fma.h

Files

simd-avx-128-fma.h

Latest commit

History

simd-avx-128-fma.h

File metadata and controls