<< Chapter < Page | Chapter >> Page > |
typedef struct _reg_t {
__m128 re, im;} reg_t;
static inline reg_t MUL(reg_t a, reg_t b) {
reg_t r;r.re = _mm_sub_ps(_mm_mul_ps(a.re,b.re),_mm_mul_ps(a.im,b.im));
r.im = _mm_add_ps(_mm_mul_ps(a.re,b.im),_mm_mul_ps(a.im,b.re));return r;
}static inline reg_t MULJ(reg_t a, reg_t b) {
reg_t r;r.re = _mm_add_ps(_mm_mul_ps(a.re,b.re),_mm_mul_ps(a.im,b.im));
r.im = _mm_sub_ps(_mm_mul_ps(a.im,b.re),_mm_mul_ps(a.re,b.im));return r;
}
static inline reg_t ADD(reg_t a, reg_t b) {reg_t r;
r.re = _mm_add_ps(a.re,b.re);r.im = _mm_add_ps(a.im,b.im);
return r;}
static inline reg_t SUB(reg_t a, reg_t b) {reg_t r;
r.re = _mm_sub_ps(a.re,b.re);r.im = _mm_sub_ps(a.im,b.im);
return r;}
static inline reg_t ADD_I(reg_t a, reg_t b) {reg_t r;
r.re = _mm_sub_ps(a.re,b.im);r.im = _mm_add_ps(a.im,b.re);
return r;}
static inline reg_t SUB_I(reg_t a, reg_t b) {reg_t r;
r.re = _mm_add_ps(a.re,b.im);r.im = _mm_sub_ps(a.im,b.re);
return r;}
static inline reg_t LOAD(float *a) {
reg_t r;r.re = _mm_load_ps(a);
r.im = _mm_load_ps(a+4);return r;
}static inline void STORE(float *a, reg_t r) {
_mm_store_ps(a, r.re);_mm_store_ps(a+4, r.im);
}static inline void STOREIL(float *a, reg_t r) {
_mm_store_ps(a, _mm_unpacklo_ps(r.re, r.im));_mm_store_ps(a+4, _mm_unpackhi_ps(r.re, r.im));
}
Vectorized math functions for split-radix implementations
#include <math.h>#include <complex.h>#include <stdio.h>#include <stdlib.h>#include <xmmintrin.h>
typedef complex float data_t;
#define W(N,k) (cexp(-2.0f * M_PI * I * (float)(k) / (float)(N)))data_t **LUT1;
data_t **LUT3;
#include "vecmath.h"
void splitfft(data_t *in, data_t *out,int log2stride, int stride, int N) {if(N == 1) {
out[0] = in[0];}else if(N == 2) {
out[0] = in[0] + in[stride];
out[1] = in[0] - in[stride];
}else if(N == 4) {splitfft(in, out, log2stride+1, stride << 1, N >> 1);
splitfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);
splitfft(in+3*stride, out+3*N/4, log2stride+2, stride << 2, N >> 2);data_t temp0 = out[0] + (out[2] + out[3]);data_t temp1 = out[0] - (out[2] + out[3]);data_t temp2 = out[1] - I*(out[2] - out[3]);data_t temp3 = out[1] + I*(out[2] - out[3]); if(log2stride) {
out[0] = creal(temp0) + creal(temp2)*I;
out[1] = creal(temp1) + creal(temp3)*I;
out[2] = cimag(temp0) + cimag(temp2)*I;
out[3] = cimag(temp1) + cimag(temp3)*I;
}else{out[0] = temp0;out[2] = temp1;out[1] = temp2;out[3] = temp3; }}else if(N == 8) {
splitfft(in, out, log2stride+1, stride << 1, N >> 1);
splitfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);
splitfft(in+3*stride, out+3*N/4, log2stride+2, stride << 2, N >> 2);data_t o[8];{
data_t Uk = creal(out[0]) + creal(out[2])*I;data_t Zk = out[4];data_t Uk2 = creal(out[1]) + creal(out[3])*I;
data_t Zdk = out[6];o[0] = Uk + (Zk + Zdk);o[4] = Uk - (Zk + Zdk);o[2] = Uk2 - I*(Zk - Zdk);o[6] = Uk2 + I*(Zk - Zdk);}
{data_t Uk = cimag(out[0]) + cimag(out[2])*I;
data_t Zk = out[5];
data_t Uk2 = cimag(out[1]) + cimag(out[3])*I;data_t Zdk = out[7];data_t w1 = LUT1[log2stride][1];
data_t w3 = LUT3[log2stride][1];o[1] = Uk + (w1*Zk + w3*Zdk);
o[5] = Uk - (w1*Zk + w3*Zdk);
o[3] = Uk2 - I*(w1*Zk - w3*Zdk);
o[7] = Uk2 + I*(w1*Zk - w3*Zdk);
}if(log2stride) {
out[0] = creal(o[0]) + creal(o[1])*I;
out[1] = creal(o[2]) + creal(o[3])*I;
out[2] = cimag(o[0]) + cimag(o[1])*I;
out[3] = cimag(o[2]) + cimag(o[3])*I;
out[4] = creal(o[4]) + creal(o[5])*I;
out[5] = creal(o[6]) + creal(o[7])*I;
out[6] = cimag(o[4]) + cimag(o[5])*I;
out[7] = cimag(o[6]) + cimag(o[7])*I;
}else{int i;
for(i=0;i<8;i++) out[i] = o[i];
}}else if(!log2stride){
splitfft(in, out, log2stride+1, stride << 1, N >> 1);
splitfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);
splitfft(in+3*stride, out+3*N/4, log2stride+2, stride << 2, N >> 2);
int k;for(k=0;k<N/4;k+=4) {
reg_t Uk = LOAD((float *)&out[k]);reg_t Zk = LOAD((float *)&out[k+N/2]);reg_t Uk2 = LOAD((float *)&out[k+N/4]);reg_t Zdk = LOAD((float *)&out[k+3*N/4]);reg_t w1 = LOAD((float *)&LUT1[log2stride][k]);
reg_t w3 = LOAD((float *)&LUT3[log2stride][k]);reg_t w3Zdk = MUL(w3, Zdk);
reg_t w1Zk = MUL(w1, Zk);reg_t sum = ADD(w1Zk, w3Zdk);
reg_t dif = SUB(w1Zk, w3Zdk);STOREIL((float *)&out[k], ADD(Uk, sum));STOREIL((float *)&out[k+N/2], SUB(Uk, sum));STOREIL((float *)&out[k+N/4], SUB_I(Uk2, dif));STOREIL((float *)&out[k+3*N/4], ADD_I(Uk2, dif));}
}else{
splitfft(in, out, log2stride+1, stride << 1, N >> 1);
splitfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);
splitfft(in+3*stride, out+3*N/4, log2stride+2, stride << 2, N >> 2);int k;
for(k=0;k<N/4;k+=4) {
reg_t Uk = LOAD((float *)&out[k]);reg_t Zk = LOAD((float *)&out[k+N/2]);reg_t Uk2 = LOAD((float *)&out[k+N/4]);reg_t Zdk = LOAD((float *)&out[k+3*N/4]);reg_t w1 = LOAD((float *)&LUT1[log2stride][k]);
reg_t w3 = LOAD((float *)&LUT3[log2stride][k]);reg_t w3Zdk = MUL(w3, Zdk);
reg_t w1Zk = MUL(w1, Zk);reg_t sum = ADD(w1Zk, w3Zdk);
reg_t dif = SUB(w1Zk, w3Zdk);STORE((float *)&out[k], ADD(Uk, sum));STORE((float *)&out[k+N/2], SUB(Uk, sum));STORE((float *)&out[k+N/4], SUB_I(Uk2, dif));STORE((float *)&out[k+3*N/4], ADD_I(Uk2, dif));}
}}
void fft_init(int N) {
#define log2(x) ((int)(log(x)/log(2)))int n_luts = log2(N)-1;LUT1 = malloc(n_luts * sizeof(data_t *));
LUT3 = malloc(n_luts * sizeof(data_t *));int i;
for(i=0;i<n_luts;i++) {
int n = N / pow(2,i);LUT1[i] = _mm_malloc(n/4 * sizeof(data_t),16);LUT3[i] = _mm_malloc(n/4 * sizeof(data_t),16);if(n == 8) {int j;
for(j=0;j<n/4;j++) {
LUT1[i][j] = W(n,j);LUT3[i][j] = W(n,3*j);
}}else{
int j;for(j=0;j<n/4;j+=4) {
data_t w1[4], w3[4];int k;
for(k=0;k<4;k++) w1[k] = W(n,j+k);for(k=0;k<4;k++) w3[k] = W(n,3*(j+k));
LUT1[i][j] = creal(w1[0]) + creal(w1[1])*I;LUT1[i][j+1] = creal(w1[2]) + creal(w1[3])*I;
LUT1[i][j+2] = cimag(w1[0]) + cimag(w1[1])*I;LUT1[i][j+3] = cimag(w1[2]) + cimag(w1[3])*I;
LUT3[i][j] = creal(w3[0]) + creal(w3[1])*I;LUT3[i][j+1] = creal(w3[2]) + creal(w3[3])*I;
LUT3[i][j+2] = cimag(w3[0]) + cimag(w3[1])*I;LUT3[i][j+3] = cimag(w3[2]) + cimag(w3[3])*I;
}}
}}
Split-radix FFT with vectorized loops
#include <math.h>#include <complex.h>#include <stdio.h>#include <stdlib.h>#include <xmmintrin.h>
typedef complex float data_t;
#define W(N,k) (cexp(-2.0f * M_PI * I * (float)(k) / (float)(N)))data_t **LUT1;
#include "vecmath.h"
data_t *base;
int TN;
void conjfft(data_t *in, data_t *out,int log2stride, int stride, int N) {if(N == 1) {
if(in < base) in += TN;
out[0] = in[0];}else if(N == 2) {
data_t *i0 = in, *i1 = in + stride;if(i0 < base) i0 += TN;
if(i1 < base) i1 += TN;
out[0] = *i0 + *i1;
out[N/2] = *i0 - *i1;
}else if(N == 4) {conjfft(in, out, log2stride+1, stride << 1, N >> 1);
conjfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);
conjfft(in-stride, out+3*N/4, log2stride+2, stride << 2, N >> 2);data_t temp0 = out[0] + (out[2] + out[3]);data_t temp1 = out[0] - (out[2] + out[3]);data_t temp2 = out[1] - I*(out[2] - out[3]);data_t temp3 = out[1] + I*(out[2] - out[3]); if(log2stride) {
out[0] = creal(temp0) + creal(temp2)*I;
out[1] = creal(temp1) + creal(temp3)*I;
out[2] = cimag(temp0) + cimag(temp2)*I;
out[3] = cimag(temp1) + cimag(temp3)*I;
}else{out[0] = temp0;out[2] = temp1;out[1] = temp2;out[3] = temp3; }}else if(N == 8) {
conjfft(in, out, log2stride+1, stride << 1, N >> 1);
conjfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);
conjfft(in-stride, out+3*N/4, log2stride+2, stride << 2, N >> 2);data_t o[8];{
data_t Uk = creal(out[0]) + creal(out[2])*I;data_t Zk = out[4];data_t Uk2 = creal(out[1]) + creal(out[3])*I;
data_t Zdk = out[6];o[0] = Uk + (Zk + Zdk);o[4] = Uk - (Zk + Zdk);o[2] = Uk2 - I*(Zk - Zdk);
o[6] = Uk2 + I*(Zk - Zdk);
}{
data_t Uk = cimag(out[0]) + cimag(out[2])*I;data_t Zk = out[5];data_t Uk2 = cimag(out[1]) + cimag(out[3])*I;
data_t Zdk = out[7];
data_t w1 = LUT1[log2stride][1];o[1] = Uk + (w1*Zk + conj(w1)*Zdk);
o[5] = Uk - (w1*Zk + conj(w1)*Zdk);
o[3] = Uk2 - I*(w1*Zk - conj(w1)*Zdk);
o[7] = Uk2 + I*(w1*Zk - conj(w1)*Zdk);
}if(log2stride) {
out[0] = creal(o[0]) + creal(o[1])*I;
out[1] = creal(o[2]) + creal(o[3])*I;
out[2] = cimag(o[0]) + cimag(o[1])*I;
out[3] = cimag(o[2]) + cimag(o[3])*I;
out[4] = creal(o[4]) + creal(o[5])*I;
out[5] = creal(o[6]) + creal(o[7])*I;
out[6] = cimag(o[4]) + cimag(o[5])*I;
out[7] = cimag(o[6]) + cimag(o[7])*I;
}else{int i;
for(i=0;i<8;i++) out[i] = o[i];
}}else if(!log2stride){
conjfft(in, out, log2stride+1, stride << 1, N >> 1);
conjfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);
conjfft(in-stride, out+3*N/4, log2stride+2, stride << 2, N >> 2);
int k;for(k=0;k<N/4;k+=4) {
reg_t Uk = LOAD((float *)&out[k]);reg_t Zk = LOAD((float *)&out[k+N/2]);reg_t Uk2 = LOAD((float *)&out[k+N/4]);reg_t Zdk = LOAD((float *)&out[k+3*N/4]);reg_t w1 = LOAD((float *)&LUT1[log2stride][k]);reg_t w3Zdk = MULJ(Zdk, w1);
reg_t w1Zk = MUL(w1, Zk);reg_t sum = ADD(w1Zk, w3Zdk);
reg_t dif = SUB(w1Zk, w3Zdk);STOREIL((float *)&out[k], ADD(Uk, sum));STOREIL((float *)&out[k+N/2], SUB(Uk, sum));STOREIL((float *)&out[k+N/4], SUB_I(Uk2, dif));STOREIL((float *)&out[k+3*N/4], ADD_I(Uk2, dif));}
}else{
conjfft(in, out, log2stride+1, stride << 1, N >> 1);
conjfft(in+stride, out+N/2, log2stride+2, stride << 2, N >> 2);
conjfft(in-stride, out+3*N/4, log2stride+2, stride << 2, N >> 2);int k;
for(k=0;k<N/4;k+=4) {
reg_t Uk = LOAD((float *)&out[k]);reg_t Zk = LOAD((float *)&out[k+N/2]);reg_t Uk2 = LOAD((float *)&out[k+N/4]);reg_t Zdk = LOAD((float *)&out[k+3*N/4]);reg_t w1 = LOAD((float *)&LUT1[log2stride][k]);reg_t w3Zdk = MULJ(Zdk, w1);
reg_t w1Zk = MUL(w1, Zk);reg_t sum = ADD(w1Zk, w3Zdk);
reg_t dif = SUB(w1Zk, w3Zdk);STORE((float *)&out[k], ADD(Uk, sum));STORE((float *)&out[k+N/2], SUB(Uk, sum));STORE((float *)&out[k+N/4], SUB_I(Uk2, dif));STORE((float *)&out[k+3*N/4], ADD_I(Uk2, dif));}
}}
void fft_init(int N) {
#define log2(x) ((int)(log(x)/log(2)))int n_luts = log2(N)-1;
LUT1 = malloc(n_luts * sizeof(data_t *));int i;
for(i=0;i<n_luts;i++) {
int n = N / pow(2,i);LUT1[i] = _mm_malloc(n/4 * sizeof(data_t),16);if(n == 8) {int j;
for(j=0;j<n/4;j++) {
LUT1[i][j] = W(n,j);}
}else{int j;
for(j=0;j<n/4;j+=4) {
data_t w1[4];
int k;for(k=0;k<4;k++) w1[k] = W(n,j+k);
LUT1[i][j] = creal(w1[0]) + creal(w1[1])*I;LUT1[i][j+1] = creal(w1[2]) + creal(w1[3])*I;
LUT1[i][j+2] = cimag(w1[0]) + cimag(w1[1])*I;LUT1[i][j+3] = cimag(w1[2]) + cimag(w1[3])*I;
}}
}TN = N;
}
Conjugate-pair FFT with vectorized loops
Notification Switch
Would you like to follow the 'Computing the fast fourier transform on simd microprocessors' conversation and receive update notifications?