#include #define USE_MMX 1 #define UNROLL 1 unsigned short *RadKernel(int nQuads, unsigned short *ffPtr, unsigned short *radPtr, unsigned short *material, unsigned short *results) { #if USE_MMX __asm { mov eax, ffPtr mov ebx, radPtr mov ecx, nQuads mov edx, results pxor mm0, mm0 pxor mm1, mm1 pxor mm2, mm2 pxor mm3, mm3 main_loop: prefetchnta [ebx+128] prefetchnta [eax+128] movzx esi, word ptr [eax] movzx edi, word ptr [eax+2] add eax, 4 sub ecx, esi test edi, edi jz skip mov edi, esi #if UNROLL shr esi, 2 and edi, 3 test edi, edi jz skip_pre_loop #endif pre_inner_loop: movd mm4, [eax] pshufw mm4, mm4, 0 pmulhuw mm4, [ebx] paddw mm0, mm4 add eax, 2 add ebx, 6 dec edi jnz pre_inner_loop #if UNROLL skip_pre_loop: test esi, esi jz end_main_loop inner_loop: prefetchnta [ebx+128] prefetchnta [eax+128] movq mm4, [eax] pshufw mm7, mm4, 0xFF pshufw mm6, mm4, 0xAA pshufw mm5, mm4, 0x55 pshufw mm4, mm4, 0x00 pmulhuw mm4, [ebx] pmulhuw mm5, [ebx+6] pmulhuw mm6, [ebx+12] pmulhuw mm7, [ebx+18] paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm6 paddw mm3, mm7 add eax, 8 add ebx, 24 dec esi jnz inner_loop #endif jmp end_main_loop skip: lea ebx, [ebx+4*esi] lea ebx, [ebx+2*esi] end_main_loop: test ecx, ecx jnz main_loop paddw mm0, mm1 paddw mm2, mm3 paddw mm0, mm2 mov ecx, material pmulhuw mm0, [ecx] movq [edx], mm0 // FEMMS _emit 0x0f _emit 0x0e }; #else unsigned int sumR, sumG, sumB, ff; int count, lcount; sumR = sumG = sumB = 0; count = nQuads; do { lcount = *ffPtr++; count -= lcount; if (*ffPtr++) { while (lcount--) { ff = ffPtr[0]; sumR += ff * radPtr[0]; sumG += ff * radPtr[1]; sumB += ff * radPtr[2]; ffPtr++; radPtr += 3; } } else { radPtr += 3*lcount; } } while (count > 0); results[0] = ((sumR >> 16) * material[0]) >> 16; results[1] = ((sumG >> 16) * material[1]) >> 16; results[2] = ((sumB >> 16) * material[2]) >> 16; return ffPtr; #endif }