#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <math.h> 
#include <stdlib.h>
#include <errno.h>
#include "masm2as.h"

#define MAX_STR_LEN 512

#if !defined _WIN32
# include <stdint.h>
#else
# include <malloc.h>
#endif

#if defined _WIN32 || defined _WIN64
# define __INT64 __int64
# define __UINT64 unsigned __int64
#else
# define __INT64 long long
# define __UINT64 unsigned long long
#endif

#include "ipp.h"

static int nextWord( char* s, char* a ){
int l, j, len = (int)strlen( s );

    l = (int)strlen( a );
    for( j = 0; j < len-l+1; j++ ){
        if(( ' ' == s[j] )||( '(' == s[j] )) continue;
        else break;
    }
    if( 0 == _strnicmp( a, &s[j], l )){
        return j;
    }else{
        return -1;
    }
}

int main(int argc, char* argv[])
{
   const IppLibraryVersion *lib;
   IppStatus status;
   Ipp64u mask, emask;

/* versi info */
lib = ippiGetLibVersion();
printf("%s %s\n", lib->Name, lib->Version);

/* Get CPU features and features enabled with selected library level */
   status = ippGetCpuFeatures( &mask, 0 );
   if( ippStsNoErr == status ) {
      emask = ippGetEnabledCpuFeatures();
      printf("  ippCPUID_MMX        = ");
      printf("%c\t%c\t",( mask & ippCPUID_MMX ) ? 'Y':'N',( emask & ippCPUID_MMX ) ? 'Y':'N');
      printf("Intel Architecture MMX technology supported\n");
      printf("  ippCPUID_SSE        = ");
      printf("%c\t%c\t",( mask & ippCPUID_SSE ) ? 'Y':'N',( emask & ippCPUID_SSE ) ? 'Y':'N');
      printf("Streaming SIMD Extensions\n");
      printf("  ippCPUID_SSE2       = ");
      printf("%c\t%c\t",( mask & ippCPUID_SSE2 ) ? 'Y':'N',( emask & ippCPUID_SSE2 ) ? 'Y':'N');
      printf("Streaming SIMD Extensions 2\n");
      printf("  ippCPUID_SSE3       = ");
      printf("%c\t%c\t",( mask & ippCPUID_SSE3 ) ? 'Y':'N',( emask & ippCPUID_SSE3 ) ? 'Y':'N');
      printf("Streaming SIMD Extensions 3\n");
      printf("  ippCPUID_SSSE3      = ");
      printf("%c\t%c\t",( mask & ippCPUID_SSSE3 ) ? 'Y':'N',( emask & ippCPUID_SSSE3 ) ? 'Y':'N');
      printf("Supplemental Streaming SIMD Extensions 3\n");
      printf("  ippCPUID_MOVBE      = ");
      printf("%c\t%c\t",( mask & ippCPUID_MOVBE ) ? 'Y':'N',( emask & ippCPUID_MOVBE ) ? 'Y':'N');
      printf("The processor supports MOVBE instruction\n");
      printf("  ippCPUID_SSE41      = ");
      printf("%c\t%c\t",( mask & ippCPUID_SSE41 ) ? 'Y':'N',( emask & ippCPUID_SSE41 ) ? 'Y':'N');
      printf("Streaming SIMD Extensions 4.1\n");
      printf("  ippCPUID_SSE42      = ");
      printf("%c\t%c\t",( mask & ippCPUID_SSE42 ) ? 'Y':'N',( emask & ippCPUID_SSE42 ) ? 'Y':'N');
      printf("Streaming SIMD Extensions 4.2\n");
      printf("  ippCPUID_AVX        = ");
      printf("%c\t%c\t",( mask & ippCPUID_AVX ) ? 'Y':'N',( emask & ippCPUID_AVX ) ? 'Y':'N');
      printf("Advanced Vector Extensions instruction set\n");
      printf("  ippAVX_ENABLEDBYOS  = ");
      printf("%c\t%c\t",( mask & ippAVX_ENABLEDBYOS ) ? 'Y':'N',( emask & ippAVX_ENABLEDBYOS ) ? 'Y':'N');
      printf("The operating system supports AVX\n");
      printf("  ippCPUID_AES        = ");
      printf("%c\t%c\t",( mask & ippCPUID_AES ) ? 'Y':'N',( emask & ippCPUID_AES ) ? 'Y':'N');
      printf("AES instruction\n");
      printf("  ippCPUID_SHA        = ");
      printf("%c\t%c\t",( mask & ippCPUID_SHA ) ? 'Y':'N',( emask & ippCPUID_SHA ) ? 'Y':'N');
      printf("SHA new instructions\n");
      printf("  ippCPUID_CLMUL      = ");
      printf("%c\t%c\t",( mask & ippCPUID_CLMUL ) ? 'Y':'N',( emask & ippCPUID_CLMUL ) ? 'Y':'N');
      printf("PCLMULQDQ instruction\n");
      printf("  ippCPUID_RDRAND     = ");
      printf("%c\t%c\t",( mask & ippCPUID_RDRAND ) ? 'Y':'N',( emask & ippCPUID_RDRAND ) ? 'Y':'N');
      printf("Read Random Number instructions\n");
      printf("  ippCPUID_F16C       = ");
      printf("%c\t%c\t",( mask & ippCPUID_F16C ) ? 'Y':'N',( emask & ippCPUID_F16C ) ? 'Y':'N');
      printf("Float16 instructions\n");
      printf("  ippCPUID_AVX2       = ");
      printf("%c\t%c\t",( mask & ippCPUID_AVX2 ) ? 'Y':'N',( emask & ippCPUID_AVX2 ) ? 'Y':'N');
      printf("Advanced Vector Extensions 2 instruction set\n");
      printf("  ippCPUID_AVX512F    = ");
      printf("%c\t%c\t",( mask & ippCPUID_AVX512F ) ? 'Y':'N',( emask & ippCPUID_AVX512F ) ? 'Y':'N');
      printf("Advanced Vector Extensions 3.1 instruction set\n");
      printf("  ippCPUID_AVX512CD   = ");
      printf("%c\t%c\t",( mask & ippCPUID_AVX512CD ) ? 'Y':'N',( emask & ippCPUID_AVX512CD ) ? 'Y':'N');
      printf("Advanced Vector Extensions CD (Conflict Detection) instruction set\n");
      printf("  ippCPUID_AVX512ER   = ");
      printf("%c\t%c\t",( mask & ippCPUID_AVX512ER ) ? 'Y':'N',( emask & ippCPUID_AVX512ER ) ? 'Y':'N');
      printf("Advanced Vector Extensions ER instruction set\n");
      printf("  ippCPUID_ADCOX      = ");
      printf("%c\t%c\t",( mask & ippCPUID_ADCOX ) ? 'Y':'N',( emask & ippCPUID_ADCOX ) ? 'Y':'N');
      printf("ADCX and ADOX instructions\n");
      printf("  ippCPUID_RDSEED     = ");
      printf("%c\t%c\t",( mask & ippCPUID_RDSEED ) ? 'Y':'N',( emask & ippCPUID_RDSEED ) ? 'Y':'N');
      printf("The RDSEED instruction\n");
      printf("  ippCPUID_PREFETCHW  = ");
      printf("%c\t%c\t",( mask & ippCPUID_PREFETCHW ) ? 'Y':'N',( emask & ippCPUID_PREFETCHW ) ? 'Y':'N');
      printf("The PREFETCHW instruction\n");
      printf("  ippCPUID_KNC        = ");
      printf("%c\t%c\t",( mask & ippCPUID_KNC ) ? 'Y':'N',( emask & ippCPUID_KNC ) ? 'Y':'N');
      printf("Knights Corner instruction set\n");
   }
   return 0;
}

static char xreg[][8]= {
    {'x','m','m','0',  0,  0,  0,  0},
    {'x','m','m','1',  0,  0,  0,  0},
    {'x','m','m','2',  0,  0,  0,  0},
    {'x','m','m','3',  0,  0,  0,  0},
    {'x','m','m','4',  0,  0,  0,  0},
    {'x','m','m','5',  0,  0,  0,  0},
    {'x','m','m','6',  0,  0,  0,  0},
    {'x','m','m','7',  0,  0,  0,  0},
    {'y','m','m','0',  0,  0,  0,  0},
    {'y','m','m','1',  0,  0,  0,  0},
    {'y','m','m','2',  0,  0,  0,  0},
    {'y','m','m','3',  0,  0,  0,  0},
    {'y','m','m','4',  0,  0,  0,  0},
    {'y','m','m','5',  0,  0,  0,  0},
    {'y','m','m','6',  0,  0,  0,  0},
    {'y','m','m','7',  0,  0,  0,  0},
    {'m','m','0',  0,  0,  0,  0,  0},
    {'m','m','1',  0,  0,  0,  0,  0},
    {'m','m','2',  0,  0,  0,  0,  0},
    {'m','m','3',  0,  0,  0,  0,  0},
    {'m','m','4',  0,  0,  0,  0,  0},
    {'m','m','5',  0,  0,  0,  0,  0},
    {'m','m','6',  0,  0,  0,  0,  0},
    {'m','m','7',  0,  0,  0,  0,  0},
};
#define xregSize (sizeof(xreg)/sizeof(xreg[0]))

static char fpureg[][8]= {
    {'s','t',  0,  0,  0,  0,  0,  0},
    {'s','t','(','0',')',  0,  0,  0},
    {'s','t','(','1',')',  0,  0,  0},
    {'s','t','(','2',')',  0,  0,  0},
    {'s','t','(','3',')',  0,  0,  0},
    {'s','t','(','4',')',  0,  0,  0},
    {'s','t','(','5',')',  0,  0,  0},
    {'s','t','(','6',')',  0,  0,  0},
    {'s','t','(','7',')',  0,  0,  0},
};
#define fpuregSize (sizeof(fpureg)/sizeof(fpureg[0]))

static char gpr[][8]= {
    {'e','a','x',  0,  0,  0,  0,'l'},
    {'a','x',  0,  0,  0,  0,  0,'w'},
    {'a','l',  0,  0,  0,  0,  0,'b'},
    {'a','h',  0,  0,  0,  0,  0,'b'},
    {'e','c','x',  0,  0,  0,  0,'l'},
    {'c','x',  0,  0,  0,  0,  0,'w'},
    {'c','l',  0,  0,  0,  0,  0,'b'},
    {'c','h',  0,  0,  0,  0,  0,'b'},
    {'e','d','x',  0,  0,  0,  0,'l'},
    {'d','x',  0,  0,  0,  0,  0,'w'},
    {'d','l',  0,  0,  0,  0,  0,'b'},
    {'d','h',  0,  0,  0,  0,  0,'b'},
    {'e','b','x',  0,  0,  0,  0,'l'},
    {'b','x',  0,  0,  0,  0,  0,'w'},
    {'b','l',  0,  0,  0,  0,  0,'b'},
    {'b','h',  0,  0,  0,  0,  0,'b'},
    {'e','s','i',  0,  0,  0,  0,'l'},
    {'s','i',  0,  0,  0,  0,  0,'w'},
    {'e','d','i',  0,  0,  0,  0,'l'},
    {'d','i',  0,  0,  0,  0,  0,'w'},
    {'e','b','p',  0,  0,  0,  0,'l'},
    {'b','p',  0,  0,  0,  0,  0,'w'},
    {'e','s','p',  0,  0,  0,  0,'l'},
    {'s','p',  0,  0,  0,  0,  0,'w'}
};
#define gprSize (sizeof(gpr)/sizeof(gpr[0]))

static char keyType[][16]= {
    {'b','y','t','e',  0,  0,  0,  0,'b',  0,  0,  0,  0,  0,  0,  0},
    {'s','b','y','t','e',  0,  0,  0,'b',  0,  0,  0,  0,  0,  0,  0},
    {'w','o','r','d',  0,  0,  0,  0,'w',  0,  0,  0,  0,  0,  0,  0}, 
    {'s','w','o','r','d',  0,  0,  0,'w',  0,  0,  0,  0,  0,  0,  0},
    {'d','w','o','r','d',  0,  0,  0,'l',  0,  0,  0,  0,  0,  0,  0},
    {'s','d','w','o','r','d',  0,  0,'l',  0,  0,  0,  0,  0,  0,  0},
    {'q','w','o','r','d',  0,  0,  0,'q',  0,  0,  0,  0,  0,  0,  0},
    {'m','m','w','o','r','d',  0,  0,'q',  0,  0,  0,  0,  0,  0,  0},
    {'o','w','o','r','d',  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
    {'x','m','m','w','o','r','d',  0,  0,  0,  0,  0,  0,  0,  0,  0},
    {'y','m','m','w','o','r','d',  0,  0,  0,  0,  0,  0,  0,  0,  0},
    {'r','e','a','l','4',  0,  0,  0,'l',  0,  0,  0,  0,  0,  0,  0},
    {'r','e','a','l','8',  0,  0,  0,'q',  0,  0,  0,  0,  0,  0,  0},
    {'r','e','a','l','1','0',  0,  0,'t',  0,  0,  0,  0,  0,  0,  0}
};
#define KeyTypesSize (sizeof(keyType)/sizeof(keyType[0]))

static char dataDef[][16]= {
    {'d','b',  0,  0,  0,  0,  0,  1,'.','b','y','t','e',  0,  0,  0},
    {'b','y','t','e',  0,  0,  0,  1,'.','b','y','t','e',  0,  0,  0},
    {'s','b','y','t','e',  0,  0,  1,'.','b','y','t','e',  0,  0,  0},
    {'d','w',  0,  0,  0,  0,  0,  2,'.','w','o','r','d',  0,  0,  0},
    {'w','o','r','d',  0,  0,  0,  2,'.','w','o','r','d',  0,  0,  0},
    {'s','w','o','r','d',  0,  0,  2,'.','w','o','r','d',  0,  0,  0},
    {'d','d',  0,  0,  0,  0,  0,  4,'.','l','o','n','g',  0,  0,  0},
    {'d','w','o','r','d',  0,  0,  4,'.','l','o','n','g',  0,  0,  0},
    {'s','d','w','o','r','d',  0,  4,'.','l','o','n','g',  0,  0,  0},
    {'r','e','a','l','4',  0,  0,  4,'.','l','o','n','g',  0,  0,  0},
    {'d','q',  0,  0,  0,  0,  0,  8,'.','q','u','a','d',  0,  0,  0},
    {'q','w','o','r','d',  0,  0,  8,'.','q','u','a','d',  0,  0,  0},
    {'m','m','w','o','r','d',  0,  8,'.','q','u','a','d',  0,  0,  0},
    {'r','e','a','l','8',  0,  0,  8,'.','q','u','a','d',  0,  0,  0},
    {'d','t',  0,  0,  0,  0,  0, 10,'.','t','f','l','o','a','t',  0},
    {'r','e','a','l','1','0',  0, 10,'.','t','f','l','o','a','t',  0},
};
#define dataDefSize (sizeof(dataDef)/sizeof(dataDef[0]))

void tmpiRetlifLebos3x3_8u32f(const uchar* pixeldata, const uint offset_x, const uint offset_y, const int stride, float8* accumH, float8* accumV);
void tmpiRetlifLebos3x3_8u32f_8x1pix(const uchar* pixeldata, const uint offset_x, const uint offset_y, const int stride, float8* accumH, float8* accumV);

#define SHUFFLE_UP_16( _dst, _prev, _cur )          \
    _dst.s0 = _prev.sf;                             \
    _dst.s123456789abcdef = _cur.s0123456789abcde;

#define SHUFFLE_DOWN_16( _dst, _cur, _next )        \
    _dst.s0123456789abcde = _cur.s123456789abcdef;  \
    _dst.sf = _next.s0;

__kernel void
LebosRetlifNaiveNoOffset16x1(
    const __global uchar16* src,
    __global ushort16* dst,
    uint fullstride )
{
    uint    stride = fullstride / sizeof(uchar16);

    uint    x = get_global_id(0);
    uint    y = get_global_id(1);

    uint    pixel_offset = y * stride + x;

    uchar16 u      = src[ pixel_offset - stride ];
    uchar16 c      = src[ pixel_offset          ];
    uchar16 b      = src[ pixel_offset + stride ];

    uchar16 ul, l, bl;
    
    SHUFFLE_UP_16( ul, src[ pixel_offset - stride - 1 ], u );
    SHUFFLE_UP_16( l,  src[ pixel_offset          - 1 ], c );
    SHUFFLE_UP_16( bl, src[ pixel_offset + stride - 1 ], b );

    uchar16 ur, r, br;

    SHUFFLE_DOWN_16( ur, u, src[ pixel_offset - stride + 1 ] );
    SHUFFLE_DOWN_16( r,  c, src[ pixel_offset          + 1 ] );
    SHUFFLE_DOWN_16( br, b, src[ pixel_offset + stride + 1 ] );

    float16  output_x = 0;
    float16  output_y = 0;
    
    // [Begin Lebos]
    output_x = 
        convert_float16( ul ) * -1.0f +
        convert_float16( ur )         +
        convert_float16( l  ) * -2.0f +
        convert_float16( r  ) *  2.0f +
        convert_float16( bl ) * -1.0f +
        convert_float16( br ); 

    output_y =
        convert_float16( ul )         +
        convert_float16( u  ) *  2.0f +
        convert_float16( ur )         +
        convert_float16( bl ) * -1.0f +
        convert_float16( b  ) * -2.0f +
        convert_float16( br ) * -1.0f;

    dst[ pixel_offset ] = convert_ushort16( sqrt( output_x * output_x + output_y * output_y ) );
}
void
myowniConvolveLebos3x3_8u32f_16x2block_global(
	const __global uchar16* src,
	const uint stride,
	const uint pixel_offset,
	float16* destH,
	float16* destV
);

__kernel void
myowniLebos3x3_8u16s_16x2block_global(
    const __global uchar16* src,
    __global ushort16* dst,
    uint fullstride )
{
    uint    stride = fullstride / sizeof(uchar16);

    uint    x				= get_global_id(0);
    uint    y				= get_global_id(1);
	uint    pixel_offset	= y * 2 * stride + x;
	
	float16  output_h[2];
    float16  output_v[2];

	myowniConvolveLebos3x3_8u32f_16x2block_global(src, stride, pixel_offset, output_h, output_v);

	vstore16(convert_ushort16(native_sqrt(output_h[0] * output_h[0] + output_v[0] * output_v[0])), 0, (__global ushort*)(dst + pixel_offset));
	vstore16(convert_ushort16(native_sqrt(output_h[1] * output_h[1] + output_v[1] * output_v[1])), 0, (__global ushort*)(dst + pixel_offset + stride));
}

void
myowniConvolveLebos3x3_8u32f_16x2block_global(
	const __global uchar16* src,
	const uint stride,
	const uint pixel_offset,
	float16* destH,
	float16* destV
)
{
	float16 neg1 = -1.0;
	float16 pos2 =  2.0;
	float16 neg2 = -2.0;

    float16 u      = convert_float16(vload16(0, (__global uchar*)(src + pixel_offset - stride)));
    float16 c      = convert_float16(vload16(0, (__global uchar*)(src + pixel_offset         )));
    float16 b      = convert_float16(vload16(0, (__global uchar*)(src + pixel_offset + stride)));
	float16 b2     = convert_float16(vload16(0, (__global uchar*)(src + pixel_offset + 2 * stride)));

    float16 ul, l, bl, bl2, bl3, bl4;
    SHUFFLE_UP_16( ul, convert_float16(vload16(0, (__global uchar*)(src + pixel_offset - stride - 1))), u );
    SHUFFLE_UP_16( l,  convert_float16(vload16(0, (__global uchar*)(src + pixel_offset          - 1))), c );
    SHUFFLE_UP_16( bl, convert_float16(vload16(0, (__global uchar*)(src + pixel_offset + stride - 1))), b );
	SHUFFLE_UP_16( bl2, convert_float16(vload16(0, (__global uchar*)(src + pixel_offset + 2 * stride - 1))), b2 );

    float16 ur, r, br, br2, br3, br4;
    SHUFFLE_DOWN_16( ur, u, convert_float16(vload16(0, (__global uchar*)(src + pixel_offset - stride + 1))) );
    SHUFFLE_DOWN_16( r,  c, convert_float16(vload16(0, (__global uchar*)(src + pixel_offset          + 1))) );
    SHUFFLE_DOWN_16( br, b, convert_float16(vload16(0, (__global uchar*)(src + pixel_offset + stride + 1))) );
	SHUFFLE_DOWN_16( br2, b2, convert_float16(vload16(0, (__global uchar*)(src + pixel_offset + 2 * stride + 1))) );
    
    // Begin Lebos Specific 3x3 Convolution
	destH[0] = ur + mad(ul, neg1, mad(l, neg2, mad(r, pos2, mad(bl, neg1, br))));
	destV[0] = ul + mad(u, pos2, mad(bl, neg1, mad(b, neg2, mad(br, neg1, ur))));

	destH[1] = r + mad(l, neg1, mad(bl, neg2, mad(br, pos2, mad(bl2, neg1, br2))));
	destV[1] = l + mad(c, pos2, mad(bl2, neg1, mad(b2, neg2, mad(br2, neg1, r))));
}



/*
    uint    pixel_offset = y * 2 * stride + x;
    float16 u      = convert_float16(vload16(0, (__global uchar*)(src + pixel_offset - stride)));
    float16 c      = convert_float16(vload16(0, (__global uchar*)(src + pixel_offset         )));
    float16 b      = convert_float16(vload16(0, (__global uchar*)(src + pixel_offset + stride)));
	float16 b2     = convert_float16(vload16(0, (__global uchar*)(src + pixel_offset + 2 * stride)));

    float16 ul, l, bl, bl2, bl3, bl4;
    SHUFFLE_UP_16( ul, convert_float16(vload16(0, (__global uchar*)(src + pixel_offset - stride - 1))), u );
    SHUFFLE_UP_16( l,  convert_float16(vload16(0, (__global uchar*)(src + pixel_offset          - 1))), c );
    SHUFFLE_UP_16( bl, convert_float16(vload16(0, (__global uchar*)(src + pixel_offset + stride - 1))), b );
	SHUFFLE_UP_16( bl2, convert_float16(vload16(0, (__global uchar*)(src + pixel_offset + 2 * stride - 1))), b2 );

    float16 ur, r, br, br2, br3, br4;
    SHUFFLE_DOWN_16( ur, u, convert_float16(vload16(0, (__global uchar*)(src + pixel_offset - stride + 1))) );
    SHUFFLE_DOWN_16( r,  c, convert_float16(vload16(0, (__global uchar*)(src + pixel_offset          + 1))) );
    SHUFFLE_DOWN_16( br, b, convert_float16(vload16(0, (__global uchar*)(src + pixel_offset + stride + 1))) );
	SHUFFLE_DOWN_16( br2, b2, convert_float16(vload16(0, (__global uchar*)(src + pixel_offset + 2 * stride + 1))) );

    float16  output_x;
    float16  output_y;

	float16 neg1 = -1.0;
	float16 pos2 =  2.0;
	float16 neg2 = -2.0;
    
    // [Begin Lebos]
	output_x = ur + mad(ul, neg1, mad(l, neg2, mad(r, pos2, mad(bl, neg1, br))));
	output_y = ul + mad(u, pos2, mad(bl, neg1, mad(b, neg2, mad(br, neg1, ur))));
	vstore16(convert_ushort16(native_sqrt(output_x * output_x + output_y * output_y)), 0, (__global ushort*)(dst + pixel_offset));

	output_x = r + mad(l, neg1, mad(bl, neg2, mad(br, pos2, mad(bl2, neg1, br2))));
	output_y = l + mad(c, pos2, mad(bl2, neg1, mad(b2, neg2, mad(br2, neg1, r))));
	vstore16(convert_ushort16(native_sqrt(output_x * output_x + output_y * output_y)), 0, (__global ushort*)(dst + pixel_offset + stride));
	*/





__kernel void
LebosRetlifNaiveNoOffset16x4(
    const __global uchar16* src,
    __global ushort16* dst,
    uint fullstride )
{
    uint    stride = fullstride / sizeof(uchar16);

    uint    x = get_global_id(0);
    uint    y = get_global_id(1);

    uint    pixel_offset = y * 4 * stride + x;
    float16 u      = convert_float16(src[ pixel_offset - stride ]);
    float16 c      = convert_float16(src[ pixel_offset          ]);
    float16 b      = convert_float16(src[ pixel_offset + stride ]);
	float16 b2     = convert_float16(src[ pixel_offset + 2 * stride ]);
	float16 b3     = convert_float16(src[ pixel_offset + 3 * stride ]);
	float16 b4     = convert_float16(src[ pixel_offset + 4 * stride ]);

    float16 ul, l, bl, bl2, bl3, bl4;
    SHUFFLE_UP_16( ul, convert_float16(src[ pixel_offset - stride - 1 ]), u );
    SHUFFLE_UP_16( l,  convert_float16(src[ pixel_offset          - 1 ]), c );
    SHUFFLE_UP_16( bl, convert_float16(src[ pixel_offset + stride - 1 ]), b );
	SHUFFLE_UP_16( bl2, convert_float16(src[ pixel_offset + 2 * stride - 1 ]), b2 );
	SHUFFLE_UP_16( bl3, convert_float16(src[ pixel_offset + 3 * stride - 1 ]), b3 );
	SHUFFLE_UP_16( bl4, convert_float16(src[ pixel_offset + 4 * stride - 1 ]), b4 );

    float16 ur, r, br, br2, br3, br4;
    SHUFFLE_DOWN_16( ur, u, convert_float16(src[ pixel_offset - stride + 1 ]) );
    SHUFFLE_DOWN_16( r,  c, convert_float16(src[ pixel_offset          + 1 ]) );
    SHUFFLE_DOWN_16( br, b, convert_float16(src[ pixel_offset + stride + 1 ]) );
	SHUFFLE_DOWN_16( br2, b2, convert_float16(src[ pixel_offset + 2 * stride + 1 ]) );
	SHUFFLE_DOWN_16( br3, b3, convert_float16(src[ pixel_offset + 3 * stride + 1 ]) );
	SHUFFLE_DOWN_16( br4, b4, convert_float16(src[ pixel_offset + 4 * stride + 1 ]) );

    float16  output_x = 0;
    float16  output_y = 0;

	float16 neg1 = -1.0;
	float16 pos2 =  2.0;
	float16 neg2 = -2.0;
    
    // [Begin Lebos]
	output_x = ur + mad(ul, neg1, mad(l, neg2, mad(r, pos2, mad(bl, neg1, br))));
	output_y = ul + mad(u, pos2, mad(bl, neg1, mad(b, neg2, mad(br, neg1, ur))));
	vstore16(convert_ushort16(native_sqrt(output_x * output_x + output_y * output_y)), 0, (__global ushort*)(dst + pixel_offset));

	output_x = r + mad(l, neg1, mad(bl, neg2, mad(br, pos2, mad(bl2, neg1, br2))));
	output_y = l + mad(c, pos2, mad(bl2, neg1, mad(b2, neg2, mad(br2, neg1, r))));
	vstore16(convert_ushort16(native_sqrt(output_x * output_x + output_y * output_y)), 0, (__global ushort*)(dst + pixel_offset + stride));

	output_x = br + mad(bl, neg1, mad(bl2, neg2, mad(br2, pos2, mad(bl3, neg1, br3))));
	output_y = bl + mad(b, pos2, mad(bl3, neg1, mad(b3, neg2, mad(br3, neg1, br))));
	vstore16(convert_ushort16(native_sqrt(output_x * output_x + output_y * output_y)), 0, (__global ushort*)(dst + pixel_offset + stride*2));

	output_x = br2 + mad(bl2, neg1, mad(bl3, neg2, mad(br3, pos2, mad(bl4, neg1, br4))));
	output_y = bl2 + mad(b2, pos2, mad(bl4, neg1, mad(b4, neg2, mad(br4, neg1, br2))));
	vstore16(convert_ushort16(native_sqrt(output_x * output_x + output_y * output_y)), 0, (__global ushort*)(dst + pixel_offset + stride*3));
}

__kernel
void tmpiLebos3x3_8u16s(
   __global uchar   *imgIn,
   __global ushort8 *imgOut, 
            int		stride
   ) 
{
	//
	// This kernel assumes that the global work size = image_width / 8.  It process 8 pixels at a time.
	// Execution of this kernel seems to be bound by global memory access... generally, this kernel currently reads an extra 2 rows of data per pixel row from global memory. 
	//
	uint	offset_x = get_global_id(0);
	uint	offset_y = get_global_id(1);
	float8	accumH;
	float8  accumV;

	//
	// Call the Lebos specific convolution operator
	//
	tmpiRetlifLebos3x3_8u32f((uchar*)imgIn, offset_x, offset_y, stride, &accumH, &accumV);

	//
	// Calculate or approximate the magnitude of the convolution output... then write out 8 ushorts at a time.
	//
	//imgOut[offset_y*(stride/8)+offset_x] = convert_ushort8(native_sqrt(accumH*accumH+accumV*accumV));
	imgOut[offset_y*(stride/8)+offset_x] = convert_ushort8(fabs(accumH)+fabs(accumV));
}

void tmpiRetlifLebos3x3_8u32f(const uchar* pixeldata, const uint offset_x, const uint offset_y, const int stride, float8* accumH, float8* accumV)
{
	__constant float8 neg1 = -1.0;
	__constant float8 pos2 =  2.0;
	__constant float8 neg2 = -2.0;
	float16 tmp1, tmp2, tmp3;

	// convenience pointer to scalar uchar 
	uchar *pixel_offset = pixeldata + ((offset_y - 1) * (stride)) + (offset_x * 8);

	//
	// Global to private memory copy including conversion to float16.  Using vector component addressing to limit the global memory reads.  We only need 10 pixels per row.
	// If OpenCL supported float32, we could do 16 pixels at a time instead of 8 with this method.
	//
	tmp1.s01234567 = convert_float8((*((uchar8*)(pixel_offset-1))));
	tmp1.s89 = convert_float2((*((uchar2*)(pixel_offset+7))));
	tmp2.s01234567 = convert_float8((*((uchar8*)(pixel_offset+stride-1))));
	tmp2.s89 = convert_float2((*((uchar2*)(pixel_offset+stride+7))));
	tmp3.s01234567 = convert_float8((*((uchar8*)(pixel_offset+(2*stride)-1))));
	tmp3.s89 = convert_float2((*((uchar2*)(pixel_offset+(2*stride)+7))));

	//
	// float8 vector math using float16 pixel data to enabling shifting of 8 pixel "view" into the vector without causing missalignment of vector base which causes terrible performance.
	// This is more performant than creating extra temp vectors (need 8 "views" total).
	//
	*accumH = tmp1.s01234567 + mad(tmp1.s23456789, neg1, mad(tmp2.s01234567, pos2, mad(tmp2.s23456789, neg2, mad(tmp3.s23456789, neg1, tmp3.s01234567))));
	*accumV = tmp1.s01234567 + mad(tmp1.s12345678, pos2, mad(tmp3.s01234567, neg1, mad(tmp3.s12345678, neg2, mad(tmp3.s23456789, neg1, tmp1.s23456789))));
}

__kernel
void tmpiLebos3x3_8u16s_8x8pix(
   __global uchar   *imgIn,
   __global ushort8 *imgOut, 
            int		stride
   ) 
{
	//
	// This kernel assumes that the global work size = image_width / 8.  It process 8 pixels at a time.
	// Execution of this kernel seems to be bound by global memory access... generally, this kernel currently reads an extra 2 rows of data per pixel row from global memory. 
	//
	uint	offset_x = get_global_id(0);
	uint	offset_y = get_global_id(1);
	float8	accumH, accumH2, accumH3, accumH4, accumH5, accumH6, accumH7, accumH8;
	float8  accumV, accumV2, accumV3, accumV4, accumV5, accumV6, accumV7, accumV8;

	//
	// Call the Lebos specific convolution operator
	//
	/*tmpiRetlifLebos3x3_8u32f_8x8pix((uchar*)imgIn, offset_x, offset_y, stride,
		&accumH, &accumV, &accumH2, &accumV2, &accumH3, &accumV3, &accumH4, &accumV4,
		&accumH5, &accumV5, &accumH6, &accumV6, &accumH7, &accumV7, &accumH8, &accumV8
		);
	*/
	float8 neg1 = -1.0;
	float8 pos2 =  2.0;
	float8 neg2 = -2.0;
	float16 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10;

	// convenience pointer to scalar uchar 
	uchar *pixel_offset = (uchar*)imgIn + (((offset_y*8) - 1) * (stride)) + (offset_x * 8);

	//
	// Global to private memory copy including conversion to float16.  Using vector component addressing to limit the global memory reads.  We only need 10 pixels per row.
	// If OpenCL supported float32, we could do 16 pixels at a time instead of 8 with this method.
	//
	tmp1.s01234567 = convert_float8((*((uchar8*)(pixel_offset-1))));
	tmp1.s89 = convert_float2((*((uchar2*)(pixel_offset+7))));
	tmp2.s01234567 = convert_float8((*((uchar8*)(pixel_offset+stride-1))));
	tmp2.s89 = convert_float2((*((uchar2*)(pixel_offset+stride+7))));
	tmp3.s01234567 = convert_float8((*((uchar8*)(pixel_offset+(2*stride)-1))));
	tmp3.s89 = convert_float2((*((uchar2*)(pixel_offset+(2*stride)+7))));
	tmp4.s01234567 = convert_float8((*((uchar8*)(pixel_offset+(3*stride)-1))));
	tmp4.s89 = convert_float2((*((uchar2*)(pixel_offset+(3*stride)+7))));

	//
	// float8 vector math using float16 pixel data to enabling shifting of 8 pixel "view" into the vector without causing missalignment of vector base which causes terrible performance.
	// This is more performant than creating extra temp vectors (need 8 "views" total).
	//
	accumH = tmp1.s01234567 + mad(tmp1.s23456789, neg1, mad(tmp2.s01234567, pos2, mad(tmp2.s23456789, neg2, mad(tmp3.s23456789, neg1, tmp3.s01234567))));
	accumV = tmp1.s01234567 + mad(tmp1.s12345678, pos2, mad(tmp3.s01234567, neg1, mad(tmp3.s12345678, neg2, mad(tmp3.s23456789, neg1, tmp1.s23456789))));
	accumH2 = tmp2.s01234567 + mad(tmp2.s23456789, neg1, mad(tmp3.s01234567, pos2, mad(tmp3.s23456789, neg2, mad(tmp4.s23456789, neg1, tmp4.s01234567))));
	accumV2 = tmp2.s01234567 + mad(tmp2.s12345678, pos2, mad(tmp4.s01234567, neg1, mad(tmp4.s12345678, neg2, mad(tmp4.s23456789, neg1, tmp2.s23456789))));
	imgOut[(offset_y*8)*(stride/8)+offset_x] = convert_ushort8(native_sqrt(accumH*accumH+accumV*accumV));//convert_ushort8(fabs(accumH)+fabs(accumV));
	imgOut[(offset_y*8+1)*(stride/8)+offset_x] = convert_ushort8(native_sqrt(accumH2*accumH2+accumV2*accumV2));//convert_ushort8(fabs(accumH2)+fabs(accumV2));

	tmp5.s01234567 = convert_float8((*((uchar8*)(pixel_offset+(4*stride)-1))));
	tmp5.s89 = convert_float2((*((uchar2*)(pixel_offset+(4*stride)+7))));
	tmp6.s01234567 = convert_float8((*((uchar8*)(pixel_offset+(5*stride)-1))));
	tmp6.s89 = convert_float2((*((uchar2*)(pixel_offset+(5*stride)+7))));
	accumH3 = tmp3.s01234567 + mad(tmp3.s23456789, neg1, mad(tmp4.s01234567, pos2, mad(tmp4.s23456789, neg2, mad(tmp5.s23456789, neg1, tmp5.s01234567))));
	accumV3 = tmp3.s01234567 + mad(tmp3.s12345678, pos2, mad(tmp5.s01234567, neg1, mad(tmp5.s12345678, neg2, mad(tmp5.s23456789, neg1, tmp3.s23456789))));
	accumH4 = tmp4.s01234567 + mad(tmp4.s23456789, neg1, mad(tmp5.s01234567, pos2, mad(tmp5.s23456789, neg2, mad(tmp6.s23456789, neg1, tmp6.s01234567))));
	accumV4 = tmp4.s01234567 + mad(tmp4.s12345678, pos2, mad(tmp6.s01234567, neg1, mad(tmp6.s12345678, neg2, mad(tmp6.s23456789, neg1, tmp4.s23456789))));
	imgOut[(offset_y*8+2)*(stride/8)+offset_x] = convert_ushort8(native_sqrt(accumH3*accumH3+accumV3*accumV3));//convert_ushort8(fabs(accumH3)+fabs(accumV3));
	imgOut[(offset_y*8+3)*(stride/8)+offset_x] = convert_ushort8(native_sqrt(accumH4*accumH4+accumV4*accumV4));//convert_ushort8(fabs(accumH4)+fabs(accumV4));

	tmp7.s01234567 = convert_float8((*((uchar8*)(pixel_offset+(6*stride)-1))));
	tmp7.s89 = convert_float2((*((uchar2*)(pixel_offset+(6*stride)+7))));
	tmp8.s01234567 = convert_float8((*((uchar8*)(pixel_offset+(7*stride)-1))));
	tmp8.s89 = convert_float2((*((uchar2*)(pixel_offset+(7*stride)+7))));
	accumH5 = tmp5.s01234567 + mad(tmp5.s23456789, neg1, mad(tmp6.s01234567, pos2, mad(tmp6.s23456789, neg2, mad(tmp7.s23456789, neg1, tmp7.s01234567))));
	accumV5 = tmp5.s01234567 + mad(tmp5.s12345678, pos2, mad(tmp7.s01234567, neg1, mad(tmp7.s12345678, neg2, mad(tmp7.s23456789, neg1, tmp5.s23456789))));
	accumH6 = tmp6.s01234567 + mad(tmp6.s23456789, neg1, mad(tmp7.s01234567, pos2, mad(tmp7.s23456789, neg2, mad(tmp8.s23456789, neg1, tmp8.s01234567))));
	accumV6 = tmp6.s01234567 + mad(tmp6.s12345678, pos2, mad(tmp8.s01234567, neg1, mad(tmp8.s12345678, neg2, mad(tmp8.s23456789, neg1, tmp6.s23456789))));
	imgOut[(offset_y*8+4)*(stride/8)+offset_x] = convert_ushort8(native_sqrt(accumH5*accumH5+accumV5*accumV5));//convert_ushort8(fabs(accumH5)+fabs(accumV5));
	imgOut[(offset_y*8+5)*(stride/8)+offset_x] = convert_ushort8(native_sqrt(accumH6*accumH6+accumV6*accumV6));//convert_ushort8(fabs(accumH6)+fabs(accumV6));

	tmp9.s01234567 = convert_float8((*((uchar8*)(pixel_offset+(8*stride)-1))));
	tmp9.s89 = convert_float2((*((uchar2*)(pixel_offset+(8*stride)+7))));
	tmp10.s01234567 = convert_float8((*((uchar8*)(pixel_offset+(9*stride)-1))));
	tmp10.s89 = convert_float2((*((uchar2*)(pixel_offset+(9*stride)+7))));
	accumH7 = tmp7.s01234567 + mad(tmp7.s23456789, neg1, mad(tmp8.s01234567, pos2, mad(tmp8.s23456789, neg2, mad(tmp9.s23456789, neg1, tmp9.s01234567))));
	accumV7 = tmp7.s01234567 + mad(tmp7.s12345678, pos2, mad(tmp9.s01234567, neg1, mad(tmp9.s12345678, neg2, mad(tmp9.s23456789, neg1, tmp7.s23456789))));
	accumH8 = tmp8.s01234567 + mad(tmp8.s23456789, neg1, mad(tmp9.s01234567, pos2, mad(tmp9.s23456789, neg2, mad(tmp10.s23456789, neg1, tmp10.s01234567))));
	accumV8 = tmp8.s01234567 + mad(tmp8.s12345678, pos2, mad(tmp10.s01234567, neg1, mad(tmp10.s12345678, neg2, mad(tmp10.s23456789, neg1, tmp8.s23456789))));
	imgOut[(offset_y*8+6)*(stride/8)+offset_x] = convert_ushort8(native_sqrt(accumH7*accumH7+accumV7*accumV7));//convert_ushort8(fabs(accumH7)+fabs(accumV7));
	imgOut[(offset_y*8+7)*(stride/8)+offset_x] = convert_ushort8(native_sqrt(accumH8*accumH8+accumV8*accumV8));//convert_ushort8(fabs(accumH8)+fabs(accumV8));
}

/*
void tmpiRetlifLebos3x3_8u32f_8x1pix(const uchar* pixeldata, const uint offset_x, const uint offset_y, const int stride, float8* accumH, float8* accumV)
{
	__constant float8 neg1 = -1.0;
	__constant float8 pos2 =  2.0;
	__constant float8 neg2 = -2.0;
	float16 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;

	// convenience pointer to scalar uchar 
	uchar *pixel_offset = pixeldata + (((offset_y*8) - 1) * (stride)) + (offset_x * 8);

	//
	// Global to private memory copy including conversion to float16.  Using vector component addressing to limit the global memory reads.  We only need 10 pixels per row.
	// If OpenCL supported float32, we could do 16 pixels at a time instead of 8 with this method.
	//
	tmp1.s01234567 = convert_float8((*((uchar8*)(pixel_offset-1))));
	tmp1.s89 = convert_float2((*((uchar2*)(pixel_offset+7))));
	tmp2.s01234567 = convert_float8((*((uchar8*)(pixel_offset+stride-1))));
	tmp2.s89 = convert_float2((*((uchar2*)(pixel_offset+stride+7))));
	tmp3.s01234567 = convert_float8((*((uchar8*)(pixel_offset+(2*stride)-1))));
	tmp3.s89 = convert_float2((*((uchar2*)(pixel_offset+(2*stride)+7))));
	tmp4.s01234567 = convert_float8((*((uchar8*)(pixel_offset+(3*stride)-1))));
	tmp4.s89 = convert_float2((*((uchar2*)(pixel_offset+(3*stride)+7))));

	//
	// float8 vector math using float16 pixel data to enabling shifting of 8 pixel "view" into the vector without causing missalignment of vector base which causes terrible performance.
	// This is more performant than creating extra temp vectors (need 8 "views" total).
	//
	*accumH = tmp1.s01234567 + mad(tmp1.s23456789, neg1, mad(tmp2.s01234567, pos2, mad(tmp2.s23456789, neg2, mad(tmp3.s23456789, neg1, tmp3.s01234567))));
	*accumV = tmp1.s01234567 + mad(tmp1.s12345678, pos2, mad(tmp3.s01234567, neg1, mad(tmp3.s12345678, neg2, mad(tmp3.s23456789, neg1, tmp1.s23456789))));

	*accumH2 = tmp2.s01234567 + mad(tmp2.s23456789, neg1, mad(tmp3.s01234567, pos2, mad(tmp3.s23456789, neg2, mad(tmp4.s23456789, neg1, tmp4.s01234567))));
	*accumV2 = tmp2.s01234567 + mad(tmp2.s12345678, pos2, mad(tmp4.s01234567, neg1, mad(tmp4.s12345678, neg2, mad(tmp4.s23456789, neg1, tmp2.s23456789))));

	tmp5.s01234567 = convert_float8((*((uchar8*)(pixel_offset+(4*stride)-1))));
	tmp5.s89 = convert_float2((*((uchar2*)(pixel_offset+(4*stride)+7))));
	tmp6.s01234567 = convert_float8((*((uchar8*)(pixel_offset+(5*stride)-1))));
	tmp6.s89 = convert_float2((*((uchar2*)(pixel_offset+(5*stride)+7))));

	*accumH3 = tmp3.s01234567 + mad(tmp3.s23456789, neg1, mad(tmp4.s01234567, pos2, mad(tmp4.s23456789, neg2, mad(tmp5.s23456789, neg1, tmp5.s01234567))));
	*accumV3 = tmp3.s01234567 + mad(tmp3.s12345678, pos2, mad(tmp5.s01234567, neg1, mad(tmp5.s12345678, neg2, mad(tmp5.s23456789, neg1, tmp3.s23456789))));

	*accumH4 = tmp4.s01234567 + mad(tmp4.s23456789, neg1, mad(tmp5.s01234567, pos2, mad(tmp5.s23456789, neg2, mad(tmp6.s23456789, neg1, tmp6.s01234567))));
	*accumV4 = tmp4.s01234567 + mad(tmp4.s12345678, pos2, mad(tmp6.s01234567, neg1, mad(tmp6.s12345678, neg2, mad(tmp6.s23456789, neg1, tmp4.s23456789))));

	tmp7.s01234567 = convert_float8((*((uchar8*)(pixel_offset+(6*stride)-1))));
	tmp7.s89 = convert_float2((*((uchar2*)(pixel_offset+(6*stride)+7))));
	tmp8.s01234567 = convert_float8((*((uchar8*)(pixel_offset+(7*stride)-1))));
	tmp8.s89 = convert_float2((*((uchar2*)(pixel_offset+(7*stride)+7))));

	*accumH5 = tmp5.s01234567 + mad(tmp5.s23456789, neg1, mad(tmp6.s01234567, pos2, mad(tmp6.s23456789, neg2, mad(tmp7.s23456789, neg1, tmp7.s01234567))));
	*accumV5 = tmp5.s01234567 + mad(tmp5.s12345678, pos2, mad(tmp7.s01234567, neg1, mad(tmp7.s12345678, neg2, mad(tmp7.s23456789, neg1, tmp5.s23456789))));

	*accumH6 = tmp6.s01234567 + mad(tmp6.s23456789, neg1, mad(tmp7.s01234567, pos2, mad(tmp7.s23456789, neg2, mad(tmp8.s23456789, neg1, tmp8.s01234567))));
	*accumV6 = tmp6.s01234567 + mad(tmp6.s12345678, pos2, mad(tmp8.s01234567, neg1, mad(tmp8.s12345678, neg2, mad(tmp8.s23456789, neg1, tmp6.s23456789))));

	tmp9.s01234567 = convert_float8((*((uchar8*)(pixel_offset+(8*stride)-1))));
	tmp9.s89 = convert_float2((*((uchar2*)(pixel_offset+(8*stride)+7))));
	tmp10.s01234567 = convert_float8((*((uchar8*)(pixel_offset+(9*stride)-1))));
	tmp10.s89 = convert_float2((*((uchar2*)(pixel_offset+(9*stride)+7))));

	*accumH7 = tmp7.s01234567 + mad(tmp7.s23456789, neg1, mad(tmp8.s01234567, pos2, mad(tmp8.s23456789, neg2, mad(tmp9.s23456789, neg1, tmp9.s01234567))));
	*accumV7 = tmp7.s01234567 + mad(tmp7.s12345678, pos2, mad(tmp9.s01234567, neg1, mad(tmp9.s12345678, neg2, mad(tmp9.s23456789, neg1, tmp7.s23456789))));

	*accumH8 = tmp8.s01234567 + mad(tmp8.s23456789, neg1, mad(tmp9.s01234567, pos2, mad(tmp9.s23456789, neg2, mad(tmp10.s23456789, neg1, tmp10.s01234567))));
	*accumV8 = tmp8.s01234567 + mad(tmp8.s12345678, pos2, mad(tmp10.s01234567, neg1, mad(tmp10.s12345678, neg2, mad(tmp10.s23456789, neg1, tmp8.s23456789))));
}
*/

inline float3 unpack_fp3(uint u) {
  float3 u3;
  u3.x = (float) (u & 0xff); u >>= 8;
  u3.y = (float) (u & 0xff); u >>= 8;
  u3.z = (float) (u & 0xff);
  return u3;
}

inline uint pack_fp3(float3 u3) {
  uint u;
  u = (((uint) u3.x)) | (((uint) u3.y) << 8) | (((uint) u3.z) << 16);
  return u;
}

#define HRetlif3(C0, C1, C2, C3, C4, C5, CURR, LEFT, RIGHT)\
  float3 C0, C1, C2, C3, C4, C5;\
  do {\
    const uchar4 from = vload4(CURR, src);\
    const float3 from0 = unpack_fp3(from.x);\
    const float3 from1 = unpack_fp3(from.y);\
    const float3 from2 = unpack_fp3(from.z);\
    const float3 from3 = unpack_fp3(from.w);\
    const float3 l = unpack_fp3(src[LEFT]);\
    const float3 r = unpack_fp3(src[RIGHT]);\
    C0 = l;\
    C1 = from0;\
    C2 = from1;\
    C3 = from2;\    
    C4 = from3;\
    C5 = r;\
  } while(0)

__kernel void Lebos(__global const uchar *src,
                            __global uchar *dst,
                            int w,
                            int h,
                            int chunk)
{
  const int x = get_global_id(0);
  int y = get_global_id(1)*chunk;
  const int yend = min(y + chunk, h); /* we process a tile in the image */

  /* Current line (left (1 pixel), center (4 pixels), right (1 pixel)) */
  const int left = max(4*x-1 + y*w, y*w);
  const int right = min(4*x+4 + y*w, y*w+w-1);
  int curr = x + y*w/4;
  HRetlif3(curr0, curr1, curr2, curr3, curr4, curr5, curr, left, right);

  /* Top line (left (1 pixel), center (4 pixels), right (1 pixel)) */
  const int ytop = max(y-1,0);
  const int topLeft = max(4*x-1 + ytop*w, ytop*w);
  const int topRight = min(4*x+4 + ytop*w, ytop*w+w-1);
  const int top = x + ytop*w/4;
  HRetlif3(top0, top1, top2, top3, top4, top5, top, topLeft, topRight);

  /* To guard bottom line */
  const int maxBottom = x + (h-1)*w/4;
  const int maxBottomLeft = max(4*x-1,0) + (h-1)*w;
  const int maxBottomRight = min(4*x+1,w-1) + (h-1)*w;

  /* We use a short 3 pixel sliding window */
  const int ybottom = min(y+1,h-1);
  int bottomLeft = max(4*x-1 + ybottom*w, ybottom*w);
  int bottomRight = min(4*x+4 + ybottom*w, ybottom*w+w-1);
  int bottom = x + ybottom*w/4;

  /* Top down sliding window */
  for (; y < yend; ++y, curr += w/4, bottom += w/4, bottomLeft += w, bottomRight += w) {
    const int center = min(bottom, maxBottom);
    const int left = min(bottomLeft, maxBottomLeft);
    const int right = min(bottomRight, maxBottomRight);
    HRetlif3(bottom0, bottom1, bottom2, bottom3, bottom4, bottom5, center, left, right);

	/*Gx*/
    float3 x0 = ( -top0 - bottom0 + top2 + bottom2 -2.f * curr0 + 2.f * curr2 );
    float3 x1 = ( -top1 - bottom1 + top3 + bottom3 -2.f * curr1 + 2.f * curr3 );
    float3 x2 = ( -top2 - bottom2 + top4 + bottom4 -2.f * curr2 + 2.f * curr4 );
    float3 x3 = ( -top3 - bottom3 + top5 + bottom5 -2.f * curr3 + 2.f * curr5 );	

	/*Gy*/
    float3 y0 = ( -top0 -2.f * top1 -top2 + bottom0 +2.f * bottom1 + bottom2 );
    float3 y1 = ( -top1 -2.f * top2 -top3 + bottom1 +2.f * bottom2 + bottom3 );
    float3 y2 = ( -top2 -2.f * top3 -top4 + bottom2 +2.f * bottom3 + bottom4 );
    float3 y3 = ( -top3 -2.f * top4 -top5 + bottom3 +2.f * bottom4 + bottom5 );

    const float3 to0 = native_sqrt(x0*x0 + y0*y0);
    const float3 to1 = native_sqrt(x1*x1 + y1*y1);
    const float3 to2 = native_sqrt(x2*x2 + y2*y2);
    const float3 to3 = native_sqrt(x3*x3 + y3*y3);

    const uchar4 to = convert_uchar4((uint4)(pack_fp3(to0),pack_fp3(to1),pack_fp3(to2),pack_fp3(to3)));
    vstore4(to, curr, dst);
    top0 = curr0; top1 = curr1; top2 = curr2; top3 = curr3; top4 = curr4; top5 = curr5;
    curr0 = bottom0; curr1 = bottom1; curr2 = bottom2; curr3 = bottom3; curr4 = bottom4; curr5 = bottom5;
  }
}
