Cvim half precision floating point

Download Cvim half precision floating point

Post on 08-Jan-2017

224 views

Category:

Engineering

11 download

TRANSCRIPT

OpenCV float

Half Precision Floating Point Number-half-@tomoaki_teshima

How big is the image ?Multiplying two images (floating point operation)

Size ! Size !! Size !!!RGB 3 bytes / pixelfloat 4 bytes / pixelAny more space to reduce ?

SummaryExplanation of halfExample on ARMExample on ARM w/ SIMD instructionExample on Intel, AMD(x86)Example on CUDA

Format of Floating pointsIEEE754

64bit = double, double precision32bit = float, single precision16bit = half, half precision

Signed bitExponentSignificand11111bit52bit23bit10bit5bit8bit

ARM has fp16

https://ja.wikipedia.org/wiki/

What to prepareAn ARM machine which runs Linux Raspberry Pi zero/1/2/3ODROID XU4/C2Jetson TK1/TX1PINE64Red ones are 64bit architectureBuy one for better understanding

Example on ARMint main(int argc, char**argv){ printf("Hello World !!\n"); __fp16 halfPrecision = 1.5f; printf("half precision:%f\n, halfPrecision); printf("half precision:sizeof %d\n, sizeof(halfPrecision)); printf("half precision:0x%04x\n", *(short*)(void*)&halfPrecision);

float original[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f,16.0f,}; for (unsigned int i = 0;i < 16;i++) { __fp16 stub = original[i]; printf(%2d 0x%04x\n", (int)original[i], *(short*)&stub); } return 0;}https://github.com/tomoaki0705/sampleFp16

Build it

Required to put option -mpf16-formatTry it on ARM gcc, otherwise unknown optionerror$ gcc -std=c99 -mfp16-format=ieee main.c

Result 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 01/21/10241/41/81/161/321/641/1281/2561/512

Signed bit(+)Exponent(17)Significand

When exponent is all 0, the number is subnormal.When exponent is all 1, the number is Inf or NaN.

SummaryFloating points format is complicated than IntegerHalf can express floating point numbers in 2 bytes

Check in AssemblySoft implemented conversion

Whats the point doing it on SW side ?$ gcc S -std=c99 -mfp16-format=ieee O main.c.s main.cmovw r3, #15872 uint uint32x4_t srcIntegerHigh = vmovl_u16(vget_high_s16(srcIntegerShort)); // ushort -> uint float32x4_t srcFloatLow = vcvtq_f32_u32(srcIntegerLow ); // uint -> float float32x4_t srcFloatHigh = vcvtq_f32_u32(srcIntegerHigh); // uint -> float float32x4_t gainFloatLow = vcvt_f32_f16(gainHalfLow ); // half -> float float32x4_t gainFloatHigh = vcvt_f32_f16(gainHalfHigh); // half -> float float32x4_t dstFloatLow = vmulq_f32(srcFloatLow, gainFloatLow ); // float * float float32x4_t dstFloatHigh = vmulq_f32(srcFloatHigh, gainFloatHigh); // float * float uint32x4_t dstIntegerLow = vcvtq_u32_f32(dstFloatLow ); // float -> uint uint32x4_t dstIntegerHigh = vcvtq_u32_f32(dstFloatHigh); // float -> uint uint16x8_t dstIntegerShort = vcombine_u16(vmovn_u16(dstIntegerLow), vmovn_u16(dstIntegerHigh)); // uint -> ushort uint8x8_t dstInteger = vmovn_u16(dstIntegerShort); // ushort -> uchar vst1_u8(dst+x, dstInteger); // store}

https://github.com/tomoaki0705/sampleFp16Vector

Little bit of improvementsconst unsigned int cParallel = 8;for (unsigned int x = 0;x ushort uint4 srcIntegerLow = convert_ushort8_lo_uint4(srcIntegerShort); // ushort -> uint uint4 srcIntegerHigh = convert_ushort8_hi_uint4(srcIntegerShort); // ushort -> uint float4 srcFloatLow = convert_uint4_float4(srcIntegerLow ); // uint -> float float4 srcFloatHigh = convert_uint4_float4(srcIntegerHigh); // uint -> float float4 gainFloatLow = convert_half4_float4(gainHalfLow ); // half -> float float4 gainFloatHigh = convert_half4_float4(gainHalfHigh); // half -> float float4 dstFloatLow = multiply_float4(srcFloatLow , gainFloatLow ); // float * float float4 dstFloatHigh = multiply_float4(srcFloatHigh, gainFloatHigh); // float * float uint4 dstIntegerLow = convert_float4_uint4(dstFloatLow ); // float -> uint uint4 dstIntegerHigh = convert_float4_uint4(dstFloatHigh); // float -> uint ushort8 dstIntegerShort = convert_uint4_ushort8(dstIntegerLow, dstIntegerHigh); // uint -> ushort uchar8 dstInteger = convert_ushort8_uchar8(dstIntegerShort); // ushort -> uchar store_uchar8(dst + x, dstInteger); // store}

Let buildSpecify one of the red FPUoptionsThe FPU has to have feature of SIMD and half

vfpvfpv3vfpv3-fp16vfpv3-d16vfpv3-d16-fp16vfpv3xdvfpv3xd-fp16neonneon-fp16vfpv4vfpv4-d16fpv4-sp-d16neon-vfpv4fp-armv8neon-fp-armv8crypto-neon-fp-armv8List of FPU optionhttp://dench.flatlib.jp/opengl/fpu_vfphttp://tessy.org/wiki/index.php?ARM%A4%CEFPU

Check in Assembly

VCVT instruction

SummaryARMDoneARM(SIMD)Specify the FPU which is capable of both SIMD and halfIntel,AMD (x86)CUDA

half instructions on x86F16C instruction set

https://en.wikipedia.org/wiki/F16C

Try the operation in vectorconst unsigned int cParallel = 8;for (unsigned int x = 0;x ushort __m128i srcIntegerLow = _mm_unpacklo_epi16(srcIntegerShort, v_zero); // ushort -> uint __m128i srcIntegerHigh = _mm_unpackhi_epi16(srcIntegerShort, v_zero); // ushort -> uint __m128i srcFloatLow = _mm_cvtepi32_ps(srcIntegerLow ); // uint -> float __m128i srcFloatHigh = _mm_cvtepi32_ps(srcIntegerHigh); // uint -> float __m128 gainFloatLow = _mm_cvtph_ps(gainHalfLow ); // half -> float __m128 gainFloatHigh = _mm_cvtph_ps(gainHalfHigh); // half -> float __m128 dstFloatLow = _mm_mul_ps(srcFloatLow , gainFloatLow ); // float * float __m128 dstFloatHigh = _mm_mul_ps(srcFloatHigh, gainFloatHigh); // float * float __m128i dstIntegerLow = _mm_cvtps_epi32(dstFloatLow ); // float -> uint __m128i dstIntegerHigh = _mm_cvtps_epi32(dstFloatHigh); // float -> uint __m128i dstIntegerShort = _mm_packs_epi32(dstIntegerLow, dstIntegerHigh); // uint -> ushort __m128i dstInteger = _mm_packus_epi16(dstIntegerShort, v_zero); // ushort -> uchar _mm_storel_epi64((__m128i *)(dst + x), dstInteger); // store}https://github.com/tomoaki0705/sampleFp16Vector

Little bit of improvementsconst unsigned int cParallel = 8;for (unsigned int x = 0;x ushort uint4 srcIntegerLow = convert_ushort8_lo_uint4(srcIntegerShort); // ushort -> uint uint4 srcIntegerHigh = convert_ushort8_hi_uint4(srcIntegerShort); // ushort -> uint float4 srcFloatLow = convert_uint4_float4(srcIntegerLow ); // uint -> float float4 srcFloatHigh = convert_uint4_float4(srcIntegerHigh); // uint -> float float4 gainFloatLow = convert_half4_float4(gainHalfLow ); // half -> float float4 gainFloatHigh = convert_half4_float4(gainHalfHigh); // half -> float float4 dstFloatLow = multiply_float4(srcFloatLow , gainFloatLow ); // float * float float4 dstFloatHigh = multiply_float4(srcFloatHigh, gainFloatHigh); // float * float uint4 dstIntegerLow = convert_float4_uint4(dstFloatLow ); // float -> uint uint4 dstIntegerHigh = convert_float4_uint4(dstFloatHigh); // float -> uint ushort8 dstIntegerShort = convert_uint4_ushort8(dstIntegerLow, dstIntegerHigh);// uint -> ushort uchar8 dstInteger = convert_ushort8_uchar8(dstIntegerShort); // ushort -> uchar store_uchar8(dst + x, dstInteger); // store}

$ gcc -mf16c main.cpp

Check in Assembly

Note that inline functions have not been expanded inline when build in Debug mode

Check in AssemblyBuild with RelWithDebInfo modeInstructions are more packedConversion instruction(vcvtph2ps)

Check in Assembly(gcc)

Same behavior as Visual Studio, inline functions are kept as function calls

Check in Assembly(gcc)

Assembly of Release modeMuch more packed instructions

Conversion instruction(vcvtph2ps)

ARMDoneARM(SIMD)DoneIntel,AMD (x86)x86 has half conversion as one of the SIMD instructionsImplemented on Ivy Bridge and later CPU (Intel)Implemented on Piledriver and later CPU (AMD) DoneCUDA

CUDAunsigned short a = g_indata[y*imgw+x];float gain;gain = __half2float(a);

float b = imageData[(y*imgw+x)*3 ];float g = imageData[(y*imgw+x)*3+1];float r = imageData[(y*imgw+x)*3+2];

g_odata[(y*imgw+x)*3 ] = clamp(b * gain, 0.0f, 255.0f);g_odata[(y*imgw+x)*3+1] = clamp(g * gain, 0.0f, 255.0f);g_odata[(y*imgw+x)*3+2] = clamp(r * gain, 0.0f, 255.0f);

The best point using halfData size transferring to GPU will be reduced

GPU memory

SummaryARMDoneARM(SIMD)DoneIntel,AMD (x86)DoneCUDACUDA 7.5 and later will support half nativelyPascal is expected to have has been announced to have direct operation treating half

Recommended

View more >