Test NEON-optimized cv::threshold() on mobile device

I have been writing some optimizations for the OpenCV’s threshold function, for ARM devices (mobile phones). It should be working on both Android and iPhone.

However, I do not have a device to test it on, so I am looking for volunteers to give me a little help. If that motivates you more, I am planning to send it to OpenCV to be integrated into the main repository.

  • Multiline TextView in Android?
  • Up navigation broken on JellyBean?
  • Fatal signal 7 (SIGBUS) at 0x00000000 (code=2)
  • Android: HorizontalScrollView inside ScrollView
  • Download a file with Android, and showing the progress in a ProgressDialog
  • Cause of performTraversals recursion, long egl buffer swap times
  • I would be interested in code correctness, and if it happens to work as intended, some statistics for original/optimized performance. Do not forget to look at all scenarios.

    So, here is the code. To run it, paste in on opencv/modules/imgproc/src/thresh.cpp, at line 228 (as of 2.4.2) – just below SSE block, and recompile OpenCV.

    Also, add this line at the top of the file

    #include <arm_neon.h>
    

    Main code body:

    #define CV_USE_NEON 1
    #if CV_USE_NEON
        //if( checkHardwareSupport(CV_CPU_ARM_NEON) )
        if( true )
        {
            uint8x16_t thresh_u = vdupq_n_u8(thresh);
            uint8x16_t maxval_ = vdupq_n_u8(maxval);
    
            j_scalar = roi.width & -8;
    
            for( i = 0; i < roi.height; i++ )
            {
                const uchar* src = (const uchar*)(_src.data + _src.step*i);
                uchar* dst = (uchar*)(_dst.data + _dst.step*i);
    
                switch( type )
                {
                case THRESH_BINARY:
                    for( j = 0; j <= roi.width - 32; j += 32 )
                    {
                        uint8x16_t v0, v1;
                        v0 = vld1q_u8 ( src + j );
                        v1 = vld1q_u8 ( src + j + 16 );
                        v0 = vcgtq_u8 ( v0, thresh_u );
                        v1 = vcgtq_u8 ( v1, thresh_u );
                        v0 = vandq_u8 ( v0, maxval_ );
                        v1 = vandq_u8 ( v1, maxval_ );
                        vst1q_u8 ( dst + j, v0 );
                        vst1q_u8 ( dst + j + 16, v1 );
                    }
    
    
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        uint8x8_t v2;
                        v2 = vld1_u8( src + j );
                        v2 = vcgt_u8 ( v2, vget_low_s8 ( thresh_u ) );
                        v2 = vand_u8 ( v2, vget_low_s8 ( maxval_ ) );
                        vst1_u8 ( dst + j, v2 );                    
                    }
                    break;
    
                case THRESH_BINARY_INV:         
                    for( j = 0; j <= roi.width - 32; j += 32 )
                    {
                        uint8x16_t v0, v1;
                        v0 = vld1q_u8 ( src + j );
                        v1 = vld1q_u8 ( src + j + 16 );
                        v0 = vcleq_u8 ( v0, thresh_u );
                        v1 = vcleq_u8 ( v1, thresh_u );
                        v0 = vandq_u8 ( v0, maxval_ );
                        v1 = vandq_u8 ( v1, maxval_ );
                        vst1q_u8 ( dst + j, v0 );
                        vst1q_u8 ( dst + j + 16, v1 );
                    }
    
    
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        uint8x8_t v2;
                        v2 = vld1_u8( src + j );
                        v2 = vcle_u8 ( v2, vget_low_s8 ( thresh_u ) );
                        v2 = vand_u8 ( v2, vget_low_s8 ( maxval_ ) );
                        vst1_u8 ( dst + j, v2 );                    
                    }
                    break;
    
                case THRESH_TRUNC:
                    for( j = 0; j <= roi.width - 32; j += 32 )
                    {
                        uint8x16_t v0, v1;
                        v0 = vld1q_u8 ( src + j );
                        v1 = vld1q_u8 ( src + j + 16 );
                        v0 = vminq_u8 ( v0, thresh_u );
                        v1 = vminq_u8 ( v1, thresh_u );                 
                        vst1q_u8 ( dst + j, v0 );
                        vst1q_u8 ( dst + j + 16, v1 );
                    }
    
    
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        uint8x8_t v2;
                        v2 = vld1_u8( src + j );
                        v2 = vmin_u8  ( v2, vget_low_s8 ( thresh_u ) );                 
                        vst1_u8 ( dst + j, v2 );                    
                    }
                    break;
    
                case THRESH_TOZERO:         
                    for( j = 0; j <= roi.width - 32; j += 32 )
                    {
                        uint8x16_t v0, v1;
                        v0 = vld1q_u8 ( src + j );
                        v1 = vld1q_u8 ( src + j + 16 );             
                        v0 = vandq_u8 ( vcgtq_u8 ( v0, thresh_u ), vmaxq_u8 ( v0, thresh_u ) );
                        v1 = vandq_u8 ( vcgtq_u8 ( v1, thresh_u ), vmaxq_u8 ( v1, thresh_u ) );
                        vst1q_u8 ( dst + j, v0 );
                        vst1q_u8 ( dst + j + 16, v1 );
                    }
    
    
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        uint8x8_t v2;
                        v2 = vld1_u8 ( src + j );                    
                        v2 = vand_u8 ( vcgt_u8 ( v2, vget_low_s8(thresh_u) ), vmax_u8 ( v2, vget_low_s8(thresh_u) ) );
                        vst1_u8 ( dst + j, v2 );                    
                    }
                    break;
    
                case THRESH_TOZERO_INV:
                    for( j = 0; j <= roi.width - 32; j += 32 )
                    {
                        uint8x16_t v0, v1;
                        v0 = vld1q_u8 ( src + j );
                        v1 = vld1q_u8 ( src + j + 16 );             
                        v0 = vandq_u8 ( vcleq_u8 ( v0, thresh_u ), vminq_u8 ( v0, thresh_u ) );
                        v1 = vandq_u8 ( vcleq_u8 ( v1, thresh_u ), vminq_u8 ( v1, thresh_u ) );
                        vst1q_u8 ( dst + j, v0 );
                        vst1q_u8 ( dst + j + 16, v1 );
                    }
    
    
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        uint8x8_t v2;
                        v2 = vld1_u8 ( src + j );                    
                        v2 = vand_u8 ( vcle_u8 ( v2, vget_low_s8(thresh_u) ), vmin_u8 ( v2, vget_low_s8(thresh_u) ) );
                        vst1_u8 ( dst + j, v2 );                    
                    }
                    break;
                }
            }
        }
    #endif
    

    Android Babe is a Google Android Fan, All about Android Phones, Android Wear, Android Dev and Android Games Apps and so on.