/* ----------------------------------------------------------------------------- * * Copyright (c) 2010-2022 Alexis Naveros. * Portions developed under contract to the SURVICE Engineering Company. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages * arising from the use of this software. * * Permission is granted to anyone to use this software for any purpose, * including commercial applications, and to alter it and redistribute it * freely, subject to the following restrictions: * * 1. The origin of this software must not be misrepresented; you must not * claim that you wrote the original software. If you use this software * in a product, an acknowledgment in the product documentation would be * appreciated but is not required. * 2. Altered source versions must be plainly marked as such, and must not be * misrepresented as being the original software. * 3. This notice may not be removed or altered from any source distribution. * * ----------------------------------------------------------------------------- */ #include #include #include #include #include #include #include #include "cpusimd.h" #include "mmheap.h" #define MM_HEAP_DEBUG (0) #if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__) #define CC_UNLIKELY(x) __builtin_expect(!!(x), 0) #else #define CC_UNLIKELY(x) (x) #endif //// void mmHeap32Init( mmHeap32 *heap, size_t sizehint ) { mmHeap32Unit *unit; heap->size = MM_HEAP_MASK; heap->alloc = ( sizehint + MM_HEAP_MASK ) & ~MM_HEAP_MASK; if( heap->alloc < 1024 ) heap->alloc = 1024; heap->array = malloc( ( heap->alloc >> MM_HEAP_SHIFT ) * sizeof(mmHeap32Unit) ); unit = &heap->array[0]; unit->p[MM_HEAP_MASK] = 0; unit->value[MM_HEAP_MASK] = UINT32_MAX; return; } void mmHeap32Free( mmHeap32 *heap ) { free( heap->array ); memset( heap, 0, sizeof(mmHeap32) ); return; } void *mmHeap32GetFirst( mmHeap32 *heap, uint32_t *retvalue ) { mmHeap32Unit *unit; unit = &heap->array[0]; #if MM_HEAP_DEBUG printf( " ## GetFirst %p %d\n", unit->p[MM_HEAP_MASK], unit->value[MM_HEAP_MASK] ); fflush( stdout ); #endif *retvalue = unit->value[MM_HEAP_MASK]; return unit->p[MM_HEAP_MASK]; } void mmHeap32Insert( mmHeap32 *heap, void *p, uint32_t value ) { int i, itembranch, parentbranch; size_t itemorder, parentorder; mmHeap32Unit *unit, *parent; #if MM_HEAP_DEBUG printf( " ## Insert %p %d\n", p, value ); fflush( stdout ); #endif if( CC_UNLIKELY( heap->size >= heap->alloc ) ) { heap->alloc <<= 1; heap->array = realloc( heap->array, ( heap->alloc >> MM_HEAP_SHIFT ) * sizeof(mmHeap32Unit) ); } itemorder = heap->size; heap->size++; unit = &heap->array[ itemorder >> MM_HEAP_SHIFT ]; itembranch = itemorder & MM_HEAP_MASK; if( itembranch == 0 ) { for( i = 1 ; i < MM_HEAP_WIDTH ; i++ ) unit->value[i] = UINT32_MAX; } for( ; itemorder >= MM_HEAP_WIDTH ; ) { parentorder = ( itemorder >> MM_HEAP_SHIFT ) + MM_HEAP_OFFSET; parent = &heap->array[ parentorder >> MM_HEAP_SHIFT ]; parentbranch = parentorder & MM_HEAP_MASK; if( parent->value[ parentbranch ] < value ) break; unit->value[ itembranch ] = parent->value[ parentbranch ]; unit->p[ itembranch ] = parent->p[ parentbranch ]; unit = parent; itemorder = parentorder; itembranch = parentbranch; } unit->value[ itembranch ] = value; unit->p[ itembranch ] = p; return; } void mmHeap32DeleteFirst( mmHeap32 *heap ) { int i, itembranch, parentbranch; uint32_t insvalue, minval; size_t itemorder, parentorder; void *insp; mmHeap32Unit *unit, *parent; #if CPU_SSE4_1_SUPPORT && ( MM_HEAP_WIDTH == 16 ) uint32_t eqmask0, eqmask1, eqmask2, eqmask3, brmask0, brmask1, brmask2; uint32_t itembranch0, itembranch1, itembranch2, itembranch3; __m128i vunitvalue0, vunitvalue1, vunitvalue2, vunitvalue3, vmin; #endif #if MM_HEAP_DEBUG unit = &heap->array[0]; printf( " ## Delete %p %d\n", unit->p[MM_HEAP_MASK], unit->value[MM_HEAP_MASK] ); fflush( stdout ); #endif heap->size--; unit = &heap->array[ heap->size >> MM_HEAP_SHIFT ]; if( CC_UNLIKELY( heap->size == MM_HEAP_MASK ) ) { unit->p[MM_HEAP_MASK] = 0; unit->value[MM_HEAP_MASK] = UINT32_MAX; return; } itembranch = heap->size & MM_HEAP_MASK; insp = unit->p[ itembranch ]; insvalue = unit->value[ itembranch ]; unit->value[ itembranch ] = UINT32_MAX; parent = &heap->array[0]; parentbranch = MM_HEAP_MASK; parentorder = MM_HEAP_MASK; for( ; ; ) { itemorder = ( parentorder - MM_HEAP_OFFSET ) << MM_HEAP_SHIFT; if( itemorder >= heap->size ) break; unit = &heap->array[ itemorder >> MM_HEAP_SHIFT ]; #if CPU_SSE4_1_SUPPORT && ( MM_HEAP_WIDTH == 16 ) vunitvalue0 = _mm_load_si128( (void *)&unit->value[0] ); vunitvalue1 = _mm_load_si128( (void *)&unit->value[4] ); vunitvalue2 = _mm_load_si128( (void *)&unit->value[8] ); vunitvalue3 = _mm_load_si128( (void *)&unit->value[12] ); vmin = _mm_min_epu32( _mm_min_epu32( vunitvalue0, vunitvalue1 ), _mm_min_epu32( vunitvalue2, vunitvalue3 ) ); vmin = _mm_min_epu32( vmin, _mm_shuffle_epi32( vmin, 0x4e ) ); vmin = _mm_min_epu32( vmin, _mm_shuffle_epi32( vmin, 0x11 ) ); minval = _mm_cvtsi128_si32( vmin ); if( insvalue <= minval ) break; eqmask0 = _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( vmin, vunitvalue0 ) ) ); eqmask1 = _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( vmin, vunitvalue1 ) ) ); eqmask2 = _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( vmin, vunitvalue2 ) ) ); eqmask3 = _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( vmin, vunitvalue3 ) ) ); itembranch0 = 0x00 + eqmask0; itembranch1 = 0x40 + eqmask1; itembranch2 = 0x80 + eqmask2; itembranch3 = 0xc0 + eqmask3; brmask0 = (((int32_t)eqmask0)-1)>>31; brmask1 = (((int32_t)eqmask1)-1)>>31; brmask2 = (((int32_t)eqmask2)-1)>>31; itembranch = ( itembranch0 & ~brmask0 ) | ( itembranch1 & brmask0 & ~brmask1 ); itembranch |= ( itembranch2 & brmask0 & brmask1 & ~brmask2 ) | ( itembranch3 & brmask0 & brmask1 & brmask2 ); itembranch = ( itembranch >> 4 ) | ( ( 0x12131210 >> ( ( itembranch & 0xf ) << 1 ) ) & 3 ); #else minval = insvalue; itembranch = -1; for( i = 0 ; i < MM_HEAP_WIDTH ; i++ ) { if( unit->value[i] < minval ) { minval = unit->value[i]; itembranch = i; } } if( itembranch < 0 ) break; #endif parent->value[ parentbranch ] = unit->value[ itembranch ]; parent->p[ parentbranch ] = unit->p[ itembranch ]; parent = unit; parentbranch = itembranch; parentorder = itemorder + itembranch; } parent->value[ parentbranch ] = insvalue; parent->p[ parentbranch ] = insp; return; } size_t mmHeap32GetCount( mmHeap32 *heap ) { return heap->size - MM_HEAP_MASK; } void mmHeap32Merge( mmHeap32 *dst, mmHeap32 *src ) { int dstindex, srcindex, dstsub, srcsub; uint32_t dstvalue, srcvalue; mmHeap32 heap; mmHeap32Unit *dstunit, *srcunit; mmHeap32Init( &heap, dst->size + src->size ); dstindex = MM_HEAP_MASK; srcindex = MM_HEAP_MASK; if( dstindex == dst->size ) goto trailsrc; else if( srcindex == src->size ) goto traildst; for( ; ; ) { dstunit = &dst->array[ dstindex >> MM_HEAP_SHIFT ]; srcunit = &src->array[ srcindex >> MM_HEAP_SHIFT ]; dstsub = dstindex & MM_HEAP_MASK; srcsub = srcindex & MM_HEAP_MASK; dstvalue = dstunit->value[dstsub]; srcvalue = srcunit->value[srcsub]; if( dstvalue < srcvalue ) { mmHeap32Insert( &heap, dstunit->p[dstsub], dstvalue ); if( ++dstindex == dst->size ) { trailsrc: for( ; srcindex < src->size ; srcindex++ ) { srcunit = &src->array[ srcindex >> MM_HEAP_SHIFT ]; srcsub = srcindex & MM_HEAP_MASK; mmHeap32Insert( &heap, srcunit->p[srcsub], srcunit->value[srcsub] ); } break; } } else { mmHeap32Insert( &heap, srcunit->p[srcsub], srcvalue ); if( ++srcindex == src->size ) { traildst: for( ; dstindex < dst->size ; dstindex++ ) { dstunit = &dst->array[ dstindex >> MM_HEAP_SHIFT ]; dstsub = dstindex & MM_HEAP_MASK; mmHeap32Insert( &heap, dstunit->p[dstsub], dstunit->value[dstsub] ); } break; } } } free( dst->array ); free( src->array ); memcpy( dst, &heap, sizeof(mmHeap32) ); memset( src, 0, sizeof(mmHeap32) ); return; } void mmHeap32Debug( mmHeap32 *heap, int verbose ) { int i, itembranch, parentorder; uint32_t refvalue; mmHeap32Unit *unit, *parent; startover: if( verbose ) printf( "\n== Heap debug (size %d) ==\n", (int)heap->size ); if( verbose ) { void *p; refvalue = 0.0f; p = mmHeap32GetFirst( heap, &refvalue ); printf( "First: %p %d\n", p, refvalue ); } itembranch = 0; for( i = MM_HEAP_MASK ; i < heap->size ; i++ ) { unit = &heap->array[ i >> MM_HEAP_SHIFT ]; itembranch = i & MM_HEAP_MASK; if( verbose ) { printf( " %u", unit->value[ itembranch ] ); if( verbose >= 2 ) printf( "(%p)", unit->p[ itembranch ] ); } refvalue = unit->value[ itembranch ]; for( parentorder = i ; ; ) { parentorder = ( parentorder >> MM_HEAP_SHIFT ) + MM_HEAP_OFFSET; if( parentorder < MM_HEAP_MASK ) break; parent = &heap->array[ parentorder >> MM_HEAP_SHIFT ]; if( parent->value[ parentorder & MM_HEAP_MASK ] > refvalue ) { if( verbose ) printf( "***" ); else { printf( "Bad heap layout %d > %d", parent->value[ parentorder & MM_HEAP_MASK ], refvalue ); if( !verbose ) { verbose = 1; goto startover; } } } else if( verbose ) printf( "!" ); refvalue = parent->value[ parentorder & MM_HEAP_MASK ]; } if( verbose ) { if( itembranch == MM_HEAP_MASK ) printf( "\n" ); } } if( verbose ) { if( itembranch != MM_HEAP_MASK ) printf( "\n" ); printf( "== Heap debug ==\n\n" ); } fflush( stdout ); return; } //// void mmHeap64Init( mmHeap64 *heap, size_t sizehint ) { mmHeap64Unit *unit; heap->size = MM_HEAP_MASK; heap->alloc = ( sizehint + MM_HEAP_MASK ) & ~MM_HEAP_MASK; if( heap->alloc < 1024 ) heap->alloc = 1024; heap->array = malloc( ( heap->alloc >> MM_HEAP_SHIFT ) * sizeof(mmHeap64Unit) ); unit = &heap->array[0]; unit->p[MM_HEAP_MASK] = 0; unit->value[MM_HEAP_MASK] = UINT64_MAX; return; } void mmHeap64Free( mmHeap64 *heap ) { free( heap->array ); memset( heap, 0, sizeof(mmHeap64) ); return; } void *mmHeap64GetFirst( mmHeap64 *heap, uint64_t *retvalue ) { mmHeap64Unit *unit; unit = &heap->array[0]; #if MM_HEAP_DEBUG printf( " ## GetFirst %p %d\n", unit->p[MM_HEAP_MASK], unit->value[MM_HEAP_MASK] ); fflush( stdout ); #endif *retvalue = unit->value[MM_HEAP_MASK]; return unit->p[MM_HEAP_MASK]; } void mmHeap64Insert( mmHeap64 *heap, void *p, uint64_t value ) { int i, itembranch, parentbranch; size_t itemorder, parentorder; mmHeap64Unit *unit, *parent; if( CC_UNLIKELY( heap->size >= heap->alloc ) ) { heap->alloc <<= 1; heap->array = realloc( heap->array, ( heap->alloc >> MM_HEAP_SHIFT ) * sizeof(mmHeap64Unit) ); } itemorder = heap->size; heap->size++; unit = &heap->array[ itemorder >> MM_HEAP_SHIFT ]; itembranch = itemorder & MM_HEAP_MASK; if( itembranch == 0 ) { for( i = 1 ; i < MM_HEAP_WIDTH ; i++ ) unit->value[i] = UINT64_MAX; } for( ; itemorder >= MM_HEAP_WIDTH ; ) { parentorder = ( itemorder >> MM_HEAP_SHIFT ) + MM_HEAP_OFFSET; parent = &heap->array[ parentorder >> MM_HEAP_SHIFT ]; parentbranch = parentorder & MM_HEAP_MASK; if( parent->value[ parentbranch ] < value ) break; unit->value[ itembranch ] = parent->value[ parentbranch ]; unit->p[ itembranch ] = parent->p[ parentbranch ]; unit = parent; itemorder = parentorder; itembranch = parentbranch; } unit->value[ itembranch ] = value; unit->p[ itembranch ] = p; return; } void mmHeap64DeleteFirst( mmHeap64 *heap ) { int i, itembranch, parentbranch; uint64_t insvalue, minval; size_t itemorder, parentorder; void *insp; mmHeap64Unit *unit, *parent; heap->size--; unit = &heap->array[ heap->size >> MM_HEAP_SHIFT ]; if( CC_UNLIKELY( heap->size == MM_HEAP_MASK ) ) { unit->p[MM_HEAP_MASK] = 0; unit->value[MM_HEAP_MASK] = UINT64_MAX; return; } itembranch = heap->size & MM_HEAP_MASK; insp = unit->p[ itembranch ]; insvalue = unit->value[ itembranch ]; unit->value[ itembranch ] = UINT64_MAX; parent = &heap->array[0]; parentbranch = MM_HEAP_MASK; parentorder = MM_HEAP_MASK; for( ; ; ) { itemorder = ( parentorder - MM_HEAP_OFFSET ) << MM_HEAP_SHIFT; if( itemorder >= heap->size ) break; unit = &heap->array[ itemorder >> MM_HEAP_SHIFT ]; minval = insvalue; itembranch = -1; for( i = 0 ; i < MM_HEAP_WIDTH ; i++ ) { if( unit->value[i] < minval ) { minval = unit->value[i]; itembranch = i; } } if( itembranch < 0 ) break; parent->value[ parentbranch ] = unit->value[ itembranch ]; parent->p[ parentbranch ] = unit->p[ itembranch ]; parent = unit; parentbranch = itembranch; parentorder = itemorder + itembranch; } parent->value[ parentbranch ] = insvalue; parent->p[ parentbranch ] = insp; return; } size_t mmHeap64GetCount( mmHeap64 *heap ) { return heap->size - MM_HEAP_MASK; } void mmHeap64Debug( mmHeap64 *heap ) { int i, itembranch; mmHeap64Unit *unit; printf( "\n== Heap debug ==\n" ); itembranch = 0; for( i = MM_HEAP_MASK ; i < heap->size ; i++ ) { unit = &heap->array[ i >> MM_HEAP_SHIFT ]; itembranch = i & MM_HEAP_MASK; printf( " %llu", (long long)unit->value[ itembranch ] ); if( itembranch == MM_HEAP_MASK ) printf( "\n" ); } if( itembranch != MM_HEAP_MASK ) printf( "\n" ); printf( "== Heap debug ==\n\n" ); fflush( stdout ); return; } //// void mmHeap64sbInit( mmHeap64sb *heap, size_t sizehint ) { mmHeap64sbUnit *unit; heap->size = MM_HEAP_MASK; heap->alloc = ( sizehint + MM_HEAP_MASK ) & ~MM_HEAP_MASK; if( heap->alloc < 1024 ) heap->alloc = 1024; heap->array = malloc( ( heap->alloc >> MM_HEAP_SHIFT ) * sizeof(mmHeap64sbUnit) ); unit = &heap->array[0]; unit->value[MM_HEAP_MASK] = INT64_MAX; return; } void mmHeap64sbFree( mmHeap64sb *heap ) { free( heap->array ); memset( heap, 0, sizeof(mmHeap64sb) ); return; } int mmHeap64sbGetFirst( mmHeap64sb *heap, int64_t *retvalue ) { mmHeap64sbUnit *unit; if( heap->size == MM_HEAP_MASK ) return 0; unit = &heap->array[0]; #if MM_HEAP_DEBUG printf( " ## GetFirst %d\n", unit->value[MM_HEAP_MASK] ); fflush( stdout ); #endif *retvalue = unit->value[MM_HEAP_MASK]; return 1; } void mmHeap64sbInsert( mmHeap64sb *heap, int64_t value ) { int i, itembranch, parentbranch; size_t itemorder, parentorder; mmHeap64sbUnit *unit, *parent; if( CC_UNLIKELY( heap->size >= heap->alloc ) ) { heap->alloc <<= 1; heap->array = realloc( heap->array, ( heap->alloc >> MM_HEAP_SHIFT ) * sizeof(mmHeap64sbUnit) ); } itemorder = heap->size; heap->size++; unit = &heap->array[ itemorder >> MM_HEAP_SHIFT ]; itembranch = itemorder & MM_HEAP_MASK; if( itembranch == 0 ) { for( i = 1 ; i < MM_HEAP_WIDTH ; i++ ) unit->value[i] = INT64_MAX; } for( ; itemorder >= MM_HEAP_WIDTH ; ) { parentorder = ( itemorder >> MM_HEAP_SHIFT ) + MM_HEAP_OFFSET; parent = &heap->array[ parentorder >> MM_HEAP_SHIFT ]; parentbranch = parentorder & MM_HEAP_MASK; if( parent->value[ parentbranch ] < value ) break; unit->value[ itembranch ] = parent->value[ parentbranch ]; unit = parent; itemorder = parentorder; itembranch = parentbranch; } unit->value[ itembranch ] = value; return; } void mmHeap64sbDeleteFirst( mmHeap64sb *heap ) { int i, itembranch, parentbranch; int64_t insvalue, minval; size_t itemorder, parentorder; void *insp; mmHeap64sbUnit *unit, *parent; #if CPU_SSE4_2_SUPPORT && ( MM_HEAP_WIDTH == 16 ) && 1 __m128i vunitvalue0, vunitvalue1, vunitvalue2, vunitvalue3, vunitvalue4, vunitvalue5, vunitvalue6, vunitvalue7, vmin; __m128i vbranch0, vbranch1, vbranch2, vbranch3, vbranch4, vbranch5, vbranch6, vbranch7; __m128i vbranch01, vbranch23, vbranch45,vbranch67; __m128i vbranch0123, vbranch4567; __m128i vbranch; __m128i vminmask01, vminmask23, vminmask45, vminmask67; __m128i vmin01, vmin23, vmin45, vmin67; __m128i vminmask0123, vminmask4567; __m128i vmin0123, vmin4567; __m128i vminhigh, vbranchhigh, vminmask; #endif heap->size--; unit = &heap->array[ heap->size >> MM_HEAP_SHIFT ]; if( CC_UNLIKELY( heap->size == MM_HEAP_MASK ) ) { unit->value[MM_HEAP_MASK] = INT64_MAX; return; } itembranch = heap->size & MM_HEAP_MASK; insvalue = unit->value[ itembranch ]; unit->value[ itembranch ] = INT64_MAX; parent = &heap->array[0]; parentbranch = MM_HEAP_MASK; parentorder = MM_HEAP_MASK; for( ; ; ) { itemorder = ( parentorder - MM_HEAP_OFFSET ) << MM_HEAP_SHIFT; if( itemorder >= heap->size ) break; unit = &heap->array[ itemorder >> MM_HEAP_SHIFT ]; #if CPU_SSE4_2_SUPPORT && ( MM_HEAP_WIDTH == 16 ) && 1 vunitvalue0 = _mm_load_si128( (void *)&unit->value[0] ); vunitvalue1 = _mm_load_si128( (void *)&unit->value[2] ); vunitvalue2 = _mm_load_si128( (void *)&unit->value[4] ); vunitvalue3 = _mm_load_si128( (void *)&unit->value[6] ); vunitvalue4 = _mm_load_si128( (void *)&unit->value[8] ); vunitvalue5 = _mm_load_si128( (void *)&unit->value[10] ); vunitvalue6 = _mm_load_si128( (void *)&unit->value[12] ); vunitvalue7 = _mm_load_si128( (void *)&unit->value[14] ); vbranch0 = _mm_set_epi64x( 1, 0 ); vbranch1 = _mm_set_epi64x( 3, 2 ); vbranch2 = _mm_set_epi64x( 5, 4 ); vbranch3 = _mm_set_epi64x( 7, 6 ); vbranch4 = _mm_set_epi64x( 9, 8 ); vbranch5 = _mm_set_epi64x( 11, 10 ); vbranch6 = _mm_set_epi64x( 13, 12 ); vbranch7 = _mm_set_epi64x( 15, 14 ); /* I wish we had a _mm_min_epi64() / _mm_min_epu64(), *sigh* */ vminmask01 = _mm_cmpgt_epi64( vunitvalue0, vunitvalue1 ); vminmask23 = _mm_cmpgt_epi64( vunitvalue2, vunitvalue3 ); vminmask45 = _mm_cmpgt_epi64( vunitvalue4, vunitvalue5 ); vminmask67 = _mm_cmpgt_epi64( vunitvalue6, vunitvalue7 ); vmin01 = _mm_blendv_epi8( vunitvalue0, vunitvalue1, vminmask01 ); vmin23 = _mm_blendv_epi8( vunitvalue2, vunitvalue3, vminmask23 ); vmin45 = _mm_blendv_epi8( vunitvalue4, vunitvalue5, vminmask45 ); vmin67 = _mm_blendv_epi8( vunitvalue6, vunitvalue7, vminmask67 ); vbranch01 = _mm_blendv_epi8( vbranch0, vbranch1, vminmask01 ); vbranch23 = _mm_blendv_epi8( vbranch2, vbranch3, vminmask23 ); vbranch45 = _mm_blendv_epi8( vbranch4, vbranch5, vminmask45 ); vbranch67 = _mm_blendv_epi8( vbranch6, vbranch7, vminmask67 ); vminmask0123 = _mm_cmpgt_epi64( vmin01, vmin23 ); vminmask4567 = _mm_cmpgt_epi64( vmin45, vmin67 ); vmin0123 = _mm_blendv_epi8( vmin01, vmin23, vminmask0123 ); vmin4567 = _mm_blendv_epi8( vmin45, vmin67, vminmask4567 ); vbranch0123 = _mm_blendv_epi8( vbranch01, vbranch23, vminmask0123 ); vbranch4567 = _mm_blendv_epi8( vbranch45, vbranch67, vminmask4567 ); vminmask = _mm_cmpgt_epi64( vmin0123, vmin4567 ); vmin = _mm_blendv_epi8( vmin0123, vmin4567, vminmask ); vbranch = _mm_blendv_epi8( vbranch0123, vbranch4567, vminmask ); vminhigh = _mm_shuffle_epi32( vmin, 0x4e ); vbranchhigh = _mm_shuffle_epi32( vbranch, 0x4e ); vminmask = _mm_cmpgt_epi64( vmin, vminhigh ); #if 1 /* When the two min values are identical, pick the lowest index branch ~ not worth it? */ /* !!! Required to ensure heap is full before heap->alloc runs out and we realloc() !!! */ vminmask = _mm_or_si128( vminmask, _mm_and_si128( _mm_cmpeq_epi64( vmin, vminhigh ), _mm_cmpgt_epi64( vbranch, vbranchhigh ) ) ); #endif vmin = _mm_blendv_epi8( vmin, vminhigh, vminmask ); vbranch = _mm_blendv_epi8( vbranch, vbranchhigh, vminmask ); minval = _mm_cvtsi128_si64( vmin ); if( insvalue <= minval ) break; itembranch = _mm_cvtsi128_si64( vbranch ); #else minval = insvalue; itembranch = -1; for( i = 0 ; i < MM_HEAP_WIDTH ; i++ ) { if( unit->value[i] < minval ) { minval = unit->value[i]; itembranch = i; } } #endif if( itembranch < 0 ) break; parent->value[ parentbranch ] = unit->value[ itembranch ]; parent = unit; parentbranch = itembranch; parentorder = itemorder + itembranch; } parent->value[ parentbranch ] = insvalue; return; } size_t mmHeap64sbGetCount( mmHeap64sb *heap ) { return heap->size - MM_HEAP_MASK; } /* Only use to build initial heap with already sorted values */ void mmHeap64sbSortedPush( mmHeap64sb *heap, int64_t value ) { int i, itembranch, parentbranch; size_t itemorder, parentorder; mmHeap64sbUnit *unit, *parent; if( CC_UNLIKELY( heap->size >= heap->alloc ) ) { heap->alloc <<= 1; heap->array = realloc( heap->array, ( heap->alloc >> MM_HEAP_SHIFT ) * sizeof(mmHeap64sbUnit) ); } itemorder = heap->size; heap->size++; unit = &heap->array[ itemorder >> MM_HEAP_SHIFT ]; itembranch = itemorder & MM_HEAP_MASK; if( itembranch == 0 ) { for( i = 1 ; i < MM_HEAP_WIDTH ; i++ ) unit->value[i] = INT64_MAX; } unit->value[ itembranch ] = value; return; } /* void mmHeap64sbDebug( mmHeap64sb *heap ) { int i, itembranch; mmHeap64sbUnit *unit; printf( "\n== Heap debug ==\n" ); itembranch = 0; for( i = MM_HEAP_MASK ; i < heap->size ; i++ ) { unit = &heap->array[ i >> MM_HEAP_SHIFT ]; itembranch = i & MM_HEAP_MASK; printf( " %lld", (long long)unit->value[ itembranch ] ); if( itembranch == MM_HEAP_MASK ) printf( "\n" ); } if( itembranch != MM_HEAP_MASK ) printf( "\n" ); printf( "== Heap debug ==\n\n" ); fflush( stdout ); return; } */ void mmHeap64sbDebug( mmHeap64sb *heap, int verbose ) { int i, itembranch, parentorder; int64_t refvalue; mmHeap64sbUnit *unit, *parent; startover: if( verbose ) printf( "\n== Heap debug (size %d) ==\n", (int)heap->size ); if( verbose ) { void *p; refvalue = 0.0f; mmHeap64sbGetFirst( heap, &refvalue ); printf( "First: %lld\n", (long long)refvalue ); } itembranch = 0; for( i = MM_HEAP_MASK ; i < heap->size ; i++ ) { unit = &heap->array[ i >> MM_HEAP_SHIFT ]; itembranch = i & MM_HEAP_MASK; if( verbose ) printf( " %lld", (long long)unit->value[ itembranch ] ); refvalue = unit->value[ itembranch ]; for( parentorder = i ; ; ) { parentorder = ( parentorder >> MM_HEAP_SHIFT ) + MM_HEAP_OFFSET; if( parentorder < MM_HEAP_MASK ) break; parent = &heap->array[ parentorder >> MM_HEAP_SHIFT ]; if( parent->value[ parentorder & MM_HEAP_MASK ] > refvalue ) { if( verbose ) printf( "***" ); else { printf( "Bad heap layout %lld > %lld", (long long)parent->value[ parentorder & MM_HEAP_MASK ], (long long)refvalue ); if( !verbose ) { verbose = 1; goto startover; } } } else if( verbose ) printf( "!" ); refvalue = parent->value[ parentorder & MM_HEAP_MASK ]; } if( verbose ) { if( itembranch == MM_HEAP_MASK ) printf( "\n" ); } } if( verbose ) { if( itembranch != MM_HEAP_MASK ) printf( "\n" ); printf( "== Heap debug ==\n\n" ); } fflush( stdout ); return; } //// void mmHeapfInit( mmHeapf *heap, size_t sizehint ) { mmHeapfUnit *unit; heap->size = MM_HEAP_MASK; heap->alloc = ( sizehint + MM_HEAP_MASK ) & ~MM_HEAP_MASK; if( heap->alloc < 1024 ) heap->alloc = 1024; heap->array = malloc( ( heap->alloc >> MM_HEAP_SHIFT ) * sizeof(mmHeapfUnit) ); unit = &heap->array[0]; unit->p[MM_HEAP_MASK] = 0; unit->value[MM_HEAP_MASK] = FLT_MAX; return; } void mmHeapfFree( mmHeapf *heap ) { free( heap->array ); memset( heap, 0, sizeof(mmHeapf) ); return; } void *mmHeapfGetFirst( mmHeapf *heap, float *retvalue ) { mmHeapfUnit *unit; unit = &heap->array[0]; #if MM_HEAP_DEBUG printf( " ## GetFirst %p %f\n", unit->p[MM_HEAP_MASK], unit->value[MM_HEAP_MASK] ); fflush( stdout ); #endif *retvalue = unit->value[MM_HEAP_MASK]; return unit->p[MM_HEAP_MASK]; } void mmHeapfInsert( mmHeapf *heap, void *p, float value ) { int i, itembranch, parentbranch; size_t itemorder, parentorder; mmHeapfUnit *unit, *parent; #if MM_HEAP_DEBUG printf( " ## Insert %p %f\n", p, value ); fflush( stdout ); #endif if( CC_UNLIKELY( heap->size >= heap->alloc ) ) { heap->alloc <<= 1; heap->array = realloc( heap->array, ( heap->alloc >> MM_HEAP_SHIFT ) * sizeof(mmHeapfUnit) ); } itemorder = heap->size; heap->size++; unit = MM_HEAP_ALIGNED( &heap->array[ itemorder >> MM_HEAP_SHIFT ] ); itembranch = itemorder & MM_HEAP_MASK; if( itembranch == 0 ) { for( i = 0 ; i < MM_HEAP_WIDTH ; i++ ) unit->value[i] = FLT_MAX; } for( ; itemorder >= MM_HEAP_WIDTH ; ) { parentorder = ( itemorder >> MM_HEAP_SHIFT ) + MM_HEAP_OFFSET; parent = &heap->array[ parentorder >> MM_HEAP_SHIFT ]; parentbranch = parentorder & MM_HEAP_MASK; if( parent->value[ parentbranch ] < value ) break; unit->value[ itembranch ] = parent->value[ parentbranch ]; unit->p[ itembranch ] = parent->p[ parentbranch ]; unit = parent; itemorder = parentorder; itembranch = parentbranch; } unit->value[ itembranch ] = value; unit->p[ itembranch ] = p; return; } void mmHeapfDeleteFirst( mmHeapf *heap ) { int itembranch, parentbranch; float insvalue, minval; size_t itemorder, parentorder; void *insp; mmHeapfUnit *unit, *parent; #if CPU_SSE2_SUPPORT && ( MM_HEAP_WIDTH == 4 ) int eqmask; __m128 vunitvalue; #elif CPU_SSE2_SUPPORT && ( MM_HEAP_WIDTH == 8 ) uint32_t eqmask0, eqmask1, brmask0; uint32_t itembranch0, itembranch1; __m128 vunitvalue0, vunitvalue1, vmin; #elif CPU_SSE2_SUPPORT && ( MM_HEAP_WIDTH == 16 ) uint32_t eqmask0, eqmask1, eqmask2, eqmask3, brmask0, brmask1, brmask2; uint32_t itembranch0, itembranch1, itembranch2, itembranch3; __m128 vunitvalue0, vunitvalue1, vunitvalue2, vunitvalue3, vmin; #else int i; #endif #if MM_HEAP_DEBUG unit = &heap->array[0]; printf( " ## Delete %p %f\n", unit->p[MM_HEAP_MASK], unit->value[MM_HEAP_MASK] ); fflush( stdout ); #endif heap->size--; unit = &heap->array[ heap->size >> MM_HEAP_SHIFT ]; if( CC_UNLIKELY( heap->size == MM_HEAP_MASK ) ) { unit->p[MM_HEAP_MASK] = 0; unit->value[MM_HEAP_MASK] = FLT_MAX; return; } itembranch = heap->size & MM_HEAP_MASK; insp = unit->p[ itembranch ]; insvalue = unit->value[ itembranch ]; unit->value[ itembranch ] = FLT_MAX; parent = MM_HEAP_ALIGNED( &heap->array[0] ); parentbranch = MM_HEAP_MASK; parentorder = MM_HEAP_MASK; for( ; ; ) { itemorder = ( parentorder - MM_HEAP_OFFSET ) << MM_HEAP_SHIFT; if( itemorder >= heap->size ) break; unit = MM_HEAP_ALIGNED( &heap->array[ itemorder >> MM_HEAP_SHIFT ] ); #if CPU_SSE2_SUPPORT && ( MM_HEAP_WIDTH == 4 ) vunitvalue = _mm_load_ps( &unit->value[0] ); minval = fminf( fminf( unit->value[0], unit->value[1] ), fminf( unit->value[2], unit->value[3] ) ); if( insvalue <= minval ) break; eqmask = _mm_movemask_ps( _mm_cmpeq_ps( _mm_set1_ps( minval ), vunitvalue ) ); itembranch = ( 0x12131210 >> ( eqmask + eqmask ) ) & 3; #elif CPU_SSE2_SUPPORT && ( MM_HEAP_WIDTH == 8 ) vunitvalue0 = _mm_load_ps( &unit->value[0] ); vunitvalue1 = _mm_load_ps( &unit->value[4] ); vmin = _mm_min_ps( vunitvalue0, vunitvalue1 ); vmin = _mm_min_ps( vmin, _mm_shuffle_ps( vmin, vmin, 0x4e ) ); vmin = _mm_min_ps( vmin, _mm_shuffle_ps( vmin, vmin, 0x11 ) ); minval = _mm_cvtss_f32( vmin ); if( insvalue <= minval ) break; eqmask0 = _mm_movemask_ps( _mm_cmpeq_ps( vmin, vunitvalue0 ) ); eqmask1 = _mm_movemask_ps( _mm_cmpeq_ps( vmin, vunitvalue1 ) ); itembranch0 = ( 0x12131210 >> ( eqmask0 + eqmask0 ) ) & 3; itembranch1 = 4 + ( ( 0x12131210 >> ( eqmask1 + eqmask1 ) ) & 3 ); brmask0 = (((int32_t)eqmask0)-1)>>31; itembranch = ( itembranch0 & ~brmask0 ) | ( itembranch1 & brmask0 ); #elif CPU_SSE2_SUPPORT && ( MM_HEAP_WIDTH == 16 ) vunitvalue0 = _mm_load_ps( &unit->value[0] ); vunitvalue1 = _mm_load_ps( &unit->value[4] ); vunitvalue2 = _mm_load_ps( &unit->value[8] ); vunitvalue3 = _mm_load_ps( &unit->value[12] ); vmin = _mm_min_ps( _mm_min_ps( vunitvalue0, vunitvalue1 ), _mm_min_ps( vunitvalue2, vunitvalue3 ) ); vmin = _mm_min_ps( vmin, _mm_shuffle_ps( vmin, vmin, 0x4e ) ); vmin = _mm_min_ps( vmin, _mm_shuffle_ps( vmin, vmin, 0x11 ) ); minval = _mm_cvtss_f32( vmin ); if( insvalue <= minval ) break; eqmask0 = _mm_movemask_ps( _mm_cmpeq_ps( vmin, vunitvalue0 ) ); eqmask1 = _mm_movemask_ps( _mm_cmpeq_ps( vmin, vunitvalue1 ) ); eqmask2 = _mm_movemask_ps( _mm_cmpeq_ps( vmin, vunitvalue2 ) ); eqmask3 = _mm_movemask_ps( _mm_cmpeq_ps( vmin, vunitvalue3 ) ); itembranch0 = 0x00 + eqmask0; itembranch1 = 0x40 + eqmask1; itembranch2 = 0x80 + eqmask2; itembranch3 = 0xc0 + eqmask3; brmask0 = (((int32_t)eqmask0)-1)>>31; brmask1 = (((int32_t)eqmask1)-1)>>31; brmask2 = (((int32_t)eqmask2)-1)>>31; itembranch = ( itembranch0 & ~brmask0 ) | ( itembranch1 & brmask0 & ~brmask1 ); itembranch |= ( itembranch2 & brmask0 & brmask1 & ~brmask2 ) | ( itembranch3 & brmask0 & brmask1 & brmask2 ); itembranch = ( itembranch >> 4 ) | ( ( 0x12131210 >> ( ( itembranch & 0xf ) << 1 ) ) & 3 ); #else minval = insvalue; itembranch = -1; for( i = 0 ; i < MM_HEAP_WIDTH ; i++ ) { if( unit->value[i] < minval ) { minval = unit->value[i]; itembranch = i; } } if( itembranch < 0 ) break; #endif parent->value[ parentbranch ] = minval; parent->p[ parentbranch ] = unit->p[ itembranch ]; parent = unit; parentbranch = itembranch; parentorder = itemorder + itembranch; } parent->value[ parentbranch ] = insvalue; parent->p[ parentbranch ] = insp; return; } size_t mmHeapfGetCount( mmHeapf *heap ) { return heap->size - MM_HEAP_MASK; } void mmHeapfDebug( mmHeapf *heap, int verbose ) { int i, itembranch, parentorder; float refvalue; mmHeapfUnit *unit, *parent; startover: if( verbose ) printf( "\n== Heap debug (size %d) ==\n", (int)heap->size ); if( verbose ) { void *p; refvalue = 0.0f; p = mmHeapfGetFirst( heap, &refvalue ); printf( "First: %p %f\n", p, refvalue ); } itembranch = 0; for( i = MM_HEAP_MASK ; i < heap->size ; i++ ) { unit = &heap->array[ i >> MM_HEAP_SHIFT ]; itembranch = i & MM_HEAP_MASK; if( verbose ) printf( " %.12f", unit->value[ itembranch ] ); refvalue = unit->value[ itembranch ]; for( parentorder = i ; ; ) { parentorder = ( parentorder >> MM_HEAP_SHIFT ) + MM_HEAP_OFFSET; if( parentorder < MM_HEAP_MASK ) break; parent = &heap->array[ parentorder >> MM_HEAP_SHIFT ]; if( parent->value[ parentorder & MM_HEAP_MASK ] > refvalue ) { if( verbose ) printf( "***" ); else { printf( "Bad heap layout %.12f > %.12f", parent->value[ parentorder & MM_HEAP_MASK ], refvalue ); if( !verbose ) { verbose = 1; goto startover; } } } else if( verbose ) printf( "!" ); refvalue = parent->value[ parentorder & MM_HEAP_MASK ]; } if( verbose ) { if( itembranch == MM_HEAP_MASK ) printf( "\n" ); } } if( verbose ) { if( itembranch != MM_HEAP_MASK ) printf( "\n" ); printf( "== Heap debug ==\n\n" ); } fflush( stdout ); return; } ////