Efficient integer compare function Efficient integer compare function c c

Efficient integer compare function


This one has no branches, and doesn't suffer from overflow or underflow:

return (a > b) - (a < b);

With gcc -O2 -S, this compiles down to the following six instructions:

xorl    %eax, %eaxcmpl    %esi, %edisetl    %dlsetg    %almovzbl  %dl, %edxsubl    %edx, %eax

Here's some code to benchmark various compare implementations:

#include <stdio.h>#include <stdlib.h>#define COUNT 1024#define LOOPS 500#define COMPARE compare2#define USE_RAND 1int arr[COUNT];int compare1 (int a, int b){    if (a < b) return -1;    if (a > b) return 1;    return 0;}int compare2 (int a, int b){    return (a > b) - (a < b);}int compare3 (int a, int b){    return (a < b) ? -1 : (a > b);}int compare4 (int a, int b){    __asm__ __volatile__ (        "sub %1, %0 \n\t"        "jno 1f \n\t"        "cmc \n\t"        "rcr %0 \n\t"        "1: "    : "+r"(a)    : "r"(b)    : "cc");    return a;}int main (){    for (int i = 0; i < COUNT; i++) {#if USE_RAND        arr[i] = rand();#else        for (int b = 0; b < sizeof(arr[i]); b++) {            *((unsigned char *)&arr[i] + b) = rand();        }#endif    }    int sum = 0;    for (int l = 0; l < LOOPS; l++) {        for (int i = 0; i < COUNT; i++) {            for (int j = 0; j < COUNT; j++) {                sum += COMPARE(arr[i], arr[j]);            }        }    }    printf("%d=0\n", sum);    return 0;}

The results on my 64-bit system, compiled with gcc -std=c99 -O2, for positive integers (USE_RAND=1):

compare1: 0m1.118scompare2: 0m0.756scompare3: 0m1.101scompare4: 0m0.561s

Out of C-only solutions, the one I suggested was the fastest. user315052's solution was slower despite compiling to only 5 instructions. The slowdown is likely because, despite having one less instruction, there is a conditional instruction (cmovge).

Overall, FredOverflow's 4-instruction assembly implementation was the fastest when used with positive integers. However, this code only benchmarked the integer range RAND_MAX, so the 4-instuction test is biased, because it handles overflows separately, and these don't occur in the test; the speed may be due to successful branch prediction.

With a full range of integers (USE_RAND=0), the 4-instruction solution is in fact very slow (others are the same):

compare4: 0m1.897s


The following has always proven to be fairly efficient for me:

return (a < b) ? -1 : (a > b);

With gcc -O2 -S, this compiles down to the following five instructions:

xorl    %edx, %edxcmpl    %esi, %edimovl    $-1, %eaxsetg    %dlcmovge  %edx, %eax

As a follow-up to Ambroz Bizjak's excellent companion answer, I was not convinced that his program tested the same assembly code what was posted above. And, when I was studying the compiler output more closely, I noticed that the compiler was not generating the same instructions as was posted in either of our answers. So, I took his test program, hand modified the assembly output to match what we posted, and compared the resulting times. It seems the two versions compare roughly identically.

./opt_cmp_branchless: 0m1.070s./opt_cmp_branch:     0m1.037s

I am posting the assembly of each program in full so that others may attempt the same experiment, and confirm or contradict my observation.

The following is the version with the cmovge instruction ((a < b) ? -1 : (a > b)):

        .file   "cmp.c"        .text        .section        .rodata.str1.1,"aMS",@progbits,1.LC0:        .string "%d=0\n"        .text        .p2align 4,,15.globl main        .type   main, @functionmain:.LFB20:        .cfi_startproc        pushq   %rbp        .cfi_def_cfa_offset 16        .cfi_offset 6, -16        pushq   %rbx        .cfi_def_cfa_offset 24        .cfi_offset 3, -24        movl    $arr.2789, %ebx        subq    $8, %rsp        .cfi_def_cfa_offset 32.L9:        leaq    4(%rbx), %rbp.L10:        call    rand        movb    %al, (%rbx)        addq    $1, %rbx        cmpq    %rbx, %rbp        jne     .L10        cmpq    $arr.2789+4096, %rbp        jne     .L9        xorl    %r8d, %r8d        xorl    %esi, %esi        orl     $-1, %edi.L12:        xorl    %ebp, %ebp        .p2align 4,,10        .p2align 3.L18:        movl    arr.2789(%rbp), %ecx        xorl    %eax, %eax        .p2align 4,,10        .p2align 3.L15:        movl    arr.2789(%rax), %edx        xorl    %ebx, %ebx        cmpl    %ecx, %edx        movl    $-1, %edx        setg    %bl        cmovge  %ebx, %edx        addq    $4, %rax        addl    %edx, %esi        cmpq    $4096, %rax        jne     .L15        addq    $4, %rbp        cmpq    $4096, %rbp        jne     .L18        addl    $1, %r8d        cmpl    $500, %r8d        jne     .L12        movl    $.LC0, %edi        xorl    %eax, %eax        call    printf        addq    $8, %rsp        .cfi_def_cfa_offset 24        xorl    %eax, %eax        popq    %rbx        .cfi_def_cfa_offset 16        popq    %rbp        .cfi_def_cfa_offset 8        ret        .cfi_endproc.LFE20:        .size   main, .-main        .local  arr.2789        .comm   arr.2789,4096,32        .section        .note.GNU-stack,"",@progbits

The version below uses the branchless method ((a > b) - (a < b)):

        .file   "cmp.c"        .text        .section        .rodata.str1.1,"aMS",@progbits,1.LC0:        .string "%d=0\n"        .text        .p2align 4,,15.globl main        .type   main, @functionmain:.LFB20:        .cfi_startproc        pushq   %rbp        .cfi_def_cfa_offset 16        .cfi_offset 6, -16        pushq   %rbx        .cfi_def_cfa_offset 24        .cfi_offset 3, -24        movl    $arr.2789, %ebx        subq    $8, %rsp        .cfi_def_cfa_offset 32.L9:        leaq    4(%rbx), %rbp.L10:        call    rand        movb    %al, (%rbx)        addq    $1, %rbx        cmpq    %rbx, %rbp        jne     .L10        cmpq    $arr.2789+4096, %rbp        jne     .L9        xorl    %r8d, %r8d        xorl    %esi, %esi.L19:        movl    %ebp, %ebx        xorl    %edi, %edi        .p2align 4,,10        .p2align 3.L24:        movl    %ebp, %ecx        xorl    %eax, %eax        jmp     .L22        .p2align 4,,10        .p2align 3.L20:        movl    arr.2789(%rax), %ecx.L22:        xorl    %edx, %edx        cmpl    %ebx, %ecx        setg    %cl        setl    %dl        movzbl  %cl, %ecx        subl    %ecx, %edx        addl    %edx, %esi        addq    $4, %rax        cmpq    $4096, %rax        jne     .L20        addq    $4, %rdi        cmpq    $4096, %rdi        je      .L21        movl    arr.2789(%rdi), %ebx        jmp     .L24.L21:        addl    $1, %r8d        cmpl    $500, %r8d        jne     .L19        movl    $.LC0, %edi        xorl    %eax, %eax        call    printf        addq    $8, %rsp        .cfi_def_cfa_offset 24        xorl    %eax, %eax        popq    %rbx        .cfi_def_cfa_offset 16        popq    %rbp        .cfi_def_cfa_offset 8        ret        .cfi_endproc.LFE20:        .size   main, .-main        .local  arr.2789        .comm   arr.2789,4096,32        .section        .note.GNU-stack,"",@progbits


Okay, I managed to get it down to four instructions :) The basic idea is as follows:

Half the time, the difference is small enough to fit into an integer. In that case, just return the difference. Otherwise, shift the number one to the right. The crucial question is what bit to shift into the MSB then.

Let's look at two extreme examples, using 8 bits instead of 32 bits for the sake of simplicity:

 10000000 INT_MIN 01111111 INT_MAX---------000000001 difference 00000000 shifted 01111111 INT_MAX 10000000 INT_MIN---------111111111 difference 11111111 shifted

Shifting the carry bit in would yield 0 for the first case (although INT_MIN is not equal to INT_MAX) and some negative number for the second case (although INT_MAX is not smaller than INT_MIN).

But if we flip the carry bit before doing the shift, we get sensible numbers:

 10000000 INT_MIN 01111111 INT_MAX---------000000001 difference100000001 carry flipped 10000000 shifted 01111111 INT_MAX 10000000 INT_MIN---------111111111 difference011111111 carry flipped 01111111 shifted

I'm sure there's a deep mathematical reason why it makes sense to flip the carry bit, but I don't see it yet.

int compare_int(int a, int b){    __asm__ __volatile__ (        "sub %1, %0 \n\t"        "jno 1f \n\t"        "cmc \n\t"        "rcr %0 \n\t"        "1: "    : "+r"(a)    : "r"(b)    : "cc");    return a;}

I have tested the code with one million random inputs plus every combination of INT_MIN, -INT_MAX, INT_MIN/2, -1, 0, 1, INT_MAX/2, INT_MAX/2+1, INT_MAX. All tests passed. Can you proove me wrong?