John's research (and his daily life): performance

Recently my friend Yan posted something about static function:

And people start discussing about the performance different between this with the normal switch statement. And some saying that complier should be smart enough to optimize that. A simple tests showing that the runtime different seems negotiable.

To see how complier handle that, the best way to do is going to see the assembly. I rewrite the program in C because it is easier to map C code with assembly. (source code : https://github.com/johnlcf/test/downloads)

# gcc -g -o f1 f1.c
# objdump -d -M intel -S f1

Use static function:

0000000000400514 <foo>:

void foo(int i) {
  400514: 55                    push   rbp
  400515: 48 89 e5              mov    rbp,rsp
  400518: 48 83 ec 10           sub    rsp,0x10
  40051c: 89 7d fc              mov    DWORD PTR [rbp-0x4],edi
    static void (*lookup[])() = {zero, one, two, three, four};
    lookup[i]();
  40051f: 8b 45 fc              mov    eax,DWORD PTR [rbp-0x4]
  400522: 48 98                 cdqe   
  400524: 48 8b 14 c5 20 0a 60  mov    rdx,QWORD PTR [rax*8+0x600a20]
  40052b: 00 
  40052c: b8 00 00 00 00        mov    eax,0x0
  400531: ff d2                 call   rdx
}
  400533: c9                    leave  
  400534: c3                    ret

# gcc -g -o f2 f2.c
# objdump -d -M intel -S f2
Use switch statement:

0000000000400514 <foo>:

void foo(int i) {
  400514: 55                    push   rbp
  400515: 48 89 e5              mov    rbp,rsp
  400518: 48 83 ec 10           sub    rsp,0x10
  40051c: 89 7d fc              mov    DWORD PTR [rbp-0x4],edi
    switch (i) 
  40051f: 83 7d fc 04           cmp    DWORD PTR [rbp-0x4],0x4
  400523: 77 48                 ja     40056d <foo+0x59>
  400525: 8b 45 fc              mov    eax,DWORD PTR [rbp-0x4]
  400528: 48 8b 04 c5 a8 06 40  mov    rax,QWORD PTR [rax*8+0x4006a8]
  40052f: 00 
  400530: ff e0                 jmp    rax
    {
        case 0:
            zero();
  400532: b8 00 00 00 00        mov    eax,0x0
  400537: e8 88 ff ff ff        call   4004c4 <zero>
            break;
  40053c: eb 2f                 jmp    40056d <foo+0x59>
        case 1:
            one();
  40053e: b8 00 00 00 00        mov    eax,0x0
  400543: e8 8c ff ff ff        call   4004d4 <one>
            break;
  400548: eb 23                 jmp    40056d <foo+0x59>
        case 2:
            two();
  40054a: b8 00 00 00 00        mov    eax,0x0
  40054f: e8 90 ff ff ff        call   4004e4 <two>
            break;
  400554: eb 17                 jmp    40056d <foo+0x59>
        case 3:
            three();
  400556: b8 00 00 00 00        mov    eax,0x0
  40055b: e8 94 ff ff ff        call   4004f4 <three>
            break;
  400560: eb 0b                 jmp    40056d <foo+0x59>
        case 4:
            four();
  400562: b8 00 00 00 00        mov    eax,0x0
  400567: e8 98 ff ff ff        call   400504 <four>
            break;
  40056c: 90                    nop
    }
}
  40056d: c9                    leave  
  40056e: c3                    ret

It is interesting to see that actually gcc optimize the switch statement with a similar approach, which called "jump tables".

// Check  if i > 4, if yes, jump to the end of switch statement
  40051f: 83 7d fc 04           cmp    DWORD PTR [rbp-0x4],0x4
  400523: 77 48                 ja     40056d <foo+0x59>
// rax = i * (pointer size: 8) + (the start address of "jump table: 0x400678)
  400525: 8b 45 fc              mov    eax,DWORD PTR [rbp-0x4]
  400528: 48 8b 04 c5 a8 06 40  mov    rax,QWORD PTR [rax*8+0x4006a8]
  40052f: 00 
// jump to there
  400530: ff e0                 jmp    rax

And the jump table is already initialized with the address of the "cases": 400532, 40053c, 400546, 400550. (not shown here).

Furthermore, I find a gcc option that can tune the behaviour: -fno-jump-tables. Here is the assembly after using that option with switch statement, which do a little bit more compare (cmp) and jump-if-equal (eq).

# gcc -g -fno-jump-tables -o f2_no_jump_tables f2.c
# objdump -d -M intel -S f2

0000000000400514 <foo>:

void foo(int i) {
  400514: 55                    push   rbp
  400515: 48 89 e5              mov    rbp,rsp
  400518: 48 83 ec 10           sub    rsp,0x10
  40051c: 89 7d fc              mov    DWORD PTR [rbp-0x4],edi
    switch (i) 
  40051f: 8b 45 fc              mov    eax,DWORD PTR [rbp-0x4]
  400522: 83 f8 02              cmp    eax,0x2
  400525: 74 34                 je     40055b <foo+0x47>
  400527: 83 f8 02              cmp    eax,0x2
  40052a: 7f 0b                 jg     400537 <foo+0x23>
  40052c: 85 c0                 test   eax,eax
  40052e: 74 13                 je     400543 <foo+0x2f>
  400530: 83 f8 01              cmp    eax,0x1
  400533: 74 1a                 je     40054f <foo+0x3b>
  400535: eb 47                 jmp    40057e <foo+0x6a>
  400537: 83 f8 03              cmp    eax,0x3
  40053a: 74 2b                 je     400567 <foo+0x53>
  40053c: 83 f8 04              cmp    eax,0x4
  40053f: 74 32                 je     400573 <foo+0x5f>
  400541: eb 3b                 jmp    40057e <foo+0x6a>
    {
        case 0:
            zero();
  400543: b8 00 00 00 00        mov    eax,0x0
  400548: e8 77 ff ff ff        call   4004c4 <zero>
            break;
  40054d: eb 2f                 jmp    40057e <foo+0x6a>
        case 1:
            one();
  40054f: b8 00 00 00 00        mov    eax,0x0
  400554: e8 7b ff ff ff        call   4004d4 <one>
            break;
  400559: eb 23                 jmp    40057e <foo+0x6a>
        case 2:
            two();
  40055b: b8 00 00 00 00        mov    eax,0x0
  400560: e8 7f ff ff ff        call   4004e4 <two>
            break;
  400565: eb 17                 jmp    40057e <foo+0x6a>
        case 3:
            three();
  400567: b8 00 00 00 00        mov    eax,0x0
  40056c: e8 83 ff ff ff        call   4004f4 <three>
            break;
  400571: eb 0b                 jmp    40057e <foo+0x6a>
        case 4:
            four();
  400573: b8 00 00 00 00        mov    eax,0x0
  400578: e8 87 ff ff ff        call   400504 <four>
            break;
  40057d: 90                    nop
    }
}
  40057e: c9                    leave  
  40057f: c3                    ret

But actually it has done a (may be) simpler optimization. It use 2 as a pivot, after compare if it is 2, it will check if it is greater than 2 or less than 2.

After these analysis, I believed that the runtime of f1 is similar to that of f2, and f2_no_jump_tables would be a little bit slower than them. But when I test it:


$ time ./f1 > /dev/null 
real 0m0.349s
user 0m0.344s
sys 0m0.003s

$ time ./f2 > /dev/null 
real 0m0.272s
user 0m0.270s
sys 0m0.001s

$ time ./f2_no_jump_tables > /dev/null 
real 0m0.287s
user 0m0.285s
sys 0m0.001s

I run it a few times and got similar result. It is quite strange that f1 is significantly slower on my machine, while we already saw that it has similar assembly code in f2. I have to dive deeper and find out the truth (as far as I know) behind this. I will write a part 2 for this.

The more you know, the more you realize you don't know...

John's research (and his daily life)

Sunday, June 17, 2012

Assembly analysis (Part 1)