Showing posts with label performance. Show all posts
Showing posts with label performance. Show all posts

Sunday, June 17, 2012

Assembly analysis (Part 1)

Recently my friend Yan posted something about static function:



And people start discussing about the performance different between this with the normal switch statement. And some saying that complier should be smart enough to optimize that. A simple tests showing that the runtime different seems negotiable.

To see how complier handle that, the best way to do is going to see the assembly. I rewrite the program in C because it is easier to map C code with assembly. (source code : https://github.com/johnlcf/test/downloads)

# gcc -g -o f1 f1.c
# objdump -d -M intel -S f1

Use static function:

0000000000400514 <foo>:

void foo(int i) {
  400514: 55                    push   rbp
  400515: 48 89 e5              mov    rbp,rsp
  400518: 48 83 ec 10           sub    rsp,0x10
  40051c: 89 7d fc              mov    DWORD PTR [rbp-0x4],edi
    static void (*lookup[])() = {zero, one, two, three, four};
    lookup[i]();
  40051f: 8b 45 fc              mov    eax,DWORD PTR [rbp-0x4]
  400522: 48 98                 cdqe   
  400524: 48 8b 14 c5 20 0a 60  mov    rdx,QWORD PTR [rax*8+0x600a20]
  40052b: 00 
  40052c: b8 00 00 00 00        mov    eax,0x0
  400531: ff d2                 call   rdx
}
  400533: c9                    leave  
  400534: c3                    ret 

# gcc -g -o f2 f2.c
# objdump -d -M intel -S f2
Use switch statement:
0000000000400514 <foo>:

void foo(int i) {
  400514: 55                    push   rbp
  400515: 48 89 e5              mov    rbp,rsp
  400518: 48 83 ec 10           sub    rsp,0x10
  40051c: 89 7d fc              mov    DWORD PTR [rbp-0x4],edi
    switch (i) 
  40051f: 83 7d fc 04           cmp    DWORD PTR [rbp-0x4],0x4
  400523: 77 48                 ja     40056d <foo+0x59>
  400525: 8b 45 fc              mov    eax,DWORD PTR [rbp-0x4]
  400528: 48 8b 04 c5 a8 06 40  mov    rax,QWORD PTR [rax*8+0x4006a8]
  40052f: 00 
  400530: ff e0                 jmp    rax
    {
        case 0:
            zero();
  400532: b8 00 00 00 00        mov    eax,0x0
  400537: e8 88 ff ff ff        call   4004c4 <zero>
            break;
  40053c: eb 2f                 jmp    40056d <foo+0x59>
        case 1:
            one();
  40053e: b8 00 00 00 00        mov    eax,0x0
  400543: e8 8c ff ff ff        call   4004d4 <one>
            break;
  400548: eb 23                 jmp    40056d <foo+0x59>
        case 2:
            two();
  40054a: b8 00 00 00 00        mov    eax,0x0
  40054f: e8 90 ff ff ff        call   4004e4 <two>
            break;
  400554: eb 17                 jmp    40056d <foo+0x59>
        case 3:
            three();
  400556: b8 00 00 00 00        mov    eax,0x0
  40055b: e8 94 ff ff ff        call   4004f4 <three>
            break;
  400560: eb 0b                 jmp    40056d <foo+0x59>
        case 4:
            four();
  400562: b8 00 00 00 00        mov    eax,0x0
  400567: e8 98 ff ff ff        call   400504 <four>
            break;
  40056c: 90                    nop
    }
}
  40056d: c9                    leave  
  40056e: c3                    ret    

It is interesting to see that actually gcc optimize the switch statement with a similar approach, which called "jump tables".

// Check  if i > 4, if yes, jump to the end of switch statement
  40051f: 83 7d fc 04           cmp    DWORD PTR [rbp-0x4],0x4
  400523: 77 48                 ja     40056d <foo+0x59>
// rax = i * (pointer size: 8) + (the start address of "jump table: 0x400678)
  400525: 8b 45 fc              mov    eax,DWORD PTR [rbp-0x4]
  400528: 48 8b 04 c5 a8 06 40  mov    rax,QWORD PTR [rax*8+0x4006a8]
  40052f: 00 
// jump to there
  400530: ff e0                 jmp    rax


And the jump table is already initialized with the address of the "cases": 400532, 40053c, 400546, 400550. (not shown here).

Furthermore, I find a gcc option that can tune the behaviour: -fno-jump-tables. Here is the assembly after using that option with switch statement, which do a little bit more compare (cmp) and jump-if-equal (eq).


# gcc -g -fno-jump-tables -o f2_no_jump_tables f2.c
# objdump -d -M intel -S f2

 
0000000000400514 <foo>:

void foo(int i) {
  400514: 55                    push   rbp
  400515: 48 89 e5              mov    rbp,rsp
  400518: 48 83 ec 10           sub    rsp,0x10
  40051c: 89 7d fc              mov    DWORD PTR [rbp-0x4],edi
    switch (i) 
  40051f: 8b 45 fc              mov    eax,DWORD PTR [rbp-0x4]
  400522: 83 f8 02              cmp    eax,0x2
  400525: 74 34                 je     40055b <foo+0x47>
  400527: 83 f8 02              cmp    eax,0x2
  40052a: 7f 0b                 jg     400537 <foo+0x23>
  40052c: 85 c0                 test   eax,eax
  40052e: 74 13                 je     400543 <foo+0x2f>
  400530: 83 f8 01              cmp    eax,0x1
  400533: 74 1a                 je     40054f <foo+0x3b>
  400535: eb 47                 jmp    40057e <foo+0x6a>
  400537: 83 f8 03              cmp    eax,0x3
  40053a: 74 2b                 je     400567 <foo+0x53>
  40053c: 83 f8 04              cmp    eax,0x4
  40053f: 74 32                 je     400573 <foo+0x5f>
  400541: eb 3b                 jmp    40057e <foo+0x6a>
    {
        case 0:
            zero();
  400543: b8 00 00 00 00        mov    eax,0x0
  400548: e8 77 ff ff ff        call   4004c4 <zero>
            break;
  40054d: eb 2f                 jmp    40057e <foo+0x6a>
        case 1:
            one();
  40054f: b8 00 00 00 00        mov    eax,0x0
  400554: e8 7b ff ff ff        call   4004d4 <one>
            break;
  400559: eb 23                 jmp    40057e <foo+0x6a>
        case 2:
            two();
  40055b: b8 00 00 00 00        mov    eax,0x0
  400560: e8 7f ff ff ff        call   4004e4 <two>
            break;
  400565: eb 17                 jmp    40057e <foo+0x6a>
        case 3:
            three();
  400567: b8 00 00 00 00        mov    eax,0x0
  40056c: e8 83 ff ff ff        call   4004f4 <three>
            break;
  400571: eb 0b                 jmp    40057e <foo+0x6a>
        case 4:
            four();
  400573: b8 00 00 00 00        mov    eax,0x0
  400578: e8 87 ff ff ff        call   400504 <four>
            break;
  40057d: 90                    nop
    }
}
  40057e: c9                    leave  
  40057f: c3                    ret   

But actually it has done a (may be) simpler optimization. It use 2 as a pivot, after compare if it is 2, it will check if it is greater than 2 or less than 2.

After these analysis, I believed that the runtime of f1 is similar to that of f2, and f2_no_jump_tables would be a little bit slower than them. But when I test it:


$ time ./f1 > /dev/null 
real 0m0.349s
user 0m0.344s
sys 0m0.003s

$ time ./f2 > /dev/null 
real 0m0.272s
user 0m0.270s
sys 0m0.001s

$ time ./f2_no_jump_tables > /dev/null 
real 0m0.287s
user 0m0.285s
sys 0m0.001s

I run it a few times and got similar result. It is quite strange that f1 is significantly slower on my machine, while we already saw that it has similar assembly code in f2. I have to dive deeper and find out the truth (as far as I know) behind this. I will write a part 2 for this.

The more you know, the more you realize you don't know...