And people start discussing about the performance different between this with the normal switch statement. And some saying that complier should be smart enough to optimize that. A simple tests showing that the runtime different seems negotiable.
To see how complier handle that, the best way to do is going to see the assembly. I rewrite the program in C because it is easier to map C code with assembly. (source code : https://github.com/johnlcf/test/downloads)
# gcc -g -o f1 f1.c
# objdump -d -M intel -S f1
Use static function:
0000000000400514 <foo>:
void foo(int i) {
400514: 55 push rbp
400515: 48 89 e5 mov rbp,rsp
400518: 48 83 ec 10 sub rsp,0x10
40051c: 89 7d fc mov DWORD PTR [rbp-0x4],edi
static void (*lookup[])() = {zero, one, two, three, four};
lookup[i]();
40051f: 8b 45 fc mov eax,DWORD PTR [rbp-0x4]
400522: 48 98 cdqe
400524: 48 8b 14 c5 20 0a 60 mov rdx,QWORD PTR [rax*8+0x600a20]
40052b: 00
40052c: b8 00 00 00 00 mov eax,0x0
400531: ff d2 call rdx
}
400533: c9 leave
400534: c3 ret
# gcc -g -o f2 f2.c
# objdump -d -M intel -S f2
Use switch statement:
0000000000400514 <foo>:
void foo(int i) {
400514: 55 push rbp
400515: 48 89 e5 mov rbp,rsp
400518: 48 83 ec 10 sub rsp,0x10
40051c: 89 7d fc mov DWORD PTR [rbp-0x4],edi
switch (i)
40051f: 83 7d fc 04 cmp DWORD PTR [rbp-0x4],0x4
400523: 77 48 ja 40056d <foo+0x59>
400525: 8b 45 fc mov eax,DWORD PTR [rbp-0x4]
400528: 48 8b 04 c5 a8 06 40 mov rax,QWORD PTR [rax*8+0x4006a8]
40052f: 00
400530: ff e0 jmp rax
{
case 0:
zero();
400532: b8 00 00 00 00 mov eax,0x0
400537: e8 88 ff ff ff call 4004c4 <zero>
break;
40053c: eb 2f jmp 40056d <foo+0x59>
case 1:
one();
40053e: b8 00 00 00 00 mov eax,0x0
400543: e8 8c ff ff ff call 4004d4 <one>
break;
400548: eb 23 jmp 40056d <foo+0x59>
case 2:
two();
40054a: b8 00 00 00 00 mov eax,0x0
40054f: e8 90 ff ff ff call 4004e4 <two>
break;
400554: eb 17 jmp 40056d <foo+0x59>
case 3:
three();
400556: b8 00 00 00 00 mov eax,0x0
40055b: e8 94 ff ff ff call 4004f4 <three>
break;
400560: eb 0b jmp 40056d <foo+0x59>
case 4:
four();
400562: b8 00 00 00 00 mov eax,0x0
400567: e8 98 ff ff ff call 400504 <four>
break;
40056c: 90 nop
}
}
40056d: c9 leave
40056e: c3 ret
It is interesting to see that actually gcc optimize the switch statement with a similar approach, which called "jump tables".// Check if i > 4, if yes, jump to the end of switch statement 40051f: 83 7d fc 04 cmp DWORD PTR [rbp-0x4],0x4 400523: 77 48 ja 40056d <foo+0x59> // rax = i * (pointer size: 8) + (the start address of "jump table: 0x400678) 400525: 8b 45 fc mov eax,DWORD PTR [rbp-0x4] 400528: 48 8b 04 c5 a8 06 40 mov rax,QWORD PTR [rax*8+0x4006a8] 40052f: 00 // jump to there 400530: ff e0 jmp rax
And the jump table is already initialized with the address of the "cases": 400532, 40053c, 400546, 400550. (not shown here).
Furthermore, I find a gcc option that can tune the behaviour: -fno-jump-tables. Here is the assembly after using that option with switch statement, which do a little bit more compare (cmp) and jump-if-equal (eq).
# gcc -g -fno-jump-tables -o f2_no_jump_tables f2.c
# objdump -d -M intel -S f2
0000000000400514 <foo>:
void foo(int i) {
400514: 55 push rbp
400515: 48 89 e5 mov rbp,rsp
400518: 48 83 ec 10 sub rsp,0x10
40051c: 89 7d fc mov DWORD PTR [rbp-0x4],edi
switch (i)
40051f: 8b 45 fc mov eax,DWORD PTR [rbp-0x4]
400522: 83 f8 02 cmp eax,0x2
400525: 74 34 je 40055b <foo+0x47>
400527: 83 f8 02 cmp eax,0x2
40052a: 7f 0b jg 400537 <foo+0x23>
40052c: 85 c0 test eax,eax
40052e: 74 13 je 400543 <foo+0x2f>
400530: 83 f8 01 cmp eax,0x1
400533: 74 1a je 40054f <foo+0x3b>
400535: eb 47 jmp 40057e <foo+0x6a>
400537: 83 f8 03 cmp eax,0x3
40053a: 74 2b je 400567 <foo+0x53>
40053c: 83 f8 04 cmp eax,0x4
40053f: 74 32 je 400573 <foo+0x5f>
400541: eb 3b jmp 40057e <foo+0x6a>
{
case 0:
zero();
400543: b8 00 00 00 00 mov eax,0x0
400548: e8 77 ff ff ff call 4004c4 <zero>
break;
40054d: eb 2f jmp 40057e <foo+0x6a>
case 1:
one();
40054f: b8 00 00 00 00 mov eax,0x0
400554: e8 7b ff ff ff call 4004d4 <one>
break;
400559: eb 23 jmp 40057e <foo+0x6a>
case 2:
two();
40055b: b8 00 00 00 00 mov eax,0x0
400560: e8 7f ff ff ff call 4004e4 <two>
break;
400565: eb 17 jmp 40057e <foo+0x6a>
case 3:
three();
400567: b8 00 00 00 00 mov eax,0x0
40056c: e8 83 ff ff ff call 4004f4 <three>
break;
400571: eb 0b jmp 40057e <foo+0x6a>
case 4:
four();
400573: b8 00 00 00 00 mov eax,0x0
400578: e8 87 ff ff ff call 400504 <four>
break;
40057d: 90 nop
}
}
40057e: c9 leave
40057f: c3 ret
But actually it has done a (may be) simpler optimization. It use 2 as a pivot, after compare if it is 2, it will check if it is greater than 2 or less than 2.
After these analysis, I believed that the runtime of f1 is similar to that of f2, and f2_no_jump_tables would be a little bit slower than them. But when I test it:
$ time ./f1 > /dev/null real 0m0.349s user 0m0.344s sys 0m0.003s $ time ./f2 > /dev/null real 0m0.272s user 0m0.270s sys 0m0.001s $ time ./f2_no_jump_tables > /dev/null real 0m0.287s user 0m0.285s sys 0m0.001sI run it a few times and got similar result. It is quite strange that f1 is significantly slower on my machine, while we already saw that it has similar assembly code in f2. I have to dive deeper and find out the truth (as far as I know) behind this. I will write a part 2 for this.
The more you know, the more you realize you don't know...
