Sunday, June 17, 2012

Assembly analysis (Part 1)

Recently my friend Yan posted something about static function:



And people start discussing about the performance different between this with the normal switch statement. And some saying that complier should be smart enough to optimize that. A simple tests showing that the runtime different seems negotiable.

To see how complier handle that, the best way to do is going to see the assembly. I rewrite the program in C because it is easier to map C code with assembly. (source code : https://github.com/johnlcf/test/downloads)

# gcc -g -o f1 f1.c
# objdump -d -M intel -S f1

Use static function:

0000000000400514 <foo>:

void foo(int i) {
  400514: 55                    push   rbp
  400515: 48 89 e5              mov    rbp,rsp
  400518: 48 83 ec 10           sub    rsp,0x10
  40051c: 89 7d fc              mov    DWORD PTR [rbp-0x4],edi
    static void (*lookup[])() = {zero, one, two, three, four};
    lookup[i]();
  40051f: 8b 45 fc              mov    eax,DWORD PTR [rbp-0x4]
  400522: 48 98                 cdqe   
  400524: 48 8b 14 c5 20 0a 60  mov    rdx,QWORD PTR [rax*8+0x600a20]
  40052b: 00 
  40052c: b8 00 00 00 00        mov    eax,0x0
  400531: ff d2                 call   rdx
}
  400533: c9                    leave  
  400534: c3                    ret 

# gcc -g -o f2 f2.c
# objdump -d -M intel -S f2
Use switch statement:
0000000000400514 <foo>:

void foo(int i) {
  400514: 55                    push   rbp
  400515: 48 89 e5              mov    rbp,rsp
  400518: 48 83 ec 10           sub    rsp,0x10
  40051c: 89 7d fc              mov    DWORD PTR [rbp-0x4],edi
    switch (i) 
  40051f: 83 7d fc 04           cmp    DWORD PTR [rbp-0x4],0x4
  400523: 77 48                 ja     40056d <foo+0x59>
  400525: 8b 45 fc              mov    eax,DWORD PTR [rbp-0x4]
  400528: 48 8b 04 c5 a8 06 40  mov    rax,QWORD PTR [rax*8+0x4006a8]
  40052f: 00 
  400530: ff e0                 jmp    rax
    {
        case 0:
            zero();
  400532: b8 00 00 00 00        mov    eax,0x0
  400537: e8 88 ff ff ff        call   4004c4 <zero>
            break;
  40053c: eb 2f                 jmp    40056d <foo+0x59>
        case 1:
            one();
  40053e: b8 00 00 00 00        mov    eax,0x0
  400543: e8 8c ff ff ff        call   4004d4 <one>
            break;
  400548: eb 23                 jmp    40056d <foo+0x59>
        case 2:
            two();
  40054a: b8 00 00 00 00        mov    eax,0x0
  40054f: e8 90 ff ff ff        call   4004e4 <two>
            break;
  400554: eb 17                 jmp    40056d <foo+0x59>
        case 3:
            three();
  400556: b8 00 00 00 00        mov    eax,0x0
  40055b: e8 94 ff ff ff        call   4004f4 <three>
            break;
  400560: eb 0b                 jmp    40056d <foo+0x59>
        case 4:
            four();
  400562: b8 00 00 00 00        mov    eax,0x0
  400567: e8 98 ff ff ff        call   400504 <four>
            break;
  40056c: 90                    nop
    }
}
  40056d: c9                    leave  
  40056e: c3                    ret    

It is interesting to see that actually gcc optimize the switch statement with a similar approach, which called "jump tables".

// Check  if i > 4, if yes, jump to the end of switch statement
  40051f: 83 7d fc 04           cmp    DWORD PTR [rbp-0x4],0x4
  400523: 77 48                 ja     40056d <foo+0x59>
// rax = i * (pointer size: 8) + (the start address of "jump table: 0x400678)
  400525: 8b 45 fc              mov    eax,DWORD PTR [rbp-0x4]
  400528: 48 8b 04 c5 a8 06 40  mov    rax,QWORD PTR [rax*8+0x4006a8]
  40052f: 00 
// jump to there
  400530: ff e0                 jmp    rax


And the jump table is already initialized with the address of the "cases": 400532, 40053c, 400546, 400550. (not shown here).

Furthermore, I find a gcc option that can tune the behaviour: -fno-jump-tables. Here is the assembly after using that option with switch statement, which do a little bit more compare (cmp) and jump-if-equal (eq).


# gcc -g -fno-jump-tables -o f2_no_jump_tables f2.c
# objdump -d -M intel -S f2

 
0000000000400514 <foo>:

void foo(int i) {
  400514: 55                    push   rbp
  400515: 48 89 e5              mov    rbp,rsp
  400518: 48 83 ec 10           sub    rsp,0x10
  40051c: 89 7d fc              mov    DWORD PTR [rbp-0x4],edi
    switch (i) 
  40051f: 8b 45 fc              mov    eax,DWORD PTR [rbp-0x4]
  400522: 83 f8 02              cmp    eax,0x2
  400525: 74 34                 je     40055b <foo+0x47>
  400527: 83 f8 02              cmp    eax,0x2
  40052a: 7f 0b                 jg     400537 <foo+0x23>
  40052c: 85 c0                 test   eax,eax
  40052e: 74 13                 je     400543 <foo+0x2f>
  400530: 83 f8 01              cmp    eax,0x1
  400533: 74 1a                 je     40054f <foo+0x3b>
  400535: eb 47                 jmp    40057e <foo+0x6a>
  400537: 83 f8 03              cmp    eax,0x3
  40053a: 74 2b                 je     400567 <foo+0x53>
  40053c: 83 f8 04              cmp    eax,0x4
  40053f: 74 32                 je     400573 <foo+0x5f>
  400541: eb 3b                 jmp    40057e <foo+0x6a>
    {
        case 0:
            zero();
  400543: b8 00 00 00 00        mov    eax,0x0
  400548: e8 77 ff ff ff        call   4004c4 <zero>
            break;
  40054d: eb 2f                 jmp    40057e <foo+0x6a>
        case 1:
            one();
  40054f: b8 00 00 00 00        mov    eax,0x0
  400554: e8 7b ff ff ff        call   4004d4 <one>
            break;
  400559: eb 23                 jmp    40057e <foo+0x6a>
        case 2:
            two();
  40055b: b8 00 00 00 00        mov    eax,0x0
  400560: e8 7f ff ff ff        call   4004e4 <two>
            break;
  400565: eb 17                 jmp    40057e <foo+0x6a>
        case 3:
            three();
  400567: b8 00 00 00 00        mov    eax,0x0
  40056c: e8 83 ff ff ff        call   4004f4 <three>
            break;
  400571: eb 0b                 jmp    40057e <foo+0x6a>
        case 4:
            four();
  400573: b8 00 00 00 00        mov    eax,0x0
  400578: e8 87 ff ff ff        call   400504 <four>
            break;
  40057d: 90                    nop
    }
}
  40057e: c9                    leave  
  40057f: c3                    ret   

But actually it has done a (may be) simpler optimization. It use 2 as a pivot, after compare if it is 2, it will check if it is greater than 2 or less than 2.

After these analysis, I believed that the runtime of f1 is similar to that of f2, and f2_no_jump_tables would be a little bit slower than them. But when I test it:


$ time ./f1 > /dev/null 
real 0m0.349s
user 0m0.344s
sys 0m0.003s

$ time ./f2 > /dev/null 
real 0m0.272s
user 0m0.270s
sys 0m0.001s

$ time ./f2_no_jump_tables > /dev/null 
real 0m0.287s
user 0m0.285s
sys 0m0.001s

I run it a few times and got similar result. It is quite strange that f1 is significantly slower on my machine, while we already saw that it has similar assembly code in f2. I have to dive deeper and find out the truth (as far as I know) behind this. I will write a part 2 for this.

The more you know, the more you realize you don't know...

Tuesday, February 1, 2011

How to copy a 5G file to large number of servers in shortest time?

One day my ex-colleague asked me this question. It seems like a common interview question. I don't know the best answer (if exist) but I would like to share my solution and my view point.

First I share some common answer and their problem:

1. A tree-type algorithm:

First round: Copy the file to 2nd server.
2nd round: Copy the file from 1st server to 3rd server AND from 2nd server to 4th server at the same time.
3rd round: 1->5, 2->6, 3->7, 4->8
And so on.

So log2(N) rounds of file transfer can sync the file on N systems. Assume t is the time to transfer 5G file over network, it will take log2(N) * t.

It is nice from algorithm point of view. But in real system, if the 5G file cannot fit in the system cache, it will be re-read from disk for N times on 1st system, which can be slow. (Ref. Numbers Everyone Should Know)

2. "Install BT and let BT do it" solution

It can be a simple answer for a lazy guy and a very complex real environment. But actually BT would do many thing that is unnecessary for this simple task.

My solution: A pipeline streaming solution

Stream the file from 1st to 2nd system (e.g. use nc). And stream the file from 2nd system to 3rd system at the same time and so on.

In theory, it will take N*(initial transfer delay) + t. (see the graph below)
And the biggest benefit is, the part of the file that the system operate on will be most likely in the system cache when it is writing and reading from disk, which will greatly increase the efficiency.

Here is a graph to illustrate. The x-axis is the time. The upper part is the tree-algorithm and the lower part is the streaming solution:


Moreover, if the program can utilize zero-copy (A good reference from IBM: Efficient data transfer through zero copy), it may be even faster.

But one major problem of this solution is that if one node is broken, the transfer to the node after it will be affected.

But frankly I haven't test it in real environment yet. I would like to know how it would perform in real environment.

Thursday, January 13, 2011

pipe, split, rotate

Recently I have to strace to monitor a long-running daemon. strace will generate a lot of output but it doesn't have a built-in way to split the output into files. At first, I have a few idea to solve the problem:

1. "split" command can split the output to files but it has a pre-set limit. For example, default it names the output file with suffix "aa", "ab"...., "zz" and then it will stop. You can increase suffix length but it will eventually stop.

2. Normal "logrotate" method would not work because strace will not accept SIGHUP (like httpd does) to close and open the log file.

3. Use a cronjob to "stop, rotate file and restart strace" - it will lost the trace between the stop and restart of strace.

I then try to write my own program to read the input and write to file and rotate. But after I finished, I search on web and find an existing program which can solve my problem: "rotatelogs". It comes with httpd package.

For example:

# strace -f -t -p | rotatelogs output_log 86400


It will write the output to output_log and rotate it every 86400s (24hrs). You can also specify the size of each output file.

Monday, June 14, 2010

Fedora 12 + SELinux + Firefox 3.6 + Sun Java plugin

Because of a legacy application, I need to install Sun Java plugin in my Fedora 12 system. There is a few tricks worth notice:

1. After Firefox 3.6, it doesn't support the original Java plugin format OJI and only support the standard NPAPI and NPRuntime interfaces that come with Sun Java 6 update 10 or later.

Ref. www.java.com/en/download/faq/firefox_newplugin.xml

You have to remove all libjavaplugin_oji.so link from firefox plugin directories (e.g. /usr/lib/mozilla/plugins, /usr/lib/firefox/plugins, ~/.mozilla/plugins, etc) and create this new link:

# ln -s /usr/java/jdk1.6.0_18/jre/lib/i386/libnpjp2.so libnpjp2.so


And restart firefox to take effect.

2.If you SELinux is disabled, it is all you need. But if SELinux is enabled, you will get this message when you start firefox in shell:

#./firefox
LoadPlugin: failed to initialize shared library /usr/java/jre1.6.0_20/lib/i386/libnpjp2.so [/usr/java/jre1.6.0_20/lib/i386/libnpjp2.so: cannot enable executable stack as shared object requires: Permission denied]

A quick and dirty way is to clear the "executable stack" flag for the libnpjp2.so:

# execstack -c /usr/java/jdk1.6.0_18/jre/lib/i386/libnpjp2.so


(Don't do execstack on your symbolic link or it will copy the real file to replace the symlink.)

But after further research, I found that it may be related to a policy issue of SELinux. And upgrade selinux-policy would solve the problem. Ref.

https://bugzilla.redhat.com/show_bug.cgi?id=533486

Monday, February 1, 2010

網癮戰爭

引自維基百科:
《網癮戰爭》是中國大陸個人組織製作的網路視訊,整部作品以網路遊戲《魔獸世界》為藍本,由玩家遊戲中人物「出演」,視訊畫面《魔獸世界》里的遊戲場景截取編輯而成。電影主要反映出玩家對於遊戲審批和主流媒體不公正報導的不滿,並調侃電擊治網癮的楊永信。

這裏是Youtube上的版本

看完後,我有很沉重的感覺,上次有這個感覺應該是看six four片段時吧……

雖然主題是惡攪,我也沒有玩過魔獸世界,我一直到很後來才聽說到有關它在中國的風風雨雨,不過這套片所描寫的卻是2009年在社會中發生的種種不平事。片中人物的吶喊,就是他們對這個政府不平的制度下的吶喊。雖然沒有流血,沒有死人,但是這件事和六四是相似的地方是:高官們的權力爭鬥,被害的是小市民,所有的反對聲音被打壓,大家只能在被和諧的環境下默默地生活著。

我建議香港人,都看一看這部電影。雖然可能很多地方不明白(網上有解說),但聽聽內地網民的呼聲。

還有,那些在香港抗議這個抗議那個的人,來大陸吧,這裏才是問題的核心,別浪響時間在香港那個傀儡政府上。孫中山當年也是人在香港,志在中國。各位年青的香港人,放眼中國吧,這可能就是香港的“歴史使命”。

說多了。

Wednesday, October 14, 2009

Convert ogg to mp4

Recently I bought a new G-phone (HTC Hero) which is the actually the first smart phone for me (finally...) I can download a lot of interesting program from Android Market to it.
And one thing I want to do is downloading some training video to the phone so that I can see it in the subway. But it seems that there are no OGG-ready video player in Android Market yet (may be I should write one?). So I have to convert ogg video to mp4.
I search for such application on web but I find the most simple way is using command line:
# ffmpeg -s vga -i video.ogg video.mp4
And the converted video can play on my G-phone seamlessly.
ffmpeg is a famous video libraries for Linux which is available in RPM format in DAG repo.

Friday, July 17, 2009

Google reader public page

I like to use google reader to read blog post and I would share some articles I like. If you are interested, you can have a look:

http://www.google.com/reader/shared/00610968397564927697