GCC code optimization

Recently, I came to know about dead code elimination technique used by compilers in-order to remove the code which does not effects the program results. The benefit of removing such a code has two benefits: it shrinks the program size, and it avoids running irrelevant stuffs such as to reduce the run time. Lets see an example in each case:

Example1

#include 
#define _ -F<00||--F-OO--;

int F=00,OO=00;

main() {
        F_OO();
        printf("%1.3f\n",4.*-F/OO/OO);
}

F_OO() {
            _-_-_-_
       _-_-_-_-_-_-_-_-_
    _-_-_-_-_-_-_-_-_-_-_-_
  _-_-_-_-_-_-_-_-_-_-_-_-_-_
 _-_-_-_-_-_-_-_-_-_-_-_-_-_-_
 _-_-_-_-_-_-_-_-_-_-_-_-_-_-_
_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
 _-_-_-_-_-_-_-_-_-_-_-_-_-_-_
 _-_-_-_-_-_-_-_-_-_-_-_-_-_-_
  _-_-_-_-_-_-_-_-_-_-_-_-_-_
    _-_-_-_-_-_-_-_-_-_-_-_
        _-_-_-_-_-_-_-_
            _-_-_-_
}

When we execute this program we get a value 0.250.

In C/C++, only during the time of pre-processing, the macros are substituted with the corresponding string defined. The code has a macro defined

#define _ -F<00||--F-OO--;

. which checks if ‘-F’ is less than zero. According the OR operator, the second condition will not be checked until or unless if the first condition (i.e the condition on the left side of the OR operator) returns false.

This is how the code looks upon pre-processing:

F_OO() {
            -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;
       -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;
    -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;
  -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;
 -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;
 -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;
-F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;
-F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;
-F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;
-F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;
 -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;
 -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;
  -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;
    -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;
        -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;
            -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;- -F<00||--F-OO--;
}

Lets inspect the corresponding assembly code of the above given program without any optimization flag set is given below:

➜  A#1 (master) gcc circle.c -S 
➜  A#1 (master) less circle.s	
	.file	"circle.c"
	.globl	F
	.bss
	.align 4
	.type	F, @object
	.size	F, 4
F:
	.zero	4
	.globl	OO
	.align 4
	.type	OO, @object
	.size	OO, 4
OO:
	.zero	4
	.section	.rodata
.LC1:
	.string	"%1.3f\n"
	.text
	.globl	main
	.type	main, @function
main:
.LFB0:
	.cfi_startproc
	pushl	%ebp
	.cfi_def_cfa_offset 8
	.cfi_offset 5, -8
	movl	%esp, %ebp
	.cfi_def_cfa_register 5
	andl	$-16, %esp
	subl	$32, %esp
	call	F_OO
	movl	F, %eax
	negl	%eax
	movl	%eax, 28(%esp)
	fildl	28(%esp)
	fldl	.LC0
	fmulp	%st, %st(1)
	movl	OO, %eax
	movl	%eax, 28(%esp)
	fildl	28(%esp)
	fdivrp	%st, %st(1)
	movl	OO, %eax
	movl	%eax, 28(%esp)
	fildl	28(%esp)
	fdivrp	%st, %st(1)
	movl	$.LC1, %eax
	fstpl	4(%esp)
	movl	%eax, (%esp)
	call	printf
	leave
	.cfi_restore 5
	.cfi_def_cfa 4, 4
	ret
	.cfi_endproc
.LFE0:
	.size	main, .-main
	.globl	F_OO
	.type	F_OO, @function
F_OO:
.LFB1:
	.cfi_startproc
	pushl	%ebp
	.cfi_def_cfa_offset 8
	.cfi_offset 5, -8
	movl	%esp, %ebp
	.cfi_def_cfa_register 5
	movl	F, %eax
	negl	%eax
	testl	%eax, %eax
	js	.L607
	movl	F, %eax
	subl	$1, %eax
	movl	%eax, F
	movl	F, %edx
	movl	OO, %eax
	cmpl	%eax, %edx
	setne	%dl
	subl	$1, %eax
	movl	%eax, OO
	testb	%dl, %dl
.L607:
	nop
L5:
	movl	F, %eax
	testl	%eax, %eax
	js	.L608
	movl	F, %eax
	subl	$1, %eax
	movl	%eax, F
	movl	F, %edx
	movl	OO, %eax
	cmpl	%eax, %edx
	setne	%dl
	subl	$1, %eax
	movl	%eax, OO
	testb	%dl, %dl
.L608:
	nop
.
.
.
.L606:
	popl	%ebp
	.cfi_def_cfa 4, 4
	.cfi_restore 5
	ret
	.cfi_endproc
.LFE1:
	.size	F_OO, .-F_OO
	.section	.rodata
	.align 8
.LC0:
	.long	0
	.long	1074790400
	.ident	"GCC: (Ubuntu/Linaro 4.6.4-1ubuntu1~12.04) 4.6.4"
	.section	.note.GNU-stack,"",@progbits

Inside F_OO function, as every ‘_’ is replaced by -F<00||–F-OO–; statement. When the code is compiled with no optimization, as you can see the .L5: block in the above given assembly

L5:
	movl	F, %eax     // loads the value of F into eax
	testl	%eax, %eax  // checks if -F is less than zero
	js	.L608       // if true jumps to .L608 
	movl	F, %eax     // loads F into eax register
	subl	$1, %eax    // decrements the content of eax register by q
	movl	%eax, F     // moves back the value of eax register to F
	movl	F, %edx     // moves the value of F to edx register
	movl	OO, %eax    // moves the value of OO to eax register
	cmpl	%eax, %edx  // compares the values of eax and edx
	setne	%dl         // dl is set to 1 if the cmpl result is zero
	subl	$1, %eax    // decrements the value of eax register by 1
	movl	%eax, OO    // moves the decremented value of eax to OO 
	testb	%dl, %dl    // checks if higher bit is set or not
.L608:
	nop

The right side of the (-F<00||–F-OO–) OR operator is skipped if the first part becomes true.
So these lines will be unused if the first condition becomes true. Such lines are called dead code.

        js	.L608       // if true jumps to .L608 
	movl	F, %eax     // loads F into eax register
	subl	$1, %eax    // decrements the content of eax register by q
	movl	%eax, F     // moves back the value of eax register to F
	movl	F, %edx     // moves the value of F to edx register
	movl	OO, %eax    // moves the value of OO to eax register
	cmpl	%eax, %edx  // compares the values of eax and edx
	setne	%dl         // dl is set to 1 if the cmpl result is zero
	subl	$1, %eax    // decrements the value of eax register by 1
	movl	%eax, OO    // moves the decremented value of eax to OO 
	testb	%dl, %dl    // checks if higher bit is set or not

Now, lets compile the code with highest level of code optimization:

➜  A#1 (master) gcc circle.c -S -O3
➜  A#1 (master) less circle.s
	.file	"circle.c"
	.text
	.p2align 4,,15
	.globl	F_OO
	.type	F_OO, @function
F_OO:
.LFB23:
	.cfi_startproc
	movl	F, %eax   // F is loaded into eax register
	pushl	%esi      // value of esi is pushed into the stack
//The call frame is identified by an address on the stack. We refer to this address as the Canonical Frame Address or CFA. Typically, the CFA is defined to be the 
//value of the stack pointer at the call site in the previous frame (which may be different from its value on entry to the current frame).
	.cfi_def_cfa_offset 8  
	.cfi_offset 6, -8
	pushl	%ebx
	.cfi_def_cfa_offset 12
	.cfi_offset 3, -12
	movl	%eax, %edx  // content of F which is moved to eax is later moved to edx
	negl	%edx        // Sign of the value of edx is inversed
	testl	%edx, %edx  // checks if highest bit is enabled 
	js	.L2         // if not enabled Jumps to .L2 block
	subl	$1, OO      // decrements value of OO by 1
	subl	$1, %eax    // decrements the value of the eax by 1
	movl	%eax, F     // moves the decremented value of eax to F
.L2:
	testl	%eax, %eax  // tests if value of F is negative or not
	js	.L186       // if negative skips the right side of '||' condition
	movl	OO, %edx
	leal	-1(%eax), %ecx
	cmpl	$-1, %ecx
	movl	%ecx, F
	leal	-1(%edx), %ebx
	movl	%ebx, OO
	je	.L201
	leal	-2(%edx), %ecx
	cmpl	$1, %eax
	movl	%ecx, OO
	je	.L202
	subl	$3, %eax
	subl	$3, %edx
	movl	%eax, F
	movl	%edx, OO
.L186:
	movl	%eax, %edx
	negl	%edx
	testl	%edx, %edx
	js	.L5
.
.
.
.
.
.
L202:
.
.
L201:
	.cfi_restore_state
	orl	$-1, %eax
	jmp	.L186
.LFE23:
	.size	F_OO, .-F_OO
	.section	.rodata.str1.1,"aMS",@progbits,1
.LC1:
	.string	"%1.3f\n"
	.section	.text.startup,"ax",@progbits
	.p2align 4,,15
	.globl	main
	.type	main, @function
main:
.LFB22:
	.cfi_startproc
	pushl	%ebp
	.cfi_def_cfa_offset 8
	.cfi_offset 5, -8
	movl	%esp, %ebp
	.cfi_def_cfa_register 5
	andl	$-16, %esp
	subl	$32, %esp
	call	F_OO
	movl	F, %eax
	movl	$.LC1, 4(%esp)
	movl	$1, (%esp)
	negl	%eax
	movl	%eax, 28(%esp)
	fildl	OO
	fildl	28(%esp)
	fmuls	.LC0
	fdiv	%st(1), %st
	fdivp	%st, %st(1)
	fstpl	8(%esp)
	call	__printf_chk
	leave
	.cfi_restore 5
	.cfi_def_cfa 4, 4
	ret
	.cfi_endproc
.LFE22:
	.size	main, .-main
	.globl	OO
	.bss
	.align 4
	.type	OO, @object
	.size	OO, 4
OO:
	.zero	4
	.globl	F
	.align 4
	.type	F, @object
	.size	F, 4
F:
	.zero	4
	.section	.rodata.cst4,"aM",@progbits,4
	.align 4
.LC0:
	.long	1082130432
	.ident	"GCC: (Ubuntu/Linaro 4.6.4-1ubuntu1~12.04) 4.6.4"
	.section	.note.GNU-stack,"",@progbits

As could see the the right side of the OR is written not near the left side of the same in-order to push away the unused the code. This is how the gcc optimizes the code.

Example 2

This is a sample code in which count is not used anywhere else than in the for loop.

#include 
main() {
    int i, count = 0;
    for(i = 0; i < 2000000000; i++)
        count = count + 1;
}
➜  test  gcc test.c -S
➜  test  less test.s
        .file   "test.c"
        .text
        .globl  main
        .type   main, @function
main:
.LFB0:
        .cfi_startproc
        pushl   %ebp
        .cfi_def_cfa_offset 8
        .cfi_offset 5, -8
        movl    %esp, %ebp
        .cfi_def_cfa_register 5
        subl    $16, %esp
        movl    $0, -4(%ebp) // i = 0
        movl    $0, -8(%ebp) // count = 0
        jmp     .L2          // get into the for loop
.L3:
        addl    $1, -4(%ebp) // i++
        addl    $1, -8(%ebp) // count = count + 1
.L2:
        cmpl    $1999999999, -8(%ebp) // i< 1999999999
        jle     .L3                   // if cmpl is true jumps and increments count and i
        leave                         // end of one iteration  
        .cfi_restore 5                       
        .cfi_def_cfa 4, 4
        ret
        .cfi_endproc
.LFE0:
        .size   main, .-main
        .ident  "GCC: (Ubuntu/Linaro 4.6.4-1ubuntu1~12.04) 4.6.4"
        .section        .note.GNU-stack,"",@progbits

Lets use the highest level of optimization in the same code while compiling.

➜  test  gcc test.c -S -O3
➜  test  less test.s
       .file   "test.c"
        .section        .text.startup,"ax",@progbits
        .p2align 4,,15
        .globl  main
        .type   main, @function
main:
.LFB22:
        .cfi_startproc // main starts
        rep
        ret // main ends
        .cfi_endproc
.LFE22:
        .size   main, .-main
        .ident  "GCC: (Ubuntu/Linaro 4.6.4-1ubuntu1~12.04) 4.6.4"
        .section        .note.GNU-stack,"",@progbits

In this case, the compiler removes the dead lines from the compiled code. See you cannot find the declaration of i, count and even the for loop has disappeared.

Lets see what’s the difference between the execution time when we run the code with and without optimization

Without compiler optimization

➜  test  gcc test.c  
➜  test  time ./a.out
./a.out  4.23s user 0.00s system 99% cpu 4.241 total

With compiler optimization

➜  test  gcc test.c -O3
➜  test  time ./a.out 
./a.out  0.00s user 0.00s system 0% cpu 0.002 total

In the first case, for loop is executed hence it took a lot of time to execute when as in the second case, the compiler has optimized and removed the for loop and due to that the execution time is approximately zero.

References:
[1] http://en.wikipedia.org/wiki/International_Obfuscated_C_Code_Contest
[2] http://stackoverflow.com/questions/8841865/how-does-gcc-optimize-c-code

Advertisements

One comment

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s