arm optimization code
DESCRIPTION
arm optimization codes in c and assemblyTRANSCRIPT
-
, &
Conditional execution examples
if (r0 == 0)
C source code
CMP r0, #0
ARM instructionsunconditional
CMP r0, #0conditional
if (r0 0){r1 = r1 + 1;
}
CMP r0, #0BNE elseADD r1, r1, #1B end
CMP r0, #0ADDEQ r1, r1, #1ADDNE r2, r2, #1...}
else{r2 = r2 + 1;
elseADD r2, r2, #1
end} ...
C il ll th d Compile all three cases and measure Instruction count Byte count Cycle count Cycle count
-
, &
Loop InterchangeLoop Interchange
Loop Interchange: change nesting of loops to access data in order Loop Interchange: change nesting of loops to access data in order stored in memory
/* Before */for (j = 0; j < 100; j = j+1)
for (i = 0; i < 5000; i = i+1)x[i][j] = 2 * x[i][j];
100
-
, &
Loop InterchangeLoop Interchange
Loop Interchange: change nesting of loops to access data in order Loop Interchange: change nesting of loops to access data in order stored in memory
/* After */for (i = 0; i < 5000; i = i+1)for (j = 0; j < 100; j = j+1)
x[i][j] = 2 * x[i][j]; , .
cache block
spatial locality
-
, &
Loop FusionLoop Fusion
Loop Fusion: Combine 2 independent loops that have same looping Loop Fusion: Combine 2 independent loops that have same looping and some variables overlap
/* Before */for (i = 0; i < N; i = i+1)
for (j = 0; j < N; j = j+1)a[i][j] = 1/b[i][j] * c[i][j];
for (i = 0; i < N; i = i+1)for (j = 0; j < N; j = j+1)
d[i][j] = a[i][j] + c[i][j];
-
, &
Loop FusionLoop Fusion
Loop Fusion: Combine 2 independent loops that have same looping Loop Fusion: Combine 2 independent loops that have same looping and some variables overlap
/* After */for (i = 0; i < N; i = i+1)
for (j = 0; j < N; j = j+1){ a[i][j] = 1/b[i][j] * c[i][j];
d[i][j] = a[i][j] + c[i][j];}
improve spatial localityp p y
-
, &
Reduce costly computations Reduce costly computations
Before Beforefor(int i = 0; i < N; i++)
{z += x/y;}
Afterdouble denom = 1/y;ffor(int i = 0; i < N; i++) {z += x*denom;z += x*denom;
}
-
, &
Remove Branch overheadsRemove Branch overheads
float *data = ...; float *data = ...;float data ...;int length = ...;// Slow version
;int length = ...;// Fast version
float total = 0.0f;int i;
float total1, total2, total3, total4;for (i=0; i < length-3; i += 4) {
for (i=0; i < length; i++) {total += data[i]
total1 += data[i];total2 += data[i+1];t t l3 + d t [i+2]} total3 += data[i+2];total4 += data[i+3];
}}total += (total1 + total2) +
(total3 + total4);`
-
, &
Eliminate branchesEliminate branches
Eliminating branches (if/elses) by using boolean math: Eliminating branches (if/elses) by using boolean math:
if(x == 0) x = 5; ( ) ; // becomes: x += (x == 0) * 5;
// if '5' was a base 2 number, let's say 4: x += (x == 0)
-
, &
Using array indicesUsing array indices
switch ( queue ) { switch ( queue ) { case 0 : letter = 'W'; break; case 1 : letter = 'S'; break; ; ;case 2 : letter = 'U'; break; }
if ( queue == 0 ) letter = 'W'; else if ( queue == 1 ) letter = 'S'; else letter = 'U';
static char *classes="WSU"; letter = classes[queue];
-
, &
Global VariablesGlobal Variables
Global variables are never allocated to registers. Global variables are never allocated to registers. int f(void); int g(void); g( );int errs; void test1(void) { errs += f(); errs += g(); } void test2(void) { int localerrs = errs; localerrs += f(); localerrs += g(); errs = localerrs; }
-
, &
Local variablesLocal variables
Avoid char and short as local variables. Avoid sign-extending Avoid char and short as local variables. Avoid sign extending for signed variables and zero extending for unsigned variables.
int wordinc (int a) { return a + 1; }
char charinc (char a) { return a + 1; }
The results will be identical, but the first code segment will run faster than othersfaster than others.
-
, &
Loop terminationLoop termination
Use count-down-to-zero loops and use simple termination Use count down to zero loops and use simple termination conditions.
int fact1_func (int n) { int i, fact = 1; for (i = 1; i
-
, &
More unrollingMore unrolling
//Example - 1 //Example - 2//Example 1int countbit1(uint n) { int bits = 0;
// pint countbit2(uint n) { int bits = 0; ;
while (n != 0) { if (n & 1) bits++;
while (n != 0) { if (n & 1) bits++;
n >>= 1; } return bits;
if (n & 2) bits++; if (n & 4) bits++;
} if (n & 8) bits++; n >>= 4; } return bits; }