亲密数 - ARM汇编语言编程

This is a note while studing ARM assembler in Raspberry Pi. Most contents are copied from there to remember where I read to. You should read the original copy from the author.

Reference

ARM assembler in Raspberry Pi

01 - first arm assembly program

first.s

.global main 

main: 
    mov r0, #3 /* r0 is a register, #3 is an immediate value */
    bx lr /*branch and exchange, link register*/

02 - Registers and basic arithmetic

sum01.s

/* 
    Registers and basic arithmetic 
    
    r0 - r15, 32 bits integer registers in Raspberry PI; 
    
    32 floating point registers; 
    
    A program to add r1 and r2 and put the result to r0; 
    
    A program to add r0 and r1 and put the result to r0; 

*/ 

.global main 

main: 
    mov r1, #3
    mov r2, #4 
    add r0, r1, r2 
    bx lr /* link register */

sum02.s


/* 
    Registers and basic arithmetic 
    
    r0 - r15, 32 bits integer registers in Raspberry PI; 
    
    32 floating point registers; 
    
    A program to add r1 and r2 and put the result to r0; 
    
    A program to add r0 and r1 and put the result to r0; 

*/ 

.global main 

main: 
    mov r0, #4
    mov r1, #5
    add r0, r0, r1
    bx lr /* link register */

03 - Memory, addresses. Load and store.

load01.s

/*
    Memory, addresses. Load and store.
    
    Difference between x86 and ARM(Advanced RISC Machines)  
    
    ldr: load to register 
    str: store from register 
    
    Address, Labels 
    
    Code, Data 
    
    .balign byte align 
    .word 
    
    Sections: .text, .data 
    
    load01.s & store01.s 
*/

.data 

.balign 4 
var1: 
    .word 3 
    
.balign 4   
var2:
    .word 6 

.text 
.balign 4 
.global main 
main: 
    ldr r1, addr_var1
    ldr r1,[r1]
    
    ldr r2, addr_var2
    ldr r2,[r2]
    
    add r0,r1,r2 
    bx lr 
    
    
addr_var1: 
    .word var1 
addr_var2: 
    .word var2

store01.s

/*
    Memory, addresses. Load and store.
    
    Difference between x86 and ARM(Advanced RISC Machines)  
    
    ldr: load to register 
    str: store from register 
    
    Address, Labels 
    
    Code, Data 
    
    .balign byte align 
    .word 
    
    Sections: .text, .data 
    
    load01.s & store01.s 
*/

.data 

.balign 4 
var1: 
    .word 0 
    
.balign 4   
var2:
    .word 0  

.text 
.balign 4 
.global main 
main: 
    mov r3, #6
    
    ldr r1, addr_var1
    str r3, [r1]
    
    mov r3, #9
    
    ldr r1, addr_var2
    str r3, [r1]

    ldr r1, addr_var1
    ldr r1,[r1]
    
    ldr r2, addr_var2
    ldr r2,[r2]
    
    add r0,r1,r2 
    bx lr 
    
    
addr_var1: 
    .word var1 
addr_var2: 
    .word var2

04 - GDB

# get help 
help 

# get help of a command 
help disassemble 

help breakpoints 

start

stepi 

info registers 

# continue to run to the next breakpoint or the end 
continue 

# print the addr of a 
print &a 

# print the value of a with casting 
print (int)a 

# examine 10 decimal values starting from the address &a 
x/10d &a

disassemble 

# disassemble instructions from 0x000103d0 to the addr with offset +40 bytes 
disassemble 0x000103d0,+40 

# set breakpoints at the address 
break *0x000103f0

# query the current breakpoints 
i b 

# delete the 3rd breakpoint 
delete 3

05 - Branches

branch01.s

/*
r0 - r15 
r13: sp, stack pointer  
r14: link register 
r15: pc, program counter, IP, instruction pointer 
b: branch 
cmp: compare 
cpsr: current progrm status register 
N (negative), Z (zero), C (carry) and V (overflow)
*/

.text 
.global main 
main: 
    mov r0, #2
    b end 
    mov r0, #3 
end:
    bx lr

compare00.s

/*
demonstrate how subtract operation affects the Carry bit of cpsr 
*/
.text 
.global main 
main: 
    mov r1, #2 
    mov r2, #1 
    cmp r1, r2 
    mov r0, #0 
    bx lr

compare01.s

/*
*/
.text 
.global main 
main: 
    mov r1, #3 
    mov r2, #2 
    cmp r1, r2 
    beq case_eq
case_neq: 
    mov r0, #6  
    b end 
case_eq: 
    mov r0, #5
end: 
    bx lr

cpsr_decode.py

cpsr = 0x20000010
#cpsr is an integer 
def cpsr_decode(cpsr):
    bit_names = {} 
    bit_names["n_bit"] = 0x80_00_00_00 #negative
    bit_names["z_bit"] = 0x40_00_00_00 #zero 
    bit_names["c_bit"] = 0x20_00_00_00 #carry 
    bit_names["v_bit"] = 0x10_00_00_00 #overflow
    
    for name,bit in bit_names.items():
        if cpsr & bit != 0:
            print(name,"is set")
        else:
            print(name,"is not set")

cpsr_decode(cpsr)

06 - Control structures

if01.s

.text 
.global main 
/* check if the number in r1 is odd or even*/
main:
    mov r1, #2022

if:
    tst r1, #1 /* cpsr: current program status register*/
    bne else
then: /* r1 is even */
    mov r0, #2
    b end_if 
else: /* r1 is odd */ 
    mov r0, #1
end_if:
    bx lr

loop01.s

.text 
.global main
/* calculate the sum 1..22 */ 
main: 
    mov r1, #0 /* store sum */ 
    mov r2, #1 /* counter */ 
loop: 
    cmp r2, #22 /* cpsr updated */ 
    bgt end_loop 
    add r1, r1, r2 
    add r2, r2, #1 
    b loop 
end_loop:
    mov r0, r1 
    bx lr

07 - Indexing modes

shift01.s

/*
ARM assembler in Raspberry Pi

07 Indexing modes: 
Allowed operands in instructions are collectively called indexing modes 


shifted operand 

operator/operand 

ldr, str, bxx 

mov

add, sub, rsb, cmp, and, tst

register/immediate values 

Syntax of most of the ARM instructions:
instruction Rdest, Rsource1, source2

source2 is either a register or an immediate value

When source2 is a register, we can combine it with a shift operation. 

LSL #n
LSL Rsource3 

LSR #n
LSR Rsource3 

ASR #n
ASR Rsource3 

ROR #n 
ROR Rsource3 

#n can be 1..31. shift is an operation instead of an instruction. shift operation can be used to perform multiplication and division. 
*/

.text 
.global main 
main: 
    mov r0,#2
    /*add r0, r0, r0, lsl #1*/
    mov r0,r0,lsl #2
    bx lr

08-数组和结构体

以下内容为学习https://thinkingeek.com/2013/01/27/arm-assembler-raspberry-pi-chapter-8/的摘录.

索引模式的定义

These sets of allowed operands in instructions are collectively called indexing modes.

什么是数组

An array is a sequence of items of the same kind in memory.

int a[100];

什么是结构体

A structure (or record or tuple) is a sequence of items of possibly diferent kind.

struct my_struct
{
  char f0;
  int f1;
} b;

Naive approach without indexing modes

array01.s, a program to set initial values 0..9 to a 10 elements array

.data 
.balign 4 
a: .skip 40 

.text 
.global main 
main: 
    ldr r0, addr_a 

    mov r1, #0 

loop: 
    cmp r1, #10 
    beq loop_end 
    add r2, r0, r1, lsl #2  
    str r1, [r2] 
    add r1, r1, #1 
    b loop 
loop_end: 
    bx lr 
addr_a: .word a

Indexing modes

Non updating indexing modes

immediate value. The immediate cannot be larger than 12 bits(0..4096)

[Rsource1, #+immediate] 
[Rsource1, #-immediate]

mov r2, #3
str r2, [r1, #+12]

[Rsource1, +Rsource2] 
[Rsource1, -Rsource2]

mov r2, #3         /* r2 ← 3 */
mov r3, #12        /* r3 ← 12 */
str r2, [r1,+r3]   /* *(r1 + r3) ← r2 */

[Rsource1, +Rsource2, shift_operation #immediate]
[Rsource1, -Rsource2, shift_operation #immediate]

str r2, [r1, +r2, LSL #2]

Updating indexing modes

In these indexing modes the Rsource1 register is updated with the address synthesized by the load or store instruction.

Post-indexing modes

The value of Rsource1 is used as the address for the load or store. Then Rsource1 is updated with the value of immediate after adding (or subtracting) it to Rsource1.

[Rsource1], #+immediate
[Rsource1], #-immediate

str r2, [r1], #4        /* *r1 ← r2 then r1 ← r1 + 4 */

[Rsource1], +Rsource2
[Rsource1], -Rsource2

[Rsource1], +Rsource2, shift_operation #immediate 
[Rsource1], -Rsource2, shift_operation #immediate

Pre-indexing modes

immediate

[Rsource1, #+immediate]! 
[Rsource1, #-immediate]!

ldr r2, [r1, #+12]!  /* r1 ← r1 + 12 then r2 ← *r1 */

[Rsource1, +Rsource2]! 
[Rsource1, +Rsource2]!

[Rsource1, +Rsource2, shift_operation #immediate]! 
[Rsource1, -Rsource2, shift_operation #immediate]!

approach using indexing modes

array02.s, a program to set initial values 0..9 to a 10 elements array, using indexing modes

.data 
.balign 4 
a: .skip 40 

.text 
.global main 
main: 
    ldr r0, addr_a 

    mov r1, #0 

loop: 
    cmp r1, #10 
    beq loop_end 
    /*add r2, r0, r1, lsl #2 */
    str r1, [r0, r1, lsl #2] 
    add r1, r1, #1 
    b loop 
loop_end: 
    bx lr 
addr_a: .word a

结构体

略

09-函数

以下内容为学习https://thinkingeek.com/2013/02/02/arm-assembler-raspberry-pi-chapter-9/的摘录.

Functions are a way to reuse code.

AAPCS:Procedure Call Standard for ARM Architecture

Register	Register Alias	Description
r15	pc	Program Counter
r14	lr	Link Register
r13	sp	Stack Pointer
	cpsr	Current Program Status Register

Functions can receive parameters. The first 4 parameters must be stored, sequentially, in the registers r0, r1, r2 and r3.

Well behaved functions

A function must adhere, at least, to the following rules if we want it to be AAPCS compliant.

A function should not make any assumption on the contents of the cpsr. So, at the entry of a function condition codes N, Z, C and V are unknown.
A function can freely modify registers r0, r1, r2 and r3.
A function cannot assume anything on the contents of r0, r1, r2 and r3 unless they are playing the role of a parameter.
A function can freely modify lr but the value upon entering the function will be needed when leaving the function (so such value must be kept somewhere).
A function can modify all the remaining registers as long as their values are restored upon leaving the function. This includes sp and registers r4 to r11.
This means that, after calling a function, we have to assume that (only) registers r0, r1, r2, r3 and lr have been overwritten.

Calling a function

direct call:

bl label

indirect call(first storing the address of the function into a register):

blx Rsource1 /* Rsource1 means register operand1 */

In both cases the behaviour is as follows: the address of the function (immediately encoded in the bl or using the value of the register in blx) is stored in pc. The address of the instruction following the bl or blx instruction is kept in lr.

Leaving a function

A well behaved function, as stated above, will have to keep the initial value of lr somewhere. When leaving the function, we will retrieve that value and put it in some register (it can be lr again but this is not mandatory). Then we will bx Rsource1 (we could use blx as well but the latter would update lr which is useless here).

Returning data from functions

Functions must use r0 for data that fits in 32 bit (or less). This is, C types char, short, int, long (and float though we have not seen floating point yet) will be returned in r0. For basic types of 64 bit, like C types long long and double, they will be returned in r1 and r0. Any other data is returned through the stack unless it is 32 bit or less, where it will be returned in r0.

Hello world (puts)

/* -- hello01.s */
.data

greeting:
 .asciz "Hello world"

.balign 4
return: .word 0

.text

.global main
main:
    ldr r1, address_of_return     /*   r1 ← &address_of_return */
    str lr, [r1]                  /*   *r1 ← lr */

    ldr r0, address_of_greeting   /* r0 ← &address_of_greeting */
                                  /* First parameter of puts */

    bl puts                       /* Call to puts */
                                  /* lr ← address of next instruction */

    ldr r1, address_of_return     /* r1 ← &address_of_return */
    ldr lr, [r1]                  /* lr ← *r1 */
    bx lr                         /* return from main */
address_of_greeting: .word greeting
address_of_return: .word return

/* External */
.global puts

Interaction (scanf, printf)

/* -- printf01.s */
.data

/* First message */
.balign 4
message1: .asciz "Hey, type a number: "

/* Second message */
.balign 4
message2: .asciz "I read the number %d\n"

/* Format pattern for scanf */
.balign 4
scan_pattern : .asciz "%d"

/* Where scanf will store the number read */
.balign 4
number_read: .word 0

.balign 4
return: .word 0

.text

.global main
main:
    ldr r1, address_of_return        /* r1 ← &address_of_return */
    str lr, [r1]                     /* *r1 ← lr */

    ldr r0, address_of_message1      /* r0 ← &message1 */
    bl printf                        /* call to printf */

    ldr r0, address_of_scan_pattern  /* r0 ← &scan_pattern */
    ldr r1, address_of_number_read   /* r1 ← &number_read */
    bl scanf                         /* call to scanf */

    ldr r0, address_of_message2      /* r0 ← &message2 */
    ldr r1, address_of_number_read   /* r1 ← &number_read */
    ldr r1, [r1]                     /* r1 ← *r1 */
    bl printf                        /* call to printf */

    ldr r0, address_of_number_read   /* r0 ← &number_read */
    ldr r0, [r0]                     /* r0 ← *r0 */

    ldr lr, address_of_return        /* lr ← &address_of_return */
    ldr lr, [lr]                     /* lr ← *lr */
    bx lr                            /* return from main using lr */
address_of_message1 : .word message1
address_of_message2 : .word message2
address_of_scan_pattern : .word scan_pattern
address_of_number_read : .word number_read
address_of_return : .word return

/* External */
.global printf
.global scanf

First function (multiply by 5)

/* -- printf02.s */
.data

/* First message */
.balign 4
message1: .asciz "Hey, type a number: "

/* Second message */
.balign 4
message2: .asciz "%d times 5 is %d\n"

/* Format pattern for scanf */
.balign 4
scan_pattern : .asciz "%d"

/* Where scanf will store the number read */
.balign 4
number_read: .word 0

.balign 4
return: .word 0

.balign 4
return2: .word 0

.text

/*
mult_by_5 function
*/
mult_by_5: 
    ldr r1, address_of_return2       /* r1 ← &address_of_return */
    str lr, [r1]                     /* *r1 ← lr */

    add r0, r0, r0, LSL #2           /* r0 ← r0 + 4*r0 */

    ldr lr, address_of_return2       /* lr ← &address_of_return */
    ldr lr, [lr]                     /* lr ← *lr */
    bx lr                            /* return from main using lr */
address_of_return2 : .word return2

.global main
main:
    ldr r1, address_of_return        /* r1 ← &address_of_return */
    str lr, [r1]                     /* *r1 ← lr */

    ldr r0, address_of_message1      /* r0 ← &message1 */
    bl printf                        /* call to printf */

    ldr r0, address_of_scan_pattern  /* r0 ← &scan_pattern */
    ldr r1, address_of_number_read   /* r1 ← &number_read */
    bl scanf                         /* call to scanf */

    ldr r0, address_of_number_read   /* r0 ← &number_read */
    ldr r0, [r0]                     /* r0 ← *r0 */
    bl mult_by_5

    mov r2, r0                       /* r2 ← r0 */
    ldr r1, address_of_number_read   /* r1 ← &number_read */
    ldr r1, [r1]                     /* r1 ← *r1 */
    ldr r0, address_of_message2      /* r0 ← &message2 */
    bl printf                        /* call to printf */

    ldr lr, address_of_return        /* lr ← &address_of_return */
    ldr lr, [lr]                     /* lr ← *lr */
    bx lr                            /* return from main using lr */
address_of_message1 : .word message1
address_of_message2 : .word message2
address_of_scan_pattern : .word scan_pattern
address_of_number_read : .word number_read
address_of_return : .word return

/* External */
.global printf
.global scanf

10 Functions. The stack

// stm -- store multiple, i -- increase, a -- after, d -- decrease, b -- before 
stmia, stmib, stmda, stmdb, ldmia, ldmib, ldmda, ldmdb 

// push is equivalent to stmdb, pop is equivalent to ldmia when operating on sp 
push, pop

// multiplication instruction 
mul Rdest, Rsource1, Rsource2.

Dynamic activation

One of the benefits of functions is being able to call them more than once. A function may call itself.

A function is dynamically activated each time it is called. The span of a dynamic activation goes from the point where the function is called until it returns. At a given time, more than one function is dynamically activated. The whole dynamic activation set of functions includes the current function and the dynamic activation set of the function that called it (the current function).

//Example recursive function that calls itself 
int factorial(int n)
{
   if (n == 0)
      return 1;
   else
      return n * factorial(n-1);
}

Well behaved functions(Refer to Chap9), a brief recall:

Only r0, r1, r2 and r3 can be freely modified.
lr(r14) value at the entry of the function must be kept somewhere because we will need it to leave the function (to return to the caller).
All other registers r4 to r11 and sp(r13) can be modified but they must be restored to their original values upon leaving the function.

Note: pc register is r15. TBD: What’s the function of r12 function? Why is it not mentioned here?

we need some way to keep at least the value of lr per each dynamic activation. And not only lr, if we wanted to use registers from r4 to r11 we also need to keep somehow per each dynamic activation, a global variable would not be enough either. This is where the stack comes into play.

The stack

In computing, a stack is a data structure (a way to organize data that provides some interesting properties). A stack typically has three operations: access the top of the stack, push onto the top, pop from the top. Dependening on the context you can only access the top of the stack, in our case we will be able to access more elements than just the top.

But, what is the stack? I already said in chaper 9 that the stack is a region of memory owned solely by the function. We can now reword this a bit better: the stack is a region of memory owned solely by the current dynamic activation. register sp stands for stack pointer. This register will contain the top of the stack. The region of memory owned by the dynamic activation is the extent of bytes contained between the current value of sp and the initial value that sp had at the beginning of the function. We will call that region the local memory of a function (more precisely, of a dynamic activation of it). We will put there whatever has to be saved at the beginning of a function and restored before leaving. We will also keep there the local variables of a function (dynamic activation).

Our function also has to adhere to some rules when handling the stack.

The stack pointer (sp) is always 4 byte aligned. This is absolutely mandatory. However, due to the Procedure Call Standard for the ARM architecture (AAPCS), the stack pointer will have to be 8 byte aligned, otherwise funny things may happen when we call what the AAPCS calls as public interfaces (this is, code written by other people). (Note: It means we can persist in 4 byte aligned if the function we are written is only called by ourself, and we won’t call functions written by other persons; otherwise, we should persist in 8 byte algined.)
The value of sp when leaving the function should be the same value it had upon entering the function.

It is a convention how the stack, and thus the local memory, has its size defined. The stack can grow upwards or downwards. If it grows upwards it means that we have to increase the value of the sp register in order to enlarge the local memory. If it grows downwards we have to do the opposite, the value of the sp register must be subtracted as many bytes as the size of the local storage. In Linux ARM, the stack grows downwards, towards zero (although it never should reach zero). Addresses of local variables have very large values in the 32 bit range. They are usually close to \(2^{32}\).

Another convention when using the stack concerns whether the sp register contains the address of the top of the stack or some bytes above. In Linux ARM the sp register directly points to the top of the stack: in the memory addressed by sp there is useful information.

Example snippets of keep lr register and restore lr register. (Note: We can use add and sub to operate sp, we may also use push and pop or stmxx and ldmxx instructions.)

sub sp, sp, #8  /* sp ← sp - 8. This enlarges the stack by 8 bytes */
str lr, [sp]    /* *sp ← lr */
... // Code of the function
ldr lr, [sp]    /* lr ← *sp */
add sp, sp, #8  /* sp ← sp + 8. /* This reduces the stack by 8 bytes
                                effectively restoring the stack 
                                pointer to its original value */
bx lr

Shortened snippets of keep lr and restore lr, making use of indexing modes.

str lr, [sp, #-8]!  /* preindex: sp ← sp - 8; *sp ← lr */
... // Code of the function
ldr lr, [sp], #+8   /* postindex; lr ← *sp; sp ← sp + 8 */
bx lr

First approach to implement the factorial function

It uses the mul Rdest, Rsource1, Rsource2 instruction.

This instruction only computes the lower 32 bits. Because we are not going to use 64 bit values in this example, the maximum factorial we will be able to compute is 12! (13! is bigger than \(2^{32}\)).

This approach directly operates on the sp register.

/* -- factorial01.s */
.data

message1: .asciz "Type a number: "
format:   .asciz "%d"
message2: .asciz "The factorial of %d is %d\n"

.text

factorial:
    str lr, [sp,#-4]!  /* Push lr onto the top of the stack */
    str r0, [sp,#-4]!  /* Push r0 onto the top of the stack */
                       /* Note that after that, sp is 8 byte aligned */
    cmp r0, #0         /* compare r0 and 0 */
    bne is_nonzero     /* if r0 != 0 then branch */
    mov r0, #1         /* r0 ← 1. This is the return */
    b end
is_nonzero:
                       /* Prepare the call to factorial(n-1) */
    sub r0, r0, #1     /* r0 ← r0 - 1 */
    bl factorial
                       /* After the call r0 contains factorial(n-1) */
                       /* Load r0 (that we kept in th stack) into r1 */
    ldr r1, [sp]       /* r1 ← *sp */
    mul r0, r0, r1     /* r0 ← r0 * r1 */
    
end:
    add sp, sp, #+4    /* Discard the r0 we kept in the stack */
    ldr lr, [sp], #+4  /* Pop the top of the stack and put it in lr */
    bx lr              /* Leave factorial */

.global main
main:
    str lr, [sp,#-4]!            /* Push lr onto the top of the stack */
    sub sp, sp, #4               /* Make room for one 4 byte integer in the stack */
                                 /* In these 4 bytes we will keep the number */
                                 /* entered by the user */
                                 /* Note that after that the stack is 8-byte aligned */
    ldr r0, address_of_message1  /* Set &message1 as the first parameter of printf */
    bl printf                    /* Call printf */

    ldr r0, address_of_format    /* Set &format as the first parameter of scanf */
    mov r1, sp                   /* Set the top of the stack as the second parameter */
                                 /* of scanf */
    bl scanf                     /* Call scanf */

    ldr r0, [sp]                 /* Load the integer read by scanf into r0 */
                                 /* So we set it as the first parameter of factorial */
    bl factorial                 /* Call factorial */

    mov r2, r0                   /* Get the result of factorial and move it to r2 */
                                 /* So we set it as the third parameter of printf */
    ldr r1, [sp]                 /* Load the integer read by scanf into r1 */
                                 /* So we set it as the second parameter of printf */
    ldr r0, address_of_message2  /* Set &message2 as the first parameter of printf */
    bl printf                    /* Call printf */


    add sp, sp, #+4              /* Discard the integer read by scanf */
    ldr lr, [sp], #+4            /* Pop the top of the stack and put it in lr */
    bx lr                        /* Leave main */

address_of_message1: .word message1
address_of_message2: .word message2
address_of_format: .word format

Do it better using `ldm` and `stm`

Note that the number of instructions that we need to push and pop data to and from the stack grows linearly with respect to the number of data items. Since ARM was designed for embedded systems, ARM designers devised a way to reduce the number of instructions we need for the «bookkeeping» of the stack. These instructions are load multiple, ldm, and store multiple, stm.

These two instructions are rather powerful and allow in a single instruction perform a lot of things. Their syntax is shown as follows. Elements enclosed in curly braces { and } may be omitted from the syntax (the effect of the instruction will vary, though).

ldm addressing-mode Rbase{!}, register-set
stm addressing-mode Rbase{!}, register-set

We will consider addressing-mode later. Rbase is the base address used to load to or store from the register-set. All 16 ARM registers may be specified in register-set (except pc in stm). A set of addresses is generated when executing these instructions. One address per register in the register-set. Then, each register, in ascending order, is paired with each of these addresses, also in ascending order. This way the lowest-numbered register gets the lowest memory address, and the highest-numbered register gets the highest memory address. Each pair register-address is then used to perform the memory operation: load or store. Specifying ! means that Rbase will be updated. The updated value depends on addressing-mode.

Note that, if the registers are paired with addresses depending on their register number, it seems that they will always be loaded and stored in the same way. For instance a register-set containing r4, r5 and r6 will always store r4 in the lowest address generated by the instruction and r6 in the highest one(Note: Since it is a set, the order does not matter. r4, r5, r6 and r4, r6, r5 mean the same thing in a set). We can, though, specify what is considered the lowest address or the highest address. So, is Rbase actually the highest address or the lowest address of the multiple load/store? This is one of the two aspects that is controlled by addressing-mode. The second aspect relates to when the address of the memory operation changes between each memory operation.

If the value in Rbase is to be considered the the highest address it means that we should first decrease Rbase as many bytes as required by the number of registers in the register-set (this is 4 times the number of registers) to form the lowest address. Then we can load or store each register consecutively starting from that lowest address, always in ascending order of the register number. This addressing mode is called decreasing and is specified using a d. Conversely, if Rbase is to be considered the lowest address, then this is a bit easier as we can use its value as the lowest address already. We proceed as usual, loading or storing each register in ascending order of their register number. This addressing mode is called increasing and is specified using an i.

At each load or store, the address generated for the memory operation may be updated after or before the memory operation itself. We can specify this using a or b, respectively.

If we specify !, after the instruction, Rbase will have the highest address generated in the increasing mode and the lowest address generated in the decreasing mode. The final value of Rbase will include the final addition or subtraction if we use a mode that updates after (an a mode).

So we have four addressing modes, namely: ia, ib, da and db. These addressing modes are specified as suffixes of the stm and ldm instructions. So the full set of names is stmia, stmib, stmda, stmdb, ldmia, ldmib, ldmda, ldmdb. Now you may think that this is overly complicated, but we need not use all the eight modes. Only two of them are of interest to us now.

When we push something onto the stack we actually decrease the stack pointer (because in Linux the stack grows downwards). More precisely, we first decrease the stack pointer as many bytes as needed before doing the actual store on that just computed stack pointer. So the appropiate addressing-mode when pushing onto the stack is stmdb. Conversely when popping from the stack we will use ldmia: we increment the stack pointer after we have performed the load.

Factorial again using `stm or push` and `ldm or pop`

Before illustrating these two instructions, we will first slightly rewrite our factorial.

If you go back to the code of our factorial, there is a moment, when computing n * factorial(n-1), where the initial value of r0 is required. The value of n was in r0 at the beginning of the function, but r0 can be freely modified by called functions.

In our second version of factorial, we will keep a copy of the initial value of r0 into r4. But r4 is a register the value of which must be restored upon leaving a function. So we will keep the value of r4 at the entry of the function in the stack. At the end we will restore it back from the stack. This way we can use r4 without breaking the rules of well-behaved functions.

factorial:
    str lr, [sp,#-4]!  /* Push lr onto the top of the stack */
    str r4, [sp,#-4]!  /* Push r4 onto the top of the stack */
                       /* The stack is now 8 byte aligned */
    mov r4, r0         /* Keep a copy of the initial value of r0 in r4 */


    cmp r0, #0         /* compare r0 and 0 */
    bne is_nonzero     /* if r0 != 0 then branch */
    mov r0, #1         /* r0 ← 1. This is the return */
    b end
is_nonzero:
                       /* Prepare the call to factorial(n-1) */
    sub r0, r0, #1     /* r0 ← r0 - 1 */
    bl factorial
                       /* After the call r0 contains factorial(n-1) */
                       /* Load initial value of r0 (that we kept in r4) into r1 */
    mov r1, r4         /* r1 ← r4 */
    mul r0, r0, r1     /* r0 ← r0 * r1 */

end:
    ldr r4, [sp], #+4  /* Pop the top of the stack and put it in r4 */
    ldr lr, [sp], #+4  /* Pop the top of the stack and put it in lr */
    bx lr              /* Leave factorial */

The above program can be improved via stmdb and ldmia as below:

factorial:
    stmdb sp!, {r4, lr} /* Push r4 and lr onto the stack */
     
                       /* The stack is now 8 byte aligned */
    mov r4, r0         /* Keep a copy of the initial value of r0 in r4 */


    cmp r0, #0         /* compare r0 and 0 */
    bne is_nonzero     /* if r0 != 0 then branch */
    mov r0, #1         /* r0 ← 1. This is the return */
    b end
is_nonzero:
                       /* Prepare the call to factorial(n-1) */
    sub r0, r0, #1     /* r0 ← r0 - 1 */
    bl factorial
                       /* After the call r0 contains factorial(n-1) */
                       /* Load initial value of r0 (that we kept in r4) into r1 */
    mov r1, r4         /* r1 ← r4 */
    mul r0, r0, r1     /* r0 ← r0 * r1 */

end:
    ldmia sp!, {r4, lr}    /* Pop lr and r4 from the stack */
    bx lr              /* Leave factorial */

Note that the order of the registers in the set of registers is not relevant(the order of elements in a set does not matter), but the processor will handle them in ascending order, so we should write them in ascending order. GNU assembler will emit a warning otherwise. Since lr is actually r14 it must go after r4. This means that our code is 100% equivalent to the previous one since r4 will end in a lower address than lr: remember our stack grows toward lower addresses, thus r4 which is in the top of the stack in factorial has the lowest address.

Remembering stmdb sp! and ldmia sp! may be a bit hard. Also, given that these two instructions will be relatively common when entering and leaving functions, GNU assembler provides two mnemonics push and pop for stmdb sp! and ldmia sp!, respectively. Note that these are not ARM instructions actually, just convenience names that are easier to remember.

The above program can be improved via push and pop as below:

factorial:
    push {r4, lr} /* Push r4 and lr onto the stack */
     
                       /* The stack is now 8 byte aligned */
    mov r4, r0         /* Keep a copy of the initial value of r0 in r4 */


    cmp r0, #0         /* compare r0 and 0 */
    bne is_nonzero     /* if r0 != 0 then branch */
    mov r0, #1         /* r0 ← 1. This is the return */
    b end
is_nonzero:
                       /* Prepare the call to factorial(n-1) */
    sub r0, r0, #1     /* r0 ← r0 - 1 */
    bl factorial
                       /* After the call r0 contains factorial(n-1) */
                       /* Load initial value of r0 (that we kept in r4) into r1 */
    mov r1, r4         /* r1 ← r4 */
    mul r0, r0, r1     /* r0 ← r0 * r1 */

end:
    pop {r4, lr}    /* Pop lr and r4 from the stack */
    bx lr              /* Leave factorial */

11 Predication

Predication on wikipedia

Instruction suffix

eq, neq, le, lt, ge, gt, al(always, can be omitted)

12 loops and the status register

adds(add and update the cpsr)

subs

bpl(branch if plus, branch if the negative flag is clear)

Operating 64 bit numbers

adcs(add with carry, update the cpsr)

sbc(subtract with carry)

multiplication