Automatic Generation of Peephole Superoptimizerstheory.stanford.edu/~sbansal/talks/msr-talk.pdf ·...

Automatic Generation

of Peephole Superoptimizers

Sorav Bansal and Alex Aiken

Stanford University

{sbansal, aiken}� s.stanford.edu

Produced with LATEX seminar style & PSTricks 1

SUPEROPTIMIZATION

SUPEROPTIMIZATION 3

SUPEROPTIMIZATION

Classical Meaning: To find the optimal code sequence for a

single, loop-free assembly sequence of instructions

TargetSequence

ArchitectureSpecification

SUPEROPTIMIZATION 3-A

SUPEROPTIMIZATION

TargetSequence

Exhaustive Search

SUPEROPTIMIZATION 3-B

SUPEROPTIMIZATION

TargetSequence

Exhaustive Search

OptimalSequence

SUPEROPTIMIZATION 3-C

SUPEROPTIMIZATION

Common Point-of-View

➜ Superoptimization is Sloowww..

➜ Superoptimization is useful only for occasional optimization of

the critical inner loop

SUPEROPTIMIZATION 4

SUPEROPTIMIZATION

movl %eax , %ecx

in %al ,shll %eax ,leal (%ecx), %eax

push (%ebx),movl %ecx , (%ebx)

shll (%ebx),addl $9 , (%ebx)

call func ,mov %eax , %ecx

shll %al ,out %al ,push (%ebx),mov %ecx , (%ebx)

push %eax ,mov %eax , %ecx

SUPEROPTIMIZATION 5

SUPEROPTIMIZATION

movl %eax , %ecx

Exhaustive Search

SUPEROPTIMIZATION

movl %eax , %ecx

Exhaustive Search

Optimization Tabletarget optimal

1. movl %ecx (%ebx) leal 9(%ecx) %ebx

shll (%ebx) movl %ebx (%ebx)

addl $9 (%ebx)

SUPEROPTIMIZATION

movl %eax , %ecx

Exhaustive Search

addl $9 (%ebx)

2. shll %eax movl %ecx %eax

leal (%ecx) (%eax)

SUPEROPTIMIZATION 5-C

SUPEROPTIMIZATION

movl %eax , %ecx

mov %eax , %ecx

push (%ebx),movl %ecx , %eax

shll %ecx ,movl %ecx , %eax

Exhaustive Search

addl $9 (%ebx)

leal (%ecx) (%eax)

addl $9 (%ebx)

leal (%ecx) (%eax)

3. movl %ecx %eax shll %ecx

shll %ecx movl %ecx %eax

movl %ecx %eax

SUPEROPTIMIZATION 5-D

SUPEROPTIMIZATION

movl %eax , %ecx

mov %eax , %ecx

Exhaustive Search

addl $9 (%ebx)

leal (%ecx) (%eax)

addl $9 (%ebx)

leal (%ecx) (%eax)

movl %ecx %eax

SUPEROPTIMIZATION 5-E

SUPEROPTIMIZATION

movl %eax , %ecx

mov %eax , %ecx

Exhaustive Search

addl $9 (%ebx)

leal (%ecx) (%eax)

addl $9 (%ebx)

leal (%ecx) (%eax)

movl %ecx %eax

Optimization Database

SUPEROPTIMIZATION 5-F

SUPEROPTIMIZATION

movl %eax , %ecx

mov %eax , %ecx

Exhaustive Search

addl $9 (%ebx)

leal (%ecx) (%eax)

addl $9 (%ebx)

leal (%ecx) (%eax)

movl %ecx %eax

• To what length of instruction sequences can we scale?

SUPEROPTIMIZATION 5-G

SUPEROPTIMIZATION

movl %eax , %ecx

mov %eax , %ecx

Exhaustive Search

addl $9 (%ebx)

leal (%ecx) (%eax)

addl $9 (%ebx)

leal (%ecx) (%eax)

movl %ecx %eax

• To what length of instruction sequences can we scale?

• Can the database be made large enough to capture

most “traditional” optimizations?

SUPEROPTIMIZATION 5-H

Traversing a Link-List

struct node{

int val;struct node *next;

void traverse (struct node *head){

while (head){

head->value *= 2;head = head->next;

SUPEROPTIMIZATION 6

struct node{

int val;struct node *next;

void traverse (struct node *head){

while (head){

head->value *= 2;head = head->next;

Traversing a Link List

1: movl head , %edx2: movl head , %eax3: movl %(eax), %eax4: sall %eax5: movl %eax, %(edx)6: movl head , %eax7: movl 4(%eax), %eax8: movl %eax, head9: mpl $0, headNaive assembly odegenerated by g

1: movl head, %edx2: movl %edx, %eax3: movl %(eax), %eax4: sall %eax5: movl %eax, %(edx)6: movl head, %eax7: movl 4(%eax), %eax8: movl %eax, head9: mpl $0, headStep One

SUPEROPTIMIZATION 7

Automatic Removal of Redundant Load

1: movl head, %edx2: movl %edx, %eax3: movl %(eax), %eax4: sall %eax5: movl %eax, %(edx)6: movl head, %eax7: movl 4(%eax), %eax8: movl %eax, head9: mpl $0, %eaxStep Two

SUPEROPTIMIZATION 8

Automatic Elimination of Memory Access

1: movl head, %edx2:3:4: sall (%edx)5:6: movl head, %eax7: movl 4(%eax), %eax8: movl %eax, head9: mpl $0, %eaxStep Three

Automatic Instruction Selection

SUPEROPTIMIZATION 9

1: movl head, %edx2: sall (%edx)3: movl head, %eax4: movl 4(%eax), %eax5: movl %eax, head6: mpl $0, %eax7:8:9: Step Three

1: movl head, %edx2: sall (%edx)3: movl %edx, %eax4: movl 4(%eax), %eax5: movl %eax, head6: mpl $0, %eax7:8:9: Step Four

Automatic Removal of Redundant Load

SUPEROPTIMIZATION 10

1: movl head, %edx2: sall (%edx)3: movl %edx, %eax4: movl 4(%eax), %eax5: movl %eax, head6: mpl $0, %eax7:8:9: Step Four

1: movl head, %edx2: sall (%edx)3:4: movl 4(%edx), %eax5: movl %eax, head6: mpl $0, %eax7:8:9: Step Five

Automatic Copy Propagation

1: movl head, %edx2: sall (%edx)3: movl 4(%edx), %eax4: movl %eax, head5: mpl $0, %eax6:7:8:9: Step Five

1: movl head, %edx2: sall (%edx)3: movl 4(%edx), %edx4: movl %edx, head5: mpl $0, %edx6:7:8:9: Step Six

Automatic Register Usage Optimization

1: movl head, %edx2: sall (%edx)3: movl 4(%edx), %edx4: movl %edx, head5: mpl $0, %edx6:7:8:9: Our optimizer

1: sall (%edx)2: movl 4(%edx), %edx3: mpl $0, %edx4:5:6:7:8:9: g -optimized

1: movl head, %edx2: sall (%edx)3: movl 4(%edx), %edx4: movl %edx, head5: mpl $0, %edx6:7:8:9: Our optimizer

1: sall (%edx)2: movl 4(%edx), %edx3: mpl $0, %edx4:5:6:7:8:9: g -optimized

Global Optimizations involving loop carried dependencies

cannot be handled by peephole optimizations

1: movl head, %edx2: movl %edx, %eax3: movl %(eax), %eax4: sall %eax5: movl %eax, %(edx)6: movl head, %eax7: movl 4(%eax), %eax8: movl %eax, head9: cmpl $0, head

1: movl head, %edx2: movl %edx, %eax3: movl %(eax), %eax4: sall %eax5: movl %eax, %(edx)6: movl head, %eax7: movl 4(%eax), %eax8: movl %eax, head9: cmpl $0, %eax

1: movl head, %edx2: sall (%edx)3: movl head, %eax4: movl 4(%eax), %eax5: movl %eax, head6: cmpl $0, %eax7:8:9:

1: movl head, %edx2: sall (%edx)3: movl 4(%edx), %edx4: movl %edx, head5: cmpl $0, %edx6:7:8:9:

1: movl head, %edx2: sall (%edx)3: movl 4(%edx), %eax4: movl %eax, head5: cmpl $0, %eax6:7:8:9:

1: movl head, %edx2: sall (%edx)3: movl %edx, %eax4: movl 4(%eax), %eax5: movl %eax, head6: cmpl $0, %eax7:8:9:

An automatically generated “peephole” optimizer can

capture many traditional optimizations

➜ Removal of Redundant Loads

➜ Instruction Selection

➜ Copy Propagation

➜ Reducing Register Usage

SYSTEM ARCHITECTURE

SYSTEM ARCHITECTURE 17

SYSTEM ARCHITECTURE

Training

Programs

SYSTEM ARCHITECTURE 17-A

SYSTEM ARCHITECTURE

Training

Programs

Harvester

SYSTEM ARCHITECTURE 17-B

SYSTEM ARCHITECTURE

Training

Programs

Harvester Canonicalizer

SYSTEM ARCHITECTURE 17-C

SYSTEM ARCHITECTURE

Training

Programs

Harvester Canonicalizer Fingerprinter

SYSTEM ARCHITECTURE 17-D

SYSTEM ARCHITECTURE

Training

Programs

Fingerprint Hashtable

SYSTEM ARCHITECTURE 17-E

SYSTEM ARCHITECTURE

Training

Programs

SYSTEM ARCHITECTURE 17-F

SYSTEM ARCHITECTURE

Training

Programs

Fingerprinter

SYSTEM ARCHITECTURE 17-G

SYSTEM ARCHITECTURE

Training

Programs

Fingerprinter match?

SYSTEM ARCHITECTURE 17-H

SYSTEM ARCHITECTURE

Training

Programs

Fingerprinter match?Boolean

Equivalence Test

SYSTEM ARCHITECTURE 17-I

HARVESTER

Harvests only loop-free instruction sequences that have a

single entry point

- no middle instruction is a jump target

0: movl %esp, %ebp

1: movl 8(%ebp) %edx

2: sall %edx

3: movl %eax, %(edx)

: . . .

i : jne instruction 1

HARVESTER 18

HARVESTER

Harvests only loop-free instruction sequences that have a

single entry point

- no middle instruction is a jump target

0: movl %esp, %ebp

1: movl 8(%ebp) %edx

2: sall %edx

3: movl %eax, %(edx)

: . . .

i : jne instruction 1

HARVESTER 18-A

CANONICALIZER

movl %esp, %ebp

movl %ebp, %espmovl %esp, %ebp

CANONICALIZER 19

CANONICALIZER

movl %esp, %ebp

movl %eax, %ebp

movl %ebp, %eaxmovl %eax, %ebp

CANONICALIZER 19-A

CANONICALIZER

movl %esp, %ebp

movl %eax, %ebp

movl %ebp, %eaxmovl %eax, %ebp

movl %esp, %esi

movl %esi, %espmovl %esp, %esi

CANONICALIZER 19-B

CANONICALIZER

movl %esp, %ebp

There are 56 variants of this rule

CANONICALIZER 20

CANONICALIZER

movl %esp, %ebp

There are 56 variants of this rule

movl reg1, reg2

movl reg2, reg1movl reg1, reg2

Reduce it to one rule

CANONICALIZER 20-A

CANONICALIZER

addl %1234, %ebp

addl %5678, %ebpaddl %7912, %ebp

CANONICALIZER 21

CANONICALIZER

addl %1234, %ebp

addl %5678, %ebpaddl %7912, %ebp

Use symbolic constants to deal with immediates

addl cons0, %ebp

addl cons1, %ebpaddl cons0+cons1, %ebp

CANONICALIZER 21-A

CANONICALIZER

Rename the target sequence to get it in its canonical form:

Registers

➜ First register used is renamed to r0➜ Second distinct register used is renamed to r1➜ and so on...

Constants

➜ First constant used is renamed to 0 (a pre-defined constant)

➜ Second distinct constant is renamed to 1

Memory Addresses

➜ Rename a memory access in s-i-b form to a dereference of the

canonical name of its base (and index) register

CANONICALIZER 22

CANONICALIZER

Only instruction sequences in canonical form are enumerated

0 1 0 1 0 + 1 0 - 1 0 + 1 0 - 1

CANONICALIZER 23

CANONICALIZER

movl r3, r4

movl r5, r3

0 1 0 1 0 + 1 0 - 1 0 + 1 0 - 1

CANONICALIZER 23-A

CANONICALIZER

movl r3, r4

movl r5, r3

movl r0, r1

movl r2, r0

0 1 0 1 0 + 1 0 - 1 0 + 1 0 - 1

CANONICALIZER 23-B

CANONICALIZER

movl r3, r4

movl r5, r3

movl r0, r1

movl r2, r0

Special Constants are enumerated for immediate operands

➜ 0 ; 1

➜ 0 + 1 ; 0 - 1➜ 0 + 1 ; 0 - 1

CANONICALIZER 23-C

CANONICALIZER

movl r3, r4

movl r5, r3

movl r0, r1

movl r2, r0

Special Constants are enumerated for immediate operands

➜ 0 ; 1

➜ 0 + 1 ; 0 - 1➜ 0 + 1 ; 0 - 1

LHS of an optimization rule is always in canonical form

CANONICALIZER 23-D

CANONICALIZER

addl $47, %ebp

addl $53, %ebp

addl c0, %r0

addl c1, %r0

addl $100, %ebp addl c0+c1, %r0

Canonicalize

Optimization

Database

Un-Canonicalize

CANONICALIZER 24

FINGERPRINTER

Execute the instruction sequences on random-bit states

Use hash of result to compute a fingerprint

FINGERPRINTER 25

FINGERPRINTER

Memory is approximated by a 256 byte array

➜ Two equivalent accesses guaranteed to access the same

location in array

FINGERPRINTER 25-A

FINGERPRINTER

Memory is approximated by a 256 byte array

➜ Two equivalent accesses guaranteed to access the same

location in array

Fingerprint computed by direct execution on hardware

➜ Fingerprint of a sequence computed in <3µs

FINGERPRINTER 25-B

SYSTEM ARCHITECTURE

Training

Programs

Fingerprinter match?

SYSTEM ARCHITECTURE 26

SYSTEM ARCHITECTURE

Training

Programs

Fingerprinter match?Boolean

Equivalence Test

SYSTEM ARCHITECTURE 26-A

EQUIVALENCE TEST

Execution Test

(fast)

Boolean Test

(slower)

EQUIVALENCE TEST 27

EQUIVALENCE TEST

Execution Test

(fast)

Boolean Test

(slower)

Execution Test

Fingerprint on many different states

EQUIVALENCE TEST 27-A

EQUIVALENCE TEST

Execution Test

(fast)

Boolean Test

(slower)

Execution Test

Fingerprint on many different states

Boolean Test

Use SAT Solver

EQUIVALENCE TEST 27-B

BOOLEAN TEST

Machine state represented by a set of registers and a model

of memory

BOOLEAN TEST 28

BOOLEAN TEST

of memory

Memory model

➜ Map from address expressions to data expressions

➜ Aliasing handled by comparing addresses of multiple accesses

BOOLEAN TEST 28-A

BOOLEAN TEST

of memory

Memory model

Instructions encoded as boolean circuits

input state output state

BOOLEAN TEST 28-B

BOOLEAN TEST

of memory

Memory model

Instructions encoded as boolean circuits

input state output state

Use a SAT Solver to compute equivalence

BOOLEAN TEST 28-C

CONTEXT SENSITIVITY

Equivalence of two instruction sequences is defined under

the set of registers live beyond the sequence itself

CONTEXT SENSITIVITY 29

EXPERIMENTAL RESULTS

Length Original After Canonicalize

Search Space and Prune

1 5,606 654

2 31 m 1.09 m

3 176 b 2.8 b

Search Space

Exhaustively enumerate sequences up to length 3

EXPERIMENTAL RESULTS 30

Integer addition

for (i = 0; i < 64; i++)sum += a[i]

Integer addition

for (i = 0; i < 64; i++)sum += a[i]

psadbw: sum of absolute differences of 8 consecutive

integers

psubb %mm0, %mm0

psadbw &a[i], %mm0

movd %mm0, sum

3x faster

EXPERIMENTAL RESULTS 31-A

Comparison c[i] = (a[i] < b[i]) ? c0 : c1 7x

Minimum c[i] = (a[i] < b[i]) ? a[i] : b[i] 8x

Pixel-difference sum += abs(a[i]− b[i]) 10x

XOR c[i] = a[i]⊕ b[i] 2x

Sprite Copy c[i] = (a[i] == 0)? b[i] : a[i] 2x

MMX and conditional-move instructions are still under-used

We evaluate our optimizer on SPEC benchmarks compiled

using g

Use two cost functions

➜ Codesize

➜ Runtime

• Number of memory accesses

• Number of jump instructions

• Instruction costs

Program Codesize

Improvement

gzip 3.95%

mcf 5.86%

crafty 1.71%

bzip2 4.58%

gcc 1.12%

twolf 1.47%

parser 3.06%

Codesize improvement on executables

already optimized for size (-Os)

Codesize Improvement: 1 – 6%

Program Instruction Count Improvement

gzip 4.16%

mcf 3.73%

crafty 2.19%

bzip2 4.11%

gcc 2.44%

twolf 2.17%

parser 3.84%

Instruction count improvement on

optimized executables (-O2)

Instruction Count Improvement: 2 – 4%

Speedup: 1 – 5%

Number of Distinct Optimization Rules Learnt

Codesize 3000

Runtime 2100

Re-use of Optimization Rules

18,679

Frequency of Use

201-1000

51-200

Total: 28,593 optimizations

Initial Machine State

r0r1r2r3r4memory

Target Machine State

r0r1r2r3r4memory

Exhaustive Search over

Length-3 sequences

r0r1r2r3r4memory

Length-3 sequences

EXPERIMENTAL RESULTS 37-B

Plain Text Cipher Text

Length-3 sequences

56-bit keys

CT = Encrypt56(PT )

If, Key56 = Key28 ◦ Key28

s.t., CT = Encrypt128(Encrypt228(PT ))

CT = Encrypt56(PT )

If, Key56 = Key28 ◦ Key28

s.t., CT = Encrypt128(Encrypt228(PT ))

Decrypt128(CT ) = Encrypt228(PT )

EXPERIMENTAL RESULTS 39-B

MEET-IN-MIDDLE

Encrypt using all

possible 28-bit keys

Decrypt using all

MEET-IN-MIDDLE 40

MEET-IN-MIDDLE

Encrypt using all

Decrypt using all

MEET-IN-MIDDLE 40-A

MEET-IN-MIDDLE

Encrypt using all

Decrypt using all

O(228)

MEET-IN-MIDDLE 40-B

MEET-IN-MIDDLE

Initial

r0r1r2r3r4memory

Target

r0r1r2r3r4memory

Enumerate all

length n sequences

Enumerate all

length m inverse-sequences

MEET-IN-MIDDLE 41

MEET-IN-MIDDLE

Initial

r0r1r2r3r4memory

Target

r0r1r2r3r4memory

Enumerate all

length n sequences

Enumerate all

MEET-IN-MIDDLE 41-A

MEET-IN-MIDDLE

Initial

r0r1r2r3r4memory

Target

r0r1r2r3r4memory

Enumerate all

length n sequences

Enumerate all

Effective Enumerated Length: m + n

MEET-IN-MIDDLE 41-B

MEET-IN-MIDDLE

What is the inverse of an instruction sequence?

How fast can we match?

MEET-IN-MIDDLE 42

INVERSE OF AN INSTRUCTION SEQUENCE

Initial

r0r1r2r3r4memory

Finalr0r1r2r3r4

memoryflagsExecute

Sequence

Tightest Approximation

to Initial

r0r1r2r3r4memory

flagsExecute

Sequence−1

Approximate using don’t-know bits

INVERSE OF AN INSTRUCTION SEQUENCE 43

INSTRUCTION INVERSE

Instruction Inverse

ddd d1d11d d0d0dd0 ddd d

INSTRUCTION INVERSE 44

INSTRUCTION INVERSE

Instruction Inverse

add r0, r1 sub r0,r1 flags← d

dd d1d11d d0d0dd0 ddd d

INSTRUCTION INVERSE 44-A

INSTRUCTION INVERSE

Instruction Inverse

add r0, r1 sub r0,r1 flags← dadd r0, r0 shr r0; r0[31]← d flags← d

d d1d11d d0d0dd0 ddd d

INSTRUCTION INVERSE 44-B

INSTRUCTION INVERSE

Instruction Inverse

add r0, r1 sub r0,r1 flags← dadd r0, r0 shr r0; r0[31]← d flags← dmov r0, r1 r0← d

d1d11d d0d0dd0 ddd d

INSTRUCTION INVERSE 44-C

INSTRUCTION INVERSE

Instruction Inverse

add r0, r1 sub r0,r1 flags← dadd r0, r0 shr r0; r0[31]← d flags← dmov r0, r1 r0← dand r0, $010110 r0← r0 & d1d11d flags← d

0d0dd0 ddd d

INSTRUCTION INVERSE 44-D

INSTRUCTION INVERSE

Instruction Inverse